Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
d51a0718
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
d51a0718
编写于
10月 15, 2018
作者:
X
xiebaiyuan
提交者:
GitHub
10月 15, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1081 from xiebaiyuan/develop
trans gemm to class && add multi instance support && to unit test
上级
f90dd802
a058f56d
变更
7
显示空白变更内容
内联
并排
Showing
7 changed file
with
488 addition
and
335 deletion
+488
-335
src/operators/math/gemm.cpp
src/operators/math/gemm.cpp
+187
-176
src/operators/math/gemm.h
src/operators/math/gemm.h
+149
-129
src/operators/math/gru_compute.cpp
src/operators/math/gru_compute.cpp
+9
-6
src/operators/math/math_function.cpp
src/operators/math/math_function.cpp
+29
-20
test/CMakeLists.txt
test/CMakeLists.txt
+7
-2
test/common/test_gemm_accuracy.cpp
test/common/test_gemm_accuracy.cpp
+3
-2
test/net/test_multi_inference_predict.cpp
test/net/test_multi_inference_predict.cpp
+104
-0
未找到文件。
src/operators/math/gemm.cpp
浏览文件 @
d51a0718
...
...
@@ -26,7 +26,7 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
int
MC
=
0
;
/*
int MC = 0;
int KC = 0;
int NC = 0;
...
...
@@ -40,7 +40,7 @@ typedef void (*FnAddDot)(int, const float *, const float *, float *, int);
FnPack procPackA;
FnPack procPackB;
FnAddDot
procAddDot
;
FnAddDot procAddDot;
*/
/*
// 将A矩阵分块复制到连续内存(ColMajor)
...
...
@@ -101,7 +101,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
*/
// 将A矩阵分块复制到连续内存(RowMajor)
void
PackMatrixA_4r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
void
Gemm
::
PackMatrixA_4r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
)
{
const
float
*
a0
,
*
a1
,
*
a2
,
*
a3
;
for
(
int
i
=
0
;
i
<
m
-
m_tail
;
i
+=
MR
)
{
...
...
@@ -142,7 +142,7 @@ void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
}
}
void
PackMatrixA_6r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
void
Gemm
::
PackMatrixA_6r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
)
{
const
int
i_length
=
m
-
m_tail
;
for
(
int
i
=
0
;
i
<
i_length
;
i
+=
MR
)
{
...
...
@@ -196,7 +196,7 @@ void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
}
}
void
PackMatrixA_omp_6r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
void
Gemm
::
PackMatrixA_omp_6r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
)
{
const
int
i_length
=
m
-
m_tail
;
#pragma omp parallel for
...
...
@@ -251,7 +251,7 @@ void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
}
}
void
PackMatrixA_8r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
void
Gemm
::
PackMatrixA_8r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
)
{
const
int
i_length
=
m
-
m_tail
;
for
(
int
i
=
0
;
i
<
i_length
;
i
+=
MR
)
{
...
...
@@ -317,7 +317,7 @@ void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
}
}
void
PackMatrixA_omp_8r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
void
Gemm
::
PackMatrixA_omp_8r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
)
{
const
int
i_length
=
m
-
m_tail
;
#pragma omp parallel for
...
...
@@ -385,7 +385,7 @@ void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
}
// 将B矩阵分块复制到连续内存(RowMajor)
void
PackMatrixB_8c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
void
Gemm
::
PackMatrixB_8c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
)
{
const
int
j_length
=
n
-
n_tail
;
for
(
int
j
=
0
;
j
<
j_length
;
j
+=
NR
)
{
...
...
@@ -436,7 +436,7 @@ void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
}
}
void
PackMatrixB_omp_8c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
void
Gemm
::
PackMatrixB_omp_8c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
)
{
const
int
j_length
=
n
-
n_tail
;
#pragma omp parallel for
...
...
@@ -489,7 +489,7 @@ void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
}
#if __aarch64__
void
PackMatrixB_12c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
void
Gemm
::
PackMatrixB_12c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
)
{
const
int
j_length
=
n
-
n_tail
;
for
(
int
j
=
0
;
j
<
j_length
;
j
+=
NR
)
{
...
...
@@ -519,8 +519,8 @@ void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
}
}
void
PackMatrixB_omp_12c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
)
{
void
Gemm
::
PackMatrixB_omp_12c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
)
{
const
int
j_length
=
n
-
n_tail
;
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
j_length
;
j
+=
NR
)
{
...
...
@@ -550,7 +550,7 @@ void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
}
}
void
PackMatrixB_16c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
void
Gemm
::
PackMatrixB_16c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
)
{
const
int
j_length
=
n
-
n_tail
;
for
(
int
j
=
0
;
j
<
n
-
n_tail
;
j
+=
NR
)
{
...
...
@@ -580,8 +580,8 @@ void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
}
}
void
PackMatrixB_omp_16c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
)
{
void
Gemm
::
PackMatrixB_omp_16c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
)
{
const
int
j_length
=
n
-
n_tail
;
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
n
-
n_tail
;
j
+=
NR
)
{
...
...
@@ -613,8 +613,9 @@ void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
#endif // __aarch64__
// 分块矩阵乘法
void
InnerKernel
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
)
{
void
Gemm
::
InnerKernel
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
)
{
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
nc
;
j
+=
NR
)
{
for
(
int
i
=
0
;
i
<
mc
;
i
+=
MR
)
{
...
...
@@ -648,7 +649,7 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
}
// 分块矩阵乘法
void
InnerKernelWithBias
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
void
Gemm
::
InnerKernelWithBias
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
)
{
#pragma omp parallel for
...
...
@@ -692,9 +693,10 @@ void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
}
// 分块矩阵乘法
void
InnerKernelWithBn
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
)
{
void
Gemm
::
InnerKernelWithBn
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
)
{
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
nc
;
j
+=
NR
)
{
for
(
int
i
=
0
;
i
<
mc
;
i
+=
MR
)
{
...
...
@@ -717,10 +719,10 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
}
// 分块矩阵乘法
void
InnerKernelWithBnAdd
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
void
Gemm
::
InnerKernelWithBnAdd
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
)
{
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
)
{
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
nc
;
j
+=
NR
)
{
for
(
int
i
=
0
;
i
<
mc
;
i
+=
MR
)
{
...
...
@@ -737,7 +739,7 @@ void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
WriteWithBnAddRelu
(
mc
,
nc
,
c
,
C
,
ldc
,
new_scale
,
new_bias
,
bias
);
}
void
InnerKernelWithPRelu
(
int
mc
,
int
nc
,
const
float
*
a
,
const
float
*
b
,
void
Gemm
::
InnerKernelWithPRelu
(
int
mc
,
int
nc
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
#pragma omp parallel for
...
...
@@ -759,7 +761,7 @@ void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
#if __ARM_NEON
#if __aarch64__
void
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
void
Gemm
::
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
// init C
float32x4_t
cv0
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv1
=
vdupq_n_f32
(
0.0
);
...
...
@@ -794,7 +796,7 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
// float32x4x4_t cv = {cv0, cv1, cv2, cv3};
}
void
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
void
Gemm
::
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
// init C
float32x4_t
cv0
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv1
=
vdupq_n_f32
(
0.0
);
...
...
@@ -844,7 +846,7 @@ void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
// 分块矩阵乘法结果回写
// C = A * B
void
WriteBasic
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
void
Gemm
::
WriteBasic
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
...
...
@@ -877,10 +879,10 @@ void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
}
// C = alpha * A * B + beta * C
void
WriteWithAlphaBeta
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
void
Gemm
::
WriteWithAlphaBeta
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
// C = A * B + C
void
WriteWithAdd
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
void
Gemm
::
WriteWithAdd
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
...
...
@@ -917,7 +919,8 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
}
}
// C = A * B + bias
void
WriteWithAddV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
)
{
void
Gemm
::
WriteWithAddV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
...
...
@@ -955,7 +958,7 @@ void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {
}
// C = A * B + C, relu(C)
void
WriteWithAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
void
Gemm
::
WriteWithAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
...
...
@@ -996,7 +999,7 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
}
// C = A * B + bias, relu(C)
void
WriteWithAddReluV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
void
Gemm
::
WriteWithAddReluV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
...
...
@@ -1038,8 +1041,9 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
}
// C = A * B + C,prelu(C)
void
WriteWithAddPRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
void
Gemm
::
WriteWithAddPRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
...
...
@@ -1114,8 +1118,8 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
}
// C = A * B, batchnorm(C)
void
WriteWithBn
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
)
{
void
Gemm
::
WriteWithBn
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
...
...
@@ -1159,7 +1163,7 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
}
// C = A * B, batchnorm(C), relu(C)
void
WriteWithBnRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
void
Gemm
::
WriteWithBnRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
...
...
@@ -1205,7 +1209,7 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
}
// C = A * B, batchnorm(C),C = C + bias; relu(C)
void
WriteWithBnAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
void
Gemm
::
WriteWithBnAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
...
...
@@ -1259,7 +1263,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
#else
void
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
void
Gemm
::
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
const
float
*
a_ptr
,
*
b_ptr
;
a_ptr
=
a
;
b_ptr
=
b
;
...
...
@@ -1330,10 +1334,9 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
}
/*
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu) {
float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
void Gemm::VectorKernel(int m, int n, int k, float alpha, const float *A, int
lda, const float *B, int ldb, float beta, float *C, int ldc, bool relu) { float
*bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
const float *a0, *b0, *b1, *b2, *b3;
float *c0, *C0;
...
...
@@ -1552,7 +1555,7 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
}
}
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
void
Gemm::
VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float *C,
int ldc, bool relu, float *new_scale, float *new_bias) {
float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
...
...
@@ -1764,7 +1767,7 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
}
*/
void
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
void
Gemm
::
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
const
float
*
a_ptr
,
*
b_ptr
;
a_ptr
=
a
;
b_ptr
=
b
;
...
...
@@ -1872,7 +1875,7 @@ void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
}
// C = A * B
void
WriteBasic
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
void
Gemm
::
WriteBasic
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
nc
/
16
;
int
_nc1
=
nc
%
16
;
int
step
=
4
*
ldc
;
...
...
@@ -1929,10 +1932,10 @@ void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
}
// C = alpha * A * B + beta * C
void
WriteWithAlphaBeta
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
void
Gemm
::
WriteWithAlphaBeta
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
// C = A * B + C
void
WriteWithAdd
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
void
Gemm
::
WriteWithAdd
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
nc
/
16
;
int
_nc1
=
nc
%
16
;
int
step
=
4
*
ldc
;
...
...
@@ -1996,7 +1999,8 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
}
// C = A * B + bias
void
WriteWithAddV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
)
{
void
Gemm
::
WriteWithAddV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
...
...
@@ -2034,7 +2038,7 @@ void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {
}
// C = A * B + C, relu(C)
void
WriteWithAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
void
Gemm
::
WriteWithAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
nc
/
16
;
int
_nc1
=
nc
%
16
;
int
step
=
4
*
ldc
;
...
...
@@ -2108,7 +2112,7 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
}
// C = A * B + bias, relu(C)
void
WriteWithAddReluV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
void
Gemm
::
WriteWithAddReluV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
...
...
@@ -2149,8 +2153,9 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
}
}
void
WriteWithAddPRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
void
Gemm
::
WriteWithAddPRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
if
(
nc
<
4
)
{
if
(
bias1
==
nullptr
)
{
for
(
int
i
=
0
;
i
<
mc
;
++
i
)
{
...
...
@@ -2383,8 +2388,8 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
}
// C = A * B, batchnorm(C)
void
WriteWithBn
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
scale
,
float
*
bias
)
{
void
Gemm
::
WriteWithBn
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
scale
,
float
*
bias
)
{
if
(
nc
<
4
)
{
for
(
int
i
=
0
;
i
<
mc
;
++
i
)
{
for
(
int
j
=
0
;
j
<
nc
;
++
j
)
{
...
...
@@ -2484,8 +2489,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
}
// C = A * B, batchnorm(C), relu(C)
void
WriteWithBnRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
scale
,
float
*
bias
)
{
void
Gemm
::
WriteWithBnRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
scale
,
float
*
bias
)
{
if
(
nc
<
4
)
{
for
(
int
i
=
0
;
i
<
mc
;
++
i
)
{
for
(
int
j
=
0
;
j
<
nc
;
++
j
)
{
...
...
@@ -2595,7 +2600,7 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
}
// C = A * B, batchnorm(C),C = C + bias; relu(C)
void
WriteWithBnAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
void
Gemm
::
WriteWithBnAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
...
...
@@ -2649,7 +2654,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
/*
// C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc) {
void
Gemm::
VecWriteBasic(int n, float *c, float *C, int ldc) {
int nc1 = n / 16;
int _nc1 = n % 16;
int nc2 = _nc1 / 4;
...
...
@@ -2695,10 +2700,10 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
}
// C = alpha * A * B + beta * C
void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
void
Gemm::
VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
// C = A * B + C
void VecWriteWithAdd(int n, float *c, float *C, int ldc) {
void
Gemm::
VecWriteWithAdd(int n, float *c, float *C, int ldc) {
int nc1 = n / 16;
int _nc1 = n % 16;
...
...
@@ -2736,7 +2741,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
}
// C = A * B + C, relu(C)
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
void
Gemm::
VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
int nc1 = n / 16;
int _nc1 = n % 16;
...
...
@@ -2784,7 +2789,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
}
// C = A * B, batchnorm(C)
void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
void
Gemm::
VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
float *bias) {
int nc1 = n / 16;
int _nc1 = n % 16;
...
...
@@ -2850,12 +2855,9 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
}
// C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
float *bias) {
int nc1 = n / 16;
int _nc1 = n % 16;
int nc2 = _nc1 / 4;
int nc3 = 16 - 4 * (_nc1 % 4);
void Gemm::VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float
*scale, float *bias) { int nc1 = n / 16; int _nc1 = n % 16; int nc2 = _nc1 /
4; int nc3 = 16 - 4 * (_nc1 % 4);
asm volatile(
"vmov.f32 q14, #0.0 \n\t"
...
...
@@ -2926,7 +2928,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
#endif // __aarch64__
#else
void
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
void
Gemm
::
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
float
*
c0
,
*
c1
,
*
c2
,
*
c3
;
c0
=
c
;
c1
=
c
+
ldc
;
...
...
@@ -2962,38 +2964,42 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
}
}
void
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{}
void
Gemm
::
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
}
void
WriteBasic
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
void
Gemm
::
WriteBasic
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
void
WriteWithAlphaBeta
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
void
Gemm
::
WriteWithAlphaBeta
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
void
WriteWithAdd
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
void
Gemm
::
WriteWithAdd
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
void
WriteWithAddV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
)
{}
void
Gemm
::
WriteWithAddV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
)
{}
void
WriteWithAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
void
Gemm
::
WriteWithAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
void
WriteWithAddReluV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
void
Gemm
::
WriteWithAddReluV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
)
{}
void
WriteWithAddPRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{}
void
Gemm
::
WriteWithAddPRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{}
void
WriteWithBn
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
)
{}
void
Gemm
::
WriteWithBn
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
)
{}
void
WriteWithBnRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
void
Gemm
::
WriteWithBnRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
)
{}
void
WriteWithBnAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias1
)
{}
void
Gemm
::
WriteWithBnAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias1
)
{
}
#endif // __ARM_NEON
// 32位 float 矩阵乘法
void
Sgemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
)
{
void
Gemm
::
Sgemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
)
{
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int
L1
=
32
*
1024
;
...
...
@@ -3063,9 +3069,10 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile
::
memory
::
Free
(
zero
);
}
void
SgemmWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
)
{
void
Gemm
::
SgemmWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
)
{
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int
L1
=
32
*
1024
;
...
...
@@ -3136,7 +3143,7 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile
::
memory
::
Free
(
zero
);
}
void
SgemmWithPRelu
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
void
Gemm
::
SgemmWithPRelu
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
...
...
@@ -3212,7 +3219,7 @@ void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
}
// 32位 float 矩阵乘法
void
Sgemm_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
void
Gemm
::
Sgemm_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
)
{
#ifdef _OPENMP
...
...
@@ -3237,18 +3244,18 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
NC
=
(
n
+
NR
-
1
)
/
NR
*
NR
;
#if __aarch64__
procPackA
=
PackMatrixA_6r
;
procPackB
=
PackMatrixB_omp_16c
;
procAddDot
=
AddDot6x16
;
procPackA
=
&
Gemm
::
PackMatrixA_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_omp_16c
;
procAddDot
=
&
Gemm
::
AddDot6x16
;
#else
procPackA
=
PackMatrixA_6r
;
procPackB
=
PackMatrixB_omp_8c
;
procAddDot
=
AddDot6x8
;
procPackA
=
&
Gemm
::
PackMatrixA_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_omp_8c
;
procAddDot
=
&
Gemm
::
AddDot6x8
;
#endif
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
procPackB
(
KC
,
NC
,
NC
%
NR
,
B
,
ldb
,
packedB
);
(
*
this
.
*
procPackB
)
(
KC
,
NC
,
NC
%
NR
,
B
,
ldb
,
packedB
);
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
*
max_threads
));
}
else
{
...
...
@@ -3265,18 +3272,19 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
MC
=
(
m
+
MR
-
1
)
/
MR
*
MR
;
#if __aarch64__
procPackA
=
PackMatrixA_omp_6r
;
procPackB
=
PackMatrixB_16c
;
procAddDot
=
AddDot6x16
;
procPackA
=
&
Gemm
::
PackMatrixA_omp_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_16c
;
procAddDot
=
&
Gemm
::
AddDot6x16
;
#else
procPackA
=
PackMatrixA_omp_6r
;
procPackB
=
PackMatrixB_8c
;
procAddDot
=
AddDot6x8
;
procPackA
=
&
Gemm
::
PackMatrixA_omp_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_8c
;
procAddDot
=
&
Gemm
::
AddDot6x8
;
#endif
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
procPackA
(
MC
,
KC
,
MC
%
MR
,
A
,
lda
,
packedA
);
(
*
this
.
*
procPackA
)
(
MC
,
KC
,
MC
%
MR
,
A
,
lda
,
packedA
);
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
*
max_threads
));
}
...
...
@@ -3298,7 +3306,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
mc
=
s_min
(
m
-
i
,
MC
);
float
*
local_A
=
packedA
+
MC
*
KC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
procPackA
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
local_A
);
(
*
this
.
*
procPackA
)
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
local_A
);
InnerKernelWithBias
(
mc
,
n
,
alpha
,
local_A
,
packedB
,
beta
,
local_C
,
&
C
(
i
,
0
),
ldc
,
relu
,
bias
+
i
);
}
...
...
@@ -3315,7 +3323,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
nc
=
s_min
(
n
-
j
,
NC
);
float
*
local_B
=
packedB
+
KC
*
NC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
procPackB
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
local_B
);
(
*
this
.
*
procPackB
)
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
local_B
);
InnerKernelWithBias
(
m
,
nc
,
alpha
,
packedA
,
local_B
,
beta
,
local_C
,
&
C
(
0
,
j
),
ldc
,
relu
,
bias
);
}
...
...
@@ -3327,10 +3335,10 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile
::
memory
::
Free
(
zero
);
}
void
SgemmWithBn_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
)
{
void
Gemm
::
SgemmWithBn_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
)
{
#ifdef _OPENMP
int
max_threads
=
omp_get_max_threads
();
#else
...
...
@@ -3353,18 +3361,18 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
NC
=
(
n
+
NR
-
1
)
/
NR
*
NR
;
#if __aarch64__
procPackA
=
PackMatrixA_6r
;
procPackB
=
PackMatrixB_omp_16c
;
procAddDot
=
AddDot6x16
;
procPackA
=
&
Gemm
::
PackMatrixA_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_omp_16c
;
procAddDot
=
&
Gemm
::
AddDot6x16
;
#else
procPackA
=
PackMatrixA_6r
;
procPackB
=
PackMatrixB_omp_8c
;
procAddDot
=
AddDot6x8
;
procPackA
=
&
Gemm
::
PackMatrixA_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_omp_8c
;
procAddDot
=
&
Gemm
::
AddDot6x8
;
#endif
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
procPackB
(
KC
,
NC
,
NC
%
NR
,
B
,
ldb
,
packedB
);
(
*
this
.
*
procPackB
)
(
KC
,
NC
,
NC
%
NR
,
B
,
ldb
,
packedB
);
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
*
max_threads
));
}
else
{
...
...
@@ -3381,18 +3389,18 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
MC
=
(
m
+
MR
-
1
)
/
MR
*
MR
;
#if __aarch64__
procPackA
=
PackMatrixA_omp_6r
;
procPackB
=
PackMatrixB_16c
;
procAddDot
=
AddDot6x16
;
procPackA
=
&
Gemm
::
PackMatrixA_omp_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_16c
;
procAddDot
=
&
Gemm
::
AddDot6x16
;
#else
procPackA
=
PackMatrixA_omp_6r
;
procPackB
=
PackMatrixB_8c
;
procAddDot
=
AddDot6x8
;
procPackA
=
&
Gemm
::
PackMatrixA_omp_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_8c
;
procAddDot
=
&
Gemm
::
AddDot6x8
;
#endif
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
procPackA
(
MC
,
KC
,
MC
%
MR
,
A
,
lda
,
packedA
);
(
*
this
.
*
procPackA
)
(
MC
,
KC
,
MC
%
MR
,
A
,
lda
,
packedA
);
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
*
max_threads
));
}
...
...
@@ -3414,7 +3422,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
mc
=
s_min
(
m
-
i
,
MC
);
float
*
local_A
=
packedA
+
MC
*
KC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
procPackA
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
local_A
);
(
*
this
.
*
procPackA
)
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
local_A
);
if
(
bias
==
nullptr
)
{
InnerKernelWithBn
(
mc
,
n
,
alpha
,
local_A
,
packedB
,
beta
,
local_C
,
&
C
(
i
,
0
),
ldc
,
relu
,
new_scale
+
i
,
new_bias
+
i
);
...
...
@@ -3437,7 +3445,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
nc
=
s_min
(
n
-
j
,
NC
);
float
*
local_B
=
packedB
+
KC
*
NC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
procPackB
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
local_B
);
(
*
this
.
*
procPackB
)
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
local_B
);
if
(
bias
==
nullptr
)
{
InnerKernelWithBn
(
m
,
nc
,
alpha
,
packedA
,
local_B
,
beta
,
local_C
,
&
C
(
0
,
j
),
ldc
,
relu
,
new_scale
,
new_bias
);
...
...
@@ -3455,9 +3463,10 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile
::
memory
::
Free
(
zero
);
}
void
SgemmWithPRelu_omp
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
void
Gemm
::
SgemmWithPRelu_omp
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
#ifdef _OPENMP
int
max_threads
=
omp_get_max_threads
();
#else
...
...
@@ -3480,18 +3489,18 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
NC
=
(
n
+
NR
-
1
)
/
NR
*
NR
;
#if __aarch64__
procPackA
=
PackMatrixA_6r
;
procPackB
=
PackMatrixB_omp_16c
;
procAddDot
=
AddDot6x16
;
procPackA
=
&
Gemm
::
PackMatrixA_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_omp_16c
;
procAddDot
=
&
Gemm
::
AddDot6x16
;
#else
procPackA
=
PackMatrixA_6r
;
procPackB
=
PackMatrixB_omp_8c
;
procAddDot
=
AddDot6x8
;
procPackA
=
&
Gemm
::
PackMatrixA_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_omp_8c
;
procAddDot
=
&
Gemm
::
AddDot6x8
;
#endif
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
procPackB
(
KC
,
NC
,
NC
%
NR
,
B
,
ldb
,
packedB
);
(
*
this
.
*
procPackB
)
(
KC
,
NC
,
NC
%
NR
,
B
,
ldb
,
packedB
);
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
*
max_threads
));
}
else
{
...
...
@@ -3508,18 +3517,18 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
MC
=
(
m
+
MR
-
1
)
/
MR
*
MR
;
#if __aarch64__
procPackA
=
PackMatrixA_omp_6r
;
procPackB
=
PackMatrixB_16c
;
procAddDot
=
AddDot6x16
;
procPackA
=
&
Gemm
::
PackMatrixA_omp_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_16c
;
procAddDot
=
&
Gemm
::
AddDot6x16
;
#else
procPackA
=
PackMatrixA_omp_6r
;
procPackB
=
PackMatrixB_8c
;
procAddDot
=
AddDot6x8
;
procPackA
=
&
Gemm
::
PackMatrixA_omp_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_8c
;
procAddDot
=
&
Gemm
::
AddDot6x8
;
#endif
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
procPackA
(
MC
,
KC
,
MC
%
MR
,
A
,
lda
,
packedA
);
(
*
this
.
*
procPackA
)
(
MC
,
KC
,
MC
%
MR
,
A
,
lda
,
packedA
);
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
*
max_threads
));
}
...
...
@@ -3541,7 +3550,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
mc
=
s_min
(
m
-
i
,
MC
);
float
*
local_A
=
packedA
+
MC
*
KC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
procPackA
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
local_A
);
(
*
this
.
*
procPackA
)
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
local_A
);
if
(
bias1
==
nullptr
)
{
InnerKernelWithPRelu
(
mc
,
n
,
local_A
,
packedB
,
local_C
,
&
C
(
i
,
0
),
ldc
,
p
+
i
,
mode
,
bias
+
i
,
nullptr
);
...
...
@@ -3563,7 +3572,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
nc
=
s_min
(
n
-
j
,
NC
);
float
*
local_B
=
packedB
+
KC
*
NC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
procPackB
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
local_B
);
(
*
this
.
*
procPackB
)
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
local_B
);
if
(
bias1
==
nullptr
)
{
InnerKernelWithPRelu
(
m
,
nc
,
packedA
,
local_B
,
local_C
,
&
C
(
0
,
j
),
ldc
,
p
,
mode
,
bias
,
nullptr
);
...
...
@@ -3580,7 +3589,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
paddle_mobile
::
memory
::
Free
(
zero
);
}
void
AddDot6x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
void
Gemm
::
AddDot6x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
#if __ARM_NEON
#if __aarch64__
...
...
@@ -3867,7 +3876,8 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
}
#if __aarch64__
void
AddDot8x12
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
void
Gemm
::
AddDot8x12
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
const
float
*
a_ptr
,
*
b_ptr
;
a_ptr
=
a
;
b_ptr
=
b
;
...
...
@@ -3956,7 +3966,8 @@ void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) {
"v20"
,
"v21"
,
"v22"
,
"v23"
,
"v24"
,
"v25"
,
"v26"
,
"v27"
,
"v28"
);
}
void
AddDot6x16
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
void
Gemm
::
AddDot6x16
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
const
float
*
a_ptr
,
*
b_ptr
;
a_ptr
=
a
;
b_ptr
=
b
;
...
...
src/operators/math/gemm.h
浏览文件 @
d51a0718
...
...
@@ -35,7 +35,9 @@ namespace paddle_mobile {
namespace
operators
{
namespace
math
{
/*
class
Gemm
{
public:
/*
// 将 A 矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *buffer);
...
...
@@ -44,138 +46,156 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer);
*/
// 将 A 矩阵分块复制到连续内存(RowMajor)
void
PackMatrixA_4r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
typedef
void
(
Gemm
::*
FnPack
)(
int
,
int
,
int
,
const
float
*
,
int
,
float
*
);
typedef
void
(
Gemm
::*
FnAddDot
)(
int
,
const
float
*
,
const
float
*
,
float
*
,
int
);
FnPack
procPackA
;
FnPack
procPackB
;
FnAddDot
procAddDot
;
// 将 A 矩阵分块复制到连续内存(RowMajor)
void
PackMatrixA_4r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
void
PackMatrixA_6r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
void
PackMatrixA_6r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
void
PackMatrixA_8r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
void
PackMatrixA_8r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
void
PackMatrixA_omp_6r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
void
PackMatrixA_omp_6r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
void
PackMatrixA_omp_8r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
void
PackMatrixA_omp_8r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
// 将 B 矩阵分块复制到连续内存(RowMajor)
void
PackMatrixB_8c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
// 将 B 矩阵分块复制到连续内存(RowMajor)
void
PackMatrixB_8c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_12c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
void
PackMatrixB_12c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_16c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
void
PackMatrixB_16c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_omp_8c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
void
PackMatrixB_omp_8c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_omp_12c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
void
PackMatrixB_omp_12c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_omp_16c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
void
PackMatrixB_omp_16c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
// 分块矩阵乘法
void
InnerKernel
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
// 分块矩阵乘法
void
InnerKernel
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
);
void
InnerKernelWithBias
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
void
InnerKernelWithBias
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
);
void
InnerKernelWithBn
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
);
void
InnerKernelWithBnAdd
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
void
InnerKernelWithBn
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
);
void
InnerKernelWithPRelu
(
int
mc
,
int
nc
,
const
float
*
a
,
const
float
*
b
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
);
void
InnerKernelWithBnAdd
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
);
void
InnerKernelWithPRelu
(
int
mc
,
int
nc
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
/*
// 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
/*
// 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu);
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float *C,
int ldc, bool relu, float *new_scale, float *new_bias);
*/
// 计算一个更小的 C 矩阵分块
void
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot6x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot8x12
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot6x16
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
// 分块矩阵乘法结果回写
// C = A * B
void
WriteBasic
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = alpha * A * B + beta * C
void
WriteWithAlphaBeta
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = A * B + C
void
WriteWithAdd
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = A * B + bias
void
WriteWithAddV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
);
// C = A * B + C, relu(C)
void
WriteWithAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = A * B + C,prelu(C)
void
WriteWithAddPRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float
*C,
int ldc, bool relu, float *new_scale, float *new_bias);
*/
// 计算一个更小的 C 矩阵分块
void
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot6x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot8x12
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot6x16
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
// 分块矩阵乘法结果回写
// C = A * B
void
WriteBasic
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = alpha * A * B + beta * C
void
WriteWithAlphaBeta
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = A * B + C
void
WriteWithAdd
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = A * B + bias
void
WriteWithAddV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
);
// C = A * B + C, relu(C)
void
WriteWithAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = A * B + C,prelu(C)
void
WriteWithAddPRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
// C = A * B + bias ,relu(C)
void
WriteWithAddReluV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
// C = A * B + bias ,relu(C)
void
WriteWithAddReluV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
);
// C = A * B, batchnorm(C)
void
WriteWithBn
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
);
// C = A * B, batchnorm(C), relu(C)
void
WriteWithBnRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
// C = A * B, batchnorm(C)
void
WriteWithBn
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
);
void
WriteWithBnAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
// C = A * B, batchnorm(C), relu(C)
void
WriteWithBnRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
);
void
WriteWithBnAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias1
);
/*
// 向量矩阵乘法结果回写
// C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc);
// C = alpha * A * B + beta * C
void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
// C = A * B + C
void VecWriteWithAdd(int n, float *c, float *C, int ldc);
// C = A * B + C, relu(C)
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
// C = A * B, batchnorm(C)
void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
/*
// 向量矩阵乘法结果回写
// C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc);
// C = alpha * A * B + beta * C
void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
// C = A * B + C
void VecWriteWithAdd(int n, float *c, float *C, int ldc);
// C = A * B + C, relu(C)
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
// C = A * B, batchnorm(C)
void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
// C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
// C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
*/
*/
// 32位 float 矩阵乘法
void
Sgemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
// 32位 float 矩阵乘法
void
Sgemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom
void
SgemmWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
// 32位 float 矩阵乘法, 并对结果进行 batchnrom
void
SgemmWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
);
void
SgemmWithPRelu
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
void
SgemmWithPRelu
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
// 32位 float 矩阵乘法(openmp 多线程版本)
void
Sgemm_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
// 32位 float 矩阵乘法(openmp 多线程版本)
void
Sgemm_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本)
void
SgemmWithBn_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本)
void
SgemmWithBn_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
);
void
SgemmWithPRelu_omp
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
void
SgemmWithPRelu_omp
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
private:
int
MC
=
0
;
int
KC
=
0
;
int
NC
=
0
;
float
*
packedA
;
float
*
packedB
;
float
*
packedC
;
float
*
zero
;
};
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
src/operators/math/gru_compute.cpp
浏览文件 @
d51a0718
...
...
@@ -28,19 +28,22 @@ struct GRUUnitFunctor<CPU, T> {
static
void
compute
(
GRUMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
const
ActivationType
active_node
,
const
ActivationType
active_gate
)
{
Gemm
gemm
;
if
(
value
.
prev_out_value
)
{
Sgemm
(
batch_size
,
frame_size
*
2
,
frame_size
,
1
,
value
.
prev_out_value
,
frame_size
,
value
.
gate_weight
,
frame_size
*
2
,
1
,
value
.
gate_value
,
frame_size
*
3
,
false
,
nullptr
);
gemm
.
Sgemm
(
batch_size
,
frame_size
*
2
,
frame_size
,
1
,
value
.
prev_out_value
,
frame_size
,
value
.
gate_weight
,
frame_size
*
2
,
1
,
value
.
gate_value
,
frame_size
*
3
,
false
,
nullptr
);
}
forward_reset_output
(
forward
::
gru_resetOutput
<
T
>
(),
value
,
frame_size
,
batch_size
,
active_gate
);
if
(
value
.
prev_out_value
)
{
Sgemm
(
batch_size
,
frame_size
,
frame_size
,
1
,
value
.
reset_output_value
,
frame_size
,
value
.
state_weight
,
frame_size
,
1
,
value
.
gate_value
+
frame_size
*
2
,
frame_size
*
3
,
false
,
nullptr
);
gemm
.
Sgemm
(
batch_size
,
frame_size
,
frame_size
,
1
,
value
.
reset_output_value
,
frame_size
,
value
.
state_weight
,
frame_size
,
1
,
value
.
gate_value
+
frame_size
*
2
,
frame_size
*
3
,
false
,
nullptr
);
}
forward_final_output
(
forward
::
gru_finalOutput
<
T
>
(),
value
,
frame_size
,
...
...
src/operators/math/math_function.cpp
浏览文件 @
d51a0718
...
...
@@ -36,6 +36,7 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
int
M
=
dim_out
[
0
];
int
N
=
dim_out
[
1
];
int
K
=
(
!
trans_a
)
?
dim_a
[
1
]
:
dim_a
[
0
];
Gemm
gemm
;
if
(
trans_a
)
{
int
numel
=
matrix_a
.
numel
();
...
...
@@ -50,20 +51,24 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
a
[
index
++
]
=
tmp
[
i
*
n
+
j
];
}
}
#ifdef _OPENMP
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
gemm
.
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
bias
);
#else
Sgemm
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
gemm
.
Sgemm
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
bias
);
#endif
}
else
{
#ifdef _OPENMP
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
bias
);
gemm
.
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
bias
);
#else
Sgemm
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
bias
);
gemm
.
Sgemm
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
bias
);
#endif
}
}
...
...
@@ -74,6 +79,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
framework
::
Tensor
*
new_scale
,
framework
::
Tensor
*
new_bias
,
int
group
,
float
*
bias
)
{
Gemm
gemm
;
auto
dim_a
=
matrix_a
.
dims
();
auto
dim_b
=
matrix_b
.
dims
();
auto
dim_out
=
matrix_out
->
dims
();
...
...
@@ -86,21 +92,22 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
int
K
=
(
!
trans_a
)
?
dim_a
[
1
]
:
dim_a
[
0
];
#ifdef _OPENMP
SgemmWithBn_omp
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
new_scale
->
data
<
float
>
()
+
group
,
new_bias
->
data
<
float
>
()
+
group
,
bias
);
gemm
.
SgemmWithBn_omp
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
new_scale
->
data
<
float
>
()
+
group
,
new_bias
->
data
<
float
>
()
+
group
,
bias
);
#else
SgemmWithBn
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
()
,
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
new_scale
->
data
<
float
>
()
+
group
,
new_bias
->
data
<
float
>
()
+
group
,
bias
);
gemm
.
SgemmWithBn
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
()
,
N
,
relu
,
new_scale
->
data
<
float
>
()
+
group
,
new_bias
->
data
<
float
>
()
+
group
,
bias
);
#endif
}
void
matmulWithPRelu
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
framework
::
Tensor
*
matrix_out
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
Gemm
gemm
;
auto
dim_a
=
matrix_a
.
dims
();
auto
dim_b
=
matrix_b
.
dims
();
auto
dim_out
=
matrix_out
->
dims
();
...
...
@@ -113,11 +120,13 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
int
K
=
(
!
trans_a
)
?
dim_a
[
1
]
:
dim_a
[
0
];
#ifdef _OPENMP
SgemmWithPRelu_omp
(
M
,
N
,
K
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
matrix_out
->
data
<
float
>
(),
N
,
p
,
mode
,
bias
,
bias1
);
gemm
.
SgemmWithPRelu_omp
(
M
,
N
,
K
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
matrix_out
->
data
<
float
>
(),
N
,
p
,
mode
,
bias
,
bias1
);
#else
SgemmWithPRelu
(
M
,
N
,
K
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
matrix_out
->
data
<
float
>
(),
N
,
p
,
mode
,
bias
,
bias1
);
gemm
.
SgemmWithPRelu
(
M
,
N
,
K
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
matrix_out
->
data
<
float
>
(),
N
,
p
,
mode
,
bias
,
bias1
);
#endif
}
...
...
test/CMakeLists.txt
浏览文件 @
d51a0718
...
...
@@ -35,8 +35,8 @@ if (CON GREATER -1)
ADD_EXECUTABLE
(
test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-yolo paddle-mobile
)
# gen test
ADD_EXECUTABLE
(
test
_yolo_
combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test
_yolo_
combined paddle-mobile
)
ADD_EXECUTABLE
(
test
-yolo-
combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test
-yolo-
combined paddle-mobile
)
set
(
FOUND_MATCH ON
)
endif
()
...
...
@@ -323,5 +323,10 @@ if (NOT FOUND_MATCH)
target_link_libraries
(
test-fssd paddle-mobile
)
# gen test
ADD_EXECUTABLE
(
test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h
)
target_link_libraries
(
test-multi-process paddle-mobile
)
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
endif
()
test/common/test_gemm_accuracy.cpp
浏览文件 @
d51a0718
...
...
@@ -83,8 +83,9 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
}
}
paddle_mobile
::
operators
::
math
::
SgemmWithBn
(
m
,
n
,
k
,
0.9
,
a
,
lda
,
b
,
ldb
,
0.3
,
c
,
ldc
,
relu
,
scale
,
bias
,
nullptr
);
paddle_mobile
::
operators
::
math
::
Gemm
gemm
;
gemm
.
SgemmWithBn
(
m
,
n
,
k
,
0.9
,
a
,
lda
,
b
,
ldb
,
0.3
,
c
,
ldc
,
relu
,
scale
,
bias
,
nullptr
);
int
eq
=
0
;
int
neq
=
0
;
for
(
int
i
=
0
;
i
<
m
*
n
;
++
i
)
{
...
...
test/net/test_multi_inference_predict.cpp
0 → 100644
浏览文件 @
d51a0718
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include <thread> // NOLINT
#include "../test_helper.h"
#include "../test_include.h"
void
fun_yolo
();
int
fun_mobilenet
();
int
main
()
{
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
CPU
>
paddle_mobile2
;
// fun_yolo();
// fun_mobilenet();
std
::
thread
t1
(
fun_yolo
);
std
::
thread
t2
(
fun_mobilenet
);
t1
.
join
();
t2
.
join
();
return
0
;
}
void
fun_yolo
()
{
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
CPU
>
paddle_mobile
;
paddle_mobile
.
SetThreadNum
(
4
);
// ../../../test/models/googlenet
// ../../../test/models/mobilenet
auto
time1
=
time
();
if
(
paddle_mobile
.
Load
(
g_yolo
,
true
))
{
auto
time2
=
time
();
std
::
cout
<<
"load cost :"
<<
time_diff
(
time1
,
time1
)
<<
"ms"
<<
std
::
endl
;
vector
<
int64_t
>
dims
{
1
,
3
,
227
,
227
};
Tensor
input_tensor
;
SetupTensor
<
float
>
(
&
input_tensor
,
{
1
,
3
,
227
,
227
},
static_cast
<
float
>
(
0
),
static_cast
<
float
>
(
1
));
vector
<
float
>
input
(
input_tensor
.
data
<
float
>
(),
input_tensor
.
data
<
float
>
()
+
input_tensor
.
numel
());
auto
time3
=
time
();
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
paddle_mobile
.
Predict
(
input
,
dims
);
}
auto
time4
=
time
();
std
::
cout
<<
"thread 1: predict cost :"
<<
time_diff
(
time3
,
time4
)
/
10
<<
"ms"
<<
std
::
endl
;
}
}
int
fun_mobilenet
()
{
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
CPU
>
paddle_mobile
;
paddle_mobile
.
SetThreadNum
(
4
);
auto
time1
=
time
();
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
// std::string(g_mobilenet_detect) + "/params", true);
auto
isok
=
paddle_mobile
.
Load
(
g_mobilenet
,
true
);
if
(
isok
)
{
auto
time2
=
time
();
std
::
cout
<<
"load cost :"
<<
time_diff
(
time1
,
time1
)
<<
"ms"
<<
std
::
endl
;
vector
<
float
>
input
;
vector
<
int64_t
>
dims
{
1
,
3
,
224
,
224
};
GetInput
<
float
>
(
g_test_image_1x3x224x224_banana
,
&
input
,
dims
);
auto
vec_result
=
paddle_mobile
.
Predict
(
input
,
dims
);
auto
biggest
=
max_element
(
begin
(
vec_result
),
end
(
vec_result
));
std
::
cout
<<
" Max element is "
<<
*
biggest
<<
" at position "
<<
distance
(
begin
(
vec_result
),
biggest
)
<<
std
::
endl
;
// 预热十次
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
auto
vec_result
=
paddle_mobile
.
Predict
(
input
,
dims
);
}
auto
time3
=
time
();
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
auto
vec_result
=
paddle_mobile
.
Predict
(
input
,
dims
);
}
DLOG
<<
vec_result
;
auto
time4
=
time
();
std
::
cout
<<
"thread 2: predict cost :"
<<
time_diff
(
time3
,
time4
)
/
10
<<
"ms"
<<
std
::
endl
;
}
std
::
cout
<<
"如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
"是否存在?"
<<
std
::
endl
;
return
0
;
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录