Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
00417d29
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
00417d29
编写于
7月 10, 2018
作者:
E
eclipsess
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'upstream/develop' into develop
上级
6c86e48c
cd30eb8a
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
1329 addition
and
627 deletion
+1329
-627
CMakeLists.txt
CMakeLists.txt
+0
-1
demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/MainActivity.java
...roid/app/src/main/java/com/baidu/paddle/MainActivity.java
+8
-1
demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/PML.java
...obile_Android/app/src/main/java/com/baidu/paddle/PML.java
+8
-0
src/jni/paddle_mobile_jni.cpp
src/jni/paddle_mobile_jni.cpp
+9
-0
src/jni/paddle_mobile_jni.h
src/jni/paddle_mobile_jni.h
+6
-1
src/operators/math/gemm.cpp
src/operators/math/gemm.cpp
+1224
-590
src/operators/math/gemm.h
src/operators/math/gemm.h
+55
-21
src/operators/math/math_function.cpp
src/operators/math/math_function.cpp
+13
-13
src/operators/math/math_function.h
src/operators/math/math_function.h
+6
-0
未找到文件。
CMakeLists.txt
浏览文件 @
00417d29
...
...
@@ -9,7 +9,6 @@ option(LOG_PROFILE "log profile" ON)
option
(
CPU
"armv7 with neon"
ON
)
option
(
MALI_GPU
"mali gpu"
OFF
)
option
(
FPGA
"fpga"
OFF
)
set
(
DEBUGING ON
)
if
(
ARM_LINUX
)
include
(
"
${
CMAKE_CURRENT_LIST_DIR
}
/tools/arm-platform.cmake"
)
...
...
demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/MainActivity.java
浏览文件 @
00417d29
...
...
@@ -121,7 +121,14 @@ public class MainActivity extends Activity {
String
assetPath
=
"pml_demo"
;
String
sdcardPath
=
Environment
.
getExternalStorageDirectory
()
+
File
.
separator
+
assetPath
+
File
.
separator
+
type
;
PML
.
load
(
sdcardPath
);
//PML.load(sdcardPath);
String
modelPath
=
Environment
.
getExternalStorageDirectory
()
+
File
.
separator
+
assetPath
+
File
.
separator
+
"googlenet_combine"
+
File
.
separator
+
"model"
;
String
paramPath
=
Environment
.
getExternalStorageDirectory
()
+
File
.
separator
+
assetPath
+
File
.
separator
+
"googlenet_combine"
+
File
.
separator
+
"params"
;
PML
.
loadCombined
(
modelPath
,
paramPath
);
}
});
...
...
demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/PML.java
浏览文件 @
00417d29
...
...
@@ -8,6 +8,14 @@ public class PML {
*/
public
static
native
boolean
load
(
String
modelPath
);
/**
* Load
* @param modelPath
* @param paramPath
* @return
*/
public
static
native
boolean
loadCombined
(
String
modelPath
,
String
paramPath
);
/**
* object detection
...
...
src/jni/paddle_mobile_jni.cpp
浏览文件 @
00417d29
...
...
@@ -60,6 +60,15 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
optimize
);
}
JNIEXPORT
jboolean
JNICALL
Java_com_baidu_paddle_PML_loadCombined
(
JNIEnv
*
env
,
jclass
thiz
,
jstring
modelPath
,
jstring
paramPath
)
{
ANDROIDLOGI
(
"load invoked"
);
bool
optimize
=
true
;
return
getPaddleMobileInstance
()
->
Load
(
jstring2cppstring
(
env
,
modelPath
),
jstring2cppstring
(
env
,
paramPath
),
optimize
);
}
JNIEXPORT
jfloatArray
JNICALL
Java_com_baidu_paddle_PML_predict
(
JNIEnv
*
env
,
jclass
thiz
,
jfloatArray
buf
)
{
jfloatArray
result
=
NULL
;
...
...
src/jni/paddle_mobile_jni.h
浏览文件 @
00417d29
...
...
@@ -22,11 +22,16 @@ extern "C" {
namespace
paddle_mobile
{
namespace
jni
{
/**
* load
model & params of the net
for android
* load
separated model
for android
*/
JNIEXPORT
jboolean
JNICALL
Java_com_baidu_paddle_PML_load
(
JNIEnv
*
env
,
jclass
thiz
,
jstring
modelPath
);
/**
* load combined model for android
*/
JNIEXPORT
jboolean
JNICALL
Java_com_baidu_paddle_PML_loadCombined
(
JNIEnv
*
env
,
jclass
thiz
,
jstring
modelPath
,
jstring
paramPath
);
/**
* object detection for anroid
...
...
src/operators/math/gemm.cpp
浏览文件 @
00417d29
...
...
@@ -22,9 +22,14 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
alignas
(
64
)
float
packedA
[
MC
*
KC
];
alignas
(
64
)
float
packedB
[
KC
*
NC
];
alignas
(
64
)
float
ab
[
MR
*
NR
];
int
MC
=
0
;
int
KC
=
0
;
int
NC
=
0
;
float
*
packedA
;
float
*
packedB
;
float
*
packedC
;
float
*
zero
;
// 将A矩阵分块复制到连续内存(ColMajor)
void
PackMatrixA
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
)
{
...
...
@@ -55,28 +60,39 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
// 将A矩阵分块复制到连续内存(RowMajor)
void
PackMatrixA_
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
)
{
int
i
,
j
;
const
float
*
Ai
,
*
Ai1
,
*
Ai2
,
*
Ai3
;
for
(
i
=
0
;
i
<
m
-
m_tail
;
i
+=
MR
)
{
Ai
=
&
A
(
i
,
0
);
Ai1
=
&
A
(
i
+
1
,
0
);
Ai2
=
&
A
(
i
+
2
,
0
);
Ai3
=
&
A
(
i
+
3
,
0
);
const
float
*
a0
,
*
a1
,
*
a2
,
*
a3
;
for
(
int
i
=
0
;
i
<
m
-
m_tail
;
i
+=
MR
)
{
a0
=
A
+
i
*
lda
;
a1
=
A
+
(
i
+
1
)
*
lda
;
a2
=
A
+
(
i
+
2
)
*
lda
;
a3
=
A
+
(
i
+
3
)
*
lda
;
for
(
int
j
=
0
;
j
<
k
;
++
j
)
{
*
buffer
++
=
*
Ai
++
;
*
buffer
++
=
*
Ai
1
++
;
*
buffer
++
=
*
Ai
2
++
;
*
buffer
++
=
*
Ai
3
++
;
*
buffer
++
=
*
a0
++
;
*
buffer
++
=
*
a
1
++
;
*
buffer
++
=
*
a
2
++
;
*
buffer
++
=
*
a
3
++
;
}
}
int
i
=
m
-
m_tail
;
a0
=
&
A
(
i
,
0
);
a1
=
a0
+
lda
;
a2
=
a0
+
2
*
lda
;
a3
=
a0
+
3
*
lda
;
if
(
m_tail
!=
0
)
{
for
(
j
=
0
;
j
<
k
;
++
j
)
{
for
(
i
=
m
-
m_tail
;
i
<
m
;
++
i
)
{
*
buffer
++
=
A
(
i
,
j
);
}
for
(
i
=
m
;
i
<
m
+
(
MR
-
m_tail
);
++
i
)
{
*
buffer
++
=
0
;
}
if
(
m_tail
<=
3
)
{
a3
=
zero
;
}
if
(
m_tail
<=
2
)
{
a2
=
zero
;
}
if
(
m_tail
<=
1
)
{
a1
=
zero
;
}
for
(
int
j
=
0
;
j
<
k
;
++
j
)
{
*
buffer
++
=
*
a0
++
;
*
buffer
++
=
*
a1
++
;
*
buffer
++
=
*
a2
++
;
*
buffer
++
=
*
a3
++
;
}
}
}
...
...
@@ -113,35 +129,24 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
// 将B矩阵分块复制到连续内存(RowMajor)
void
PackMatrixB_
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
)
{
int
i
,
j
;
const
float
*
Bij
;
for
(
j
=
0
;
j
<
n
-
n_tail
;
j
+=
NR
)
{
#ifdef ARMV7
for
(
i
=
0
;
i
<
k
;
++
i
)
{
Bij
=
&
B
(
i
,
j
);
const
float
*
b0
;
for
(
int
j
=
0
;
j
<
n
-
n_tail
;
j
+=
NR
)
{
for
(
int
i
=
0
;
i
<
k
;
++
i
)
{
b0
=
&
B
(
i
,
j
);
asm
volatile
(
"vld1.32 {q0}, [%[Bij]]
\n\t
"
"vst1.32 {q0}, [%[buffer]]!
\n\t
"
"pld [%[b0]]
\n\t
"
"vld1.32 {q0, q1}, [%[b0]]
\n\t
"
"vst1.32 {q0, q1}, [%[buffer]]!
\n\t
"
:
[
buffer
]
"+r"
(
buffer
)
:
[
Bij
]
"r"
(
Bij
)
:
"memory"
,
"q0"
);
}
#else
for
(
i
=
0
;
i
<
k
;
++
i
)
{
Bij
=
&
B
(
i
,
j
);
*
buffer
++
=
*
Bij
;
*
buffer
++
=
*
(
Bij
+
1
);
*
buffer
++
=
*
(
Bij
+
2
);
*
buffer
++
=
*
(
Bij
+
3
);
:
[
b0
]
"r"
(
b0
)
:
"memory"
,
"q0"
,
"q0"
);
}
#endif
}
if
(
n_tail
!=
0
)
{
for
(
i
=
0
;
i
<
k
;
++
i
)
{
Bij
=
&
B
(
i
,
n
-
n_tail
);
for
(
i
nt
i
=
0
;
i
<
k
;
++
i
)
{
b0
=
&
B
(
i
,
n
-
n_tail
);
for
(
int
j
=
n
-
n_tail
;
j
<
n
;
++
j
)
{
*
buffer
++
=
*
Bij
++
;
*
buffer
++
=
*
b0
++
;
}
for
(
int
j
=
n
;
j
<
n
+
(
NR
-
n_tail
);
++
j
)
{
*
buffer
++
=
0
;
...
...
@@ -151,118 +156,53 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
}
// 分块矩阵乘法
void
InnerKernel
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
int
first_time
)
{
int
m_block
=
(
m
+
MR
-
1
)
/
MR
*
MR
;
int
n_block
=
(
n
+
NR
-
1
)
/
NR
*
NR
;
int
m_tail
=
m
%
MR
;
int
n_tail
=
n
%
NR
;
void
InnerKernel
(
int
m
c
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
)
{
for
(
int
j
=
0
;
j
<
nc
;
j
+=
NR
)
{
for
(
int
i
=
0
;
i
<
mc
;
i
+=
MR
)
{
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC)
;
AddDot4x8
(
KC
,
a
+
i
*
KC
,
b
+
j
*
KC
,
c
+
i
*
NC
+
j
,
NC
);
}
}
if
(
first_time
)
{
PackMatrixB_
(
k
,
n
,
n_tail
,
B
,
ldb
,
packedB
);
if
(
alpha
!=
1
)
{
WriteWithAlphaBeta
(
mc
,
nc
,
c
,
C
,
ldc
);
return
;
}
PackMatrixA_
(
m
,
k
,
m_tail
,
A
,
lda
,
packedA
);
int
i
,
j
,
mc
,
nc
;
// B 取 4 列, 打包预热
for
(
j
=
0
;
j
<
n_block
;
j
+=
NR
)
{
nc
=
(
n
-
j
)
<
NR
?
n_tail
:
NR
;
// A 取 4 行,打包预热
for
(
i
=
0
;
i
<
m_block
;
i
+=
MR
)
{
mc
=
(
m
-
i
)
<
MR
?
m_tail
:
MR
;
AddDot4x4
(
k
,
alpha
,
&
packedA
[
i
*
k
],
4
,
&
packedB
[
j
*
k
],
k
,
beta
,
&
C
(
i
,
j
),
ldc
,
mc
,
nc
);
}
if
(
beta
==
0
)
{
WriteBasic
(
mc
,
nc
,
c
,
C
,
ldc
);
return
;
}
if
(
beta
==
1
&&
!
relu
)
{
WriteWithAdd
(
mc
,
nc
,
c
,
C
,
ldc
);
return
;
}
if
(
beta
==
1
&&
relu
)
{
WriteWithAddRelu
(
mc
,
nc
,
c
,
C
,
ldc
);
return
;
}
}
// 分块矩阵乘法
void
InnerKernel_relu
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
int
first_time
,
bool
relu
=
false
)
{
int
m_block
=
(
m
+
MR
-
1
)
/
MR
*
MR
;
int
n_block
=
(
n
+
NR
-
1
)
/
NR
*
NR
;
int
m_tail
=
m
%
MR
;
int
n_tail
=
n
%
NR
;
if
(
first_time
)
{
PackMatrixB_
(
k
,
n
,
n_tail
,
B
,
ldb
,
packedB
);
}
PackMatrixA_
(
m
,
k
,
m_tail
,
A
,
lda
,
packedA
);
int
i
,
j
,
mc
,
nc
;
// B 取 4 列, 打包预热
for
(
j
=
0
;
j
<
n_block
;
j
+=
NR
)
{
nc
=
(
n
-
j
)
<
NR
?
n_tail
:
NR
;
// A 取 4 行,打包预热
for
(
i
=
0
;
i
<
m_block
;
i
+=
MR
)
{
mc
=
(
m
-
i
)
<
MR
?
m_tail
:
MR
;
AddDot4x4_relu
(
k
,
alpha
,
&
packedA
[
i
*
k
],
4
,
&
packedB
[
j
*
k
],
k
,
beta
,
&
C
(
i
,
j
),
ldc
,
mc
,
nc
,
relu
);
void
InnerKernelWithBn
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
)
{
for
(
int
j
=
0
;
j
<
nc
;
j
+=
NR
)
{
for
(
int
i
=
0
;
i
<
mc
;
i
+=
MR
)
{
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot4x8
(
KC
,
a
+
i
*
KC
,
b
+
j
*
KC
,
c
+
i
*
NC
+
j
,
NC
);
}
}
}
// 计算一个更小的 4 * 4 的 C 矩阵分块
#if defined(IOS)
void
AddDot4x4
(
int
k
,
float
alpha
,
const
float
*
a
,
int
lda
,
const
float
*
b
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
int
mc
,
int
nc
)
{
// init C
float32x4_t
cv0
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv1
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv2
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv3
=
vdupq_n_f32
(
0.0
);
float32x4_t
av
;
float32x4_t
bv
;
float32x2_t
av01
;
float32x2_t
av23
;
for
(
int
p
=
0
;
p
<
k
;
p
+=
1
)
{
av
=
vld1q_f32
(
a
);
bv
=
vld1q_f32
(
b
);
av01
=
vget_low_f32
(
av
);
cv0
=
vmlaq_lane_f32
(
cv0
,
bv
,
av01
,
0
);
cv1
=
vmlaq_lane_f32
(
cv1
,
bv
,
av01
,
1
);
av23
=
vget_high_f32
(
av
);
cv2
=
vmlaq_lane_f32
(
cv2
,
bv
,
av23
,
0
);
cv3
=
vmlaq_lane_f32
(
cv3
,
bv
,
av23
,
1
);
a
+=
MR
;
b
+=
NR
;
}
float32x4x4_t
cv
=
{
cv0
,
cv1
,
cv2
,
cv3
};
int
i
,
j
;
for
(
i
=
0
;
i
<
mc
;
++
i
)
{
for
(
j
=
0
;
j
<
nc
;
++
j
)
{
if
(
beta
==
0.0
)
{
C
(
i
,
j
)
=
0.0
;
}
else
if
(
beta
!=
1.0
)
{
C
(
i
,
j
)
*=
beta
;
}
if
(
j
==
0
)
{
C
(
i
,
j
)
+=
alpha
*
vgetq_lane_f32
(
cv
.
val
[
i
],
0
);
}
else
if
(
j
==
1
)
{
C
(
i
,
j
)
+=
alpha
*
vgetq_lane_f32
(
cv
.
val
[
i
],
1
);
}
else
if
(
j
==
2
)
{
C
(
i
,
j
)
+=
alpha
*
vgetq_lane_f32
(
cv
.
val
[
i
],
2
);
}
else
if
(
j
==
3
)
{
C
(
i
,
j
)
+=
alpha
*
vgetq_lane_f32
(
cv
.
val
[
i
],
3
);
}
}
if
(
relu
)
{
WriteWithBnRelu
(
mc
,
nc
,
c
,
C
,
ldc
,
new_scale
,
new_bias
);
}
else
{
WriteWithBn
(
mc
,
nc
,
c
,
C
,
ldc
,
new_scale
,
new_bias
);
}
}
void
AddDot4x4_relu
(
int
k
,
float
alpha
,
const
float
*
a
,
int
lda
,
const
float
*
b
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
int
mc
,
int
nc
,
bool
relu
=
false
)
{
#if defined(IOS)
void
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
C
,
int
ldc
)
{
// init C
float32x4_t
cv0
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv1
=
vdupq_n_f32
(
0.0
);
...
...
@@ -307,183 +247,22 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
}
else
if
(
j
==
3
)
{
C
(
i
,
j
)
+=
alpha
*
vgetq_lane_f32
(
cv
.
val
[
i
],
3
);
}
if
(
C
(
i
,
j
)
<
0
)
{
C
(
i
,
j
)
=
0
;
}
}
}
}
}
// namespace math
#elif defined(ARMV7)
void
AddDot4x4
(
int
k
,
float
alpha
,
const
float
*
a
,
int
lda
,
const
float
*
b
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
int
mc
,
int
nc
)
{
int
kc1
=
k
/
4
,
kc2
=
k
%
4
;
int
bytes_ldc
=
4
*
ldc
;
int
flag_alpha
=
(
alpha
==
1.0
)
?
1
:
2
;
int
flag_beta
;
if
(
beta
==
0.0
)
{
flag_beta
=
0
;
}
else
if
(
beta
==
1.0
)
{
flag_beta
=
1
;
}
else
{
flag_beta
=
2
;
}
asm
volatile
(
"pld [%[a]]
\n\t
"
"pld [%[b]]
\n\t
"
"vmov.f32 q10, #0.0
\n\t
"
"vmov.f32 q11, #0.0
\n\t
"
"vmov.f32 q12, #0.0
\n\t
"
"vmov.f32 q13, #0.0
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"blt end_kc1_%=
\n\t
"
"loop_kc1_%=:
\n\t
"
"pld [%[a], #64]
\n\t
"
"pld [%[b], #64]
\n\t
"
"vld1.32 {q0, q1}, [%[a]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b]]!
\n\t
"
"vmla.f32 q10, q2, d0[0]
\n\t
"
"vmla.f32 q11, q2, d0[1]
\n\t
"
"vmla.f32 q12, q2, d1[0]
\n\t
"
"vmla.f32 q13, q2, d1[1]
\n\t
"
"vmla.f32 q10, q3, d2[0]
\n\t
"
"vmla.f32 q11, q3, d2[1]
\n\t
"
"vmla.f32 q12, q3, d3[0]
\n\t
"
"vmla.f32 q13, q3, d3[1]
\n\t
"
"vld1.32 {q0, q1}, [%[a]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b]]!
\n\t
"
"vmla.f32 q10, q2, d0[0]
\n\t
"
"vmla.f32 q11, q2, d0[1]
\n\t
"
"vmla.f32 q12, q2, d1[0]
\n\t
"
"vmla.f32 q13, q2, d1[1]
\n\t
"
"vmla.f32 q10, q3, d2[0]
\n\t
"
"vmla.f32 q11, q3, d2[1]
\n\t
"
"vmla.f32 q12, q3, d3[0]
\n\t
"
"vmla.f32 q13, q3, d3[1]
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"bge loop_kc1_%=
\n\t
"
"end_kc1_%=:
\n\t
"
"subs %[kc2], %[kc2], #1
\n\t
"
"blt end_kc2_%=
\n\t
"
"loop_kc2_%=:
\n\t
"
"vld1.32 {q0}, [%[a]]!
\n\t
"
"vld1.32 {q1}, [%[b]]!
\n\t
"
"vmla.f32 q10, q1, d0[0]
\n\t
"
"vmla.f32 q11, q1, d0[1]
\n\t
"
"vmla.f32 q12, q1, d1[0]
\n\t
"
"vmla.f32 q13, q1, d1[1]
\n\t
"
"subs %[kc2], %[kc2], #1
\n\t
"
"bge loop_kc2_%=
\n\t
"
"end_kc2_%=:
\n\t
"
"cmp %[mc], #4
\n\t
"
"bne temp_%=
\n\t
"
"cmp %[nc], #4
\n\t
"
"bne temp_%=
\n\t
"
"vmov.f32 d8[0], %[alpha]
\n\t
"
"vmov.f32 d8[1], %[beta]
\n\t
"
"cmp %[flag_alpha], #1
\n\t
"
"bne alpha_%=
\n\t
"
"alpha_%=:
\n\t
"
"vmul.f32 q10, q10, d8[0]
\n\t
"
"vmul.f32 q11, q11, d8[0]
\n\t
"
"vmul.f32 q12, q12, d8[0]
\n\t
"
"vmul.f32 q13, q13, d8[0]
\n\t
"
"beta_%=:
\n\t
"
"cmp %[flag_beta], #0
\n\t
"
"beq memory_%=
\n\t
"
"mov r4, %[C]
\n\t
"
"mov r6, %[bytes_ldc]
\n\t
"
"vld1.32 {q0}, [r4], r6
\n\t
"
"vld1.32 {q1}, [r4], r6
\n\t
"
"vld1.32 {q2}, [r4], r6
\n\t
"
"vld1.32 {q3}, [r4]
\n\t
"
"cmp %[flag_beta], #1
\n\t
"
"beq beta_eq1_%=
\n\t
"
"bne beta_ne1_%=
\n\t
"
"beta_eq1_%=:
\n\t
"
"vadd.f32 q10, q10, q0
\n\t
"
"vadd.f32 q11, q11, q1
\n\t
"
"vadd.f32 q12, q12, q2
\n\t
"
"vadd.f32 q13, q13, q3
\n\t
"
"b memory_%=
\n\t
"
"beta_ne1_%=:
\n\t
"
"vmla.f32 q10, q0, d8[1]
\n\t
"
"vmla.f32 q11, q1, d8[1]
\n\t
"
"vmla.f32 q12, q2, d8[1]
\n\t
"
"vmla.f32 q13, q3, d8[1]
\n\t
"
"memory_%=:
\n\t
"
"mov r5, %[C]
\n\t
"
"mov r6, %[bytes_ldc]
\n\t
"
"vst1.32 {q10}, [r5], r6
\n\t
"
"vst1.32 {q11}, [r5], r6
\n\t
"
"vst1.32 {q12}, [r5], r6
\n\t
"
"vst1.32 {q13}, [r5]
\n\t
"
"b end_%=
\n\t
"
"temp_%=:
\n\t
"
"vst1.32 {q10, q11}, [%[ab]]!
\n\t
"
"vst1.32 {q12, q13}, [%[ab]]
\n\t
"
"end_%=:
\n\t
"
:
:
[
a
]
"r"
(
a
),
[
b
]
"r"
(
b
),
[
C
]
"r"
(
C
),
[
ab
]
"r"
(
ab
),
[
kc1
]
"r"
(
kc1
),
[
kc2
]
"r"
(
kc2
),
[
mc
]
"r"
(
mc
),
[
nc
]
"r"
(
nc
),
[
alpha
]
"r"
(
alpha
),
[
beta
]
"r"
(
beta
),
[
bytes_ldc
]
"r"
(
bytes_ldc
),
[
flag_alpha
]
"r"
(
flag_alpha
),
[
flag_beta
]
"r"
(
flag_beta
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
);
if
(
mc
!=
MR
||
nc
!=
NR
)
{
int
i
,
j
;
for
(
i
=
0
;
i
<
mc
;
++
i
)
{
for
(
j
=
0
;
j
<
nc
;
++
j
)
{
if
(
beta
==
0.0
)
{
if
(
alpha
!=
1.0
)
{
C
(
i
,
j
)
=
alpha
*
ab
[
i
*
MR
+
j
];
}
else
{
C
(
i
,
j
)
=
ab
[
i
*
MR
+
j
];
}
}
else
{
if
(
beta
!=
1.0
)
{
C
(
i
,
j
)
*=
beta
;
}
if
(
alpha
!=
1.0
)
{
C
(
i
,
j
)
+=
alpha
*
ab
[
i
*
MR
+
j
];
}
else
{
C
(
i
,
j
)
+=
ab
[
i
*
MR
+
j
];
}
}
}
}
}
}
void
AddDot4x4_relu
(
int
k
,
float
alpha
,
const
float
*
a
,
int
lda
,
const
float
*
b
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
int
mc
,
int
nc
,
bool
relu
=
false
)
{
int
kc1
=
k
/
4
,
kc2
=
k
%
4
;
int
bytes_ldc
=
4
*
ldc
;
int
flag_alpha
=
(
alpha
==
1.0
)
?
1
:
2
;
int
flag_beta
;
if
(
beta
==
0.0
)
{
flag_beta
=
0
;
}
else
if
(
beta
==
1.0
)
{
flag_beta
=
1
;
}
else
{
flag_beta
=
2
;
}
void
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
const
float
*
a_ptr
,
*
b_ptr
;
a_ptr
=
a
;
b_ptr
=
b
;
int
kc1
=
k
/
4
;
int
kc2
=
k
%
4
;
int
step
=
4
*
ldc
;
asm
volatile
(
"pld [%[a
]]
\n\t
"
"pld [%[b
]]
\n\t
"
"pld [%[a
_ptr]]
\n\t
"
"pld [%[b
_ptr]]
\n\t
"
"vmov.f32 q10, #0.0
\n\t
"
"vmov.f32 q11, #0.0
\n\t
"
"vmov.f32 q12, #0.0
\n\t
"
...
...
@@ -492,20 +271,10 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
"subs %[kc1], %[kc1], #1
\n\t
"
"blt end_kc1_%=
\n\t
"
"loop_kc1_%=:
\n\t
"
"pld [%[a], #64]
\n\t
"
"pld [%[b], #64]
\n\t
"
"vld1.32 {q0, q1}, [%[a]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b]]!
\n\t
"
"vmla.f32 q10, q2, d0[0]
\n\t
"
"vmla.f32 q11, q2, d0[1]
\n\t
"
"vmla.f32 q12, q2, d1[0]
\n\t
"
"vmla.f32 q13, q2, d1[1]
\n\t
"
"vmla.f32 q10, q3, d2[0]
\n\t
"
"vmla.f32 q11, q3, d2[1]
\n\t
"
"vmla.f32 q12, q3, d3[0]
\n\t
"
"vmla.f32 q13, q3, d3[1]
\n\t
"
"vld1.32 {q0, q1}, [%[a]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b]]!
\n\t
"
"pld [%[a_ptr], #64]
\n\t
"
"pld [%[b_ptr], #64]
\n\t
"
"vld1.32 {q0, q1}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q10, q2, d0[0]
\n\t
"
"vmla.f32 q11, q2, d0[1]
\n\t
"
"vmla.f32 q12, q2, d1[0]
\n\t
"
...
...
@@ -514,6 +283,16 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
"vmla.f32 q11, q3, d2[1]
\n\t
"
"vmla.f32 q12, q3, d3[0]
\n\t
"
"vmla.f32 q13, q3, d3[1]
\n\t
"
"vld1.32 {q4, q5}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q6, q7}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q10, q6, d8[0]
\n\t
"
"vmla.f32 q11, q6, d8[1]
\n\t
"
"vmla.f32 q12, q6, d9[0]
\n\t
"
"vmla.f32 q13, q6, d9[1]
\n\t
"
"vmla.f32 q10, q7, d10[0]
\n\t
"
"vmla.f32 q11, q7, d10[1]
\n\t
"
"vmla.f32 q12, q7, d11[0]
\n\t
"
"vmla.f32 q13, q7, d11[1]
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"bge loop_kc1_%=
\n\t
"
"end_kc1_%=:
\n\t
"
...
...
@@ -521,8 +300,8 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
"subs %[kc2], %[kc2], #1
\n\t
"
"blt end_kc2_%=
\n\t
"
"loop_kc2_%=:
\n\t
"
"vld1.32 {q0}, [%[a
]]!
\n\t
"
"vld1.32 {q1}, [%[b
]]!
\n\t
"
"vld1.32 {q0}, [%[a
_ptr]]!
\n\t
"
"vld1.32 {q1}, [%[b
_ptr]]!
\n\t
"
"vmla.f32 q10, q1, d0[0]
\n\t
"
"vmla.f32 q11, q1, d0[1]
\n\t
"
"vmla.f32 q12, q1, d1[0]
\n\t
"
...
...
@@ -531,290 +310,168 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
"bge loop_kc2_%=
\n\t
"
"end_kc2_%=:
\n\t
"
"cmp %[mc], #4
\n\t
"
"bne temp_%=
\n\t
"
"cmp %[nc], #4
\n\t
"
"bne temp_%=
\n\t
"
"vmov.f32 d8[0], %[alpha]
\n\t
"
"vmov.f32 d8[1], %[beta]
\n\t
"
"cmp %[flag_alpha], #1
\n\t
"
"bne alpha_%=
\n\t
"
"alpha_%=:
\n\t
"
"vmul.f32 q10, q10, d8[0]
\n\t
"
"vmul.f32 q11, q11, d8[0]
\n\t
"
"vmul.f32 q12, q12, d8[0]
\n\t
"
"vmul.f32 q13, q13, d8[0]
\n\t
"
"beta_%=:
\n\t
"
"cmp %[flag_beta], #0
\n\t
"
"beq memory_%=
\n\t
"
"mov r4, %[C]
\n\t
"
"mov r6, %[bytes_ldc]
\n\t
"
"vld1.32 {q0}, [r4], r6
\n\t
"
"vld1.32 {q1}, [r4], r6
\n\t
"
"vld1.32 {q2}, [r4], r6
\n\t
"
"vld1.32 {q3}, [r4]
\n\t
"
"cmp %[flag_beta], #1
\n\t
"
"beq beta_eq1_%=
\n\t
"
"bne beta_ne1_%=
\n\t
"
"beta_eq1_%=:
\n\t
"
"vadd.f32 q10, q10, q0
\n\t
"
"vadd.f32 q11, q11, q1
\n\t
"
"vadd.f32 q12, q12, q2
\n\t
"
"vadd.f32 q13, q13, q3
\n\t
"
"b memory_%=
\n\t
"
"beta_ne1_%=:
\n\t
"
"vmla.f32 q10, q0, d8[1]
\n\t
"
"vmla.f32 q11, q1, d8[1]
\n\t
"
"vmla.f32 q12, q2, d8[1]
\n\t
"
"vmla.f32 q13, q3, d8[1]
\n\t
"
"memory_%=:
\n\t
"
"vmax.f32 q10, q10, q14
\n\t
"
"vmax.f32 q11, q11, q14
\n\t
"
"vmax.f32 q12, q12, q14
\n\t
"
"vmax.f32 q13, q13, q14
\n\t
"
"mov r5, %[C]
\n\t
"
"mov r6, %[bytes_ldc]
\n\t
"
"mov r5, %[c]
\n\t
"
"mov r6, %[step]
\n\t
"
"vst1.32 {q10}, [r5], r6
\n\t
"
"vst1.32 {q11}, [r5], r6
\n\t
"
"vst1.32 {q12}, [r5], r6
\n\t
"
"vst1.32 {q13}, [r5]
\n\t
"
"b end_%=
\n\t
"
"temp_%=:
\n\t
"
"vst1.32 {q10, q11}, [%[ab]]!
\n\t
"
"vst1.32 {q12, q13}, [%[ab]]
\n\t
"
"end_%=:
\n\t
"
:
:
[
a
]
"r"
(
a
),
[
b
]
"r"
(
b
),
[
C
]
"r"
(
C
),
[
ab
]
"r"
(
ab
),
[
kc1
]
"r"
(
kc1
),
[
kc2
]
"r"
(
kc2
),
[
mc
]
"r"
(
mc
),
[
nc
]
"r"
(
nc
),
[
alpha
]
"r"
(
alpha
),
[
beta
]
"r"
(
beta
),
[
bytes_ldc
]
"r"
(
bytes_ldc
),
[
flag_alpha
]
"r"
(
flag_alpha
),
[
flag_beta
]
"r"
(
flag_beta
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
);
if
(
mc
!=
MR
||
nc
!=
NR
)
{
int
i
,
j
;
for
(
i
=
0
;
i
<
mc
;
++
i
)
{
for
(
j
=
0
;
j
<
nc
;
++
j
)
{
if
(
beta
==
0.0
)
{
if
(
alpha
!=
1.0
)
{
C
(
i
,
j
)
=
alpha
*
ab
[
i
*
MR
+
j
];
}
else
{
C
(
i
,
j
)
=
ab
[
i
*
MR
+
j
];
}
}
else
{
if
(
beta
!=
1.0
)
{
C
(
i
,
j
)
*=
beta
;
}
if
(
alpha
!=
1.0
)
{
C
(
i
,
j
)
+=
alpha
*
ab
[
i
*
MR
+
j
];
}
else
{
C
(
i
,
j
)
+=
ab
[
i
*
MR
+
j
];
}
}
if
(
relu
)
{
if
(
C
(
i
,
j
)
<
0
)
{
C
(
i
,
j
)
=
0
;
}
}
}
}
}
:
[
a_ptr
]
"r"
(
a_ptr
),
[
b_ptr
]
"r"
(
b_ptr
),
[
c
]
"r"
(
c
),
[
kc1
]
"r"
(
kc1
),
[
kc2
]
"r"
(
kc2
),
[
step
]
"r"
(
step
)
:
"memory"
,
"r5"
,
"r6"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
);
}
#else
void
AddDot4x4
(
int
k
,
float
alpha
,
const
float
*
a
,
int
lda
,
const
float
*
b
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
int
mc
,
int
nc
)
{
float
c
[
16
]
=
{
0
};
float
reg_a0
,
reg_a1
,
reg_a2
,
reg_a3
,
reg_b0
,
reg_b1
,
reg_b2
,
reg_b3
;
void
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
float
*
c0
,
*
c1
,
*
c2
,
*
c3
;
c0
=
c
;
c1
=
c
+
ldc
;
c2
=
c
+
2
*
ldc
;
c3
=
c
+
3
*
ldc
;
for
(
int
p
=
0
;
p
<
k
;
p
+=
1
)
{
reg_b0
=
*
b
++
;
reg_b1
=
*
b
++
;
reg_b2
=
*
b
++
;
reg_b3
=
*
b
++
;
reg_a0
=
*
a
++
;
reg_a1
=
*
a
++
;
reg_a2
=
*
a
++
;
reg_a3
=
*
a
++
;
// first row
c
[
0
]
+=
reg_a0
*
reg_b0
;
c
[
1
]
+=
reg_a0
*
reg_b1
;
c
[
2
]
+=
reg_a0
*
reg_b2
;
c
[
3
]
+=
reg_a0
*
reg_b3
;
c
0
[
0
]
+=
a
[
0
]
*
b
[
0
]
;
c
0
[
1
]
+=
a
[
0
]
*
b
[
1
]
;
c
0
[
2
]
+=
a
[
0
]
*
b
[
2
]
;
c
0
[
3
]
+=
a
[
0
]
*
b
[
3
]
;
// second row
c
[
4
]
+=
reg_a1
*
reg_b0
;
c
[
5
]
+=
reg_a1
*
reg_b1
;
c
[
6
]
+=
reg_a1
*
reg_b2
;
c
[
7
]
+=
reg_a1
*
reg_b3
;
c
1
[
0
]
+=
a
[
1
]
*
b
[
0
]
;
c
1
[
1
]
+=
a
[
1
]
*
b
[
1
]
;
c
1
[
2
]
+=
a
[
1
]
*
b
[
2
]
;
c
1
[
3
]
+=
a
[
1
]
*
b
[
3
]
;
// third row
c
[
8
]
+=
reg_a2
*
reg_b0
;
c
[
9
]
+=
reg_a2
*
reg_b1
;
c
[
10
]
+=
reg_a2
*
reg_b2
;
c
[
11
]
+=
reg_a2
*
reg_b3
;
c
2
[
0
]
+=
a
[
2
]
*
b
[
0
]
;
c
2
[
1
]
+=
a
[
2
]
*
b
[
1
]
;
c
2
[
2
]
+=
a
[
2
]
*
b
[
2
]
;
c
2
[
3
]
+=
a
[
2
]
*
b
[
3
]
;
// fourth row
c
[
12
]
+=
reg_a3
*
reg_b0
;
c
[
13
]
+=
reg_a3
*
reg_b1
;
c
[
14
]
+=
reg_a3
*
reg_b2
;
c
[
15
]
+=
reg_a3
*
reg_b3
;
}
int
i
,
j
;
for
(
i
=
0
;
i
<
mc
;
++
i
)
{
for
(
j
=
0
;
j
<
nc
;
++
j
)
{
if
(
beta
==
0.0
)
{
C
(
i
,
j
)
=
0.0
;
}
else
if
(
beta
!=
1.0
)
{
C
(
i
,
j
)
*=
beta
;
}
if
(
alpha
!=
1.0
)
{
C
(
i
,
j
)
+=
alpha
*
c
[
i
*
MR
+
j
];
}
else
{
C
(
i
,
j
)
+=
c
[
i
*
MR
+
j
];
}
}
}
}
void
AddDot4x4_relu
(
int
k
,
float
alpha
,
const
float
*
a
,
int
lda
,
const
float
*
b
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
int
mc
,
int
nc
,
bool
relu
)
{
float
c
[
16
]
=
{
0
};
float
reg_a0
,
reg_a1
,
reg_a2
,
reg_a3
,
reg_b0
,
reg_b1
,
reg_b2
,
reg_b3
;
for
(
int
p
=
0
;
p
<
k
;
p
+=
1
)
{
reg_b0
=
*
b
++
;
reg_b1
=
*
b
++
;
reg_b2
=
*
b
++
;
reg_b3
=
*
b
++
;
reg_a0
=
*
a
++
;
reg_a1
=
*
a
++
;
reg_a2
=
*
a
++
;
reg_a3
=
*
a
++
;
// first row
c
[
0
]
+=
reg_a0
*
reg_b0
;
c
[
1
]
+=
reg_a0
*
reg_b1
;
c
[
2
]
+=
reg_a0
*
reg_b2
;
c
[
3
]
+=
reg_a0
*
reg_b3
;
// second row
c
[
4
]
+=
reg_a1
*
reg_b0
;
c
[
5
]
+=
reg_a1
*
reg_b1
;
c
[
6
]
+=
reg_a1
*
reg_b2
;
c
[
7
]
+=
reg_a1
*
reg_b3
;
// third row
c
[
8
]
+=
reg_a2
*
reg_b0
;
c
[
9
]
+=
reg_a2
*
reg_b1
;
c
[
10
]
+=
reg_a2
*
reg_b2
;
c
[
11
]
+=
reg_a2
*
reg_b3
;
c3
[
0
]
+=
a
[
3
]
*
b
[
0
];
c3
[
1
]
+=
a
[
3
]
*
b
[
1
];
c3
[
2
]
+=
a
[
3
]
*
b
[
2
];
c3
[
3
]
+=
a
[
3
]
*
b
[
3
];
// fourth row
c
[
12
]
+=
reg_a3
*
reg_b0
;
c
[
13
]
+=
reg_a3
*
reg_b1
;
c
[
14
]
+=
reg_a3
*
reg_b2
;
c
[
15
]
+=
reg_a3
*
reg_b3
;
}
int
i
,
j
;
for
(
i
=
0
;
i
<
mc
;
++
i
)
{
for
(
j
=
0
;
j
<
nc
;
++
j
)
{
if
(
beta
==
0.0
)
{
C
(
i
,
j
)
=
0.0
;
}
else
if
(
beta
!=
1.0
)
{
C
(
i
,
j
)
*=
beta
;
}
if
(
alpha
!=
1.0
)
{
C
(
i
,
j
)
+=
alpha
*
c
[
i
*
MR
+
j
];
}
else
{
C
(
i
,
j
)
+=
c
[
i
*
MR
+
j
];
}
if
(
relu
)
{
if
(
C
(
i
,
j
)
<
0
)
{
C
(
i
,
j
)
=
0
;
}
}
}
a
+=
4
;
b
+=
4
;
}
}
#endif
// 32位 float 矩阵乘法
void
sgemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
)
{
int
i
,
j
,
p
,
mc
,
nc
,
kc
;
float
beta_
;
#ifdef ARMV7
if
(
m
==
1
)
{
VectorKernel
(
1
,
n
,
k
,
alpha
,
A
,
lda
,
B
,
ldb
,
beta
,
C
,
ldc
);
return
;
void
Sgemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
)
{
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int
L1
=
30
*
1024
;
int
L2
=
1
*
1024
*
1024
;
KC
=
k
;
MC
=
L2
/
(
2
*
KC
*
sizeof
(
float
));
NC
=
MC
;
// make sure MC is multiple of 4, and NC is multiple of 8
int
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
MC
+
4
-
1
)
/
4
*
4
;
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
NC
+
8
-
1
)
/
8
*
8
;
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
packedC
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
NC
));
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
for
(
int
l
=
0
;
l
<
KC
;
++
l
)
{
zero
[
l
]
=
0
;
}
#endif
for
(
j
=
0
;
j
<
n
;
j
+=
NC
)
{
int
mc
,
nc
;
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
nc
=
s_min
(
n
-
j
,
NC
);
for
(
p
=
0
;
p
<
k
;
p
+=
KC
)
{
kc
=
s_min
(
k
-
p
,
KC
);
for
(
i
=
0
;
i
<
m
;
i
+=
MC
)
{
mc
=
s_min
(
m
-
i
,
MC
);
if
(
p
!=
0
)
{
beta_
=
1.0
;
}
else
{
beta_
=
beta
;
}
InnerKernel
(
mc
,
nc
,
kc
,
alpha
,
&
A
(
i
,
p
),
lda
,
&
B
(
p
,
j
),
ldb
,
beta_
,
&
C
(
i
,
j
),
ldc
,
i
==
0
);
}
PackMatrixB_
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
mc
=
s_min
(
m
-
i
,
MC
);
PackMatrixA_
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
InnerKernel
(
mc
,
nc
,
alpha
,
packedA
,
packedB
,
beta
,
packedC
,
&
C
(
i
,
j
),
ldc
,
relu
);
}
}
paddle_mobile
::
memory
::
Free
(
packedA
);
paddle_mobile
::
memory
::
Free
(
packedB
);
paddle_mobile
::
memory
::
Free
(
packedC
);
paddle_mobile
::
memory
::
Free
(
zero
);
}
void
sgemm_relu
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
)
{
int
i
,
j
,
p
,
mc
,
nc
,
kc
;
float
beta_
;
for
(
j
=
0
;
j
<
n
;
j
+=
NC
)
{
nc
=
s_min
(
n
-
j
,
NC
);
for
(
p
=
0
;
p
<
k
;
p
+=
KC
)
{
kc
=
s_min
(
k
-
p
,
KC
);
for
(
i
=
0
;
i
<
m
;
i
+=
MC
)
{
mc
=
s_min
(
m
-
i
,
MC
);
if
(
p
!=
0
)
{
beta_
=
1.0
;
}
else
{
beta_
=
beta
;
}
void
SgemmWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
)
{
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int
L1
=
30
*
1024
;
int
L2
=
1
*
1024
*
1024
;
KC
=
k
;
MC
=
L2
/
(
2
*
KC
*
sizeof
(
float
));
NC
=
MC
;
// make sure MC is multiple of 4, and NC is multiple of 8
int
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
MC
+
4
-
1
)
/
4
*
4
;
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
NC
+
8
-
1
)
/
8
*
8
;
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
packedC
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
NC
));
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
for
(
int
l
=
0
;
l
<
KC
;
++
l
)
{
zero
[
l
]
=
0
;
}
if
(
p
+
KC
>=
k
)
{
InnerKernel_relu
(
mc
,
nc
,
kc
,
alpha
,
&
A
(
i
,
p
),
lda
,
&
B
(
p
,
j
),
ldb
,
beta_
,
&
C
(
i
,
j
),
ldc
,
i
==
0
,
true
);
}
else
{
InnerKernel
(
mc
,
nc
,
kc
,
alpha
,
&
A
(
i
,
p
),
lda
,
&
B
(
p
,
j
),
ldb
,
beta_
,
&
C
(
i
,
j
),
ldc
,
i
==
0
);
}
}
int
mc
,
nc
;
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
nc
=
s_min
(
n
-
j
,
NC
);
PackMatrixB_
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
mc
=
s_min
(
m
-
i
,
MC
);
PackMatrixA_
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
InnerKernelWithBn
(
mc
,
nc
,
alpha
,
packedA
,
packedB
,
beta
,
packedC
,
&
C
(
i
,
j
),
ldc
,
relu
,
new_scale
+
ldc
*
i
+
j
,
new_bias
+
ldc
*
i
+
j
);
}
}
paddle_mobile
::
memory
::
Free
(
packedA
);
paddle_mobile
::
memory
::
Free
(
packedB
);
paddle_mobile
::
memory
::
Free
(
packedC
);
paddle_mobile
::
memory
::
Free
(
zero
);
}
#ifdef ARMV7
void
VectorKernel
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
)
{
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
)
{
float
*
bufferC
=
static_cast
<
float
*>
(
memory
::
Alloc
(
sizeof
(
float
)
*
n
));
const
float
*
a0
,
*
b0
,
*
b1
,
*
b2
,
*
b3
;
...
...
@@ -1016,18 +673,995 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
}
}
c0
=
bufferC
;
C0
=
C
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
if
(
beta
==
1.0
)
{
*
C0
++
+=
*
c0
++
;
}
else
{
*
C0
++
=
*
c0
++
;
}
if
(
alpha
!=
1
)
{
VecWriteWithAlphaBeta
(
n
,
bufferC
,
C
,
ldc
);
return
;
}
if
(
beta
==
0
)
{
VecWriteBasic
(
n
,
bufferC
,
C
,
ldc
);
return
;
}
if
(
beta
==
1
&&
!
relu
)
{
VecWriteWithAdd
(
n
,
bufferC
,
C
,
ldc
);
return
;
}
if
(
beta
==
1
&&
relu
)
{
VecWriteWithAddRelu
(
n
,
bufferC
,
C
,
ldc
);
return
;
}
}
#endif
}
// namespace math
}
// namespace operators
void
VectorKernelWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
)
{
float
*
bufferC
=
static_cast
<
float
*>
(
memory
::
Alloc
(
sizeof
(
float
)
*
n
));
const
float
*
a0
,
*
b0
,
*
b1
,
*
b2
,
*
b3
;
float
*
c0
,
*
C0
;
int
volatile
kc1
=
k
/
4
;
int
volatile
kc2
=
k
%
4
;
int
volatile
nc1
=
n
/
16
;
int
_nc1
=
n
%
16
;
int
volatile
nc2
=
_nc1
/
4
;
int
volatile
nc3
=
_nc1
%
4
;
for
(
int
i
=
0
;
i
<
kc1
;
i
++
)
{
a0
=
A
+
i
*
4
;
b0
=
B
+
i
*
4
*
ldb
;
b1
=
b0
+
ldb
;
b2
=
b1
+
ldb
;
b3
=
b2
+
ldb
;
c0
=
bufferC
;
asm
volatile
(
"pld [%[a0], #16]
\n\t
"
"vld1.32 {q0}, [%[a0]]
\n\t
"
"subs %[nc1], %[nc1], #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
"cmp %[i], #0
\n\t
"
"beq i_eq0_%=
\n\t
"
"bne i_ne0_%=
\n\t
"
"i_eq0_%=:
\n\t
"
"vmov.f32 q10, #0.0
\n\t
"
"vmov.f32 q11, #0.0
\n\t
"
"vmov.f32 q12, #0.0
\n\t
"
"vmov.f32 q13, #0.0
\n\t
"
"b gemm_nc1_%=
\n\t
"
"i_ne0_%=:
\n\t
"
"pld [%[c0], #64]
\n\t
"
"vld1.32 {q10, q11}, [%[c0]]!
\n\t
"
"vld1.32 {q12, q13}, [%[c0]]
\n\t
"
"sub %[c0], %[c0], #32
\n\t
"
"gemm_nc1_%=:
\n\t
"
"pld [%[b0], #64]
\n\t
"
"vld1.32 {q2, q3}, [%[b0]]!
\n\t
"
"vld1.32 {q4, q5}, [%[b0]]!
\n\t
"
"vmla.f32 q10, q2, d0[0]
\n\t
"
"vmla.f32 q11, q3, d0[0]
\n\t
"
"vmla.f32 q12, q4, d0[0]
\n\t
"
"vmla.f32 q13, q5, d0[0]
\n\t
"
"pld [%[b1], #64]
\n\t
"
"vld1.32 {q2, q3}, [%[b1]]!
\n\t
"
"vld1.32 {q4, q5}, [%[b1]]!
\n\t
"
"vmla.f32 q10, q2, d0[1]
\n\t
"
"vmla.f32 q11, q3, d0[1]
\n\t
"
"vmla.f32 q12, q4, d0[1]
\n\t
"
"vmla.f32 q13, q5, d0[1]
\n\t
"
"pld [%[b2], #64]
\n\t
"
"vld1.32 {q2, q3}, [%[b2]]!
\n\t
"
"vld1.32 {q4, q5}, [%[b2]]!
\n\t
"
"vmla.f32 q10, q2, d1[0]
\n\t
"
"vmla.f32 q11, q3, d1[0]
\n\t
"
"vmla.f32 q12, q4, d1[0]
\n\t
"
"vmla.f32 q13, q5, d1[0]
\n\t
"
"pld [%[b3], #64]
\n\t
"
"vld1.32 {q2, q3}, [%[b3]]!
\n\t
"
"vld1.32 {q4, q5}, [%[b3]]!
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q4, d1[1]
\n\t
"
"vmla.f32 q13, q5, d1[1]
\n\t
"
"vst1.32 {q10, q11}, [%[c0]]!
\n\t
"
"vst1.32 {q12, q13}, [%[c0]]!
\n\t
"
"subs %[nc1], %[nc1], #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
"subs %[nc2], %[nc2], #1
\n\t
"
"blt end_nc2_%=
\n\t
"
"loop_nc2_%=:
\n\t
"
"cmp %[i], #0
\n\t
"
"beq ii_eq0_%=
\n\t
"
"bne ii_ne0_%=
\n\t
"
"ii_eq0_%=:
\n\t
"
"vmov.f32 q10, #0.0
\n\t
"
"b gemm_nc2_%=
\n\t
"
"ii_ne0_%=:
\n\t
"
"pld [%[c0], #16]
\n\t
"
"vld1.32 {q10}, [%[c0]]
\n\t
"
"gemm_nc2_%=:
\n\t
"
"pld [%[b0], #16]
\n\t
"
"vld1.32 {q2}, [%[b0]]!
\n\t
"
"vmla.f32 q10, q2, d0[0]
\n\t
"
"pld [%[b1], #16]
\n\t
"
"vld1.32 {q3}, [%[b1]]!
\n\t
"
"vmla.f32 q10, q3, d0[1]
\n\t
"
"pld [%[b2], #16]
\n\t
"
"vld1.32 {q4}, [%[b2]]!
\n\t
"
"vmla.f32 q10, q4, d1[0]
\n\t
"
"pld [%[b3], #16]
\n\t
"
"vld1.32 {q5}, [%[b3]]!
\n\t
"
"vmla.f32 q10, q5, d1[1]
\n\t
"
"vst1.32 {q10}, [%[c0]]!
\n\t
"
"subs %[nc2], %[nc2], #1
\n\t
"
"bge loop_nc2_%=
\n\t
"
"end_nc2_%=:
\n\t
"
:
[
b0
]
"+r"
(
b0
),
[
b1
]
"+r"
(
b1
),
[
b2
]
"+r"
(
b2
),
[
b3
]
"+r"
(
b3
),
[
c0
]
"+r"
(
c0
)
:
[
a0
]
"r"
(
a0
),
[
i
]
"r"
(
i
),
[
nc1
]
"r"
(
nc1
),
[
nc2
]
"r"
(
nc2
)
:
"memory"
,
"q0"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
);
for
(
int
j
=
0
;
j
<
nc3
;
j
++
)
{
if
(
i
==
0
)
{
*
c0
=
(
*
a0
)
*
(
*
b0
++
);
}
else
{
*
c0
+=
(
*
a0
)
*
(
*
b0
++
);
}
*
c0
+=
(
*
(
a0
+
1
))
*
(
*
b1
++
);
*
c0
+=
(
*
(
a0
+
2
))
*
(
*
b2
++
);
*
c0
+=
(
*
(
a0
+
3
))
*
(
*
b3
++
);
c0
++
;
}
}
for
(
int
i
=
0
;
i
<
kc2
;
++
i
)
{
a0
=
A
+
4
*
kc1
+
i
;
b0
=
B
+
(
4
*
kc1
+
i
)
*
ldb
;
c0
=
bufferC
;
asm
volatile
(
"pld [%[a0], #16]
\n\t
"
"vld1.32 {d0}, [%[a0]]
\n\t
"
"subs %[nc1], %[nc1], #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
"pld [%[c0], #64]
\n\t
"
"vld1.32 {q10, q11}, [%[c0]]!
\n\t
"
"vld1.32 {q12, q13}, [%[c0]]
\n\t
"
"sub %[c0], %[c0], #32
\n\t
"
"gemm_nc1_%=:
\n\t
"
"pld [%[b0], #64]
\n\t
"
"vld1.32 {q2, q3}, [%[b0]]!
\n\t
"
"vld1.32 {q4, q5}, [%[b0]]!
\n\t
"
"vmla.f32 q10, q2, d0[0]
\n\t
"
"vmla.f32 q11, q3, d0[0]
\n\t
"
"vmla.f32 q12, q4, d0[0]
\n\t
"
"vmla.f32 q13, q5, d0[0]
\n\t
"
"vst1.32 {q10, q11}, [%[c0]]!
\n\t
"
"vst1.32 {q12, q13}, [%[c0]]!
\n\t
"
"subs %[nc1], %[nc1], #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
"subs %[nc2], %[nc2], #1
\n\t
"
"blt end_nc2_%=
\n\t
"
"loop_nc2_%=:
\n\t
"
"pld [%[c0], #16]
\n\t
"
"vld1.32 {q10}, [%[c0]]
\n\t
"
"gemm_nc2_%=:
\n\t
"
"vld1.32 {q2}, [%[b0]]!
\n\t
"
"vmla.f32 q10, q2, d0[0]
\n\t
"
"vst1.32 {q10}, [%[c0]]!
\n\t
"
"subs %[nc2], %[nc2], #1
\n\t
"
"bge loop_nc2_%=
\n\t
"
"end_nc2_%=:
\n\t
"
:
[
b0
]
"+r"
(
b0
),
[
b1
]
"+r"
(
b1
),
[
b2
]
"+r"
(
b2
),
[
b3
]
"+r"
(
b3
),
[
c0
]
"+r"
(
c0
)
:
[
a0
]
"r"
(
a0
),
[
nc1
]
"r"
(
nc1
),
[
nc2
]
"r"
(
nc2
)
:
"memory"
,
"q0"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
);
for
(
int
j
=
0
;
j
<
nc3
;
j
++
)
{
*
c0
+=
(
*
a0
)
*
(
*
b0
++
);
c0
++
;
}
}
if
(
relu
)
{
VecWriteWithBnRelu
(
n
,
bufferC
,
C
,
ldc
,
new_scale
,
new_bias
);
}
else
{
VecWriteWithBn
(
n
,
bufferC
,
C
,
ldc
,
new_scale
,
new_bias
);
}
}
void
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
const
float
*
a_ptr
,
*
b_ptr
;
a_ptr
=
a
;
b_ptr
=
b
;
int
kc1
=
k
/
4
;
int
kc2
=
k
%
4
;
int
step
=
4
*
ldc
;
asm
volatile
(
"pld [%[a_ptr]]
\n\t
"
"pld [%[b_ptr]]
\n\t
"
"vmov.f32 q8, #0.0
\n\t
"
"vmov.f32 q9, #0.0
\n\t
"
"vmov.f32 q10, #0.0
\n\t
"
"vmov.f32 q11, #0.0
\n\t
"
"vmov.f32 q12, #0.0
\n\t
"
"vmov.f32 q13, #0.0
\n\t
"
"vmov.f32 q14, #0.0
\n\t
"
"vmov.f32 q15, #0.0
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"blt end_kc1_%=
\n\t
"
"loop_kc1_%=:
\n\t
"
"pld [%[a_ptr], #64]
\n\t
"
"pld [%[b_ptr], #64]
\n\t
"
"vld1.32 {q0, q1}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vld1.32 {q4, q5}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q8, q2, d0[0]
\n\t
"
"vmla.f32 q9, q3, d0[0]
\n\t
"
"vmla.f32 q10, q2, d0[1]
\n\t
"
"vmla.f32 q11, q3, d0[1]
\n\t
"
"vmla.f32 q12, q2, d1[0]
\n\t
"
"vmla.f32 q13, q3, d1[0]
\n\t
"
"vmla.f32 q14, q2, d1[1]
\n\t
"
"vmla.f32 q15, q3, d1[1]
\n\t
"
"vmla.f32 q8, q4, d2[0]
\n\t
"
"vmla.f32 q9, q5, d2[0]
\n\t
"
"vmla.f32 q10, q4, d2[1]
\n\t
"
"vmla.f32 q11, q5, d2[1]
\n\t
"
"vmla.f32 q12, q4, d3[0]
\n\t
"
"vmla.f32 q13, q5, d3[0]
\n\t
"
"vmla.f32 q14, q4, d3[1]
\n\t
"
"vmla.f32 q15, q5, d3[1]
\n\t
"
"pld [%[b_ptr], #64]
\n\t
"
"vld1.32 {q0, q1}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vld1.32 {q4, q5}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q8, q2, d0[0]
\n\t
"
"vmla.f32 q9, q3, d0[0]
\n\t
"
"vmla.f32 q10, q2, d0[1]
\n\t
"
"vmla.f32 q11, q3, d0[1]
\n\t
"
"vmla.f32 q12, q2, d1[0]
\n\t
"
"vmla.f32 q13, q3, d1[0]
\n\t
"
"vmla.f32 q14, q2, d1[1]
\n\t
"
"vmla.f32 q15, q3, d1[1]
\n\t
"
"vmla.f32 q8, q4, d2[0]
\n\t
"
"vmla.f32 q9, q5, d2[0]
\n\t
"
"vmla.f32 q10, q4, d2[1]
\n\t
"
"vmla.f32 q11, q5, d2[1]
\n\t
"
"vmla.f32 q12, q4, d3[0]
\n\t
"
"vmla.f32 q13, q5, d3[0]
\n\t
"
"vmla.f32 q14, q4, d3[1]
\n\t
"
"vmla.f32 q15, q5, d3[1]
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"bge loop_kc1_%=
\n\t
"
"end_kc1_%=:
\n\t
"
"subs %[kc2], %[kc2], #1
\n\t
"
"blt end_kc2_%=
\n\t
"
"loop_kc2_%=:
\n\t
"
"vld1.32 {q0}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q8, q2, d0[0]
\n\t
"
"vmla.f32 q9, q3, d0[0]
\n\t
"
"vmla.f32 q10, q2, d0[1]
\n\t
"
"vmla.f32 q11, q3, d0[1]
\n\t
"
"vmla.f32 q12, q2, d1[0]
\n\t
"
"vmla.f32 q13, q3, d1[0]
\n\t
"
"vmla.f32 q14, q2, d1[1]
\n\t
"
"vmla.f32 q15, q3, d1[1]
\n\t
"
"subs %[kc2], %[kc2], #1
\n\t
"
"bge loop_kc2_%=
\n\t
"
"end_kc2_%=:
\n\t
"
"mov r5, %[c]
\n\t
"
"mov r6, %[step]
\n\t
"
"vst1.32 {q8, q9}, [r5], r6
\n\t
"
"vst1.32 {q10, q11}, [r5], r6
\n\t
"
"vst1.32 {q12, q13}, [r5], r6
\n\t
"
"vst1.32 {q14, q15}, [r5]
\n\t
"
:
:
[
a_ptr
]
"r"
(
a_ptr
),
[
b_ptr
]
"r"
(
b_ptr
),
[
c
]
"r"
(
c
),
[
kc1
]
"r"
(
kc1
),
[
kc2
]
"r"
(
kc2
),
[
step
]
"r"
(
step
)
:
"memory"
,
"r5"
,
"r6"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
);
}
// C = A * B
void
WriteBasic
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
nc
/
16
;
int
_nc1
=
nc
%
16
;
int
step
=
4
*
ldc
;
int
step1
=
4
*
(
NC
-
16
*
nc1
);
int
volatile
m
=
mc
;
float
*
volatile
c_ptr
,
*
volatile
C_ptr
;
float
*
C0
,
*
c0
;
c_ptr
=
c
;
C_ptr
=
C
;
if
(
nc1
>
0
)
{
asm
volatile
(
"subs %[mc], %[mc], #1
\n\t
"
"blt end_mc_%=
\n\t
"
"loop_mc_%=:
\n\t
"
"mov r6, %[C_ptr]
\n\t
"
"mov r5, %[nc1]
\n\t
"
"subs r5, r5, #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
"vld1.32 {q0, q1}, [%[c_ptr]]!
\n\t
"
"vst1.32 {q0, q1}, [r6]!
\n\t
"
"vld1.32 {q2, q3}, [%[c_ptr]]!
\n\t
"
"vst1.32 {q2, q3}, [r6]!
\n\t
"
"subs r5, r5, #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
"add %[C_ptr], %[C_ptr], %[step]
\n\t
"
"add %[c_ptr], %[c_ptr], %[step1]
\n\t
"
"subs %[mc], %[mc], #1
\n\t
"
"bge loop_mc_%=
\n\t
"
"end_mc_%=:
\n\t
"
:
:
[
C_ptr
]
"r"
(
C_ptr
),
[
c_ptr
]
"r"
(
c_ptr
),
[
mc
]
"r"
(
m
),
[
nc1
]
"r"
(
nc1
),
[
step
]
"r"
(
step
),
[
step1
]
"r"
(
step1
)
:
"memory"
,
"r5"
,
"r6"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
);
}
if
(
_nc1
!=
0
)
{
for
(
int
i
=
0
;
i
<
mc
;
i
++
)
{
C0
=
C_ptr
+
nc1
*
16
+
i
*
ldc
;
c0
=
c_ptr
+
nc1
*
16
+
i
*
NC
;
for
(
int
j
=
0
;
j
<
_nc1
;
j
++
)
{
*
C0
++
=
*
c0
++
;
}
}
}
}
// C = alpha * A * B + beta * C
void
WriteWithAlphaBeta
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
// C = A * B + C
void
WriteWithAdd
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
nc
/
16
;
int
_nc1
=
nc
%
16
;
int
step
=
4
*
ldc
;
int
step1
=
4
*
(
NC
-
16
*
nc1
);
int
volatile
m
=
mc
;
float
*
volatile
c_ptr
,
*
volatile
C_ptr
;
float
*
C0
,
*
c0
;
c_ptr
=
c
;
C_ptr
=
C
;
if
(
nc1
>
0
)
{
asm
volatile
(
"subs %[mc], %[mc], #1
\n\t
"
"blt end_mc_%=
\n\t
"
"loop_mc_%=:
\n\t
"
"mov r6, %[C_ptr]
\n\t
"
"mov r5, %[nc1]
\n\t
"
"subs r5, r5, #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
"vld1.32 {q0, q1}, [r6]
\n\t
"
"vld1.32 {q2, q3}, [%[c_ptr]]!
\n\t
"
"vadd.f32 q10, q0, q2
\n\t
"
"vadd.f32 q11, q1, q3
\n\t
"
"vst1.32 {q10, q11}, [r6]!
\n\t
"
"vld1.32 {q4, q5}, [r6]
\n\t
"
"vld1.32 {q6, q7}, [%[c_ptr]]!
\n\t
"
"vadd.f32 q12, q4, q6
\n\t
"
"vadd.f32 q13, q5, q7
\n\t
"
"vst1.32 {q12, q13}, [r6]!
\n\t
"
"subs r5, r5, #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
"add %[C_ptr], %[C_ptr], %[step]
\n\t
"
"add %[c_ptr], %[c_ptr], %[step1]
\n\t
"
"subs %[mc], %[mc], #1
\n\t
"
"bge loop_mc_%=
\n\t
"
"end_mc_%=:
\n\t
"
:
:
[
C_ptr
]
"r"
(
C_ptr
),
[
c_ptr
]
"r"
(
c_ptr
),
[
mc
]
"r"
(
m
),
[
nc1
]
"r"
(
nc1
),
[
step
]
"r"
(
step
),
[
step1
]
"r"
(
step1
)
:
"memory"
,
"r5"
,
"r6"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
);
}
if
(
_nc1
!=
0
)
{
for
(
int
i
=
0
;
i
<
mc
;
i
++
)
{
C0
=
C_ptr
+
nc1
*
16
+
i
*
ldc
;
c0
=
c_ptr
+
nc1
*
16
+
i
*
NC
;
for
(
int
j
=
0
;
j
<
_nc1
;
j
++
)
{
*
C0
++
+=
*
c0
++
;
}
}
}
}
// C = A * B + C, relu(C)
void
WriteWithAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
nc
/
16
;
int
_nc1
=
nc
%
16
;
int
step
=
4
*
ldc
;
int
step1
=
4
*
(
NC
-
16
*
nc1
);
int
volatile
m
=
mc
;
float
*
volatile
c_ptr
,
*
volatile
C_ptr
;
float
*
C0
,
*
c0
;
c_ptr
=
c
;
C_ptr
=
C
;
if
(
nc1
>
0
)
{
asm
volatile
(
"vmov.f32 q14, #0.0
\n\t
"
"subs %[mc], %[mc], #1
\n\t
"
"blt end_mc_%=
\n\t
"
"loop_mc_%=:
\n\t
"
"mov r6, %[C_ptr]
\n\t
"
"mov r5, %[nc1]
\n\t
"
"subs r5, r5, #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
"vld1.32 {q0, q1}, [r6]
\n\t
"
"vld1.32 {q2, q3}, [%[c_ptr]]!
\n\t
"
"vadd.f32 q10, q0, q2
\n\t
"
"vadd.f32 q11, q1, q3
\n\t
"
"vmax.f32 q10, q10, q14
\n\t
"
"vmax.f32 q11, q11, q14
\n\t
"
"vst1.32 {q10, q11}, [r6]!
\n\t
"
"vld1.32 {q4, q5}, [r6]
\n\t
"
"vld1.32 {q6, q7}, [%[c_ptr]]!
\n\t
"
"vadd.f32 q12, q4, q6
\n\t
"
"vadd.f32 q13, q5, q7
\n\t
"
"vmax.f32 q12, q12, q14
\n\t
"
"vmax.f32 q13, q13, q14
\n\t
"
"vst1.32 {q12, q13}, [r6]!
\n\t
"
"subs r5, r5, #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
"add %[C_ptr], %[C_ptr], %[step]
\n\t
"
"add %[c_ptr], %[c_ptr], %[step1]
\n\t
"
"subs %[mc], %[mc], #1
\n\t
"
"bge loop_mc_%=
\n\t
"
"end_mc_%=:
\n\t
"
:
:
[
C_ptr
]
"r"
(
C_ptr
),
[
c_ptr
]
"r"
(
c_ptr
),
[
mc
]
"r"
(
m
),
[
nc1
]
"r"
(
nc1
),
[
step
]
"r"
(
step
),
[
step1
]
"r"
(
step1
)
:
"memory"
,
"r5"
,
"r6"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
);
}
if
(
_nc1
!=
0
)
{
for
(
int
i
=
0
;
i
<
mc
;
i
++
)
{
C0
=
C_ptr
+
nc1
*
16
+
i
*
ldc
;
c0
=
c_ptr
+
nc1
*
16
+
i
*
NC
;
for
(
int
j
=
0
;
j
<
_nc1
;
j
++
)
{
*
C0
+=
*
c0
;
if
(
*
C0
<
0
)
{
*
C0
=
0
;
}
C0
++
;
c0
++
;
}
}
}
}
// C = A * B, batchnorm(C)
void
WriteWithBn
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
scale
,
float
*
bias
)
{
int
nc1
=
nc
/
16
;
int
_nc1
=
nc
%
16
;
int
nc2
=
_nc1
/
4
;
int
nc3
=
16
-
4
*
(
_nc1
%
4
);
int
step
=
4
*
(
ldc
-
nc
);
int
step1
=
4
*
(
NC
-
nc
);
asm
volatile
(
"subs %[mc], %[mc], #1
\n\t
"
"blt end_mc_%=
\n\t
"
"loop_mc_%=:
\n\t
"
"mov r5, %[nc1]
\n\t
"
"mov r6, %[nc2]
\n\t
"
"subs r5, r5, #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
"vld1.32 {q0, q1}, [%[c]]!
\n\t
"
"vld1.32 {q2, q3}, [%[scale]]!
\n\t
"
"vld1.32 {q10, q11}, [%[bias]]!
\n\t
"
"vmla.f32 q10, q0, q2
\n\t
"
"vmla.f32 q11, q1, q3
\n\t
"
"vst1.32 {q10, q11}, [%[C]]!
\n\t
"
"vld1.32 {q4, q5}, [%[c]]!
\n\t
"
"vld1.32 {q6, q7}, [%[scale]]!
\n\t
"
"vld1.32 {q12, q13}, [%[bias]]!
\n\t
"
"vmla.f32 q12, q4, q6
\n\t
"
"vmla.f32 q13, q5, q7
\n\t
"
"vst1.32 {q12, q13}, [%[C]]!
\n\t
"
"subs r5, r5, #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
"subs r6, r6, #1
\n\t
"
"blt end_nc2_%=
\n\t
"
"loop_nc2_%=:
\n\t
"
"vld1.32 {q0}, [%[c]]!
\n\t
"
"vld1.32 {q1}, [%[scale]]!
\n\t
"
"vld1.32 {q10}, [%[bias]]!
\n\t
"
"vmla.f32 q10, q0, q1
\n\t
"
"vst1.32 {q10}, [%[C]]!
\n\t
"
"subs r6, r6, #1
\n\t
"
"bge loop_nc2_%=
\n\t
"
"end_nc2_%=:
\n\t
"
"cmp %[nc3], #16
\n\t
"
"beq end_nc3_%=
\n\t
"
"sub %[c], %[c], %[nc3]
\n\t
"
"sub %[scale], %[scale], %[nc3]
\n\t
"
"sub %[bias], %[bias], %[nc3]
\n\t
"
"sub %[C], %[C], %[nc3]
\n\t
"
"vld1.32 {q0}, [%[c]]!
\n\t
"
"vld1.32 {q1}, [%[scale]]!
\n\t
"
"vld1.32 {q10}, [%[bias]]!
\n\t
"
"vmla.f32 q10, q0, q1
\n\t
"
"vst1.32 {q10}, [%[C]]!
\n\t
"
"end_nc3_%=:
\n\t
"
"add %[c], %[c], %[step1]
\n\t
"
"add %[scale], %[scale], %[step]
\n\t
"
"add %[bias], %[bias], %[step]
\n\t
"
"add %[C], %[C], %[step]
\n\t
"
"subs %[mc], %[mc], #1
\n\t
"
"bge loop_mc_%=
\n\t
"
"end_mc_%=:
\n\t
"
:
:
[
C
]
"r"
(
C
),
[
c
]
"r"
(
c
),
[
mc
]
"r"
(
mc
),
[
nc1
]
"r"
(
nc1
),
[
nc2
]
"r"
(
nc2
),
[
nc3
]
"r"
(
nc3
),
[
step
]
"r"
(
step
),
[
step1
]
"r"
(
step1
),
[
scale
]
"r"
(
scale
),
[
bias
]
"r"
(
bias
)
:
"memory"
,
"cc"
,
"r5"
,
"r6"
,
"r7"
,
"r8"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
);
}
// C = A * B, batchnorm(C), relu(C)
void
WriteWithBnRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
scale
,
float
*
bias
)
{
int
nc1
=
nc
/
16
;
int
_nc1
=
nc
%
16
;
int
nc2
=
_nc1
/
4
;
int
nc3
=
16
-
4
*
(
_nc1
%
4
);
int
step
=
4
*
(
ldc
-
nc
);
int
step1
=
4
*
(
NC
-
nc
);
asm
volatile
(
"vmov.f32 q14, #0.0
\n\t
"
"subs %[mc], %[mc], #1
\n\t
"
"blt end_mc_%=
\n\t
"
"loop_mc_%=:
\n\t
"
"mov r5, %[nc1]
\n\t
"
"mov r6, %[nc2]
\n\t
"
"subs r5, r5, #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
"vld1.32 {q0, q1}, [%[c]]!
\n\t
"
"vld1.32 {q2, q3}, [%[scale]]!
\n\t
"
"vld1.32 {q10, q11}, [%[bias]]!
\n\t
"
"vmla.f32 q10, q0, q2
\n\t
"
"vmla.f32 q11, q1, q3
\n\t
"
"vmax.f32 q10, q10, q14
\n\t
"
"vmax.f32 q11, q11, q14
\n\t
"
"vst1.32 {q10, q11}, [%[C]]!
\n\t
"
"vld1.32 {q4, q5}, [%[c]]!
\n\t
"
"vld1.32 {q6, q7}, [%[scale]]!
\n\t
"
"vld1.32 {q12, q13}, [%[bias]]!
\n\t
"
"vmla.f32 q12, q4, q6
\n\t
"
"vmla.f32 q13, q5, q7
\n\t
"
"vmax.f32 q12, q12, q14
\n\t
"
"vmax.f32 q13, q13, q14
\n\t
"
"vst1.32 {q12, q13}, [%[C]]!
\n\t
"
"subs r5, r5, #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
"subs r6, r6, #1
\n\t
"
"blt end_nc2_%=
\n\t
"
"loop_nc2_%=:
\n\t
"
"vld1.32 {q0}, [%[c]]!
\n\t
"
"vld1.32 {q1}, [%[scale]]!
\n\t
"
"vld1.32 {q10}, [%[bias]]!
\n\t
"
"vmla.f32 q10, q0, q1
\n\t
"
"vmax.f32 q10, q10, q14
\n\t
"
"vst1.32 {q10}, [%[C]]!
\n\t
"
"subs r6, r6, #1
\n\t
"
"bge loop_nc2_%=
\n\t
"
"end_nc2_%=:
\n\t
"
"cmp %[nc3], #16
\n\t
"
"beq end_nc3_%=
\n\t
"
"sub %[c], %[c], %[nc3]
\n\t
"
"sub %[scale], %[scale], %[nc3]
\n\t
"
"sub %[bias], %[bias], %[nc3]
\n\t
"
"sub %[C], %[C], %[nc3]
\n\t
"
"vld1.32 {q0}, [%[c]]!
\n\t
"
"vld1.32 {q1}, [%[scale]]!
\n\t
"
"vld1.32 {q10}, [%[bias]]!
\n\t
"
"vmla.f32 q10, q0, q1
\n\t
"
"vmax.f32 q10, q10, q14
\n\t
"
"vst1.32 {q10}, [%[C]]!
\n\t
"
"end_nc3_%=:
\n\t
"
"add %[c], %[c], %[step1]
\n\t
"
"add %[scale], %[scale], %[step]
\n\t
"
"add %[bias], %[bias], %[step]
\n\t
"
"add %[C], %[C], %[step]
\n\t
"
"subs %[mc], %[mc], #1
\n\t
"
"bge loop_mc_%=
\n\t
"
"end_mc_%=:
\n\t
"
:
:
[
C
]
"r"
(
C
),
[
c
]
"r"
(
c
),
[
mc
]
"r"
(
mc
),
[
nc1
]
"r"
(
nc1
),
[
nc2
]
"r"
(
nc2
),
[
nc3
]
"r"
(
nc3
),
[
step
]
"r"
(
step
),
[
step1
]
"r"
(
step1
),
[
scale
]
"r"
(
scale
),
[
bias
]
"r"
(
bias
)
:
"memory"
,
"r5"
,
"r6"
,
"r7"
,
"r8"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
);
}
// C = A * B
void
VecWriteBasic
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
n
/
16
;
int
_nc1
=
n
%
16
;
int
nc2
=
_nc1
/
4
;
int
nc3
=
16
-
4
*
(
_nc1
%
4
);
asm
volatile
(
"subs %[nc1], %[nc1], #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
"vld1.32 {q0, q1}, [%[c]]!
\n\t
"
"vst1.32 {q0, q1}, [%[C]]!
\n\t
"
"vld1.32 {q2, q3}, [%[c]]!
\n\t
"
"vst1.32 {q2, q3}, [%[C]]!
\n\t
"
"subs %[nc1], %[nc1], #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
"subs %[nc2], %[nc2], #1
\n\t
"
"blt end_nc2_%=
\n\t
"
"loop_nc2_%=:
\n\t
"
"vld1.32 {q4}, [%[c]]!
\n\t
"
"vst1.32 {q4}, [%[C]]!
\n\t
"
"subs %[nc2], %[nc2], #1
\n\t
"
"bge loop_nc2_%=
\n\t
"
"end_nc2_%=:
\n\t
"
"cmp %[nc3], #16
\n\t
"
"beq end_nc3_%=
\n\t
"
"sub %[c], %[c], %[nc3]
\n\t
"
"sub %[C], %[C], %[nc3]
\n\t
"
"vld1.32 {q5}, [%[c]]!
\n\t
"
"vst1.32 {q5}, [%[C]]!
\n\t
"
"end_nc3_%=:
\n\t
"
:
:
[
C
]
"r"
(
C
),
[
c
]
"r"
(
c
),
[
nc1
]
"r"
(
nc1
),
[
nc2
]
"r"
(
nc2
),
[
nc3
]
"r"
(
nc3
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
);
}
// C = alpha * A * B + beta * C
void
VecWriteWithAlphaBeta
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
// C = A * B + C
void
VecWriteWithAdd
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
n
/
16
;
int
_nc1
=
n
%
16
;
asm
volatile
(
"subs %[nc1], %[nc1], #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
"vld1.32 {q0, q1}, [%[c]]!
\n\t
"
"vld1.32 {q2, q3}, [%[C]]
\n\t
"
"vadd.f32 q10, q0, q2
\n\t
"
"vadd.f32 q11, q1, q3
\n\t
"
"vst1.32 {q10, q11}, [%[C]]!
\n\t
"
"vld1.32 {q4, q5}, [%[c]]!
\n\t
"
"vld1.32 {q6, q7}, [%[C]]
\n\t
"
"vadd.f32 q12, q4, q6
\n\t
"
"vadd.f32 q13, q5, q7
\n\t
"
"vst1.32 {q12, q13}, [%[C]]!
\n\t
"
"subs %[nc1], %[nc1], #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
:
[
C
]
"+r"
(
C
),
[
c
]
"+r"
(
c
)
:
[
nc1
]
"r"
(
nc1
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
);
if
(
_nc1
!=
0
)
{
for
(
int
j
=
0
;
j
<
_nc1
;
j
++
)
{
*
C
++
+=
*
c
++
;
}
}
}
// C = A * B + C, relu(C)
void
VecWriteWithAddRelu
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
n
/
16
;
int
_nc1
=
n
%
16
;
asm
volatile
(
"vmov.f32 q14, #0.0
\n\t
"
"subs %[nc1], %[nc1], #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
"vld1.32 {q0, q1}, [%[c]]!
\n\t
"
"vld1.32 {q2, q3}, [%[C]]
\n\t
"
"vadd.f32 q10, q0, q2
\n\t
"
"vadd.f32 q11, q1, q3
\n\t
"
"vmax.f32 q10, q10, q14
\n\t
"
"vmax.f32 q11, q11, q14
\n\t
"
"vst1.32 {q10, q11}, [%[C]]!
\n\t
"
"vld1.32 {q4, q5}, [%[c]]!
\n\t
"
"vld1.32 {q6, q7}, [%[C]]
\n\t
"
"vadd.f32 q12, q4, q6
\n\t
"
"vadd.f32 q13, q5, q7
\n\t
"
"vmax.f32 q12, q12, q14
\n\t
"
"vmax.f32 q13, q13, q14
\n\t
"
"vst1.32 {q12, q13}, [%[C]]!
\n\t
"
"subs %[nc1], %[nc1], #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
:
[
C
]
"+r"
(
C
),
[
c
]
"+r"
(
c
)
:
[
nc1
]
"r"
(
nc1
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
);
if
(
_nc1
!=
0
)
{
for
(
int
j
=
0
;
j
<
_nc1
;
j
++
)
{
*
C
+=
*
c
;
if
(
*
C
<
0
)
{
*
C
=
0
;
}
C
++
;
c
++
;
}
}
}
// C = A * B, batchnorm(C)
void
VecWriteWithBn
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
scale
,
float
*
bias
)
{
int
nc1
=
n
/
16
;
int
_nc1
=
n
%
16
;
int
nc2
=
_nc1
/
4
;
int
nc3
=
16
-
4
*
(
_nc1
%
4
);
asm
volatile
(
"subs %[nc1], %[nc1], #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
"vld1.32 {q0, q1}, [%[c]]!
\n\t
"
"vld1.32 {q2, q3}, [%[scale]]!
\n\t
"
"vld1.32 {q10, q11}, [%[bias]]!
\n\t
"
"vmla.f32 q10, q0, q2
\n\t
"
"vmla.f32 q11, q1, q3
\n\t
"
"vst1.32 {q10, q11}, [%[C]]!
\n\t
"
"vld1.32 {q4, q5}, [%[c]]!
\n\t
"
"vld1.32 {q6, q7}, [%[scale]]!
\n\t
"
"vld1.32 {q12, q13}, [%[bias]]!
\n\t
"
"vmla.f32 q12, q4, q6
\n\t
"
"vmla.f32 q13, q5, q7
\n\t
"
"vst1.32 {q12, q13}, [%[C]]!
\n\t
"
"subs %[nc1], %[nc1], #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
"subs %[nc2], %[nc2], #1
\n\t
"
"blt end_nc2_%=
\n\t
"
"loop_nc2_%=:
\n\t
"
"vld1.32 {q0}, [%[c]]!
\n\t
"
"vld1.32 {q1}, [%[scale]]!
\n\t
"
"vld1.32 {q10}, [%[bias]]!
\n\t
"
"vmla.f32 q10, q0, q1
\n\t
"
"vst1.32 {q10}, [%[C]]!
\n\t
"
"subs %[nc2], %[nc2], #1
\n\t
"
"bge loop_nc2_%=
\n\t
"
"end_nc2_%=:
\n\t
"
"cmp %[nc3], #16
\n\t
"
"beq end_nc3_%=
\n\t
"
"sub %[c], %[c], %[nc3]
\n\t
"
"sub %[scale], %[scale], %[nc3]
\n\t
"
"sub %[bias], %[bias], %[nc3]
\n\t
"
"sub %[C], %[C], %[nc3]
\n\t
"
"vld1.32 {q0}, [%[c]]!
\n\t
"
"vld1.32 {q1}, [%[scale]]!
\n\t
"
"vld1.32 {q10}, [%[bias]]!
\n\t
"
"vmla.f32 q10, q0, q1
\n\t
"
"vst1.32 {q10}, [%[C]]!
\n\t
"
"end_nc3_%=:
\n\t
"
:
:
[
C
]
"r"
(
C
),
[
c
]
"r"
(
c
),
[
nc1
]
"r"
(
nc1
),
[
nc2
]
"r"
(
nc2
),
[
nc3
]
"r"
(
nc3
),
[
scale
]
"r"
(
scale
),
[
bias
]
"r"
(
bias
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
);
}
// C = A * B, batchnorm(C), relu(C)
void
VecWriteWithBnRelu
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
scale
,
float
*
bias
)
{
int
nc1
=
n
/
16
;
int
_nc1
=
n
%
16
;
int
nc2
=
_nc1
/
4
;
int
nc3
=
16
-
4
*
(
_nc1
%
4
);
asm
volatile
(
"vmov.f32 q14, #0.0
\n\t
"
"subs %[nc1], %[nc1], #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
"vld1.32 {q0, q1}, [%[c]]!
\n\t
"
"vld1.32 {q2, q3}, [%[scale]]!
\n\t
"
"vld1.32 {q10, q11}, [%[bias]]!
\n\t
"
"vmla.f32 q10, q0, q2
\n\t
"
"vmla.f32 q11, q1, q3
\n\t
"
"vmax.f32 q10, q10, q14
\n\t
"
"vmax.f32 q11, q11, q14
\n\t
"
"vst1.32 {q10, q11}, [%[C]]!
\n\t
"
"vld1.32 {q4, q5}, [%[c]]!
\n\t
"
"vld1.32 {q6, q7}, [%[scale]]!
\n\t
"
"vld1.32 {q12, q13}, [%[bias]]!
\n\t
"
"vmla.f32 q12, q4, q6
\n\t
"
"vmla.f32 q13, q5, q7
\n\t
"
"vmax.f32 q12, q12, q14
\n\t
"
"vmax.f32 q13, q13, q14
\n\t
"
"vst1.32 {q12, q13}, [%[C]]!
\n\t
"
"subs %[nc1], %[nc1], #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
"subs %[nc2], %[nc2], #1
\n\t
"
"blt end_nc2_%=
\n\t
"
"loop_nc2_%=:
\n\t
"
"vld1.32 {q0}, [%[c]]!
\n\t
"
"vld1.32 {q1}, [%[scale]]!
\n\t
"
"vld1.32 {q10}, [%[bias]]!
\n\t
"
"vmla.f32 q10, q0, q1
\n\t
"
"vmax.f32 q10, q10, q14
\n\t
"
"vst1.32 {q10}, [%[C]]!
\n\t
"
"subs %[nc2], %[nc2], #1
\n\t
"
"bge loop_nc2_%=
\n\t
"
"end_nc2_%=:
\n\t
"
"cmp %[nc3], #16
\n\t
"
"beq end_nc3_%=
\n\t
"
"sub %[c], %[c], %[nc3]
\n\t
"
"sub %[scale], %[scale], %[nc3]
\n\t
"
"sub %[bias], %[bias], %[nc3]
\n\t
"
"sub %[C], %[C], %[nc3]
\n\t
"
"vld1.32 {q0}, [%[c]]!
\n\t
"
"vld1.32 {q1}, [%[scale]]!
\n\t
"
"vld1.32 {q10}, [%[bias]]!
\n\t
"
"vmla.f32 q10, q0, q1
\n\t
"
"vmax.f32 q10, q10, q14
\n\t
"
"vst1.32 {q10}, [%[C]]!
\n\t
"
"end_nc3_%=:
\n\t
"
:
:
[
C
]
"r"
(
C
),
[
c
]
"r"
(
c
),
[
nc1
]
"r"
(
nc1
),
[
nc2
]
"r"
(
nc2
),
[
nc3
]
"r"
(
nc3
),
[
scale
]
"r"
(
scale
),
[
bias
]
"r"
(
bias
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
);
}
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
src/operators/math/gemm.h
浏览文件 @
00417d29
...
...
@@ -19,12 +19,8 @@ limitations under the License. */
#define B(i, j) B[(i)*ldb + (j)]
#define C(i, j) C[(i)*ldc + (j)]
// 分块计算的块大小,mc 与 kc 分别对应分块计算时的 m 与 k
#define MC 128
#define KC 128
#define NC 1024
#define MR 4
#define NR
4
#define NR
8
#define s_min(i, j) ((i) < (j) ? (i) : (j))
...
...
@@ -49,28 +45,66 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
float
*
buffer
);
// 分块矩阵乘法
void
InnerKernel
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
int
first_time
);
void
InnerKernel
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
);
void
InnerKernelWithBn
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
);
// 向量矩阵乘法 (M = 1)
void
VectorKernel
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
);
// 计算一个更小的 4 * 4 的 C 矩阵分块
void
AddDot4x4
(
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
int
mc
,
int
nc
);
void
AddDot4x4_relu
(
int
k
,
float
alpha
,
const
float
*
a
,
int
lda
,
const
float
*
b
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
int
mc
,
int
nc
,
bool
relu
);
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
);
void
VectorKernelWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
);
// 计算一个更小的 C 矩阵分块
void
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
// 分块矩阵乘法结果回写
// C = A * B
void
WriteBasic
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = alpha * A * B + beta * C
void
WriteWithAlphaBeta
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = A * B + C
void
WriteWithAdd
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = A * B + C, relu(C)
void
WriteWithAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = A * B, batchnorm(C)
void
WriteWithBn
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
);
// C = A * B, batchnorm(C), relu(C)
void
WriteWithBnRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
);
// 向量矩阵乘法结果回写
// C = A * B
void
VecWriteBasic
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = alpha * A * B + beta * C
void
VecWriteWithAlphaBeta
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = A * B + C
void
VecWriteWithAdd
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = A * B + C, relu(C)
void
VecWriteWithAddRelu
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = A * B, batchnorm(C)
void
VecWriteWithBn
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
);
// C = A * B, batchnorm(C), relu(C)
void
VecWriteWithBnRelu
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
);
// 32位 float 矩阵乘法
void
s
gemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
);
void
S
gemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
);
void
sgemm_relu
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom
void
SgemmWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
);
// 64位 double 矩阵乘法
void
dgemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
double
*
A
,
int
lda
,
...
...
src/operators/math/math_function.cpp
浏览文件 @
00417d29
...
...
@@ -39,22 +39,18 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
int
M
=
dim_out
[
0
];
int
N
=
dim_out
[
1
];
int
K
=
(
trans_a
==
false
)
?
dim_a
[
1
]
:
dim_a
[
0
];
int
K
=
(
!
trans_a
)
?
dim_a
[
1
]
:
dim_a
[
0
];
if
(
relu
)
{
sgemm_relu
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
);
}
else
{
sgemm
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
);
}
Sgemm
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
);
}
template
<
>
void
matmul
<
double
>
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
double
alpha
,
framework
::
Tensor
*
matrix_out
,
double
beta
,
bool
relu
)
{
void
matmulWithBn
<
float
>
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
framework
::
Tensor
*
new_scale
,
framework
::
Tensor
*
new_bias
)
{
auto
dim_a
=
matrix_a
.
dims
();
auto
dim_b
=
matrix_b
.
dims
();
auto
dim_out
=
matrix_out
->
dims
();
...
...
@@ -71,7 +67,11 @@ void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
int
M
=
dim_out
[
0
];
int
N
=
dim_out
[
1
];
int
K
=
(
trans_a
==
false
)
?
dim_a
[
1
]
:
dim_a
[
0
];
int
K
=
(
!
trans_a
)
?
dim_a
[
1
]
:
dim_a
[
0
];
SgemmWithBn
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
new_scale
->
data
<
float
>
(),
new_bias
->
data
<
float
>
());
}
}
// namespace math
...
...
src/operators/math/math_function.h
浏览文件 @
00417d29
...
...
@@ -26,6 +26,12 @@ template <typename T>
void
matmul
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
T
alpha
,
framework
::
Tensor
*
matrix_out
,
T
beta
,
bool
relu
=
false
);
template
<
typename
T
>
void
matmulWithBn
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
T
alpha
,
framework
::
Tensor
*
matrix_out
,
T
beta
,
bool
relu
,
framework
::
Tensor
*
new_scale
,
framework
::
Tensor
*
new_bias
);
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录