Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
0f4e6d73
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
0f4e6d73
编写于
8月 24, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
8月 24, 2020
浏览文件
操作
浏览文件
下载
差异文件
!5004 [MS][LITE][Develop]optimize fp32 matmul kernel
Merge pull request !5004 from lixian/master
上级
56835aaf
811940bc
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
932 addition
and
5 deletion
+932
-5
mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
+784
-0
mindspore/lite/nnacl/fp32/matmul.c
mindspore/lite/nnacl/fp32/matmul.c
+136
-0
mindspore/lite/nnacl/fp32/matmul.h
mindspore/lite/nnacl/fp32/matmul.h
+5
-0
mindspore/lite/nnacl/matmul_parameter.h
mindspore/lite/nnacl/matmul_parameter.h
+1
-0
mindspore/lite/nnacl/op_base.h
mindspore/lite/nnacl/op_base.h
+1
-0
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
...spore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
+5
-5
未找到文件。
mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
0 → 100644
浏览文件 @
0f4e6d73
#ifdef __aarch64__
.
text
.
align
5
.
global
MatmulFloatNeon64Opt
#ifndef __APPLE__
.
type
MatmulFloatNeon64Opt
,
%
function
#endif
//
A
:
LM
[
row_8
*
depth
]
col_8_major
//
B
:
RM
[
depth
*
col_8
]
row_8_major
//
C
:
A
*
B
[
row_8
*
col_8
]
col_8x8_major
//
A
*
B
->
[
8
*
depth
]
*
[
depth
*
8
]
->
[
8
*
4
]
*
[
4
*
8
]
or
[
8
*
1
]
*
[
1
*
8
]
///////////////////////////////////////////////////////////////////////////////
//
CommLoopMul
RM
1
x8
block
//
/-----------------------------------------\
//
|v2.s[0] ... v2.s[3] v3.s[0] ... v3.s[3]|
//
\-----------------------------------------/
//
LM
8
x1
block
//
/---------------------\
/-----------------------------------------
\
//
|
v0
.
s
[
0
]
| |v16
.
s
[
0
]
...
v16
.
s
[
3
]
v17
.
s
[
0
]
...
v17
.
s
[
3
]|
//
|
...
| |
...
...
|
//
|
v0
.
s
[
3
]
| |v22
.
s
[
0
]
...
v22
.
s
[
3
]
v23
.
s
[
0
]
...
v23
.
s
[
3
]|
//
|
v1
.
s
[
0
]
| |v24
.
s
[
0
]
...
v24
.
s
[
3
]
v25
.
s
[
0
]
...
v25
.
s
[
3
]|
//
|
...
| |
...
...
|
//
|
v1
.
s
[
3
]
| |v30
.
s
[
0
]
...
v30
.
s
[
3
]
v31
.
s
[
0
]
...
v31
.
s
[
3
]|
//
\---------------------/
\
-----------------------------------------/
//
accumulators
8
x8
block
//
///////////////////////////////////////////////////////////////////////////////
//
OptLoopMul4
RM
4
x8
block
//
/--------------------------------------------\
//
|v8.s[0] ... v8.s[3] v9.s[0] ... v9.s[3] |
//
|v10.s[0] ... v10.s[3] v11.s[0] ... v11.s[3]|
//
|v12.s[0] ... v12.s[3] v13.s[0] ... v13.s[3]|
//
|v14.s[0] ... v14.s[3] v15.s[0] ... v15.s[3]|
//
\--------------------------------------------/
//
LM
8
x4
block
//
/---------------------------------\
/--------------------------------------------
\
//
|
v0
.
s
[
0
]
v2
.
s
[
0
]
v4
.
s
[
0
]
v6
.
s
[
0
]
| |v16
.
s
[
0
]
...
v16
.
s
[
3
]
v17
.
s
[
0
]
...
v17
.
s
[
3
]
|
//
|
...
...
...
...
| |
...
...
|
//
|
v0
.
s
[
3
]
v2
.
s
[
3
]
v4
.
s
[
3
]
v6
.
s
[
3
]
| |v22
.
s
[
0
]
...
v22
.
s
[
3
]
v23
.
s
[
0
]
...
v23
.
s
[
3
]
|
//
|
v1
.
s
[
0
]
v3
.
s
[
0
]
v5
.
s
[
0
]
v7
.
s
[
0
]
| |v24
.
s
[
0
]
...
v24
.
s
[
3
]
v25
.
s
[
0
]
...
v25
.
s
[
3
]
|
//
|
...
...
...
...
| |
...
...
|
//
|
v1
.
s
[
3
]
v3
.
s
[
3
]
v5
.
s
[
3
]
v7
.
s
[
3
]
| |v30
.
s
[
0
]
...
v30
.
s
[
3
]
v31
.
s
[
0
]
...
v31
.
s
[
3
]
|
//
\---------------------------------/
\
--------------------------------------------/
//
accumulators
8
x8
block
/////////////////////////////////////////////////////////////////////////////////
//
//
void
MatmulFloatNeon64
(
const
float
*
a
,
const
float
*
b
,
float
*
c
,
const
float
*
bias
,
int
act_type
,
int
depth
//
int
row
,
int
col
,
int
stride
,
bool
write_nhwc
)
//
x0
:
a
//
x1
:
b
//
x2
:
c
//
x3
:
bias
//
w4
:
act_type
//
w5
:
depth
//
w6
:
row
//
w7
:
col
//
w17
:
stride
//
w13
:
writeC8
MatmulFloatNeon64Opt
:
sub
sp
,
sp
,
#
128
st1
{
v8
.4
s
,
v9
.4
s
,
v10
.4
s
,
v11
.4
s
},
[
sp
],
#
64
st1
{
v12
.4
s
,
v13
.4
s
,
v14
.4
s
,
v15
.4
s
},
[
sp
],
#
64
mov
w18
,
#
32
//
sizeof
(
float
)
*
8
mul
w15
,
w5
,
w18
//
block
stride
of
lhs
/
rhs
:
sizeof
(
float
)
*
8
*
depth
mov
x11
,
x3
//
bias
flag
mov
x18
,
#
4
ldr
x17
,
[
sp
]
mul
x17
,
x17
,
x18
L1
:
mov
w10
,
w6
//
reload
lhs
row
mov
x12
,
x0
//
reload
lhs
ptr
mov
x18
,
x2
//
reload
dst
ptr
L2
:
mov
x16
,
x1
//
reload
rhs
ptr
mov
w13
,
w5
//
reload
depth
mov
x14
,
x3
//
reload
bias
ptr
dup
v8
.4
s
,
wzr
dup
v9
.4
s
,
wzr
dup
v10
.4
s
,
wzr
dup
v11
.4
s
,
wzr
dup
v12
.4
s
,
wzr
dup
v13
.4
s
,
wzr
dup
v14
.4
s
,
wzr
dup
v15
.4
s
,
wzr
dup
v16
.4
s
,
wzr
dup
v17
.4
s
,
wzr
dup
v18
.4
s
,
wzr
dup
v19
.4
s
,
wzr
dup
v20
.4
s
,
wzr
dup
v21
.4
s
,
wzr
dup
v22
.4
s
,
wzr
dup
v23
.4
s
,
wzr
dup
v24
.4
s
,
wzr
dup
v25
.4
s
,
wzr
dup
v26
.4
s
,
wzr
dup
v27
.4
s
,
wzr
dup
v28
.4
s
,
wzr
dup
v29
.4
s
,
wzr
dup
v30
.4
s
,
wzr
dup
v31
.4
s
,
wzr
LoopStart
:
ld1
{
v0
.4
s
,
v1
.4
s
,
v2
.4
s
},
[
x12
],
#
48
ld1
{
v3
.4
s
,
v4
.4
s
},
[
x16
],
#
32
fmla
v8
.4
s
,
v3
.4
s
,
v0
.
s
[
0
]
fmla
v10
.4
s
,
v3
.4
s
,
v0
.
s
[
1
]
fmla
v12
.4
s
,
v3
.4
s
,
v0
.
s
[
2
]
fmla
v14
.4
s
,
v3
.4
s
,
v0
.
s
[
3
]
fmla
v9
.4
s
,
v4
.4
s
,
v0
.
s
[
0
]
fmla
v11
.4
s
,
v4
.4
s
,
v0
.
s
[
1
]
fmla
v13
.4
s
,
v4
.4
s
,
v0
.
s
[
2
]
fmla
v15
.4
s
,
v4
.4
s
,
v0
.
s
[
3
]
subs
w13
,
w13
,
#
1
beq
LoopEnd
Loop
:
ld1
{
v0
.4
s
},
[
x12
],
#
16
fmla
v16
.4
s
,
v3
.4
s
,
v1
.
s
[
0
]
fmla
v18
.4
s
,
v3
.4
s
,
v1
.
s
[
1
]
fmla
v20
.4
s
,
v3
.4
s
,
v1
.
s
[
2
]
fmla
v22
.4
s
,
v3
.4
s
,
v1
.
s
[
3
]
fmla
v17
.4
s
,
v4
.4
s
,
v1
.
s
[
0
]
fmla
v19
.4
s
,
v4
.4
s
,
v1
.
s
[
1
]
fmla
v21
.4
s
,
v4
.4
s
,
v1
.
s
[
2
]
fmla
v23
.4
s
,
v4
.4
s
,
v1
.
s
[
3
]
ld1
{
v1
.4
s
},
[
x12
],
#
16
fmla
v24
.4
s
,
v3
.4
s
,
v2
.
s
[
0
]
fmla
v26
.4
s
,
v3
.4
s
,
v2
.
s
[
1
]
fmla
v28
.4
s
,
v3
.4
s
,
v2
.
s
[
2
]
fmla
v30
.4
s
,
v3
.4
s
,
v2
.
s
[
3
]
ld1
{
v3
.4
s
},
[
x16
],
#
16
fmla
v25
.4
s
,
v4
.4
s
,
v2
.
s
[
0
]
fmla
v27
.4
s
,
v4
.4
s
,
v2
.
s
[
1
]
fmla
v29
.4
s
,
v4
.4
s
,
v2
.
s
[
2
]
fmla
v31
.4
s
,
v4
.4
s
,
v2
.
s
[
3
]
ld1
{
v4
.4
s
},
[
x16
],
#
16
fmla
v8
.4
s
,
v3
.4
s
,
v0
.
s
[
0
]
fmla
v10
.4
s
,
v3
.4
s
,
v0
.
s
[
1
]
fmla
v12
.4
s
,
v3
.4
s
,
v0
.
s
[
2
]
fmla
v14
.4
s
,
v3
.4
s
,
v0
.
s
[
3
]
ld1
{
v2
.4
s
},
[
x12
],
#
16
fmla
v9
.4
s
,
v4
.4
s
,
v0
.
s
[
0
]
fmla
v11
.4
s
,
v4
.4
s
,
v0
.
s
[
1
]
fmla
v13
.4
s
,
v4
.4
s
,
v0
.
s
[
2
]
fmla
v15
.4
s
,
v4
.4
s
,
v0
.
s
[
3
]
subs
w13
,
w13
,
#
1
bgt
Loop
LoopEnd
:
fmla
v16
.4
s
,
v3
.4
s
,
v1
.
s
[
0
]
fmla
v18
.4
s
,
v3
.4
s
,
v1
.
s
[
1
]
fmla
v20
.4
s
,
v3
.4
s
,
v1
.
s
[
2
]
fmla
v22
.4
s
,
v3
.4
s
,
v1
.
s
[
3
]
fmla
v17
.4
s
,
v4
.4
s
,
v1
.
s
[
0
]
fmla
v19
.4
s
,
v4
.4
s
,
v1
.
s
[
1
]
fmla
v21
.4
s
,
v4
.4
s
,
v1
.
s
[
2
]
fmla
v23
.4
s
,
v4
.4
s
,
v1
.
s
[
3
]
fmla
v24
.4
s
,
v3
.4
s
,
v2
.
s
[
0
]
fmla
v26
.4
s
,
v3
.4
s
,
v2
.
s
[
1
]
fmla
v28
.4
s
,
v3
.4
s
,
v2
.
s
[
2
]
fmla
v30
.4
s
,
v3
.4
s
,
v2
.
s
[
3
]
fmla
v25
.4
s
,
v4
.4
s
,
v2
.
s
[
0
]
fmla
v27
.4
s
,
v4
.4
s
,
v2
.
s
[
1
]
fmla
v29
.4
s
,
v4
.4
s
,
v2
.
s
[
2
]
fmla
v31
.4
s
,
v4
.4
s
,
v2
.
s
[
3
]
Bias
:
cbz
x11
,
Activation
ld1
{
v0
.4
s
},
[
x14
],
#
16
ld1
{
v1
.4
s
},
[
x14
],
#
16
fadd
v8
.4
s
,
v8
.4
s
,
v0
.4
s
fadd
v9
.4
s
,
v9
.4
s
,
v1
.4
s
fadd
v10
.4
s
,
v10
.4
s
,
v0
.4
s
fadd
v11
.4
s
,
v11
.4
s
,
v1
.4
s
fadd
v12
.4
s
,
v12
.4
s
,
v0
.4
s
fadd
v13
.4
s
,
v13
.4
s
,
v1
.4
s
fadd
v14
.4
s
,
v14
.4
s
,
v0
.4
s
fadd
v15
.4
s
,
v15
.4
s
,
v1
.4
s
fadd
v16
.4
s
,
v16
.4
s
,
v0
.4
s
fadd
v17
.4
s
,
v17
.4
s
,
v1
.4
s
fadd
v18
.4
s
,
v18
.4
s
,
v0
.4
s
fadd
v19
.4
s
,
v19
.4
s
,
v1
.4
s
fadd
v20
.4
s
,
v20
.4
s
,
v0
.4
s
fadd
v21
.4
s
,
v21
.4
s
,
v1
.4
s
fadd
v22
.4
s
,
v22
.4
s
,
v0
.4
s
fadd
v23
.4
s
,
v23
.4
s
,
v1
.4
s
fadd
v24
.4
s
,
v24
.4
s
,
v0
.4
s
fadd
v25
.4
s
,
v25
.4
s
,
v1
.4
s
fadd
v26
.4
s
,
v26
.4
s
,
v0
.4
s
fadd
v27
.4
s
,
v27
.4
s
,
v1
.4
s
fadd
v28
.4
s
,
v28
.4
s
,
v0
.4
s
fadd
v29
.4
s
,
v29
.4
s
,
v1
.4
s
fadd
v30
.4
s
,
v30
.4
s
,
v0
.4
s
fadd
v31
.4
s
,
v31
.4
s
,
v1
.4
s
Activation
:
cmp
w4
,
#
2
beq
Relu6
cmp
w4
,
#
1
beq
Relu
b
Write
Relu6
:
mov
w8
,
#
6
dup
v2
.4
s
,
w8
scvtf
v2
.4
s
,
v2
.4
s
fmin
v8
.4
s
,
v8
.4
s
,
v2
.4
s
fmin
v9
.4
s
,
v9
.4
s
,
v2
.4
s
fmin
v10
.4
s
,
v10
.4
s
,
v2
.4
s
fmin
v11
.4
s
,
v11
.4
s
,
v2
.4
s
fmin
v12
.4
s
,
v12
.4
s
,
v2
.4
s
fmin
v13
.4
s
,
v13
.4
s
,
v2
.4
s
fmin
v14
.4
s
,
v14
.4
s
,
v2
.4
s
fmin
v15
.4
s
,
v15
.4
s
,
v2
.4
s
fmin
v16
.4
s
,
v16
.4
s
,
v2
.4
s
fmin
v17
.4
s
,
v17
.4
s
,
v2
.4
s
fmin
v18
.4
s
,
v18
.4
s
,
v2
.4
s
fmin
v19
.4
s
,
v19
.4
s
,
v2
.4
s
fmin
v20
.4
s
,
v20
.4
s
,
v2
.4
s
fmin
v21
.4
s
,
v21
.4
s
,
v2
.4
s
fmin
v22
.4
s
,
v22
.4
s
,
v2
.4
s
fmin
v23
.4
s
,
v23
.4
s
,
v2
.4
s
fmin
v24
.4
s
,
v24
.4
s
,
v2
.4
s
fmin
v25
.4
s
,
v25
.4
s
,
v2
.4
s
fmin
v26
.4
s
,
v26
.4
s
,
v2
.4
s
fmin
v27
.4
s
,
v27
.4
s
,
v2
.4
s
fmin
v28
.4
s
,
v28
.4
s
,
v2
.4
s
fmin
v29
.4
s
,
v29
.4
s
,
v2
.4
s
fmin
v30
.4
s
,
v30
.4
s
,
v2
.4
s
fmin
v31
.4
s
,
v31
.4
s
,
v2
.4
s
Relu
:
dup
v3
.4
s
,
wzr
fmax
v8
.4
s
,
v8
.4
s
,
v3
.4
s
fmax
v9
.4
s
,
v9
.4
s
,
v3
.4
s
fmax
v10
.4
s
,
v10
.4
s
,
v3
.4
s
fmax
v11
.4
s
,
v11
.4
s
,
v3
.4
s
fmax
v12
.4
s
,
v12
.4
s
,
v3
.4
s
fmax
v13
.4
s
,
v13
.4
s
,
v3
.4
s
fmax
v14
.4
s
,
v14
.4
s
,
v3
.4
s
fmax
v15
.4
s
,
v15
.4
s
,
v3
.4
s
fmax
v16
.4
s
,
v16
.4
s
,
v3
.4
s
fmax
v17
.4
s
,
v17
.4
s
,
v3
.4
s
fmax
v18
.4
s
,
v18
.4
s
,
v3
.4
s
fmax
v19
.4
s
,
v19
.4
s
,
v3
.4
s
fmax
v20
.4
s
,
v20
.4
s
,
v3
.4
s
fmax
v21
.4
s
,
v21
.4
s
,
v3
.4
s
fmax
v22
.4
s
,
v22
.4
s
,
v3
.4
s
fmax
v23
.4
s
,
v23
.4
s
,
v3
.4
s
fmax
v24
.4
s
,
v24
.4
s
,
v3
.4
s
fmax
v25
.4
s
,
v25
.4
s
,
v3
.4
s
fmax
v26
.4
s
,
v26
.4
s
,
v3
.4
s
fmax
v27
.4
s
,
v27
.4
s
,
v3
.4
s
fmax
v28
.4
s
,
v28
.4
s
,
v3
.4
s
fmax
v29
.4
s
,
v29
.4
s
,
v3
.4
s
fmax
v30
.4
s
,
v30
.4
s
,
v3
.4
s
fmax
v31
.4
s
,
v31
.4
s
,
v3
.4
s
Write
:
ldrb
w13
,
[
sp
,
#
8
]
cbz
w13
,
WriteC8
cmp
w7
,
#
1
beq
Write1
cmp
w7
,
#
2
beq
Write2
cmp
w7
,
#
3
beq
Write3
cmp
w7
,
#
4
beq
Write4
cmp
w7
,
#
5
beq
Write5
cmp
w7
,
#
6
beq
Write6
cmp
w7
,
#
7
beq
Write7
b
Write8
Write1
:
str
s8
,
[
x18
]
cmp
w10
,
#
1
beq
WriteEnd
add
x18
,
x18
,
x17
str
s10
,
[
x18
]
cmp
w10
,
#
2
beq
WriteEnd
add
x18
,
x18
,
x17
str
s12
,
[
x18
]
cmp
w10
,
#
3
beq
WriteEnd
add
x18
,
x18
,
x17
str
s14
,
[
x18
]
cmp
w10
,
#
4
beq
WriteEnd
add
x18
,
x18
,
x17
str
s16
,
[
x18
]
cmp
w10
,
#
5
beq
WriteEnd
add
x18
,
x18
,
x17
str
s18
,
[
x18
]
cmp
w10
,
#
6
beq
WriteEnd
add
x18
,
x18
,
x17
str
s20
,
[
x18
]
cmp
w10
,
#
7
beq
WriteEnd
add
x18
,
x18
,
x17
str
s22
,
[
x18
]
cmp
w10
,
#
8
beq
WriteEnd
add
x18
,
x18
,
x17
str
s24
,
[
x18
]
cmp
w10
,
#
9
beq
WriteEnd
add
x18
,
x18
,
x17
str
s26
,
[
x18
]
cmp
w10
,
#
10
beq
WriteEnd
add
x18
,
x18
,
x17
str
s28
,
[
x18
]
cmp
w10
,
#
11
beq
WriteEnd
add
x18
,
x18
,
x17
str
s30
,
[
x18
]
add
x18
,
x18
,
x17
b
WriteEnd
Write2
:
dup
s9
,
v8
.
s
[
1
]
stp
s8
,
s9
,
[
x18
]
cmp
w10
,
#
1
beq
WriteEnd
add
x18
,
x18
,
x17
dup
s11
,
v10
.
s
[
1
]
stp
s10
,
s11
,
[
x18
]
cmp
w10
,
#
2
beq
WriteEnd
add
x18
,
x18
,
x17
dup
s13
,
v12
.
s
[
1
]
stp
s12
,
s13
,
[
x18
]
cmp
w10
,
#
3
beq
WriteEnd
add
x18
,
x18
,
x17
dup
s15
,
v14
.
s
[
1
]
stp
s14
,
s15
,
[
x18
]
cmp
w10
,
#
4
beq
WriteEnd
add
x18
,
x18
,
x17
dup
s17
,
v16
.
s
[
1
]
stp
s16
,
s17
,
[
x18
]
cmp
w10
,
#
5
beq
WriteEnd
add
x18
,
x18
,
x17
dup
s19
,
v18
.
s
[
1
]
stp
s18
,
s19
,
[
x18
]
cmp
w10
,
#
6
beq
WriteEnd
add
x18
,
x18
,
x17
dup
s21
,
v20
.
s
[
1
]
stp
s20
,
s21
,
[
x18
]
cmp
w10
,
#
7
beq
WriteEnd
add
x18
,
x18
,
x17
dup
s23
,
v22
.
s
[
1
]
stp
s22
,
s23
,
[
x18
]
cmp
w10
,
#
8
beq
WriteEnd
add
x18
,
x18
,
x17
dup
s25
,
v24
.
s
[
1
]
stp
s24
,
s25
,
[
x18
]
cmp
w10
,
#
9
beq
WriteEnd
add
x18
,
x18
,
x17
dup
s27
,
v26
.
s
[
1
]
stp
s26
,
s27
,
[
x18
]
cmp
w10
,
#
10
beq
WriteEnd
add
x18
,
x18
,
x17
dup
s29
,
v28
.
s
[
1
]
stp
s28
,
s29
,
[
x18
]
cmp
w10
,
#
11
beq
WriteEnd
add
x18
,
x18
,
x17
dup
s31
,
v30
.
s
[
1
]
stp
s30
,
s31
,
[
x18
]
add
x18
,
x18
,
x17
b
WriteEnd
Write3
:
add
x13
,
x18
,
#
8
dup
s9
,
v8
.
s
[
1
]
stp
s8
,
s9
,
[
x18
]
add
x18
,
x18
,
x17
st1
{
v8
.
s
}[
2
],
[
x13
],
x17
cmp
w10
,
#
1
beq
WriteEnd
dup
s11
,
v10
.
s
[
1
]
stp
s10
,
s11
,
[
x18
]
add
x18
,
x18
,
x17
st1
{
v10
.
s
}[
2
],
[
x13
],
x17
cmp
w10
,
#
2
beq
WriteEnd
dup
s13
,
v12
.
s
[
1
]
stp
s12
,
s13
,
[
x18
]
add
x18
,
x18
,
x17
st1
{
v12
.
s
}[
2
],
[
x13
],
x17
cmp
w10
,
#
3
beq
WriteEnd
dup
s15
,
v14
.
s
[
1
]
stp
s14
,
s15
,
[
x18
]
add
x18
,
x18
,
x17
st1
{
v14
.
s
}[
2
],
[
x13
],
x17
cmp
w10
,
#
4
beq
WriteEnd
dup
s17
,
v16
.
s
[
1
]
stp
s16
,
s17
,
[
x18
]
add
x18
,
x18
,
x17
st1
{
v16
.
s
}[
2
],
[
x13
],
x17
cmp
w10
,
#
5
beq
WriteEnd
dup
s19
,
v18
.
s
[
1
]
stp
s18
,
s19
,
[
x18
]
add
x18
,
x18
,
x17
st1
{
v18
.
s
}[
2
],
[
x13
],
x17
cmp
w10
,
#
6
beq
WriteEnd
dup
s21
,
v20
.
s
[
1
]
stp
s20
,
s21
,
[
x18
]
add
x18
,
x18
,
x17
st1
{
v20
.
s
}[
2
],
[
x13
],
x17
cmp
w10
,
#
7
beq
WriteEnd
dup
s23
,
v22
.
s
[
1
]
stp
s22
,
s23
,
[
x18
]
add
x18
,
x18
,
x17
st1
{
v22
.
s
}[
2
],
[
x13
],
x17
cmp
w10
,
#
8
beq
WriteEnd
dup
s25
,
v24
.
s
[
1
]
stp
s24
,
s25
,
[
x18
]
add
x18
,
x18
,
x17
st1
{
v24
.
s
}[
2
],
[
x13
],
x17
cmp
w10
,
#
9
beq
WriteEnd
dup
s27
,
v26
.
s
[
1
]
stp
s26
,
s27
,
[
x18
]
add
x18
,
x18
,
x17
st1
{
v26
.
s
}[
2
],
[
x13
],
x17
cmp
w10
,
#
10
beq
WriteEnd
dup
s29
,
v28
.
s
[
1
]
stp
s28
,
s29
,
[
x18
]
add
x18
,
x18
,
x17
st1
{
v28
.
s
}[
2
],
[
x13
],
x17
cmp
w10
,
#
11
beq
WriteEnd
dup
s31
,
v30
.
s
[
1
]
stp
s30
,
s31
,
[
x18
]
add
x18
,
x18
,
x17
st1
{
v30
.
s
}[
2
],
[
x13
]
b
WriteEnd
Write4
:
st1
{
v8
.4
s
},
[
x18
],
x17
cmp
w10
,
#
1
beq
WriteEnd
st1
{
v10
.4
s
},
[
x18
],
x17
cmp
w10
,
#
2
beq
WriteEnd
st1
{
v12
.4
s
},
[
x18
],
x17
cmp
w10
,
#
3
beq
WriteEnd
st1
{
v14
.4
s
},
[
x18
],
x17
cmp
w10
,
#
4
beq
WriteEnd
st1
{
v16
.4
s
},
[
x18
],
x17
cmp
w10
,
#
5
beq
WriteEnd
st1
{
v18
.4
s
},
[
x18
],
x17
cmp
w10
,
#
6
beq
WriteEnd
st1
{
v20
.4
s
},
[
x18
],
x17
cmp
w10
,
#
7
beq
WriteEnd
st1
{
v22
.4
s
},
[
x18
],
x17
cmp
w10
,
#
8
beq
WriteEnd
st1
{
v24
.4
s
},
[
x18
],
x17
cmp
w10
,
#
9
beq
WriteEnd
st1
{
v26
.4
s
},
[
x18
],
x17
cmp
w10
,
#
10
beq
WriteEnd
st1
{
v28
.4
s
},
[
x18
],
x17
cmp
w10
,
#
11
beq
WriteEnd
st1
{
v30
.4
s
},
[
x18
],
x17
b
WriteEnd
Write5
:
add
x13
,
x18
,
#
16
st1
{
v8
.4
s
},
[
x18
],
x17
str
s9
,
[
x13
]
cmp
w10
,
#
1
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v10
.4
s
},
[
x18
],
x17
str
s11
,
[
x13
]
cmp
w10
,
#
2
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v12
.4
s
},
[
x18
],
x17
str
s13
,
[
x13
]
cmp
w10
,
#
3
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v14
.4
s
},
[
x18
],
x17
str
s15
,
[
x13
]
cmp
w10
,
#
4
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v16
.4
s
},
[
x18
],
x17
str
s17
,
[
x13
]
cmp
w10
,
#
5
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v18
.4
s
},
[
x18
],
x17
str
s19
,
[
x13
]
cmp
w10
,
#
6
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v20
.4
s
},
[
x18
],
x17
str
s21
,
[
x13
]
cmp
w10
,
#
7
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v22
.4
s
},
[
x18
],
x17
str
s23
,
[
x13
]
cmp
w10
,
#
8
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v24
.4
s
},
[
x18
],
x17
str
s25
,
[
x13
]
cmp
w10
,
#
9
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v26
.4
s
},
[
x18
],
x17
str
s27
,
[
x13
]
cmp
w10
,
#
10
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v28
.4
s
},
[
x18
],
x17
str
s29
,
[
x13
]
cmp
w10
,
#
11
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v30
.4
s
},
[
x18
],
x17
str
s31
,
[
x13
]
b
WriteEnd
Write6
:
add
x13
,
x18
,
#
16
st1
{
v8
.4
s
},
[
x18
],
x17
dup
s8
,
v9
.
s
[
1
]
stp
s9
,
s8
,
[
x13
]
cmp
w10
,
#
1
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v10
.4
s
},
[
x18
],
x17
dup
s10
,
v11
.
s
[
1
]
stp
s11
,
s10
,
[
x13
]
cmp
w10
,
#
2
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v12
.4
s
},
[
x18
],
x17
dup
s12
,
v13
.
s
[
1
]
stp
s13
,
s12
,
[
x13
]
cmp
w10
,
#
3
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v14
.4
s
},
[
x18
],
x17
dup
s14
,
v15
.
s
[
1
]
stp
s15
,
s14
,
[
x13
]
cmp
w10
,
#
4
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v16
.4
s
},
[
x18
],
x17
dup
s16
,
v17
.
s
[
1
]
stp
s17
,
s16
,
[
x13
]
cmp
w10
,
#
5
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v18
.4
s
},
[
x18
],
x17
dup
s18
,
v19
.
s
[
1
]
stp
s19
,
s18
,
[
x13
]
cmp
w10
,
#
6
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v20
.4
s
},
[
x18
],
x17
dup
s20
,
v21
.
s
[
1
]
stp
s21
,
s20
,
[
x13
]
cmp
w10
,
#
7
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v22
.4
s
},
[
x18
],
x17
dup
s22
,
v23
.
s
[
1
]
stp
s23
,
s22
,
[
x13
]
cmp
w10
,
#
8
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v24
.4
s
},
[
x18
],
x17
dup
s24
,
v25
.
s
[
1
]
stp
s25
,
s24
,
[
x13
]
cmp
w10
,
#
9
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v26
.4
s
},
[
x18
],
x17
dup
s26
,
v27
.
s
[
1
]
stp
s27
,
s26
,
[
x13
]
cmp
w10
,
#
10
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v28
.4
s
},
[
x18
],
x17
dup
s28
,
v29
.
s
[
1
]
stp
s29
,
s28
,
[
x13
]
cmp
w10
,
#
11
beq
WriteEnd
add
x13
,
x13
,
x17
st1
{
v30
.4
s
},
[
x18
],
x17
dup
s30
,
v31
.
s
[
1
]
stp
s31
,
s30
,
[
x13
]
b
WriteEnd
Write7
:
add
x13
,
x18
,
#
16
add
x16
,
x18
,
#
24
st1
{
v8
.4
s
},
[
x18
],
x17
dup
s8
,
v9
.
s
[
1
]
stp
s9
,
s8
,
[
x13
]
add
x13
,
x13
,
x17
st1
{
v9
.
s
}[
2
],
[
x16
],
x17
cmp
w10
,
#
1
beq
WriteEnd
st1
{
v10
.4
s
},
[
x18
],
x17
dup
s10
,
v11
.
s
[
1
]
stp
s11
,
s10
,
[
x13
]
add
x13
,
x13
,
x17
st1
{
v11
.
s
}[
2
],
[
x16
],
x17
cmp
w10
,
#
2
beq
WriteEnd
st1
{
v12
.4
s
},
[
x18
],
x17
dup
s12
,
v13
.
s
[
1
]
stp
s13
,
s12
,
[
x13
]
add
x13
,
x13
,
x17
st1
{
v13
.
s
}[
2
],
[
x16
],
x17
cmp
w10
,
#
3
beq
WriteEnd
st1
{
v14
.4
s
},
[
x18
],
x17
dup
s14
,
v15
.
s
[
1
]
stp
s15
,
s14
,
[
x13
]
add
x13
,
x13
,
x17
st1
{
v15
.
s
}[
2
],
[
x16
],
x17
cmp
w10
,
#
4
beq
WriteEnd
st1
{
v16
.4
s
},
[
x18
],
x17
dup
s16
,
v17
.
s
[
1
]
stp
s17
,
s16
,
[
x13
]
add
x13
,
x13
,
x17
st1
{
v17
.
s
}[
2
],
[
x16
],
x17
cmp
w10
,
#
5
beq
WriteEnd
st1
{
v18
.4
s
},
[
x18
],
x17
dup
s18
,
v19
.
s
[
1
]
stp
s19
,
s18
,
[
x13
]
add
x13
,
x13
,
x17
st1
{
v19
.
s
}[
2
],
[
x16
],
x17
cmp
w10
,
#
6
beq
WriteEnd
st1
{
v20
.4
s
},
[
x18
],
x17
dup
s20
,
v21
.
s
[
1
]
stp
s21
,
s20
,
[
x13
]
add
x13
,
x13
,
x17
st1
{
v21
.
s
}[
2
],
[
x16
],
x17
cmp
w10
,
#
7
beq
WriteEnd
st1
{
v22
.4
s
},
[
x18
],
x17
dup
s22
,
v23
.
s
[
1
]
stp
s23
,
s22
,
[
x13
]
add
x13
,
x13
,
x17
st1
{
v23
.
s
}[
2
],
[
x16
],
x17
cmp
w10
,
#
8
beq
WriteEnd
st1
{
v24
.4
s
},
[
x18
],
x17
dup
s24
,
v25
.
s
[
1
]
stp
s25
,
s24
,
[
x13
]
add
x13
,
x13
,
x17
st1
{
v25
.
s
}[
2
],
[
x16
],
x17
cmp
w10
,
#
9
beq
WriteEnd
st1
{
v26
.4
s
},
[
x18
],
x17
dup
s26
,
v27
.
s
[
1
]
stp
s27
,
s26
,
[
x13
]
add
x13
,
x13
,
x17
st1
{
v27
.
s
}[
2
],
[
x16
],
x17
cmp
w10
,
#
10
beq
WriteEnd
st1
{
v28
.4
s
},
[
x18
],
x17
dup
s28
,
v29
.
s
[
1
]
stp
s29
,
s28
,
[
x13
]
add
x13
,
x13
,
x17
st1
{
v29
.
s
}[
2
],
[
x16
],
x17
cmp
w10
,
#
11
beq
WriteEnd
st1
{
v30
.4
s
},
[
x18
],
x17
dup
s30
,
v31
.
s
[
1
]
stp
s31
,
s30
,
[
x13
]
add
x13
,
x13
,
x17
st1
{
v31
.
s
}[
2
],
[
x16
],
x17
b
WriteEnd
WriteC8
:
st1
{
v8
.8
h
,
v9
.8
h
,
v10
.8
h
,
v11
.8
h
},
[
x2
],
#
64
st1
{
v12
.8
h
,
v13
.8
h
,
v14
.8
h
,
v15
.8
h
},
[
x2
],
#
64
st1
{
v16
.8
h
,
v17
.8
h
,
v18
.8
h
,
v19
.8
h
},
[
x2
],
#
64
st1
{
v20
.8
h
,
v21
.8
h
,
v22
.8
h
,
v23
.8
h
},
[
x2
],
#
64
st1
{
v24
.8
h
,
v25
.8
h
,
v26
.8
h
,
v27
.8
h
},
[
x2
],
#
64
st1
{
v28
.8
h
,
v29
.8
h
,
v30
.8
h
,
v31
.8
h
},
[
x2
],
#
64
b
WriteEnd
Write8
:
st1
{
v8
.4
s
,
v9
.4
s
},
[
x18
],
x17
cmp
w10
,
#
1
beq
WriteEnd
st1
{
v10
.4
s
,
v11
.4
s
},
[
x18
],
x17
cmp
w10
,
#
2
beq
WriteEnd
st1
{
v12
.4
s
,
v13
.4
s
},
[
x18
],
x17
cmp
w10
,
#
3
beq
WriteEnd
st1
{
v14
.4
s
,
v15
.4
s
},
[
x18
],
x17
cmp
w10
,
#
4
beq
WriteEnd
st1
{
v16
.4
s
,
v17
.4
s
},
[
x18
],
x17
cmp
w10
,
#
5
beq
WriteEnd
st1
{
v18
.4
s
,
v19
.4
s
},
[
x18
],
x17
cmp
w10
,
#
6
beq
WriteEnd
st1
{
v20
.4
s
,
v21
.4
s
},
[
x18
],
x17
cmp
w10
,
#
7
beq
WriteEnd
st1
{
v22
.4
s
,
v23
.4
s
},
[
x18
],
x17
cmp
w10
,
#
8
beq
WriteEnd
st1
{
v24
.4
s
,
v25
.4
s
},
[
x18
],
x17
cmp
w10
,
#
9
beq
WriteEnd
st1
{
v26
.4
s
,
v27
.4
s
},
[
x18
],
x17
cmp
w10
,
#
10
beq
WriteEnd
st1
{
v28
.4
s
,
v29
.4
s
},
[
x18
],
x17
cmp
w10
,
#
11
beq
WriteEnd
st1
{
v30
.4
s
,
v31
.4
s
},
[
x18
],
x17
WriteEnd
:
subs
w10
,
w10
,
#
12
//
lhs
row
-
12
bgt
L2
End2
:
subs
w7
,
w7
,
#
8
//
rhs
col
-
8
add
x1
,
x1
,
x15
//
rhs
ptr
+
stride
add
x3
,
x3
,
#
32
//
bias
ptr
+
stride
ldrb
w13
,
[
sp
,
#
8
]
cbz
w13
,
NoDstStep
add
x2
,
x2
,
#
32
//
dst
ptr
+
stride
NoDstStep
:
bgt
L1
End1
:
sub
sp
,
sp
,
#
128
ld1
{
v8
.4
s
,
v9
.4
s
,
v10
.4
s
,
v11
.4
s
},
[
sp
],
#
64
ld1
{
v12
.4
s
,
v13
.4
s
,
v14
.4
s
,
v15
.4
s
},
[
sp
],
#
64
ret
#endif
mindspore/lite/nnacl/fp32/matmul.c
浏览文件 @
0f4e6d73
...
...
@@ -28,6 +28,108 @@ void RowMajor2Row8Major(float *src_ptr, float *dst_ptr, int row, int col) {
return
;
}
void
RowMajor2Col12Major
(
float
*
src_ptr
,
float
*
dst_ptr
,
size_t
row
,
size_t
col
)
{
size_t
row12
=
row
/
C12NUM
*
C12NUM
;
size_t
col4
=
col
/
C4NUM
*
C4NUM
;
float
*
src_r
=
src_ptr
;
float
*
dst_r
=
dst_ptr
;
size_t
ri
=
0
;
for
(;
ri
<
row12
;
ri
+=
C12NUM
)
{
size_t
ci
=
0
;
for
(;
ci
<
col4
;
ci
+=
C4NUM
)
{
float
*
src_c
=
src_r
+
ci
;
float
*
dst_c
=
dst_r
+
ci
*
C12NUM
;
/* 12x4 row-major to col-major */
#ifdef ENABLE_ARM64
size_t
stride
=
col
*
sizeof
(
float
);
asm
volatile
(
"mov x10, %[src_c]
\n
"
"mov x11, %[dst_c]
\n
"
"ld1 {v0.4s}, [x10], %[stride]
\n
"
"ld1 {v1.4s}, [x10], %[stride]
\n
"
"ld1 {v2.4s}, [x10], %[stride]
\n
"
"ld1 {v3.4s}, [x10], %[stride]
\n
"
"ld1 {v4.4s}, [x10], %[stride]
\n
"
"ld1 {v5.4s}, [x10], %[stride]
\n
"
"ld1 {v6.4s}, [x10], %[stride]
\n
"
"ld1 {v7.4s}, [x10], %[stride]
\n
"
"zip1 v12.4s, v0.4s, v1.4s
\n
"
"zip2 v13.4s, v0.4s, v1.4s
\n
"
"zip1 v14.4s, v2.4s, v3.4s
\n
"
"zip2 v15.4s, v2.4s, v3.4s
\n
"
"ld1 {v8.4s}, [x10], %[stride]
\n
"
"ld1 {v9.4s}, [x10], %[stride]
\n
"
"ld1 {v10.4s}, [x10], %[stride]
\n
"
"ld1 {v11.4s}, [x10], %[stride]
\n
"
"zip1 v16.4s, v4.4s, v5.4s
\n
"
"zip2 v17.4s, v4.4s, v5.4s
\n
"
"zip1 v18.4s, v6.4s, v7.4s
\n
"
"zip2 v19.4s, v6.4s, v7.4s
\n
"
"trn1 v20.2d, v12.2d, v14.2d
\n
"
"trn2 v23.2d, v12.2d, v14.2d
\n
"
"trn1 v26.2d, v13.2d, v15.2d
\n
"
"trn2 v29.2d, v13.2d, v15.2d
\n
"
"trn1 v21.2d, v16.2d, v18.2d
\n
"
"trn2 v24.2d, v16.2d, v18.2d
\n
"
"trn1 v27.2d, v17.2d, v19.2d
\n
"
"trn2 v30.2d, v17.2d, v19.2d
\n
"
"zip1 v12.4s, v8.4s, v9.4s
\n
"
"zip2 v13.4s, v8.4s, v9.4s
\n
"
"zip1 v14.4s, v10.4s, v11.4s
\n
"
"zip2 v15.4s, v10.4s, v11.4s
\n
"
"trn1 v22.2d, v12.2d, v14.2d
\n
"
"trn2 v25.2d, v12.2d, v14.2d
\n
"
"trn1 v28.2d, v13.2d, v15.2d
\n
"
"trn2 v31.2d, v13.2d, v15.2d
\n
"
"st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x11], #64
\n
"
"st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x11], #64
\n
"
"st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x11], #64
\n
"
:
:
[
dst_c
]
"r"
(
dst_c
),
[
src_c
]
"r"
(
src_c
),
[
stride
]
"r"
(
stride
)
:
"x10"
,
"x11"
,
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
,
"v16"
,
"v17"
,
"v18"
,
"v19"
,
"v20"
,
"v21"
,
"v22"
,
"v23"
,
"v24"
,
"v25"
,
"v26"
,
"v27"
,
"v28"
,
"v29"
,
"v30"
,
"v31"
);
#else
for
(
int
tr
=
0
;
tr
<
C12NUM
;
tr
++
)
{
for
(
int
tc
=
0
;
tc
<
C4NUM
;
tc
++
)
{
dst_c
[
tc
*
C12NUM
+
tr
]
=
src_c
[
tr
*
col
+
tc
];
}
}
#endif
}
for
(;
ci
<
col
;
ci
++
)
{
float
*
src_c
=
src_r
+
ci
;
float
*
dst_c
=
dst_r
+
ci
*
C12NUM
;
for
(
size_t
i
=
0
;
i
<
C12NUM
;
i
++
)
{
dst_c
[
i
]
=
src_c
[
i
*
col
];
}
}
src_r
+=
C12NUM
*
col
;
dst_r
+=
C12NUM
*
col
;
}
for
(;
ri
<
row
;
ri
++
)
{
for
(
size_t
i
=
0
;
i
<
col
;
i
++
)
{
dst_r
[
i
*
C12NUM
]
=
src_r
[
i
];
}
src_r
+=
col
;
dst_r
+=
1
;
}
return
;
}
void
RowMajor2Col8Major
(
float
*
src_ptr
,
float
*
dst_ptr
,
size_t
row
,
size_t
col
)
{
size_t
row8
=
row
/
C8NUM
*
C8NUM
;
size_t
col4
=
col
/
C4NUM
*
C4NUM
;
...
...
@@ -267,6 +369,31 @@ void MatMul8x8(const float *a, const float *b, float *dst, const float *bias, Ac
return
;
}
void
MatMul12x8
(
const
float
*
a
,
const
float
*
b
,
float
*
dst
,
const
float
*
bias
,
ActType
act_type
,
int
deep
,
int
row
,
int
col
,
int
stride
,
bool
write_nhwc
)
{
if
(
write_nhwc
)
{
/* col8-major * row8-major => col-major */
for
(
int
r
=
0
;
r
<
row
;
r
++
)
{
for
(
int
c
=
0
;
c
<
col
;
c
++
)
{
int
r12div
=
r
/
12
,
r12mod
=
r
%
12
;
int
c8div
=
c
/
8
,
c8mod
=
c
%
8
;
size_t
ci
=
r
*
stride
+
c
;
float
value
=
0
;
for
(
int
d
=
0
;
d
<
deep
;
d
++
)
{
size_t
ai
=
r12div
*
deep
*
12
+
d
*
12
+
r12mod
;
size_t
bi
=
c8div
*
deep
*
8
+
d
*
8
+
c8mod
;
value
=
value
+
a
[
ai
]
*
b
[
bi
];
}
if
(
bias
!=
NULL
)
value
+=
bias
[
c
];
if
(
act_type
==
ActType_Relu6
)
value
=
MSMIN
(
6
.
0
f
,
value
);
if
(
act_type
!=
ActType_No
)
value
=
MSMAX
(
0
.
0
f
,
value
);
dst
[
ci
]
=
value
;
}
}
}
return
;
}
void
MatMul
(
const
float
*
a
,
const
float
*
b
,
float
*
c
,
const
float
*
bias
,
ActType
act_type
,
int
deep
,
int
row
,
int
col
,
int
stride
,
bool
write_nhwc
)
{
#ifdef ENABLE_ARM64
...
...
@@ -275,3 +402,12 @@ void MatMul(const float *a, const float *b, float *c, const float *bias, ActType
MatMul8x8
(
a
,
b
,
c
,
bias
,
act_type
,
deep
,
row
,
col
,
stride
,
write_nhwc
);
#endif
}
void
MatMulOpt
(
const
float
*
a
,
const
float
*
b
,
float
*
c
,
const
float
*
bias
,
ActType
act_type
,
int
deep
,
int
row
,
int
col
,
int
stride
,
bool
write_nhwc
)
{
#ifdef ENABLE_ARM64
MatmulFloatNeon64Opt
(
a
,
b
,
c
,
bias
,
(
int
)
act_type
,
deep
,
row
,
col
,
stride
,
write_nhwc
);
#else
MatMul12x8
(
a
,
b
,
c
,
bias
,
act_type
,
deep
,
row
,
col
,
stride
,
write_nhwc
);
#endif
}
mindspore/lite/nnacl/fp32/matmul.h
浏览文件 @
0f4e6d73
...
...
@@ -28,12 +28,17 @@ extern "C" {
#endif
void
MatMul
(
const
float
*
a
,
const
float
*
b
,
float
*
c
,
const
float
*
bias
,
ActType
act_type
,
int
depth
,
int
row
,
int
col
,
int
stride
,
bool
write_nhwc
);
void
MatMulOpt
(
const
float
*
a
,
const
float
*
b
,
float
*
c
,
const
float
*
bias
,
ActType
act_type
,
int
depth
,
int
row
,
int
col
,
int
stride
,
bool
write_nhwc
);
void
RowMajor2Row8Major
(
float
*
src_ptr
,
float
*
dst_ptr
,
int
row
,
int
col
);
void
RowMajor2Col8Major
(
float
*
src_ptr
,
float
*
dst_ptr
,
size_t
row
,
size_t
col
);
void
RowMajor2Col12Major
(
float
*
src_ptr
,
float
*
dst_ptr
,
size_t
row
,
size_t
col
);
void
Row8x8Major2RowMajor
(
float
*
src_ptr
,
float
*
dst_ptr
,
size_t
row
,
size_t
col
,
size_t
stride
);
#ifdef ENABLE_ARM64
void
MatmulFloatNeon64
(
const
float
*
a
,
const
float
*
b
,
float
*
c
,
const
float
*
bias
,
int
act_type
,
int
depth
,
int
row
,
int
col
,
size_t
stride
,
bool
write_nhwc
);
void
MatmulFloatNeon64Opt
(
const
float
*
a
,
const
float
*
b
,
float
*
c
,
const
float
*
bias
,
int
act_type
,
int
depth
,
int
row
,
int
col
,
size_t
stride
,
bool
write_nhwc
);
#endif
#ifdef __cplusplus
}
...
...
mindspore/lite/nnacl/matmul_parameter.h
浏览文件 @
0f4e6d73
...
...
@@ -31,6 +31,7 @@ typedef struct MatMulParameter {
int
row_
;
int
col_
;
int
row_8_
;
int
row_12_
;
int
row_16_
;
int
col_8_
;
int
deep_
;
...
...
mindspore/lite/nnacl/op_base.h
浏览文件 @
0f4e6d73
...
...
@@ -23,6 +23,7 @@
#define C4NUM 4
#define C8NUM 8
#define C12NUM 12
#define C16NUM 16
#define BLOCK 4
#define TILE_NUM 8
...
...
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
浏览文件 @
0f4e6d73
...
...
@@ -59,7 +59,7 @@ void Convolution1x1CPUKernel::InitConv1x1MatmulParam() {
matmul_param_
->
row_
=
conv_param_
->
output_h_
*
conv_param_
->
output_w_
;
matmul_param_
->
col_
=
conv_param_
->
output_channel_
;
matmul_param_
->
deep_
=
conv_param_
->
input_channel_
;
matmul_param_
->
row_
8_
=
UP_ROUND
(
matmul_param_
->
row_
,
C8
NUM
);
matmul_param_
->
row_
12_
=
UP_ROUND
(
matmul_param_
->
row_
,
C12
NUM
);
matmul_param_
->
col_8_
=
UP_ROUND
(
matmul_param_
->
col_
,
C8NUM
);
matmul_param_
->
act_type_
=
(
conv_param_
->
is_relu6_
)
?
ActType_Relu6
:
ActType_No
;
matmul_param_
->
act_type_
=
(
conv_param_
->
is_relu_
)
?
ActType_Relu
:
matmul_param_
->
act_type_
;
...
...
@@ -100,12 +100,12 @@ int Convolution1x1CPUKernel::InitConv1x1Param() {
thread_count_
=
MSMIN
(
op_parameter_
->
thread_num_
,
UP_DIV
(
matmul_param_
->
col_
,
C8NUM
));
thread_stride_
=
UP_DIV
(
UP_DIV
(
matmul_param_
->
col_
,
C8NUM
),
thread_count_
)
*
C8NUM
;
pack_input_
=
reinterpret_cast
<
float
*>
(
malloc
(
matmul_param_
->
row_
8
_
*
matmul_param_
->
deep_
*
sizeof
(
float
)));
pack_input_
=
reinterpret_cast
<
float
*>
(
malloc
(
matmul_param_
->
row_
12
_
*
matmul_param_
->
deep_
*
sizeof
(
float
)));
if
(
pack_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Conv1x1 Malloc pack_input_ error!"
;
return
RET_MEMORY_FAILED
;
}
memset
(
pack_input_
,
0
,
matmul_param_
->
row_
8
_
*
matmul_param_
->
deep_
*
sizeof
(
float
));
memset
(
pack_input_
,
0
,
matmul_param_
->
row_
12
_
*
matmul_param_
->
deep_
*
sizeof
(
float
));
return
RET_OK
;
}
...
...
@@ -118,7 +118,7 @@ void Convolution1x1CPUKernel::Pre1x1Trans(float *src_input, float *src_output) {
input_ptr_
=
src_input
;
}
RowMajor2Col
8
Major
(
input_ptr_
,
pack_input_
,
matmul_param_
->
row_
,
matmul_param_
->
deep_
);
RowMajor2Col
12
Major
(
input_ptr_
,
pack_input_
,
matmul_param_
->
row_
,
matmul_param_
->
deep_
);
return
;
}
...
...
@@ -143,7 +143,7 @@ int Convolution1x1CPUKernel::DoConv1x1(int task_id) {
auto
bias
=
(
bias_data_
==
nullptr
)
?
nullptr
:
reinterpret_cast
<
float
*>
(
bias_data_
)
+
thread_stride_
*
task_id
;
MatMul
(
pack_input_
,
weight_ptr_
+
task_id
*
thread_stride_
*
matmul_param_
->
deep_
,
MatMul
Opt
(
pack_input_
,
weight_ptr_
+
task_id
*
thread_stride_
*
matmul_param_
->
deep_
,
output_ptr_
+
task_id
*
thread_stride_
,
bias
,
matmul_param_
->
act_type_
,
matmul_param_
->
deep_
,
matmul_param_
->
row_
,
cur_oc
,
matmul_param_
->
col_
,
true
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录