Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
6a5ae9bb
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
6a5ae9bb
编写于
8月 28, 2020
作者:
Z
zhanyuan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Optimize matmul int8 sdot
上级
6fd3f2e9
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
827 addition
and
1 deletion
+827
-1
mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
+820
-0
mindspore/lite/nnacl/opt_op_handler.c
mindspore/lite/nnacl/opt_op_handler.c
+7
-1
未找到文件。
mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
0 → 100644
浏览文件 @
6a5ae9bb
#ifdef __aarch64__
.
text
.
align
5
.
global
MatmulInt8DpNeon64
#ifndef __APPLE__
.
type
MatmulInt8DpNeon64
,
%
function
#endif
//
//
int8
RHS
4
x8
block
//
/-----------------------------------------
|
//
|v2.b[0] ... v2.b[12] v3.b[0] ... v3.b[12]|
//
|
...
...
|
//
|v2.b[3] ... v2.b[15] v3.b[3] ... v3.b[15]|
//
\-----------------------------------------/
//
int8
LHS
8
x4
block
//
/---------------------\
/-------------------------------------------|
//
|v0.b[0] ... v0.b[3] |
|v16.s[0] ... v16.s[3] v17.s[0] ... v17.s[3]|
//
|v0.b[4] ... v0.b[7] |
v18.s
[0
]
...
v18
.
s
[
3
]
v19
.
s
[
0
]
...
v19
.
s
[
3
]|
//
|v0.b[8] ... v0.b[11] |
v20.s
[0
]
...
v20
.
s
[
3
]
v21
.
s
[
0
]
...
v21
.
s
[
3
]|
//
|v0.b[12] ... v0.b[15]|
|v22.s[0] ... v22.s[3] v23.s[0] ... v23.s[3]|
//
|v1.b[0] ... v1.b[3] |
|v24.s[0] ... v24.s[3] v25.s[0] ... v25.s[3]|
//
|v1.b[4] ... v1.b[7] |
|v26.s[0] ... v26.s[3] v27.s[0] ... v27.s[3]|
//
|v1.b[8] ... v1.b[11]|
|v28.s[0] ... v28.s[3] v29.s[0] ... v29.s[3]|
//
|v1.b[12] ... v1.b[15]|
|v30.s[0] ... v30.s[3] v31.s[0] ... v31.s[3]|
//
\---------------------/
\
-------------------------------------------/
//
int32
accumulators
8
x8
block
//
int8
RHS
16
x8
block
//
/-------------
|
//
|v2 v3 |
//
|v6 v7 |
//
|v10 v11 |
//
|v14 v15 |
//
\-------------/
//
int8
LHS
8
x16
block
//
/--------------------\
/-------------|
//
|v0 v4 v8 v12|
| |
//
|v1 v5 v9 v13|
| |
//
\--------------------/
\
-------------/
//
void
MatmulInt8DpNeon64
(
const
int8_t
*
a
,
const
int8_t
*
b
,
int8_t
*
dst
,
int
row8
,
int
col8
,
int
deep4
,
//
const
int
*
a_sums
,
const
int
*
bias
,
int
act_min
,
int
act_max
,
int
out_zp
,
//
int
multiplier
,
int
left_shift
,
int
right_shift
,
int
row
,
int
col
,
int
stride
)
;
//
x0
:
a
(
left
matrix
ptr
)
//
x1
:
b
(
right
matrix
ptr
)
//
x2
:
out
ptr
//
w3
:
row8
//
w4
:
col8
//
w5
:
deep4
//
x6
:
a_sums
//
x7
:
bias
//
w8
:
act_min
//
w9
:
act_max
//
w10
:
out_zp
//
w11
:
multiplier
//
w12
:
left_shift
//
w13
:
right_shift
//
w14
:
row
//
w15
:
col
//
w24
:
stride
MatmulInt8DpNeon64
:
sub
sp
,
sp
,
#
192
st1
{
v8
.4
s
,
v9
.4
s
,
v10
.4
s
,
v11
.4
s
},
[
sp
],
#
64
st1
{
v12
.4
s
,
v13
.4
s
,
v14
.4
s
,
v15
.4
s
},
[
sp
],
#
64
stp
x19
,
x20
,
[
sp
],
#
16
stp
x21
,
x22
,
[
sp
],
#
16
stp
x23
,
x24
,
[
sp
],
#
16
stp
x25
,
x26
,
[
sp
],
#
16
ldr
w8
,
[
sp
]
ldr
w9
,
[
sp
,
#
8
]
ldr
w10
,
[
sp
,
#
16
]
ldr
w11
,
[
sp
,
#
24
]
ldr
w12
,
[
sp
,
#
32
]
ldr
w13
,
[
sp
,
#
40
]
ldr
w14
,
[
sp
,
#
48
]
ldr
w15
,
[
sp
,
#
56
]
ldr
w24
,
[
sp
,
#
64
]
mov
w17
,
#
8
//
sizeof
(
int8
)*
8
mul
w21
,
w5
,
w17
//
the
stride
of
a
/
b
:
sizeof
(
int8
)*
8
*
deep4
mov
x25
,
x2
L1
:
cmp
w4
,
#
0
//
if
at
the
end
of
col8
beq
End1
mov
w16
,
w3
//
reset
a
row8
counter
mov
w23
,
w14
//
reset
a
row
counter
mov
x17
,
x0
//
reload
a
ptr
mov
x22
,
x6
//
reload
a_sums
ptr
L2
:
cmp
w16
,
#
0
beq
End2
mov
x18
,
x1
//
reload
b
ptr
mov
x19
,
x7
//
reload
bias
ptr
mov
w20
,
w5
//
reload
depth
dup
v16
.4
s
,
wzr
dup
v17
.4
s
,
wzr
dup
v18
.4
s
,
wzr
dup
v19
.4
s
,
wzr
dup
v20
.4
s
,
wzr
dup
v21
.4
s
,
wzr
dup
v22
.4
s
,
wzr
dup
v23
.4
s
,
wzr
dup
v24
.4
s
,
wzr
dup
v25
.4
s
,
wzr
dup
v26
.4
s
,
wzr
dup
v27
.4
s
,
wzr
dup
v28
.4
s
,
wzr
dup
v29
.4
s
,
wzr
dup
v30
.4
s
,
wzr
dup
v31
.4
s
,
wzr
L3
:
cmp
w20
,
#
16
blt
LoopD4
LoopD16
:
ld1
{
v0
.16
b
,
v1
.16
b
},
[
x17
],
#
32
ld1
{
v2
.16
b
,
v3
.16
b
},
[
x18
],
#
32
sdot
v16
.4
s
,
v2
.16
b
,
v0
.4
b
[
0
]
sdot
v18
.4
s
,
v2
.16
b
,
v0
.4
b
[
1
]
sdot
v20
.4
s
,
v2
.16
b
,
v0
.4
b
[
2
]
sdot
v22
.4
s
,
v2
.16
b
,
v0
.4
b
[
3
]
ld1
{
v4
.16
b
,
v5
.16
b
},
[
x17
],
#
32
sdot
v24
.4
s
,
v2
.16
b
,
v1
.4
b
[
0
]
sdot
v26
.4
s
,
v2
.16
b
,
v1
.4
b
[
1
]
sdot
v28
.4
s
,
v2
.16
b
,
v1
.4
b
[
2
]
sdot
v30
.4
s
,
v2
.16
b
,
v1
.4
b
[
3
]
ld1
{
v6
.16
b
,
v7
.16
b
},
[
x18
],
#
32
sdot
v17
.4
s
,
v3
.16
b
,
v0
.4
b
[
0
]
sdot
v19
.4
s
,
v3
.16
b
,
v0
.4
b
[
1
]
sdot
v21
.4
s
,
v3
.16
b
,
v0
.4
b
[
2
]
sdot
v23
.4
s
,
v3
.16
b
,
v0
.4
b
[
3
]
sdot
v25
.4
s
,
v3
.16
b
,
v1
.4
b
[
0
]
sdot
v27
.4
s
,
v3
.16
b
,
v1
.4
b
[
1
]
sdot
v29
.4
s
,
v3
.16
b
,
v1
.4
b
[
2
]
sdot
v31
.4
s
,
v3
.16
b
,
v1
.4
b
[
3
]
ld1
{
v8
.16
b
,
v9
.16
b
},
[
x17
],
#
32
sdot
v16
.4
s
,
v6
.16
b
,
v4
.4
b
[
0
]
sdot
v18
.4
s
,
v6
.16
b
,
v4
.4
b
[
1
]
sdot
v20
.4
s
,
v6
.16
b
,
v4
.4
b
[
2
]
sdot
v22
.4
s
,
v6
.16
b
,
v4
.4
b
[
3
]
sdot
v24
.4
s
,
v6
.16
b
,
v5
.4
b
[
0
]
sdot
v26
.4
s
,
v6
.16
b
,
v5
.4
b
[
1
]
sdot
v28
.4
s
,
v6
.16
b
,
v5
.4
b
[
2
]
sdot
v30
.4
s
,
v6
.16
b
,
v5
.4
b
[
3
]
ld1
{
v10
.16
b
,
v11
.16
b
},
[
x18
],
#
32
sdot
v17
.4
s
,
v7
.16
b
,
v4
.4
b
[
0
]
sdot
v19
.4
s
,
v7
.16
b
,
v4
.4
b
[
1
]
sdot
v21
.4
s
,
v7
.16
b
,
v4
.4
b
[
2
]
sdot
v23
.4
s
,
v7
.16
b
,
v4
.4
b
[
3
]
sdot
v25
.4
s
,
v7
.16
b
,
v5
.4
b
[
0
]
sdot
v27
.4
s
,
v7
.16
b
,
v5
.4
b
[
1
]
sdot
v29
.4
s
,
v7
.16
b
,
v5
.4
b
[
2
]
sdot
v31
.4
s
,
v7
.16
b
,
v5
.4
b
[
3
]
ld1
{
v12
.16
b
,
v13
.16
b
},
[
x17
],
#
32
sdot
v16
.4
s
,
v10
.16
b
,
v8
.4
b
[
0
]
sdot
v18
.4
s
,
v10
.16
b
,
v8
.4
b
[
1
]
sdot
v20
.4
s
,
v10
.16
b
,
v8
.4
b
[
2
]
sdot
v22
.4
s
,
v10
.16
b
,
v8
.4
b
[
3
]
sdot
v24
.4
s
,
v10
.16
b
,
v9
.4
b
[
0
]
sdot
v26
.4
s
,
v10
.16
b
,
v9
.4
b
[
1
]
sdot
v28
.4
s
,
v10
.16
b
,
v9
.4
b
[
2
]
sdot
v30
.4
s
,
v10
.16
b
,
v9
.4
b
[
3
]
ld1
{
v14
.16
b
,
v15
.16
b
},
[
x18
],
#
32
sdot
v17
.4
s
,
v11
.16
b
,
v8
.4
b
[
0
]
sdot
v19
.4
s
,
v11
.16
b
,
v8
.4
b
[
1
]
sdot
v21
.4
s
,
v11
.16
b
,
v8
.4
b
[
2
]
sdot
v23
.4
s
,
v11
.16
b
,
v8
.4
b
[
3
]
sdot
v25
.4
s
,
v11
.16
b
,
v9
.4
b
[
0
]
sdot
v27
.4
s
,
v11
.16
b
,
v9
.4
b
[
1
]
sdot
v29
.4
s
,
v11
.16
b
,
v9
.4
b
[
2
]
sdot
v31
.4
s
,
v11
.16
b
,
v9
.4
b
[
3
]
sdot
v16
.4
s
,
v14
.16
b
,
v12
.4
b
[
0
]
sdot
v18
.4
s
,
v14
.16
b
,
v12
.4
b
[
1
]
sdot
v20
.4
s
,
v14
.16
b
,
v12
.4
b
[
2
]
sdot
v22
.4
s
,
v14
.16
b
,
v12
.4
b
[
3
]
sdot
v24
.4
s
,
v14
.16
b
,
v13
.4
b
[
0
]
sdot
v26
.4
s
,
v14
.16
b
,
v13
.4
b
[
1
]
sdot
v28
.4
s
,
v14
.16
b
,
v13
.4
b
[
2
]
sdot
v30
.4
s
,
v14
.16
b
,
v13
.4
b
[
3
]
sdot
v17
.4
s
,
v15
.16
b
,
v12
.4
b
[
0
]
sdot
v19
.4
s
,
v15
.16
b
,
v12
.4
b
[
1
]
sdot
v21
.4
s
,
v15
.16
b
,
v12
.4
b
[
2
]
sdot
v23
.4
s
,
v15
.16
b
,
v12
.4
b
[
3
]
sdot
v25
.4
s
,
v15
.16
b
,
v13
.4
b
[
0
]
sdot
v27
.4
s
,
v15
.16
b
,
v13
.4
b
[
1
]
sdot
v29
.4
s
,
v15
.16
b
,
v13
.4
b
[
2
]
sdot
v31
.4
s
,
v15
.16
b
,
v13
.4
b
[
3
]
subs
w20
,
w20
,
#
16
//
depth
-
16
b
L3
LoopD4
:
cmp
w20
,
#
0
beq
End3
ld1
{
v0
.16
b
,
v1
.16
b
},
[
x17
],
#
32
ld1
{
v2
.16
b
,
v3
.16
b
},
[
x18
],
#
32
sdot
v16
.4
s
,
v2
.16
b
,
v0
.4
b
[
0
]
sdot
v18
.4
s
,
v2
.16
b
,
v0
.4
b
[
1
]
sdot
v20
.4
s
,
v2
.16
b
,
v0
.4
b
[
2
]
sdot
v22
.4
s
,
v2
.16
b
,
v0
.4
b
[
3
]
sdot
v24
.4
s
,
v2
.16
b
,
v1
.4
b
[
0
]
sdot
v26
.4
s
,
v2
.16
b
,
v1
.4
b
[
1
]
sdot
v28
.4
s
,
v2
.16
b
,
v1
.4
b
[
2
]
sdot
v30
.4
s
,
v2
.16
b
,
v1
.4
b
[
3
]
sdot
v17
.4
s
,
v3
.16
b
,
v0
.4
b
[
0
]
sdot
v19
.4
s
,
v3
.16
b
,
v0
.4
b
[
1
]
sdot
v21
.4
s
,
v3
.16
b
,
v0
.4
b
[
2
]
sdot
v23
.4
s
,
v3
.16
b
,
v0
.4
b
[
3
]
sdot
v25
.4
s
,
v3
.16
b
,
v1
.4
b
[
0
]
sdot
v27
.4
s
,
v3
.16
b
,
v1
.4
b
[
1
]
sdot
v29
.4
s
,
v3
.16
b
,
v1
.4
b
[
2
]
sdot
v31
.4
s
,
v3
.16
b
,
v1
.4
b
[
3
]
subs
w20
,
w20
,
#
4
//
depth
-
4
b
LoopD4
End3
:
//
Add
(
Bias
+
Depth
*
Za
*
Zb
-
Za
*
Bsums
)
ld1
{
v15
.4
s
},
[
x19
],
#
16
ld1
{
v14
.4
s
},
[
x19
],
#
16
add
v16
.4
s
,
v16
.4
s
,
v15
.4
s
add
v18
.4
s
,
v18
.4
s
,
v15
.4
s
add
v20
.4
s
,
v20
.4
s
,
v15
.4
s
add
v22
.4
s
,
v22
.4
s
,
v15
.4
s
add
v24
.4
s
,
v24
.4
s
,
v15
.4
s
add
v26
.4
s
,
v26
.4
s
,
v15
.4
s
add
v28
.4
s
,
v28
.4
s
,
v15
.4
s
add
v30
.4
s
,
v30
.4
s
,
v15
.4
s
add
v17
.4
s
,
v17
.4
s
,
v14
.4
s
add
v19
.4
s
,
v19
.4
s
,
v14
.4
s
add
v21
.4
s
,
v21
.4
s
,
v14
.4
s
add
v23
.4
s
,
v23
.4
s
,
v14
.4
s
add
v25
.4
s
,
v25
.4
s
,
v14
.4
s
add
v27
.4
s
,
v27
.4
s
,
v14
.4
s
add
v29
.4
s
,
v29
.4
s
,
v14
.4
s
add
v31
.4
s
,
v31
.4
s
,
v14
.4
s
//
Subtract
(
Asums
*
Zb
)
ld1
{
v13
.4
s
},
[
x22
],
#
16
ld1
{
v12
.4
s
},
[
x22
],
#
16
dup
v0
.4
s
,
v13
.
s
[
0
]
dup
v1
.4
s
,
v13
.
s
[
1
]
dup
v2
.4
s
,
v13
.
s
[
2
]
dup
v3
.4
s
,
v13
.
s
[
3
]
dup
v4
.4
s
,
v12
.
s
[
0
]
dup
v5
.4
s
,
v12
.
s
[
1
]
dup
v6
.4
s
,
v12
.
s
[
2
]
dup
v7
.4
s
,
v12
.
s
[
3
]
sub
v16
.4
s
,
v16
.4
s
,
v0
.4
s
sub
v17
.4
s
,
v17
.4
s
,
v0
.4
s
sub
v18
.4
s
,
v18
.4
s
,
v1
.4
s
sub
v19
.4
s
,
v19
.4
s
,
v1
.4
s
sub
v20
.4
s
,
v20
.4
s
,
v2
.4
s
sub
v21
.4
s
,
v21
.4
s
,
v2
.4
s
sub
v22
.4
s
,
v22
.4
s
,
v3
.4
s
sub
v23
.4
s
,
v23
.4
s
,
v3
.4
s
sub
v24
.4
s
,
v24
.4
s
,
v4
.4
s
sub
v25
.4
s
,
v25
.4
s
,
v4
.4
s
sub
v26
.4
s
,
v26
.4
s
,
v5
.4
s
sub
v27
.4
s
,
v27
.4
s
,
v5
.4
s
sub
v28
.4
s
,
v28
.4
s
,
v6
.4
s
sub
v29
.4
s
,
v29
.4
s
,
v6
.4
s
sub
v30
.4
s
,
v30
.4
s
,
v7
.4
s
sub
v31
.4
s
,
v31
.4
s
,
v7
.4
s
//
Apply
left
shift
dup
v11
.4
s
,
w12
sqshl
v16
.4
s
,
v16
.4
s
,
v11
.4
s
sqshl
v17
.4
s
,
v17
.4
s
,
v11
.4
s
sqshl
v18
.4
s
,
v18
.4
s
,
v11
.4
s
sqshl
v19
.4
s
,
v19
.4
s
,
v11
.4
s
sqshl
v20
.4
s
,
v20
.4
s
,
v11
.4
s
sqshl
v21
.4
s
,
v21
.4
s
,
v11
.4
s
sqshl
v22
.4
s
,
v22
.4
s
,
v11
.4
s
sqshl
v23
.4
s
,
v23
.4
s
,
v11
.4
s
sqshl
v24
.4
s
,
v24
.4
s
,
v11
.4
s
sqshl
v25
.4
s
,
v25
.4
s
,
v11
.4
s
sqshl
v26
.4
s
,
v26
.4
s
,
v11
.4
s
sqshl
v27
.4
s
,
v27
.4
s
,
v11
.4
s
sqshl
v28
.4
s
,
v28
.4
s
,
v11
.4
s
sqshl
v29
.4
s
,
v29
.4
s
,
v11
.4
s
sqshl
v30
.4
s
,
v30
.4
s
,
v11
.4
s
sqshl
v31
.4
s
,
v31
.4
s
,
v11
.4
s
//
Apply
the
fixed
-
point
part
of
the
multiplier
.
dup
v10
.4
s
,
w11
sqrdmulh
v16
.4
s
,
v16
.4
s
,
v10
.4
s
sqrdmulh
v17
.4
s
,
v17
.4
s
,
v10
.4
s
sqrdmulh
v18
.4
s
,
v18
.4
s
,
v10
.4
s
sqrdmulh
v19
.4
s
,
v19
.4
s
,
v10
.4
s
sqrdmulh
v20
.4
s
,
v20
.4
s
,
v10
.4
s
sqrdmulh
v21
.4
s
,
v21
.4
s
,
v10
.4
s
sqrdmulh
v22
.4
s
,
v22
.4
s
,
v10
.4
s
sqrdmulh
v23
.4
s
,
v23
.4
s
,
v10
.4
s
sqrdmulh
v24
.4
s
,
v24
.4
s
,
v10
.4
s
sqrdmulh
v25
.4
s
,
v25
.4
s
,
v10
.4
s
sqrdmulh
v26
.4
s
,
v26
.4
s
,
v10
.4
s
sqrdmulh
v27
.4
s
,
v27
.4
s
,
v10
.4
s
sqrdmulh
v28
.4
s
,
v28
.4
s
,
v10
.4
s
sqrdmulh
v29
.4
s
,
v29
.4
s
,
v10
.4
s
sqrdmulh
v30
.4
s
,
v30
.4
s
,
v10
.4
s
sqrdmulh
v31
.4
s
,
v31
.4
s
,
v10
.4
s
//
Apply
right
shift
dup
v9
.4
s
,
w13
and
v0
.16
b
,
v9
.16
b
,
v16
.16
b
sshr
v0
.4
s
,
v0
.4
s
,
#
31
sqadd
v16
.4
s
,
v16
.4
s
,
v0
.4
s
srshl
v16
.4
s
,
v16
.4
s
,
v9
.4
s
and
v1
.16
b
,
v9
.16
b
,
v17
.16
b
sshr
v1
.4
s
,
v1
.4
s
,
#
31
sqadd
v17
.4
s
,
v17
.4
s
,
v1
.4
s
srshl
v17
.4
s
,
v17
.4
s
,
v9
.4
s
and
v2
.16
b
,
v9
.16
b
,
v18
.16
b
sshr
v2
.4
s
,
v2
.4
s
,
#
31
sqadd
v18
.4
s
,
v18
.4
s
,
v2
.4
s
srshl
v18
.4
s
,
v18
.4
s
,
v9
.4
s
and
v3
.16
b
,
v9
.16
b
,
v19
.16
b
sshr
v3
.4
s
,
v3
.4
s
,
#
31
sqadd
v19
.4
s
,
v19
.4
s
,
v3
.4
s
srshl
v19
.4
s
,
v19
.4
s
,
v9
.4
s
and
v0
.16
b
,
v9
.16
b
,
v20
.16
b
sshr
v0
.4
s
,
v0
.4
s
,
#
31
sqadd
v20
.4
s
,
v20
.4
s
,
v0
.4
s
srshl
v20
.4
s
,
v20
.4
s
,
v9
.4
s
and
v1
.16
b
,
v9
.16
b
,
v21
.16
b
sshr
v1
.4
s
,
v1
.4
s
,
#
31
sqadd
v21
.4
s
,
v21
.4
s
,
v1
.4
s
srshl
v21
.4
s
,
v21
.4
s
,
v9
.4
s
and
v2
.16
b
,
v9
.16
b
,
v22
.16
b
sshr
v2
.4
s
,
v2
.4
s
,
#
31
sqadd
v22
.4
s
,
v22
.4
s
,
v2
.4
s
srshl
v22
.4
s
,
v22
.4
s
,
v9
.4
s
and
v3
.16
b
,
v9
.16
b
,
v23
.16
b
sshr
v3
.4
s
,
v3
.4
s
,
#
31
sqadd
v23
.4
s
,
v23
.4
s
,
v3
.4
s
srshl
v23
.4
s
,
v23
.4
s
,
v9
.4
s
and
v0
.16
b
,
v9
.16
b
,
v24
.16
b
sshr
v0
.4
s
,
v0
.4
s
,
#
31
sqadd
v24
.4
s
,
v24
.4
s
,
v0
.4
s
srshl
v24
.4
s
,
v24
.4
s
,
v9
.4
s
and
v1
.16
b
,
v9
.16
b
,
v25
.16
b
sshr
v1
.4
s
,
v1
.4
s
,
#
31
sqadd
v25
.4
s
,
v25
.4
s
,
v1
.4
s
srshl
v25
.4
s
,
v25
.4
s
,
v9
.4
s
and
v2
.16
b
,
v9
.16
b
,
v26
.16
b
sshr
v2
.4
s
,
v2
.4
s
,
#
31
sqadd
v26
.4
s
,
v26
.4
s
,
v2
.4
s
srshl
v26
.4
s
,
v26
.4
s
,
v9
.4
s
and
v3
.16
b
,
v9
.16
b
,
v27
.16
b
sshr
v3
.4
s
,
v3
.4
s
,
#
31
sqadd
v27
.4
s
,
v27
.4
s
,
v3
.4
s
srshl
v27
.4
s
,
v27
.4
s
,
v9
.4
s
and
v0
.16
b
,
v9
.16
b
,
v28
.16
b
sshr
v0
.4
s
,
v0
.4
s
,
#
31
sqadd
v28
.4
s
,
v28
.4
s
,
v0
.4
s
srshl
v28
.4
s
,
v28
.4
s
,
v9
.4
s
and
v1
.16
b
,
v9
.16
b
,
v29
.16
b
sshr
v1
.4
s
,
v1
.4
s
,
#
31
sqadd
v29
.4
s
,
v29
.4
s
,
v1
.4
s
srshl
v29
.4
s
,
v29
.4
s
,
v9
.4
s
and
v2
.16
b
,
v9
.16
b
,
v30
.16
b
sshr
v2
.4
s
,
v2
.4
s
,
#
31
sqadd
v30
.4
s
,
v30
.4
s
,
v2
.4
s
srshl
v30
.4
s
,
v30
.4
s
,
v9
.4
s
and
v3
.16
b
,
v9
.16
b
,
v31
.16
b
sshr
v3
.4
s
,
v3
.4
s
,
#
31
sqadd
v31
.4
s
,
v31
.4
s
,
v3
.4
s
srshl
v31
.4
s
,
v31
.4
s
,
v9
.4
s
//
Add
the
destination
zero
point
dup
v8
.4
s
,
w10
add
v16
.4
s
,
v16
.4
s
,
v8
.4
s
add
v17
.4
s
,
v17
.4
s
,
v8
.4
s
add
v18
.4
s
,
v18
.4
s
,
v8
.4
s
add
v19
.4
s
,
v19
.4
s
,
v8
.4
s
add
v20
.4
s
,
v20
.4
s
,
v8
.4
s
add
v21
.4
s
,
v21
.4
s
,
v8
.4
s
add
v22
.4
s
,
v22
.4
s
,
v8
.4
s
add
v23
.4
s
,
v23
.4
s
,
v8
.4
s
add
v24
.4
s
,
v24
.4
s
,
v8
.4
s
add
v25
.4
s
,
v25
.4
s
,
v8
.4
s
add
v26
.4
s
,
v26
.4
s
,
v8
.4
s
add
v27
.4
s
,
v27
.4
s
,
v8
.4
s
add
v28
.4
s
,
v28
.4
s
,
v8
.4
s
add
v29
.4
s
,
v29
.4
s
,
v8
.4
s
add
v30
.4
s
,
v30
.4
s
,
v8
.4
s
add
v31
.4
s
,
v31
.4
s
,
v8
.4
s
//
Apply
the
act_min
bound
dup
v7
.4
s
,
w8
smax
v16
.4
s
,
v16
.4
s
,
v7
.4
s
smax
v17
.4
s
,
v17
.4
s
,
v7
.4
s
smax
v18
.4
s
,
v18
.4
s
,
v7
.4
s
smax
v19
.4
s
,
v19
.4
s
,
v7
.4
s
//
Apply
the
act_min
bound
dup
v6
.4
s
,
w9
smin
v16
.4
s
,
v16
.4
s
,
v6
.4
s
smin
v17
.4
s
,
v17
.4
s
,
v6
.4
s
smin
v18
.4
s
,
v18
.4
s
,
v6
.4
s
smin
v19
.4
s
,
v19
.4
s
,
v6
.4
s
//
int32
->
int16
sqxtn
v0
.4
h
,
v16
.4
s
sqxtn2
v0
.8
h
,
v17
.4
s
sqxtn
v1
.4
h
,
v18
.4
s
sqxtn2
v1
.8
h
,
v19
.4
s
sqxtn
v2
.4
h
,
v20
.4
s
sqxtn2
v2
.8
h
,
v21
.4
s
sqxtn
v3
.4
h
,
v22
.4
s
sqxtn2
v3
.8
h
,
v23
.4
s
sqxtn
v4
.4
h
,
v24
.4
s
sqxtn2
v4
.8
h
,
v25
.4
s
sqxtn
v5
.4
h
,
v26
.4
s
sqxtn2
v5
.8
h
,
v27
.4
s
sqxtn
v6
.4
h
,
v28
.4
s
sqxtn2
v6
.8
h
,
v29
.4
s
sqxtn
v7
.4
h
,
v30
.4
s
sqxtn2
v7
.8
h
,
v31
.4
s
//
int16
->
int8
sqxtn
v8
.8
b
,
v0
.8
h
sqxtn2
v8
.16
b
,
v1
.8
h
sqxtn
v9
.8
b
,
v2
.8
h
sqxtn2
v9
.16
b
,
v3
.8
h
sqxtn
v10
.8
b
,
v4
.8
h
sqxtn2
v10
.16
b
,
v5
.8
h
sqxtn
v11
.8
b
,
v6
.8
h
sqxtn2
v11
.16
b
,
v7
.8
h
cmp
w23
,
#
8
blt
Write
//
if
rows
<
8
cmp
w15
,
#
8
blt
Write
//
if
cols
<
8
st1
{
v8
.
d
}[
0
],
[
x2
],
x24
st1
{
v8
.
d
}[
1
],
[
x2
],
x24
st1
{
v9
.
d
}[
0
],
[
x2
],
x24
st1
{
v9
.
d
}[
1
],
[
x2
],
x24
st1
{
v10
.
d
}[
0
],
[
x2
],
x24
st1
{
v10
.
d
}[
1
],
[
x2
],
x24
st1
{
v11
.
d
}[
0
],
[
x2
],
x24
st1
{
v11
.
d
}[
1
],
[
x2
],
x24
b
Endwrite
Write
:
cmp
w15
,
#
8
bge
WriteCol8
cmp
w15
,
#
7
beq
WriteCol7
cmp
w15
,
#
6
beq
WriteCol6
cmp
w15
,
#
5
beq
WriteCol5
cmp
w15
,
#
4
beq
WriteCol4
cmp
w15
,
#
3
beq
WriteCol3
cmp
w15
,
#
2
beq
WriteCol2
cmp
w15
,
#
1
beq
WriteCol1
WriteCol8
:
st1
{
v8
.
d
}[
0
],
[
x2
],
x24
cmp
w23
,
#
1
beq
Endwrite
st1
{
v8
.
d
}[
1
],
[
x2
],
x24
cmp
w23
,
#
2
beq
Endwrite
st1
{
v9
.
d
}[
0
],
[
x2
],
x24
cmp
w23
,
#
3
beq
Endwrite
st1
{
v9
.
d
}[
1
],
[
x2
],
x24
cmp
w23
,
#
4
beq
Endwrite
st1
{
v10
.
d
}[
0
],
[
x2
],
x24
cmp
w23
,
#
5
beq
Endwrite
st1
{
v10
.
d
}[
1
],
[
x2
],
x24
cmp
w23
,
#
6
beq
Endwrite
st1
{
v11
.
d
}[
0
],
[
x2
],
x24
cmp
w23
,
#
7
beq
Endwrite
st1
{
v11
.
d
}[
1
],
[
x2
],
x24
b
Endwrite
WriteCol7
:
mov
x26
,
x2
st1
{
v8
.
s
}[
0
],
[
x26
],
#
4
st1
{
v8
.
h
}[
2
],
[
x26
],
#
2
st1
{
v8
.
b
}[
6
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
1
beq
Endwrite
mov
x26
,
x2
st1
{
v8
.
s
}[
2
],
[
x26
],
#
4
st1
{
v8
.
h
}[
6
],
[
x26
],
#
2
st1
{
v8
.
b
}[
14
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
2
beq
Endwrite
mov
x26
,
x2
st1
{
v9
.
s
}[
0
],
[
x26
],
#
4
st1
{
v9
.
h
}[
2
],
[
x26
],
#
2
st1
{
v9
.
b
}[
6
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
3
beq
Endwrite
mov
x26
,
x2
st1
{
v9
.
s
}[
2
],
[
x26
],
#
4
st1
{
v9
.
h
}[
6
],
[
x26
],
#
2
st1
{
v9
.
b
}[
14
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
4
beq
Endwrite
mov
x26
,
x2
st1
{
v10
.
s
}[
0
],
[
x26
],
#
4
st1
{
v10
.
h
}[
2
],
[
x26
],
#
2
st1
{
v10
.
b
}[
6
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
5
beq
Endwrite
mov
x26
,
x2
st1
{
v10
.
s
}[
2
],
[
x26
],
#
4
st1
{
v10
.
h
}[
6
],
[
x26
],
#
2
st1
{
v10
.
b
}[
14
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
6
beq
Endwrite
mov
x26
,
x2
st1
{
v11
.
s
}[
0
],
[
x26
],
#
4
st1
{
v11
.
h
}[
2
],
[
x26
],
#
2
st1
{
v11
.
b
}[
6
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
7
beq
Endwrite
mov
x26
,
x2
st1
{
v11
.
s
}[
2
],
[
x26
],
#
4
st1
{
v11
.
h
}[
6
],
[
x26
],
#
2
st1
{
v11
.
b
}[
14
],
[
x26
],
#
1
add
x2
,
x2
,
x24
b
Endwrite
WriteCol6
:
mov
x26
,
x2
st1
{
v8
.
s
}[
0
],
[
x26
],
#
4
st1
{
v8
.
h
}[
2
],
[
x26
],
#
2
add
x2
,
x2
,
x24
cmp
w23
,
#
1
beq
Endwrite
mov
x26
,
x2
st1
{
v8
.
s
}[
2
],
[
x26
],
#
4
st1
{
v8
.
h
}[
6
],
[
x26
],
#
2
add
x2
,
x2
,
x24
cmp
w23
,
#
2
beq
Endwrite
mov
x26
,
x2
st1
{
v9
.
s
}[
0
],
[
x26
],
#
4
st1
{
v9
.
h
}[
2
],
[
x26
],
#
2
add
x2
,
x2
,
x24
cmp
w23
,
#
3
beq
Endwrite
mov
x26
,
x2
st1
{
v9
.
s
}[
2
],
[
x26
],
#
4
st1
{
v9
.
h
}[
6
],
[
x26
],
#
2
add
x2
,
x2
,
x24
cmp
w23
,
#
4
beq
Endwrite
mov
x26
,
x2
st1
{
v10
.
s
}[
0
],
[
x26
],
#
4
st1
{
v10
.
h
}[
2
],
[
x26
],
#
2
add
x2
,
x2
,
x24
cmp
w23
,
#
5
beq
Endwrite
mov
x26
,
x2
st1
{
v10
.
s
}[
2
],
[
x26
],
#
4
st1
{
v10
.
h
}[
6
],
[
x26
],
#
2
add
x2
,
x2
,
x24
cmp
w23
,
#
6
beq
Endwrite
mov
x26
,
x2
st1
{
v11
.
s
}[
0
],
[
x26
],
#
4
st1
{
v11
.
h
}[
2
],
[
x26
],
#
2
add
x2
,
x2
,
x24
cmp
w23
,
#
7
beq
Endwrite
mov
x26
,
x2
st1
{
v11
.
s
}[
2
],
[
x26
],
#
4
st1
{
v11
.
h
}[
6
],
[
x26
],
#
2
add
x2
,
x2
,
x24
b
Endwrite
WriteCol5
:
mov
x26
,
x2
st1
{
v8
.
s
}[
0
],
[
x26
],
#
4
st1
{
v8
.
b
}[
4
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
1
beq
Endwrite
mov
x26
,
x2
st1
{
v8
.
s
}[
2
],
[
x26
],
#
4
st1
{
v8
.
b
}[
12
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
2
beq
Endwrite
mov
x26
,
x2
st1
{
v9
.
s
}[
0
],
[
x26
],
#
4
st1
{
v9
.
b
}[
4
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
3
beq
Endwrite
mov
x26
,
x2
st1
{
v9
.
s
}[
2
],
[
x26
],
#
4
st1
{
v9
.
b
}[
12
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
4
beq
Endwrite
mov
x26
,
x2
st1
{
v10
.
s
}[
0
],
[
x26
],
#
4
st1
{
v10
.
b
}[
4
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
5
beq
Endwrite
mov
x26
,
x2
st1
{
v10
.
s
}[
2
],
[
x26
],
#
4
st1
{
v10
.
b
}[
12
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
6
beq
Endwrite
mov
x26
,
x2
st1
{
v11
.
s
}[
0
],
[
x26
],
#
4
st1
{
v11
.
b
}[
4
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
7
beq
Endwrite
mov
x26
,
x2
st1
{
v11
.
s
}[
2
],
[
x26
],
#
4
st1
{
v11
.
b
}[
12
],
[
x26
],
#
1
add
x2
,
x2
,
x24
b
Endwrite
WriteCol4
:
st1
{
v8
.
s
}[
0
],
[
x2
],
x24
cmp
w23
,
#
1
beq
Endwrite
st1
{
v8
.
s
}[
2
],
[
x2
],
x24
cmp
w23
,
#
2
beq
Endwrite
st1
{
v9
.
s
}[
0
],
[
x2
],
x24
cmp
w23
,
#
3
beq
Endwrite
st1
{
v9
.
s
}[
2
],
[
x2
],
x24
cmp
w23
,
#
4
beq
Endwrite
st1
{
v10
.
s
}[
0
],
[
x2
],
x24
cmp
w23
,
#
5
beq
Endwrite
st1
{
v10
.
s
}[
2
],
[
x2
],
x24
cmp
w23
,
#
6
beq
Endwrite
st1
{
v11
.
s
}[
0
],
[
x2
],
x24
cmp
w23
,
#
7
beq
Endwrite
st1
{
v11
.
s
}[
2
],
[
x2
],
x24
b
Endwrite
WriteCol3
:
mov
x26
,
x2
st1
{
v8
.
h
}[
0
],
[
x26
],
#
2
st1
{
v8
.
b
}[
2
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
1
beq
Endwrite
mov
x26
,
x2
st1
{
v8
.
h
}[
4
],
[
x26
],
#
2
st1
{
v8
.
b
}[
10
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
2
beq
Endwrite
mov
x26
,
x2
st1
{
v9
.
h
}[
0
],
[
x26
],
#
2
st1
{
v9
.
b
}[
2
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
3
beq
Endwrite
mov
x26
,
x2
st1
{
v9
.
h
}[
4
],
[
x26
],
#
2
st1
{
v9
.
b
}[
10
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
4
beq
Endwrite
mov
x26
,
x2
st1
{
v10
.
h
}[
0
],
[
x26
],
#
2
st1
{
v10
.
b
}[
2
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
5
beq
Endwrite
mov
x26
,
x2
st1
{
v10
.
h
}[
4
],
[
x26
],
#
2
st1
{
v10
.
b
}[
10
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
6
beq
Endwrite
mov
x26
,
x2
st1
{
v11
.
h
}[
0
],
[
x26
],
#
2
st1
{
v11
.
b
}[
2
],
[
x26
],
#
1
add
x2
,
x2
,
x24
cmp
w23
,
#
7
beq
Endwrite
mov
x26
,
x2
st1
{
v11
.
h
}[
4
],
[
x26
],
#
2
st1
{
v11
.
b
}[
10
],
[
x26
],
#
1
add
x2
,
x2
,
x24
b
Endwrite
WriteCol2
:
st1
{
v8
.
h
}[
0
],
[
x2
],
x24
cmp
w23
,
#
1
beq
Endwrite
st1
{
v8
.
h
}[
4
],
[
x2
],
x24
cmp
w23
,
#
2
beq
Endwrite
st1
{
v9
.
h
}[
0
],
[
x2
],
x24
cmp
w23
,
#
3
beq
Endwrite
st1
{
v9
.
h
}[
4
],
[
x2
],
x24
cmp
w23
,
#
4
beq
Endwrite
st1
{
v10
.
h
}[
0
],
[
x2
],
x24
cmp
w23
,
#
5
beq
Endwrite
st1
{
v10
.
h
}[
4
],
[
x2
],
x24
cmp
w23
,
#
6
beq
Endwrite
st1
{
v11
.
h
}[
0
],
[
x2
],
x24
cmp
w23
,
#
7
beq
Endwrite
st1
{
v11
.
h
}[
4
],
[
x2
],
x24
b
Endwrite
WriteCol1
:
st1
{
v8
.
b
}[
0
],
[
x2
],
x24
cmp
w23
,
#
1
beq
Endwrite
st1
{
v8
.
b
}[
8
],
[
x2
],
x24
cmp
w23
,
#
2
beq
Endwrite
st1
{
v9
.
b
}[
0
],
[
x2
],
x24
cmp
w23
,
#
3
beq
Endwrite
st1
{
v9
.
b
}[
8
],
[
x2
],
x24
cmp
w23
,
#
4
beq
Endwrite
st1
{
v10
.
b
}[
0
],
[
x2
],
x24
cmp
w23
,
#
5
beq
Endwrite
st1
{
v10
.
b
}[
8
],
[
x2
],
x24
cmp
w23
,
#
6
beq
Endwrite
st1
{
v11
.
b
}[
0
],
[
x2
],
x24
cmp
w23
,
#
7
beq
Endwrite
st1
{
v11
.
b
}[
8
],
[
x2
],
x24
b
Endwrite
Endwrite
:
sub
w16
,
w16
,
#
8
//
a
row8
counter
-
8
sub
w23
,
w23
,
#
8
//
a
row
counter
-
8
b
L2
End2
:
sub
w4
,
w4
,
#
8
//
b
col8
counter
-
8
sub
w15
,
w15
,
#
8
//
b
col
counter
-
8
add
x1
,
x1
,
x21
//
b
ptr
+
stride
add
x7
,
x7
,
#
32
//
bias
ptr
+
stride
add
x25
,
x25
,
#
8
//
output
+
stride
(
8
*
sizeof
(
int8
))
mov
x2
,
x25
b
L1
End1
:
sub
sp
,
sp
,
#
192
ld1
{
v8
.4
s
,
v9
.4
s
,
v10
.4
s
,
v11
.4
s
},
[
sp
],
#
64
ld1
{
v12
.4
s
,
v13
.4
s
,
v14
.4
s
,
v15
.4
s
},
[
sp
],
#
64
ldp
x19
,
x20
,
[
sp
],
#
16
ldp
x21
,
x22
,
[
sp
],
#
16
ldp
x23
,
x24
,
[
sp
],
#
16
ldp
x25
,
x26
,
[
sp
],
#
16
ret
#endif
mindspore/lite/nnacl/opt_op_handler.c
浏览文件 @
6a5ae9bb
...
...
@@ -16,6 +16,7 @@
#include <stdlib.h>
#include <stdbool.h>
#include "nnacl/op_base.h"
#ifdef __cplusplus
extern
"C"
{
...
...
@@ -28,6 +29,10 @@ extern void IndirectGemmInt8_24x4_dp(int8_t *dst, const int8_t *src, const int8_
extern
void
MatMulOptR4Int8Neon64
(
const
int8_t
*
a
,
const
int8_t
*
b
,
int
*
dst
,
int
row4
,
int
col4
,
int
deep16
,
const
int
*
input_sum
,
const
int
*
bias
);
extern
void
MatmulInt8DpNeon64
(
const
int8_t
*
a
,
const
int8_t
*
b
,
int8_t
*
dst
,
int
row8
,
int
col8
,
int
deep4
,
const
int
*
a_sums
,
const
int
*
bias
,
int
act_min
,
int
act_max
,
int
out_zp
,
int
multiplier
,
int
left_shift
,
int
right_shift
,
int
row
,
int
col
,
int
stride
);
#ifdef __cplusplus
}
#endif
...
...
@@ -51,6 +56,7 @@ void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst,
size_t
stride
,
const
int32_t
*
input_sum
,
const
int32_t
*
bias
,
int32_t
*
left_shift
,
int32_t
*
right_shift
,
int32_t
*
multiplier
,
int32_t
output_zp
,
int32_t
mini
,
int32_t
maxi
,
bool
per_channel
)
{
return
;
return
MatmulInt8DpNeon64
(
a
,
b
,
dst
,
UP_ROUND
(
row
,
8
),
UP_ROUND
(
col
,
8
),
deep_4
,
input_sum
,
bias
,
mini
,
maxi
,
output_zp
,
multiplier
[
0
],
left_shift
[
0
],
right_shift
[
0
],
row
,
col
,
col
);
}
#endif
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录