Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
e8475691
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e8475691
编写于
4年前
作者:
L
lixian
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
optimizatino for winograd matmul
上级
2b23de61
master
无相关合并请求
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
154 addition
and
6 deletion
+154
-6
mindspore/lite/nnacl/assembly/arm64/MatmulFp32OptRemain.S
mindspore/lite/nnacl/assembly/arm64/MatmulFp32OptRemain.S
+144
-0
mindspore/lite/nnacl/fp32/conv.c
mindspore/lite/nnacl/fp32/conv.c
+2
-3
mindspore/lite/nnacl/fp32/matmul.c
mindspore/lite/nnacl/fp32/matmul.c
+7
-3
mindspore/lite/nnacl/fp32/matmul.h
mindspore/lite/nnacl/fp32/matmul.h
+1
-0
未找到文件。
mindspore/lite/nnacl/assembly/arm64/MatmulFp32OptRemain.S
0 → 100644
浏览文件 @
e8475691
#ifdef __aarch64__
.
text
.
align
5
.
global
MatmulFloatNeon64OptRemain
#ifndef __APPLE__
.
type
MatmulFloatNeon64OptRemain
,
%
function
#endif
//
void
MatmulFloatNeon64
(
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
depth
//
int
row
,
int
col
,
size_t
stride
)
//
x0
:
a
//
x1
:
b
//
x2
:
c
//
x3
:
depth
//
x4
:
row
//
x5
:
col
//
x6
:
stride
//
only
for
winograd
MatmulFloatNeon64OptRemain
:
mov
x18
,
#
32
//
sizeof
(
float
)
*
8
mul
x9
,
x3
,
x18
//
block
stride
of
lhs
/
rhs
:
sizeof
(
float
)
*
8
*
depth
mov
x18
,
#
4
mul
x8
,
x5
,
x6
mov
x11
,
#
8
mul
x11
,
x11
,
x6
mul
x8
,
x8
,
x18
mul
x11
,
x11
,
x18
cmp
x4
,
#
4
ble
LoopH4
LoopH8
:
mov
x10
,
x4
//
reload
lhs
row
mov
x12
,
x0
//
reload
lhs
ptr
mov
x18
,
x2
//
reload
dst
ptr
LoopW8
:
mov
x16
,
x1
//
reload
rhs
ptr
mov
x13
,
x3
//
reload
depth
dup
v16
.4
s
,
wzr
dup
v17
.4
s
,
wzr
dup
v18
.4
s
,
wzr
dup
v19
.4
s
,
wzr
dup
v20
.4
s
,
wzr
dup
v21
.4
s
,
wzr
dup
v22
.4
s
,
wzr
dup
v23
.4
s
,
wzr
dup
v24
.4
s
,
wzr
dup
v25
.4
s
,
wzr
dup
v26
.4
s
,
wzr
dup
v27
.4
s
,
wzr
dup
v28
.4
s
,
wzr
dup
v29
.4
s
,
wzr
dup
v30
.4
s
,
wzr
dup
v31
.4
s
,
wzr
LoopD8
:
ld1
{
v0
.4
s
,
v1
.4
s
,
v2
.4
s
},
[
x12
],
#
48
ld1
{
v3
.4
s
,
v4
.4
s
},
[
x16
],
#
32
fmla
v16
.4
s
,
v3
.4
s
,
v0
.
s
[
0
]
fmla
v18
.4
s
,
v3
.4
s
,
v0
.
s
[
1
]
fmla
v20
.4
s
,
v3
.4
s
,
v0
.
s
[
2
]
fmla
v22
.4
s
,
v3
.4
s
,
v0
.
s
[
3
]
fmla
v17
.4
s
,
v4
.4
s
,
v0
.
s
[
0
]
fmla
v19
.4
s
,
v4
.4
s
,
v0
.
s
[
1
]
fmla
v21
.4
s
,
v4
.4
s
,
v0
.
s
[
2
]
fmla
v23
.4
s
,
v4
.4
s
,
v0
.
s
[
3
]
fmla
v24
.4
s
,
v3
.4
s
,
v1
.
s
[
0
]
fmla
v26
.4
s
,
v3
.4
s
,
v1
.
s
[
1
]
fmla
v28
.4
s
,
v3
.4
s
,
v1
.
s
[
2
]
fmla
v30
.4
s
,
v3
.4
s
,
v1
.
s
[
3
]
fmla
v25
.4
s
,
v4
.4
s
,
v1
.
s
[
0
]
fmla
v27
.4
s
,
v4
.4
s
,
v1
.
s
[
1
]
fmla
v29
.4
s
,
v4
.4
s
,
v1
.
s
[
2
]
fmla
v31
.4
s
,
v4
.4
s
,
v1
.
s
[
3
]
subs
w13
,
w13
,
#
1
bgt
LoopD8
st1
{
v16
.4
s
,
v17
.4
s
},
[
x18
],
x8
st1
{
v18
.4
s
,
v19
.4
s
},
[
x18
],
x8
st1
{
v20
.4
s
,
v21
.4
s
},
[
x18
],
x8
st1
{
v22
.4
s
,
v23
.4
s
},
[
x18
],
x8
st1
{
v24
.4
s
,
v25
.4
s
},
[
x18
],
x8
st1
{
v26
.4
s
,
v27
.4
s
},
[
x18
],
x8
st1
{
v28
.4
s
,
v29
.4
s
},
[
x18
],
x8
st1
{
v30
.4
s
,
v31
.4
s
},
[
x18
],
x8
subs
x10
,
x10
,
#
8
//
lhs
row
-
8
bgt
LoopW8
subs
x5
,
x5
,
#
8
//
rhs
col
-
8
add
x1
,
x1
,
x9
//
rhs
ptr
+
stride
add
x2
,
x2
,
x11
bgt
LoopH8
ret
LoopH4
:
mov
x10
,
x4
//
reload
lhs
row
mov
x12
,
x0
//
reload
lhs
ptr
mov
x18
,
x2
//
reload
dst
ptr
LoopW4
:
mov
x16
,
x1
//
reload
rhs
ptr
mov
x13
,
x3
//
reload
depth
dup
v16
.4
s
,
wzr
dup
v17
.4
s
,
wzr
dup
v18
.4
s
,
wzr
dup
v19
.4
s
,
wzr
dup
v20
.4
s
,
wzr
dup
v21
.4
s
,
wzr
dup
v22
.4
s
,
wzr
dup
v23
.4
s
,
wzr
LoopD4
:
ld1
{
v0
.4
s
,
v1
.4
s
,
v2
.4
s
},
[
x12
],
#
48
ld1
{
v3
.4
s
,
v4
.4
s
},
[
x16
],
#
32
fmla
v16
.4
s
,
v3
.4
s
,
v0
.
s
[
0
]
fmla
v18
.4
s
,
v3
.4
s
,
v0
.
s
[
1
]
fmla
v20
.4
s
,
v3
.4
s
,
v0
.
s
[
2
]
fmla
v22
.4
s
,
v3
.4
s
,
v0
.
s
[
3
]
fmla
v17
.4
s
,
v4
.4
s
,
v0
.
s
[
0
]
fmla
v19
.4
s
,
v4
.4
s
,
v0
.
s
[
1
]
fmla
v21
.4
s
,
v4
.4
s
,
v0
.
s
[
2
]
fmla
v23
.4
s
,
v4
.4
s
,
v0
.
s
[
3
]
subs
x13
,
x13
,
#
1
bgt
LoopD4
st1
{
v16
.4
s
,
v17
.4
s
},
[
x18
],
x8
st1
{
v18
.4
s
,
v19
.4
s
},
[
x18
],
x8
st1
{
v20
.4
s
,
v21
.4
s
},
[
x18
],
x8
st1
{
v22
.4
s
,
v23
.4
s
},
[
x18
],
x8
subs
x10
,
x10
,
#
4
//
lhs
row
-
4
bgt
LoopW4
subs
x5
,
x5
,
#
8
//
rhs
col
-
8
add
x1
,
x1
,
x9
//
rhs
ptr
+
stride
add
x2
,
x2
,
x11
bgt
LoopH4
ret
#endif
This diff is collapsed.
Click to expand it.
mindspore/lite/nnacl/fp32/conv.c
浏览文件 @
e8475691
...
...
@@ -303,7 +303,7 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
for
(
int
i
=
0
;
i
<
input_unit_square
;
++
i
)
{
RowMajor2Col12Major
(
src_ptr
+
i
*
C12NUM
*
ic4
*
C4NUM
,
tmp_col_ptr
,
C12NUM
,
ic4
*
C4NUM
);
MatMulOpt
(
tmp_col_ptr
,
trans_weight
+
i
*
ic4
*
C4NUM
*
oc8
*
C8NUM
,
dst_ptr
+
i
*
C8NUM
,
NULL
,
0
,
ic4
*
C4NUM
,
C12NUM
,
oc8
*
C8NUM
,
input_unit_square
,
2
);
cal_num
,
oc8
*
C8NUM
,
input_unit_square
,
2
);
}
// step 4 : output transform
...
...
@@ -489,9 +489,8 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat
for
(
int
i
=
0
;
i
<
input_unit_square
;
++
i
)
{
RowMajor2Col12Major
(
src_ptr
+
i
*
C12NUM
*
ic4
*
C4NUM
,
tmp_col_ptr
,
C12NUM
,
ic4
*
C4NUM
);
MatMulOpt
(
tmp_col_ptr
,
transed_weight
+
i
*
ic4
*
C4NUM
*
oc8
*
C8NUM
,
dst_ptr
+
i
*
C8NUM
,
NULL
,
0
,
ic4
*
C4NUM
,
C12NUM
,
oc8
*
C8NUM
,
input_unit_square
,
2
);
ic4
*
C4NUM
,
real_cal_num
,
oc8
*
C8NUM
,
input_unit_square
,
2
);
}
Conv3x3Fp32OutputTransform
(
tmp_dst_buffer
+
task_id
*
tmp_dst_buffer_offset
,
nc4hw4_out
+
nc4hw4_buffer_offset
,
bias_data
,
start_index
,
real_cal_num
,
out_w_block
,
conv_param
);
}
...
...
This diff is collapsed.
Click to expand it.
mindspore/lite/nnacl/fp32/matmul.c
浏览文件 @
e8475691
...
...
@@ -386,7 +386,7 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A
size_t
ci
=
dst_r_offset
+
c8div
*
8
*
stride
+
c8mod
;
float
value
=
0
;
for
(
int
d
=
0
;
d
<
deep
;
++
d
)
{
size_t
ai
=
src_r_offset
+
d
*
row
;
size_t
ai
=
src_r_offset
+
d
*
C12NUM
;
size_t
bi
=
c8div
*
deep
*
8
+
d
*
8
+
c8mod
;
value
=
value
+
a
[
ai
]
*
b
[
bi
];
}
...
...
@@ -403,8 +403,12 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A
void
MatMulOpt
(
const
float
*
a
,
const
float
*
b
,
float
*
c
,
const
float
*
bias
,
ActType
act_type
,
int
deep
,
int
row
,
int
col
,
size_t
stride
,
int
out_type
)
{
#ifdef ENABLE_ARM64
MatmulFloatNeon64Opt
(
a
,
b
,
c
,
bias
,
(
int
)
act_type
,
deep
,
row
,
col
,
stride
,
(
int
)(
out_type
==
OutType_Nhwc
),
(
int
)(
out_type
==
OutType_TileC8
));
if
(
out_type
==
2
&&
row
<=
8
)
{
MatmulFloatNeon64OptRemain
(
a
,
b
,
c
,
deep
,
row
,
col
,
stride
);
}
else
{
MatmulFloatNeon64Opt
(
a
,
b
,
c
,
bias
,
(
int
)
act_type
,
deep
,
row
,
col
,
stride
,
(
int
)(
out_type
==
OutType_Nhwc
),
(
int
)(
out_type
==
OutType_TileC8
));
}
#else
MatMul12x8
(
a
,
b
,
c
,
bias
,
act_type
,
deep
,
row
,
col
,
stride
,
out_type
);
#endif
...
...
This diff is collapsed.
Click to expand it.
mindspore/lite/nnacl/fp32/matmul.h
浏览文件 @
e8475691
...
...
@@ -39,6 +39,7 @@ void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bi
int
col
,
size_t
stride
,
bool
write_nhwc
);
void
MatmulFloatNeon64Opt
(
const
float
*
a
,
const
float
*
b
,
float
*
c
,
const
float
*
bias
,
int
act_type
,
int
depth
,
int
row
,
int
col
,
size_t
stride
,
size_t
write_nhwc
,
size_t
write_c4
);
void
MatmulFloatNeon64OptRemain
(
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
depth
,
int
row
,
int
col
,
size_t
stride
);
#endif
#ifdef __cplusplus
}
...
...
This diff is collapsed.
Click to expand it.
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录
反馈
建议
客服
返回
顶部