Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
项目经理老王
Mace
提交
f2f05c0d
Mace
项目概览
项目经理老王
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
f2f05c0d
编写于
2月 19, 2019
作者:
李
李滨
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'opt_gemm' into 'master'
Minor improvement of gemv asm See merge request !984
上级
f39cc3ce
7aad041e
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
82 addition
and
20 deletion
+82
-20
mace/ops/arm/fp32/gemv.cc
mace/ops/arm/fp32/gemv.cc
+82
-20
未找到文件。
mace/ops/arm/fp32/gemv.cc
浏览文件 @
f2f05c0d
...
...
@@ -19,7 +19,11 @@
#include <algorithm>
#if !defined(__aarch64__)
#define vaddvq_f32(v) ((v)[0] + (v)[1] + (v)[2] + (v)[3])
float
vaddvq_f32
(
float32x4_t
v
)
{
float32x2_t
_sum
=
vadd_f32
(
vget_low_f32
(
v
),
vget_high_f32
(
v
));
_sum
=
vpadd_f32
(
_sum
,
_sum
);
return
vget_lane_f32
(
_sum
,
0
);
}
#endif
// Disable unroll by default, since cache set conflict could be significant
...
...
@@ -202,8 +206,7 @@ MaceStatus Gemv::Compute(const OpContext *context,
:
// clobbers
"cc"
,
"memory"
,
"r0"
,
"r1"
,
"r2"
,
"r3"
,
"r4"
,
"r5"
,
"d0"
,
"d1"
,
"d2"
,
"d3"
,
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
);
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
);
lhs_ptr
+=
w_block_count
*
w_block_size
;
rhs_ptr
+=
w_block_count
*
w_block_size
;
...
...
@@ -257,7 +260,7 @@ MaceStatus Gemv::Compute(const OpContext *context,
float32x4_t
vbias
=
vdupq_n_f32
(
0
);
if
(
bias
)
{
vbias
=
vld1q_f32
(
bias_data
+
h_
offse
t
);
vbias
=
vld1q_f32
(
bias_data
+
h_
star
t
);
}
vo
=
vaddq_f32
(
vo
,
vbias
);
vst1q_f32
(
ret_ptr
,
vo
);
...
...
@@ -268,24 +271,82 @@ MaceStatus Gemv::Compute(const OpContext *context,
for
(
index_t
h
=
0
;
h
<
h_block_len
;
++
h
)
{
lhs_ptr
=
tmp_lhs_ptr
+
h
*
lhs_width
;
rhs_ptr
=
tmp_rhs_ptr
;
float32x4_t
vo0
=
vdupq_n_f32
(
0
);
float32x4_t
vo0n
=
vdupq_n_f32
(
0
);
for
(
index_t
w
=
0
;
w
<
w_block_count
;
++
w
)
{
float32x4_t
vr0
=
vld1q_f32
(
rhs_ptr
);
float32x4_t
vr0n
=
vld1q_f32
(
rhs_ptr
+
4
);
float32x4_t
vl0
=
vld1q_f32
(
lhs_ptr
);
float32x4_t
vl0n
=
vld1q_f32
(
lhs_ptr
+
4
);
// may cause some precision error depending on the compute order
vo0
=
vmlaq_f32
(
vo0
,
vl0
,
vr0
);
vo0n
=
vmlaq_f32
(
vo0n
,
vl0n
,
vr0n
);
float
s0
=
bias
?
bias_data
[
h_start
+
h
]
:
0
;
lhs_ptr
+=
8
;
rhs_ptr
+=
8
;
}
// w
vo0
=
vaddq_f32
(
vo0
,
vo0n
);
float
s0
=
vaddvq_f32
(
vo0
)
+
(
bias
?
bias_data
[
h_start
+
h
]
:
0
);
if
(
w_block_count
)
{
#if not defined(__aarch64__)
index_t
r_w_block_count
=
w_block_count
;
float32x4_t
vo
=
vdupq_n_f32
(
0.
f
);
asm
volatile
(
"mov r0, #0
\n
"
"vdup.f32 q2, r0
\n
"
"vdup.f32 q3, r0
\n
"
// prelogue
"vld1.f32 {d16-d17}, [%[rhs_ptr]]!
\n
"
"vld1.f32 {d18-d19}, [%[rhs_ptr]]!
\n
"
"subs %[r_w_block_count], #1
\n
"
"vld1.f32 {d0-d1}, [%[lhs_ptr]]!
\n
"
"vld1.f32 {d2-d3}, [%[lhs_ptr]]!
\n
"
"beq 1f
\n
"
"0:
\n
"
"vmla.f32 q2, q0, q8
\n
"
"vld1.f32 {d0-d1}, [%[lhs_ptr]]!
\n
"
"vld1.f32 {d16-d17}, [%[rhs_ptr]]!
\n
"
"subs %[r_w_block_count], #1
\n
"
"vmla.f32 q3, q1, q9
\n
"
"vld1.f32 {d2-d3}, [%[lhs_ptr]]!
\n
"
"vld1.f32 {d18-d19}, [%[rhs_ptr]]!
\n
"
"bne 0b
\n
"
// prologue
"1:
\n
"
"vmla.f32 q2, q0, q8
\n
"
"vmla.f32 q3, q1, q9
\n
"
"vaddq.f32 %q[vo], q2, q3
\n
"
:
// outputs
[
r_w_block_count
]
"+r"
(
r_w_block_count
),
[
lhs_ptr
]
"+r"
(
lhs_ptr
),
[
rhs_ptr
]
"+r"
(
rhs_ptr
),
[
vo
]
"+w"
(
vo
)
:
// inputs
:
// clobbers
"cc"
,
"memory"
,
"r0"
,
"d0"
,
"d1"
,
"d2"
,
"d3"
,
// lhs
"d4"
,
"d5"
,
"d6"
,
"d7"
,
// output
"d16"
,
"d17"
,
"d18"
,
"d19"
// rhs
);
s0
+=
vaddvq_f32
(
vo
);
#else
float32x4_t
vo0
=
vdupq_n_f32
(
0
);
float32x4_t
vo0n
=
vdupq_n_f32
(
0
);
for
(
index_t
w
=
0
;
w
<
w_block_count
;
++
w
)
{
float32x4_t
vr0
=
vld1q_f32
(
rhs_ptr
);
float32x4_t
vr0n
=
vld1q_f32
(
rhs_ptr
+
4
);
float32x4_t
vl0
=
vld1q_f32
(
lhs_ptr
);
float32x4_t
vl0n
=
vld1q_f32
(
lhs_ptr
+
4
);
vo0
=
vmlaq_f32
(
vo0
,
vl0
,
vr0
);
vo0n
=
vmlaq_f32
(
vo0n
,
vl0n
,
vr0n
);
lhs_ptr
+=
8
;
rhs_ptr
+=
8
;
}
// w
vo0
=
vaddq_f32
(
vo0
,
vo0n
);
s0
+=
vaddvq_f32
(
vo0
);
#endif // __aarch64__
}
// if
for
(
index_t
w
=
0
;
w
<
w_remain
;
++
w
)
{
s0
+=
lhs_ptr
[
0
]
*
rhs_ptr
[
0
];
++
lhs_ptr
;
...
...
@@ -294,6 +355,7 @@ MaceStatus Gemv::Compute(const OpContext *context,
ret_ptr
[
h
]
=
s0
;
}
// h
#ifdef MACE_GEMV_UNROLL
}
// if
#endif // MACE_GEMV_UNROLL
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录