Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
b777ba0f
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
b777ba0f
编写于
4月 26, 2018
作者:
吴
吴承辉
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'master' into 'master'
Optimize gemm v7 output pipeline See merge request !424
上级
9e81d3fb
b158770b
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
23 addition
and
20 deletion
+23
-20
mace/kernels/gemm.cc
mace/kernels/gemm.cc
+23
-20
未找到文件。
mace/kernels/gemm.cc
浏览文件 @
b777ba0f
...
...
@@ -471,7 +471,7 @@ inline void GemmTile(const float *A,
// TODO(liyin): asm v7 prefetch and load optimization
while
(
nw
--
)
{
float32x4_t
b0
,
b1
,
b2
,
b3
;
float32x4_t
c0
;
float32x4_t
c0
,
c1
,
c2
,
c3
;
c0
=
vld1q_f32
(
c_ptr0
);
...
...
@@ -480,36 +480,37 @@ inline void GemmTile(const float *A,
b2
=
vld1q_f32
(
b_ptr2
);
b3
=
vld1q_f32
(
b_ptr3
);
c1
=
vld1q_f32
(
c_ptr1
);
c2
=
vld1q_f32
(
c_ptr2
);
c3
=
vld1q_f32
(
c_ptr3
);
c0
=
vmlaq_lane_f32
(
c0
,
b0
,
a00
,
0
);
c0
=
vmlaq_lane_f32
(
c0
,
b1
,
a00
,
1
);
c0
=
vmlaq_lane_f32
(
c0
,
b2
,
a01
,
0
);
c0
=
vmlaq_lane_f32
(
c0
,
b3
,
a01
,
1
);
vst1q_f32
(
c_ptr0
,
c0
);
c0
=
vld1q_f32
(
c_ptr1
);
c
0
=
vmlaq_lane_f32
(
c0
,
b0
,
a10
,
0
);
c
0
=
vmlaq_lane_f32
(
c0
,
b1
,
a10
,
1
);
c
0
=
vmlaq_lane_f32
(
c0
,
b2
,
a11
,
0
);
c
0
=
vmlaq_lane_f32
(
c0
,
b3
,
a11
,
1
);
c
1
=
vmlaq_lane_f32
(
c1
,
b0
,
a10
,
0
);
c
1
=
vmlaq_lane_f32
(
c1
,
b1
,
a10
,
1
);
c
1
=
vmlaq_lane_f32
(
c1
,
b2
,
a11
,
0
);
c
1
=
vmlaq_lane_f32
(
c1
,
b3
,
a11
,
1
);
vst1q_f32
(
c_ptr1
,
c0
);
c0
=
vld1q_f32
(
c_ptr2
);
vst1q_f32
(
c_ptr1
,
c1
);
c
0
=
vmlaq_lane_f32
(
c0
,
b0
,
a20
,
0
);
c
0
=
vmlaq_lane_f32
(
c0
,
b1
,
a20
,
1
);
c
0
=
vmlaq_lane_f32
(
c0
,
b2
,
a21
,
0
);
c
0
=
vmlaq_lane_f32
(
c0
,
b3
,
a21
,
1
);
c
2
=
vmlaq_lane_f32
(
c2
,
b0
,
a20
,
0
);
c
2
=
vmlaq_lane_f32
(
c2
,
b1
,
a20
,
1
);
c
2
=
vmlaq_lane_f32
(
c2
,
b2
,
a21
,
0
);
c
2
=
vmlaq_lane_f32
(
c2
,
b3
,
a21
,
1
);
vst1q_f32
(
c_ptr2
,
c0
);
c0
=
vld1q_f32
(
c_ptr3
);
vst1q_f32
(
c_ptr2
,
c2
);
c
0
=
vmlaq_lane_f32
(
c0
,
b0
,
a30
,
0
);
c
0
=
vmlaq_lane_f32
(
c0
,
b1
,
a30
,
1
);
c
0
=
vmlaq_lane_f32
(
c0
,
b2
,
a31
,
0
);
c
0
=
vmlaq_lane_f32
(
c0
,
b3
,
a31
,
1
);
c
3
=
vmlaq_lane_f32
(
c3
,
b0
,
a30
,
0
);
c
3
=
vmlaq_lane_f32
(
c3
,
b1
,
a30
,
1
);
c
3
=
vmlaq_lane_f32
(
c3
,
b2
,
a31
,
0
);
c
3
=
vmlaq_lane_f32
(
c3
,
b3
,
a31
,
1
);
vst1q_f32
(
c_ptr3
,
c
0
);
vst1q_f32
(
c_ptr3
,
c
3
);
b_ptr0
+=
4
;
b_ptr1
+=
4
;
...
...
@@ -586,7 +587,9 @@ void Gemm(const float *A,
// It is better to use large block size if it fits for fast cache.
// Assume l1 cache size is 32k, we load three blocks at a time (A, B, C),
// the block size should be sqrt(32k / sizeof(T) / 3).
const
index_t
block_size
=
48
;
// As number of input channels of convolution is normally power of 2, and
// we have not optimized tiling remains, we use the following magic number
const
index_t
block_size
=
64
;
const
index_t
block_tile_height
=
RoundUpDiv
(
height
,
block_size
);
const
index_t
block_tile_width
=
RoundUpDiv
(
width
,
block_size
);
const
index_t
block_tile_k
=
RoundUpDiv
(
K
,
block_size
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录