Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
fddc1e84
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
fddc1e84
编写于
5月 08, 2018
作者:
李
李寅
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'invalid_read' into 'master'
fix neon 5x5 7x7 invalid memory read See merge request !456
上级
4df34723
fb7b353e
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
52 addition
and
52 deletion
+52
-52
mace/kernels/arm/conv_2d_neon_5x5.cc
mace/kernels/arm/conv_2d_neon_5x5.cc
+10
-10
mace/kernels/arm/conv_2d_neon_7x7.cc
mace/kernels/arm/conv_2d_neon_7x7.cc
+42
-42
未找到文件。
mace/kernels/arm/conv_2d_neon_5x5.cc
浏览文件 @
fddc1e84
...
...
@@ -26,55 +26,55 @@ namespace kernels {
float32x4_t vf00, vf10, vf20, vf30; \
float32x2_t vf01, vf11, vf21, vf31; \
vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1_f32(filter_ptr0 +
4
); \
vf01 = vld1_f32(filter_ptr0 +
3
); \
vf10 = vld1q_f32(filter_ptr1); \
vf11 = vld1_f32(filter_ptr1 +
4
); \
vf11 = vld1_f32(filter_ptr1 +
3
); \
vf20 = vld1q_f32(filter_ptr2); \
vf21 = vld1_f32(filter_ptr2 +
4
); \
vf21 = vld1_f32(filter_ptr2 +
3
); \
vf30 = vld1q_f32(filter_ptr3); \
vf31 = vld1_f32(filter_ptr3 +
4
); \
vf31 = vld1_f32(filter_ptr3 +
3
); \
\
/* outch 0 */
\
vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi4, vf01,
0
); \
vo0 = vmlaq_lane_f32(vo0, vi4, vf01,
1
); \
\
/* outch 1 */
\
vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); \
vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); \
vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); \
vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); \
vo1 = vmlaq_lane_f32(vo1, vi4, vf11,
0
); \
vo1 = vmlaq_lane_f32(vo1, vi4, vf11,
1
); \
\
/* outch 2 */
\
vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); \
vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); \
vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); \
vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); \
vo2 = vmlaq_lane_f32(vo2, vi4, vf21,
0
); \
vo2 = vmlaq_lane_f32(vo2, vi4, vf21,
1
); \
\
/* outch 3 */
\
vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); \
vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); \
vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); \
vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \
vo3 = vmlaq_lane_f32(vo3, vi4, vf31,
0
);
vo3 = vmlaq_lane_f32(vo3, vi4, vf31,
1
);
#define Conv2dNeonK5x5SnLoadCalc1 \
/* load filter (1 outch x 1 height x 4 width) */
\
float32x4_t vf00; \
float32x2_t vf01; \
vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1_f32(filter_ptr0 +
4
); \
vf01 = vld1_f32(filter_ptr0 +
3
); \
\
/* outch 0 */
\
vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi4, vf01,
0
);
vo0 = vmlaq_lane_f32(vo0, vi4, vf01,
1
);
inline
void
Conv2dCPUK5x5Calc
(
const
float
*
in_ptr_base
,
const
float
*
filter_ptr0
,
...
...
mace/kernels/arm/conv_2d_neon_7x7.cc
浏览文件 @
fddc1e84
...
...
@@ -28,130 +28,130 @@ namespace kernels {
float32x4_t vf20, vf21; \
float32x4_t vf30, vf31; \
vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1q_f32(filter_ptr0 +
4
); \
vf01 = vld1q_f32(filter_ptr0 +
3
); \
vf10 = vld1q_f32(filter_ptr1); \
vf11 = vld1q_f32(filter_ptr1 +
4
); \
vf11 = vld1q_f32(filter_ptr1 +
3
); \
vf20 = vld1q_f32(filter_ptr2); \
vf21 = vld1q_f32(filter_ptr2 +
4
); \
vf21 = vld1q_f32(filter_ptr2 +
3
); \
vf30 = vld1q_f32(filter_ptr3); \
vf31 = vld1q_f32(filter_ptr3 +
4
); \
vf31 = vld1q_f32(filter_ptr3 +
3
); \
\
/* outch 0 */
\
vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); \
vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); \
vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); \
vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); \
vo0 = vfmaq_laneq_f32(vo0, vi4, vf01,
0
); \
vo0 = vfmaq_laneq_f32(vo0, vi5, vf01,
1
); \
vo0 = vfmaq_laneq_f32(vo0, vi6, vf01,
2
); \
vo0 = vfmaq_laneq_f32(vo0, vi4, vf01,
1
); \
vo0 = vfmaq_laneq_f32(vo0, vi5, vf01,
2
); \
vo0 = vfmaq_laneq_f32(vo0, vi6, vf01,
3
); \
\
/* outch 1 */
\
vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0); \
vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1); \
vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2); \
vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3); \
vo1 = vfmaq_laneq_f32(vo1, vi4, vf11,
0
); \
vo1 = vfmaq_laneq_f32(vo1, vi5, vf11,
1
); \
vo1 = vfmaq_laneq_f32(vo1, vi6, vf11,
2
); \
vo1 = vfmaq_laneq_f32(vo1, vi4, vf11,
1
); \
vo1 = vfmaq_laneq_f32(vo1, vi5, vf11,
2
); \
vo1 = vfmaq_laneq_f32(vo1, vi6, vf11,
3
); \
\
/* outch 2 */
\
vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0); \
vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1); \
vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2); \
vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3); \
vo2 = vfmaq_laneq_f32(vo2, vi4, vf21,
0
); \
vo2 = vfmaq_laneq_f32(vo2, vi5, vf21,
1
); \
vo2 = vfmaq_laneq_f32(vo2, vi6, vf21,
2
); \
vo2 = vfmaq_laneq_f32(vo2, vi4, vf21,
1
); \
vo2 = vfmaq_laneq_f32(vo2, vi5, vf21,
2
); \
vo2 = vfmaq_laneq_f32(vo2, vi6, vf21,
3
); \
\
/* outch 3 */
\
vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0); \
vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1); \
vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2); \
vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3); \
vo3 = vfmaq_laneq_f32(vo3, vi4, vf31,
0
); \
vo3 = vfmaq_laneq_f32(vo3, vi5, vf31,
1
); \
vo3 = vfmaq_laneq_f32(vo3, vi6, vf31,
2
);
vo3 = vfmaq_laneq_f32(vo3, vi4, vf31,
1
); \
vo3 = vfmaq_laneq_f32(vo3, vi5, vf31,
2
); \
vo3 = vfmaq_laneq_f32(vo3, vi6, vf31,
3
);
#define Conv2dArmv8NeonK7x7SnLoadCalc1 \
/* load filter (1 outch x 1 height x 4 width) */
\
float32x4_t vf00, vf01; \
vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1q_f32(filter_ptr0 +
4
); \
vf01 = vld1q_f32(filter_ptr0 +
3
); \
\
/* outch 0 */
\
vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); \
vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); \
vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); \
vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); \
vo0 = vfmaq_laneq_f32(vo0, vi4, vf01,
0
); \
vo0 = vfmaq_laneq_f32(vo0, vi5, vf01,
1
); \
vo0 = vfmaq_laneq_f32(vo0, vi6, vf01,
2
);
vo0 = vfmaq_laneq_f32(vo0, vi4, vf01,
1
); \
vo0 = vfmaq_laneq_f32(vo0, vi5, vf01,
2
); \
vo0 = vfmaq_laneq_f32(vo0, vi6, vf01,
3
);
#define Conv2dArmv7NeonK7x7SnLoadCalc4
\
#define Conv2dArmv7NeonK7x7SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */
\
float32x4_t vf00, vf01; \
float32x4_t vf10, vf11; \
float32x4_t vf20, vf21; \
float32x4_t vf30, vf31; \
vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1q_f32(filter_ptr0 +
4
); \
vf01 = vld1q_f32(filter_ptr0 +
3
); \
vf10 = vld1q_f32(filter_ptr1); \
vf11 = vld1q_f32(filter_ptr1 +
4
); \
vf11 = vld1q_f32(filter_ptr1 +
3
); \
vf20 = vld1q_f32(filter_ptr2); \
vf21 = vld1q_f32(filter_ptr2 +
4
); \
vf21 = vld1q_f32(filter_ptr2 +
3
); \
vf30 = vld1q_f32(filter_ptr3); \
vf31 = vld1q_f32(filter_ptr3 +
4
); \
vf31 = vld1q_f32(filter_ptr3 +
3
); \
\
/* outch 0 */
\
vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01),
0
); \
vo0 = vmlaq_lane_f32(vo0, vi5, vget_
low_f32(vf01), 1);
\
vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01),
0
); \
vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01),
1
); \
vo0 = vmlaq_lane_f32(vo0, vi5, vget_
high_f32(vf01), 0);
\
vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01),
1
); \
\
/* outch 1 */
\
vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); \
vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); \
vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); \
vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); \
vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11),
0
); \
vo1 = vmlaq_lane_f32(vo1, vi5, vget_
low_f32(vf11), 1);
\
vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11),
0
); \
vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11),
1
); \
vo1 = vmlaq_lane_f32(vo1, vi5, vget_
high_f32(vf11), 0);
\
vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11),
1
); \
\
/* outch 2 */
\
vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); \
vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); \
vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); \
vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); \
vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21),
0
); \
vo2 = vmlaq_lane_f32(vo2, vi5, vget_
low_f32(vf21), 1);
\
vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21),
0
); \
vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21),
1
); \
vo2 = vmlaq_lane_f32(vo2, vi5, vget_
high_f32(vf21), 0);
\
vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21),
1
); \
\
/* outch 3 */
\
vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); \
vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); \
vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); \
vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \
vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31),
0
); \
vo3 = vmlaq_lane_f32(vo3, vi5, vget_
low_f32(vf31), 1);
\
vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31),
0
);
vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31),
1
); \
vo3 = vmlaq_lane_f32(vo3, vi5, vget_
high_f32(vf31), 0);
\
vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31),
1
);
#define Conv2dArmv7NeonK7x7SnLoadCalc1
\
#define Conv2dArmv7NeonK7x7SnLoadCalc1 \
/* load filter (1 outch x 1 height x 4 width) */
\
float32x4_t vf00, vf01; \
vf00 = vld1q_f32(filter_ptr0); \
vf01 = vld1q_f32(filter_ptr0 +
4
); \
vf01 = vld1q_f32(filter_ptr0 +
3
); \
\
/* outch 0 */
\
vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01),
0
); \
vo0 = vmlaq_lane_f32(vo0, vi5, vget_
low_f32(vf01), 1);
\
vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01),
0
);
vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01),
1
); \
vo0 = vmlaq_lane_f32(vo0, vi5, vget_
high_f32(vf01), 0);
\
vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01),
1
);
inline
void
Conv2dCPUK7x7Calc
(
const
float
*
in_ptr_base
,
const
float
*
filter_ptr0
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录