Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
1e5d7f7a
Mace
项目概览
Xiaomi
/
Mace
通知
107
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
1e5d7f7a
编写于
4月 27, 2018
作者:
李
李寅
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'depthwise_v7' into 'master'
optimize depthwise conv3x3 s1 s2 armv7 neon See merge request !433
上级
3fef8b17
ad0b1a8a
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
39 addition
and
4 deletion
+39
-4
mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
+39
-4
未找到文件。
mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
浏览文件 @
1e5d7f7a
...
...
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#if defined(MACE_ENABLE_NEON)
&& defined(__aarch64__)
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#endif
...
...
@@ -99,7 +99,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
}
}
#if defined(MACE_ENABLE_NEON)
&& defined(__aarch64__)
#if defined(MACE_ENABLE_NEON)
// load filter (1 outch x 3 height x 3 width): vf_outch_height
float32x4_t
vf00
,
vf01
,
vf02
;
vf00
=
vld1q_f32
(
filter_ptr
);
...
...
@@ -172,6 +172,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
vo00
=
vld1q_f32
(
out_base
+
out_offset
);
vo01
=
vld1q_f32
(
out_base
+
out_offset
+
out_width
);
#if defined(__aarch64__)
// outch 0, height 0
vo00
=
vfmaq_laneq_f32
(
vo00
,
vi00
,
vf00
,
0
);
vo00
=
vfmaq_laneq_f32
(
vo00
,
vi01
,
vf00
,
1
);
...
...
@@ -193,7 +194,29 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
vo01
=
vfmaq_laneq_f32
(
vo01
,
vi30
,
vf02
,
0
);
vo01
=
vfmaq_laneq_f32
(
vo01
,
vi31
,
vf02
,
1
);
vo01
=
vfmaq_laneq_f32
(
vo01
,
vi32
,
vf02
,
2
);
#else
// outch 0, height 0
vo00
=
vmlaq_lane_f32
(
vo00
,
vi00
,
vget_low_f32
(
vf00
),
0
);
vo00
=
vmlaq_lane_f32
(
vo00
,
vi01
,
vget_low_f32
(
vf00
),
1
);
vo00
=
vmlaq_lane_f32
(
vo00
,
vi02
,
vget_high_f32
(
vf00
),
0
);
vo00
=
vmlaq_lane_f32
(
vo00
,
vi10
,
vget_low_f32
(
vf01
),
0
);
vo00
=
vmlaq_lane_f32
(
vo00
,
vi11
,
vget_low_f32
(
vf01
),
1
);
vo00
=
vmlaq_lane_f32
(
vo00
,
vi12
,
vget_high_f32
(
vf01
),
0
);
vo00
=
vmlaq_lane_f32
(
vo00
,
vi20
,
vget_low_f32
(
vf02
),
0
);
vo00
=
vmlaq_lane_f32
(
vo00
,
vi21
,
vget_low_f32
(
vf02
),
1
);
vo00
=
vmlaq_lane_f32
(
vo00
,
vi22
,
vget_high_f32
(
vf02
),
0
);
// outch 0, height 1
vo01
=
vmlaq_lane_f32
(
vo01
,
vi10
,
vget_low_f32
(
vf00
),
0
);
vo01
=
vmlaq_lane_f32
(
vo01
,
vi11
,
vget_low_f32
(
vf00
),
1
);
vo01
=
vmlaq_lane_f32
(
vo01
,
vi12
,
vget_high_f32
(
vf00
),
0
);
vo01
=
vmlaq_lane_f32
(
vo01
,
vi20
,
vget_low_f32
(
vf01
),
0
);
vo01
=
vmlaq_lane_f32
(
vo01
,
vi21
,
vget_low_f32
(
vf01
),
1
);
vo01
=
vmlaq_lane_f32
(
vo01
,
vi22
,
vget_high_f32
(
vf01
),
0
);
vo01
=
vmlaq_lane_f32
(
vo01
,
vi30
,
vget_low_f32
(
vf02
),
0
);
vo01
=
vmlaq_lane_f32
(
vo01
,
vi31
,
vget_low_f32
(
vf02
),
1
);
vo01
=
vmlaq_lane_f32
(
vo01
,
vi32
,
vget_high_f32
(
vf02
),
0
);
#endif
vst1q_f32
(
out_base
+
out_offset
,
vo00
);
vst1q_f32
(
out_base
+
out_offset
+
out_width
,
vo01
);
}
// w
...
...
@@ -316,7 +339,7 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
}
}
#if defined(MACE_ENABLE_NEON)
&& defined(__aarch64__)
#if defined(MACE_ENABLE_NEON)
// load filter (1 outch x 3 height x 3 width): vf_outch_height
float32x4_t
vf00
,
vf01
,
vf02
;
vf00
=
vld1q_f32
(
filter_ptr
);
...
...
@@ -378,6 +401,7 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
vi21
=
vi2
.
val
[
1
];
vi22
=
vextq_f32
(
vi20
,
vi2n
,
1
);
#if defined(__aarch64__)
// outch 0, height 0
vo
=
vfmaq_laneq_f32
(
vo
,
vi00
,
vf00
,
0
);
vo
=
vfmaq_laneq_f32
(
vo
,
vi01
,
vf00
,
1
);
...
...
@@ -388,7 +412,18 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
vo
=
vfmaq_laneq_f32
(
vo
,
vi20
,
vf02
,
0
);
vo
=
vfmaq_laneq_f32
(
vo
,
vi21
,
vf02
,
1
);
vo
=
vfmaq_laneq_f32
(
vo
,
vi22
,
vf02
,
2
);
#else
// outch 0, height 0
vo
=
vmlaq_lane_f32
(
vo
,
vi00
,
vget_low_f32
(
vf00
),
0
);
vo
=
vmlaq_lane_f32
(
vo
,
vi01
,
vget_low_f32
(
vf00
),
1
);
vo
=
vmlaq_lane_f32
(
vo
,
vi02
,
vget_high_f32
(
vf00
),
0
);
vo
=
vmlaq_lane_f32
(
vo
,
vi10
,
vget_low_f32
(
vf01
),
0
);
vo
=
vmlaq_lane_f32
(
vo
,
vi11
,
vget_low_f32
(
vf01
),
1
);
vo
=
vmlaq_lane_f32
(
vo
,
vi12
,
vget_high_f32
(
vf01
),
0
);
vo
=
vmlaq_lane_f32
(
vo
,
vi20
,
vget_low_f32
(
vf02
),
0
);
vo
=
vmlaq_lane_f32
(
vo
,
vi21
,
vget_low_f32
(
vf02
),
1
);
vo
=
vmlaq_lane_f32
(
vo
,
vi22
,
vget_high_f32
(
vf02
),
0
);
#endif
vst1q_f32
(
out_base
+
out_offset
,
vo
);
}
// w
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录