Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
as350144
Mace
提交
ba4ca883
Mace
项目概览
as350144
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
2
Star
1
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
ba4ca883
编写于
11月 15, 2017
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Adjust the postion of judge clauses of conv opencl kernel.
上级
5bbd271e
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
58 addition
and
42 deletion
+58
-42
mace/kernels/opencl/cl/conv_2d_1x1.cl
mace/kernels/opencl/cl/conv_2d_1x1.cl
+48
-37
mace/kernels/opencl/cl/conv_2d_3x3.cl
mace/kernels/opencl/cl/conv_2d_3x3.cl
+10
-5
未找到文件。
mace/kernels/opencl/cl/conv_2d_1x1.cl
浏览文件 @
ba4ca883
...
@@ -24,25 +24,14 @@ __kernel void conv_2d_1x1_naive(__global const float *input, /* n, c, h, w */
...
@@ -24,25 +24,14 @@ __kernel void conv_2d_1x1_naive(__global const float *input, /* n, c, h, w */
}
}
}
}
#
define
vec_conv_2d_1x1_s1
(
out_size
)
\
#
define
vec_conv_2d_1x1_s1
\
do
{
\
float4
in0
=
vload4
(
0
,
input_ptr
)
; \
float4
in0
=
vload4
(
0
,
input_ptr
)
; \
float4
in1
=
vload4
(
0
,
input_ptr
+
in_pixel
)
; \
float4
in1
=
vload4
(
0
,
input_ptr
+
in_pixel
)
; \
float4
in2
=
vload4
(
0
,
input_ptr
+
2
*
in_pixel
)
; \
float4
in2
=
vload4
(
0
,
input_ptr
+
2
*
in_pixel
)
; \
float4
in3
=
vload4
(
0
,
input_ptr
+
3
*
in_pixel
)
; \
float4
in3
=
vload4
(
0
,
input_ptr
+
3
*
in_pixel
)
;
for
(
int
oc
=
0
; oc < out_size; ++oc) { \
float4
weights
=
vload4
(
0
,
filter_ptr
+
oc
*
in_chan_num
)
; \
float4
out
=
vload4
(
0
,
output_ptr
+
oc
*
out_pixel
)
; \
out
+=
in0
*
weights.x
; \
out
+=
in1
*
weights.y
; \
out
+=
in2
*
weights.z
; \
out
+=
in3
*
weights.w
; \
vstore4
(
out,
0
,
output_ptr
+
oc
*
out_pixel
)
; \
}
\
}
while
(
0
)
#
define
vec_conv_2d_1x1_s2
(
out_size
)
\
do
{
\
#
define
vec_conv_2d_1x1_s2
\
float4
in00
=
vload4
(
0
,
input_ptr
)
; \
float4
in00
=
vload4
(
0
,
input_ptr
)
; \
float3
in01
=
vload3
(
0
,
input_ptr
+
4
)
; \
float3
in01
=
vload3
(
0
,
input_ptr
+
4
)
; \
float4
in10
=
vload4
(
0
,
input_ptr
+
in_pixel
)
; \
float4
in10
=
vload4
(
0
,
input_ptr
+
in_pixel
)
; \
...
@@ -54,8 +43,11 @@ do { \
...
@@ -54,8 +43,11 @@ do { \
float4
in0
=
(
float4
)(
in00.s02,
in01.s02
)
; \
float4
in0
=
(
float4
)(
in00.s02,
in01.s02
)
; \
float4
in1
=
(
float4
)(
in10.s02,
in11.s02
)
; \
float4
in1
=
(
float4
)(
in10.s02,
in11.s02
)
; \
float4
in2
=
(
float4
)(
in20.s02,
in21.s02
)
; \
float4
in2
=
(
float4
)(
in20.s02,
in21.s02
)
; \
float4
in3
=
(
float4
)(
in30.s02,
in31.s02
)
; \
float4
in3
=
(
float4
)(
in30.s02,
in31.s02
)
;
for
(
int
oc
=
0
; oc < out_size; ++oc) { \
#
define
vec_conv_2d_1x1_compute_loop
\
for
(
int
oc
=
0
; oc < 4; ++oc) { \
float4
weights
=
vload4
(
0
,
filter_ptr
+
oc
*
in_chan_num
)
; \
float4
weights
=
vload4
(
0
,
filter_ptr
+
oc
*
in_chan_num
)
; \
float4
out
=
vload4
(
0
,
output_ptr
+
oc
*
out_pixel
)
; \
float4
out
=
vload4
(
0
,
output_ptr
+
oc
*
out_pixel
)
; \
out
+=
in0
*
weights.x
; \
out
+=
in0
*
weights.x
; \
...
@@ -63,10 +55,16 @@ do { \
...
@@ -63,10 +55,16 @@ do { \
out
+=
in2
*
weights.z
; \
out
+=
in2
*
weights.z
; \
out
+=
in3
*
weights.w
; \
out
+=
in3
*
weights.w
; \
vstore4
(
out,
0
,
output_ptr
+
oc
*
out_pixel
)
; \
vstore4
(
out,
0
,
output_ptr
+
oc
*
out_pixel
)
; \
}
\
}
}
while
(
0
)
#
define
vec_conv_2d_1x1_compute
\
float4
weights
=
vload4
(
0
,
filter_ptr
)
; \
float4
out
=
vload4
(
0
,
output_ptr
)
; \
out
+=
in0
*
weights.x
; \
out
+=
in1
*
weights.y
; \
out
+=
in2
*
weights.z
; \
out
+=
in3
*
weights.w
; \
vstore4
(
out,
0
,
output_ptr
)
;
__kernel
void
conv_2d_1x1_v2
(
__global
const
float
*input,
/*
n,
c,
h,
w
*/
__kernel
void
conv_2d_1x1_v2
(
__global
const
float
*input,
/*
n,
c,
h,
w
*/
__global
const
float
*filter,
/*
o,
i,
kh,
kw
*/
__global
const
float
*filter,
/*
o,
i,
kh,
kw
*/
...
@@ -115,25 +113,38 @@ __kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */
...
@@ -115,25 +113,38 @@ __kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */
int
in_chan
=
0
;
int
in_chan
=
0
;
if
(
pixel_len
==
4
)
{
if
(
pixel_len
==
4
)
{
for
(
; in_chan + 3 < in_chan_num; in_chan += 4) {
if
(
stride
==
1
)
{
const
float
*input_ptr
=
input_base
+
in_chan
*
in_pixel
;
for
(
; in_chan + 3 < in_chan_num; in_chan += 4) {
int
out_chan
=
out_chan_begin
;
const
float
*input_ptr
=
input_base
+
in_chan
*
in_pixel
;
for
(
; out_chan + 3 < out_chan_end; out_chan += 4) {
int
out_chan
=
out_chan_begin
;
const
float*
filter_ptr
=
filter
+
out_chan
*
in_chan_num
+
in_chan
;
for
(
; out_chan + 3 < out_chan_end; out_chan += 4) {
float
*output_ptr
=
output_base
+
out_chan
*
out_pixel
;
const
float*
filter_ptr
=
filter
+
out_chan
*
in_chan_num
+
in_chan
;
if
(
stride
==
1
)
{
float
*output_ptr
=
output_base
+
out_chan
*
out_pixel
;
vec_conv_2d_1x1_s1
(
4
)
;
vec_conv_2d_1x1_s1
;
}
else
if
(
stride
==
2
)
{
vec_conv_2d_1x1_compute_loop
;
vec_conv_2d_1x1_s2
(
4
)
;
}
for
(
; out_chan < out_chan_end; ++out_chan) {
const
float*
filter_ptr
=
filter
+
out_chan
*
in_chan_num
+
in_chan
;
float
*output_ptr
=
output_base
+
out_chan
*
out_pixel
;
vec_conv_2d_1x1_s1
;
vec_conv_2d_1x1_compute
;
}
}
}
}
for
(
; out_chan < out_chan_end; ++out_chan) {
}
else
if
(
stride
==
2
)
{
const
float*
filter_ptr
=
filter
+
out_chan
*
in_chan_num
+
in_chan
;
for
(
; in_chan + 3 < in_chan_num; in_chan += 4) {
float
*output_ptr
=
output_base
+
out_chan
*
out_pixel
;
const
float
*input_ptr
=
input_base
+
in_chan
*
in_pixel
;
if
(
stride
==
1
)
{
int
out_chan
=
out_chan_begin
;
vec_conv_2d_1x1_s1
(
1
)
;
for
(
; out_chan + 3 < out_chan_end; out_chan += 4) {
}
else
if
(
stride
==
2
)
{
const
float*
filter_ptr
=
filter
+
out_chan
*
in_chan_num
+
in_chan
;
vec_conv_2d_1x1_s2
(
1
)
;
float
*output_ptr
=
output_base
+
out_chan
*
out_pixel
;
vec_conv_2d_1x1_s2
;
vec_conv_2d_1x1_compute_loop
;
}
for
(
; out_chan < out_chan_end; ++out_chan) {
const
float*
filter_ptr
=
filter
+
out_chan
*
in_chan_num
+
in_chan
;
float
*output_ptr
=
output_base
+
out_chan
*
out_pixel
;
vec_conv_2d_1x1_s2
;
vec_conv_2d_1x1_compute
;
}
}
}
}
}
}
...
...
mace/kernels/opencl/cl/conv_2d_3x3.cl
浏览文件 @
ba4ca883
...
@@ -41,14 +41,19 @@ void kernel conv_2d_3x3(global const float *input,
...
@@ -41,14 +41,19 @@ void kernel conv_2d_3x3(global const float *input,
if
(
pixels
==
4
)
{
if
(
pixels
==
4
)
{
float4
res
=
bias
==
NULL
?
0
:
(
float4
)
bias[i]
;
float4
res
=
bias
==
NULL
?
0
:
(
float4
)
bias[i]
;
for
(
int
in_chan_idx
=
0
; in_chan_idx < in_chan_num; ++in_chan_idx) {
const
float*
input_ptr
=
input_base
+
in_chan_idx
*
in_pixel
;
if
(
stride_w
==
1
)
{
const
float*
filter_ptr
=
filter_base
+
in_chan_idx
*
9
;
for
(
int
in_chan_idx
=
0
; in_chan_idx < in_chan_num; ++in_chan_idx) {
if
(
stride_w
==
1
)
{
const
float*
input_ptr
=
input_base
+
in_chan_idx
*
in_pixel
;
const
float*
filter_ptr
=
filter_base
+
in_chan_idx
*
9
;
res
+=
conv1x3_s1
(
input_ptr
+
0
*
in_width,
filter_ptr
+
0
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
0
*
in_width,
filter_ptr
+
0
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
1
*
in_width,
filter_ptr
+
1
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
1
*
in_width,
filter_ptr
+
1
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
2
*
in_width,
filter_ptr
+
2
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
2
*
in_width,
filter_ptr
+
2
*
3
)
;
}
else
{
}
}
else
{
for
(
int
in_chan_idx
=
0
; in_chan_idx < in_chan_num; ++in_chan_idx) {
const
float*
input_ptr
=
input_base
+
in_chan_idx
*
in_pixel
;
const
float*
filter_ptr
=
filter_base
+
in_chan_idx
*
9
;
res
+=
conv1x3_s2
(
input_ptr
+
0
*
in_width,
filter_ptr
+
0
*
3
)
;
res
+=
conv1x3_s2
(
input_ptr
+
0
*
in_width,
filter_ptr
+
0
*
3
)
;
res
+=
conv1x3_s2
(
input_ptr
+
1
*
in_width,
filter_ptr
+
1
*
3
)
;
res
+=
conv1x3_s2
(
input_ptr
+
1
*
in_width,
filter_ptr
+
1
*
3
)
;
res
+=
conv1x3_s2
(
input_ptr
+
2
*
in_width,
filter_ptr
+
2
*
3
)
;
res
+=
conv1x3_s2
(
input_ptr
+
2
*
in_width,
filter_ptr
+
2
*
3
)
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录