Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
慢慢CG
Mace
提交
8049fb22
Mace
项目概览
慢慢CG
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
8049fb22
编写于
11月 29, 2017
作者:
L
Liangliang He
浏览文件
操作
浏览文件
下载
差异文件
Merge branch '1x1' into 'master'
Update 1x1 opencl kernel See merge request !126
上级
cec5af91
564e5ec4
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
86 addition
and
331 deletion
+86
-331
mace/kernels/opencl/cl/common.h
mace/kernels/opencl/cl/common.h
+4
-0
mace/kernels/opencl/cl/conv_2d_1x1.cl
mace/kernels/opencl/cl/conv_2d_1x1.cl
+58
-251
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+24
-80
未找到文件。
mace/kernels/opencl/cl/common.h
浏览文件 @
8049fb22
...
@@ -14,4 +14,8 @@
...
@@ -14,4 +14,8 @@
#define CMD_TYPE_STR(cmd, type) cmd##type
#define CMD_TYPE_STR(cmd, type) cmd##type
#define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type)
#define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type)
#define DATA_TYPE4 VEC_DATA_TYPE(DATA_TYPE, 4)
#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
#endif // MACE_KERNELS_OPENCL_CL_COMMON_H_
#endif // MACE_KERNELS_OPENCL_CL_COMMON_H_
mace/kernels/opencl/cl/conv_2d_1x1.cl
浏览文件 @
8049fb22
#
include
<common.h>
#
include
<common.h>
#
define
vec_conv_2d_1x1_s1
\
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in0
=
vload4
(
0
,
input_ptr
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in1
=
vload4
(
0
,
input_ptr
+
in_pixel
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in2
=
vload4
(
0
,
input_ptr
+
2
*
in_pixel
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in3
=
vload4
(
0
,
input_ptr
+
3
*
in_pixel
)
;
#
define
vec_conv_2d_1x1_s2
\
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in00
=
vload4
(
0
,
input_ptr
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,3
)
in01
=
vload3
(
0
,
input_ptr
+
4
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in10
=
vload4
(
0
,
input_ptr
+
in_pixel
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,3
)
in11
=
vload3
(
0
,
input_ptr
+
in_pixel
+
4
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in20
=
vload4
(
0
,
input_ptr
+
2
*
in_pixel
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,3
)
in21
=
vload3
(
0
,
input_ptr
+
2
*
in_pixel
+
4
)
;\
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in30
=
vload4
(
0
,
input_ptr
+
3
*
in_pixel
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,3
)
in31
=
vload3
(
0
,
input_ptr
+
3
*
in_pixel
+
4
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in0
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
in00.s02,
in01.s02
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in1
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
in10.s02,
in11.s02
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in2
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
in20.s02,
in21.s02
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in3
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
in30.s02,
in31.s02
)
;
#
define
vec_conv_2d_1x1_compute_loop
\
for
(
int
oc
=
0
; oc < 4; ++oc) { \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
weights
=
vload4
(
0
,
filter_ptr
+
oc
*
in_chan_num
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
out
=
vload4
(
0
,
output_ptr
+
oc
*
out_pixel
)
; \
out
+=
in0
*
weights.x
; \
out
+=
in1
*
weights.y
; \
out
+=
in2
*
weights.z
; \
out
+=
in3
*
weights.w
; \
vstore4
(
out,
0
,
output_ptr
+
oc
*
out_pixel
)
; \
}
#
define
vec_conv_2d_1x1_compute
\
VEC_DATA_TYPE
(
DATA_TYPE,4
)
weights
=
vload4
(
0
,
filter_ptr
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
out
=
vload4
(
0
,
output_ptr
)
; \
out
+=
in0
*
weights.x
; \
out
+=
in1
*
weights.y
; \
out
+=
in2
*
weights.z
; \
out
+=
in3
*
weights.w
; \
vstore4
(
out,
0
,
output_ptr
)
;
//
Supported
data
type:
half/float
__kernel
void
conv_2d_1x1_v2
(
__global
const
DATA_TYPE
*input,
/*
n,
c,
h,
w
*/
__global
const
DATA_TYPE
*filter,
/*
o,
i,
kh,
kw
*/
#
ifdef
BIAS
__global
const
DATA_TYPE
*bias,
/*
o
*/
#
endif
/*
defined
(
BIAS
)
*/
__global
DATA_TYPE
*output,
/*
n,
c,
h,
w
*/
__private
const
int
in_chan_num,
__private
const
int
out_chan_num,
__private
const
int
in_height,
__private
const
int
in_width,
__private
const
int
out_height,
__private
const
int
out_width
)
{
int
batch
=
get_global_id
(
0
)
;
int
out_chan_blk
=
get_global_id
(
1
)
;
int
out_pixel_blk
=
get_global_id
(
2
)
;
const
int
in_pixel
=
in_height
*
in_width
;
const
int
out_pixel
=
out_height
*
out_width
;
const
int
round_out_width
=
(
out_width
+
3
)
/
4
;
const
int
out_pixel_height
=
out_pixel_blk
/
round_out_width
;
const
int
out_pixel_width
=
out_pixel_blk
%
round_out_width
;
const
int
out_chan_begin
=
out_chan_blk
*
4
;
const
int
out_chan_end
=
min
(
out_chan_begin
+
4
,
out_chan_num
)
;
const
int
out_pixel_begin
=
out_pixel_height
*
out_width
+
out_pixel_width
*
4
;
const
int
out_pixel_end
=
min
(
out_pixel_begin
+
4
,
(
out_pixel_height
+
1
)
*
out_width
)
;
#
ifdef
STRIDE_1
const
int
stride
=
1
;
#
else
const
int
stride
=
2
;
#
endif
const
int
in_pixel_begin
=
out_pixel_height
*
stride
*
in_width
+
out_pixel_width
*
stride
*
4
;
const
int
in_offset
=
batch
*
in_chan_num
*
in_pixel
;
const
int
out_offset
=
batch
*
out_chan_num
*
out_pixel
;
const
DATA_TYPE
*input_base
=
input
+
in_offset
+
in_pixel_begin
;
DATA_TYPE
*output_base
=
output
+
out_offset
+
out_pixel_begin
;
int
out_chan_len
=
out_chan_end
-
out_chan_begin
;
int
pixel_len
=
out_pixel_end
-
out_pixel_begin
;
for
(
int
out_chan
=
out_chan_begin
; out_chan < out_chan_end; ++out_chan) {
DATA_TYPE
*output_ptr
=
output_base
+
out_chan
*
out_pixel
;
#
ifdef
BIAS
DATA_TYPE
bias_value
=
bias[out_chan]
;
#
else
DATA_TYPE
bias_value
=
0
;
#
endif
for
(
int
p
=
0
; p < pixel_len; ++p) {
output_ptr[p]
=
bias_value
;
}
}
int
in_chan
=
0
;
if
(
pixel_len
==
4
)
{
for
(
; in_chan + 3 < in_chan_num; in_chan += 4) {
const
DATA_TYPE
*input_ptr
=
input_base
+
in_chan
*
in_pixel
;
int
out_chan
=
out_chan_begin
;
for
(
; out_chan + 3 < out_chan_end; out_chan += 4) {
const
DATA_TYPE*
filter_ptr
=
filter
+
out_chan
*
in_chan_num
+
in_chan
;
DATA_TYPE
*output_ptr
=
output_base
+
out_chan
*
out_pixel
;
#
ifdef
STRIDE_1
vec_conv_2d_1x1_s1
;
#
else
vec_conv_2d_1x1_s2
;
#
endif
vec_conv_2d_1x1_compute_loop
;
}
for
(
; out_chan < out_chan_end; ++out_chan) {
const
DATA_TYPE*
filter_ptr
=
filter
+
out_chan
*
in_chan_num
+
in_chan
;
DATA_TYPE
*output_ptr
=
output_base
+
out_chan
*
out_pixel
;
#
ifdef
STRIDE_1
vec_conv_2d_1x1_s1
;
#
else
vec_conv_2d_1x1_s2
;
#
endif
vec_conv_2d_1x1_compute
;
}
}
}
for
(
; in_chan < in_chan_num; ++in_chan) {
const
DATA_TYPE
*input_ptr
=
input_base
+
in_chan
*
in_pixel
;
for
(
int
out_chan
=
out_chan_begin
; out_chan < out_chan_end; ++out_chan) {
DATA_TYPE
weights
=
filter[out_chan
*
in_chan_num
+
in_chan]
;
DATA_TYPE
*output_ptr
=
output_base
+
out_chan
*
out_pixel
;
for
(
int
p
=
0
; p < pixel_len; ++p) {
float
in
=
input_ptr[p*stride]
;
output_ptr[p]
+=
in
*
weights
;
}
}
}
}
__kernel
void
conv_2d_1x1
(
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__kernel
void
conv_2d_1x1
(
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin,
cout/4
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin,
cout/4
*/
#
ifdef
BIAS
__read_only
image2d_t
bias,
/*
cout%4
*
cout/4
*/
__read_only
image2d_t
bias,
/*
cout%4
*
cout/4
*/
#
endif
#
ifdef
FUSED_BATCH_NORM
__read_only
image2d_t
bn_scale,
/*
cout%4
*
cout/4
*/
__read_only
image2d_t
bn_offset,
/*
cout%4
*
cout/4
*/
#
endif
__write_only
image2d_t
output,
__write_only
image2d_t
output,
__private
const
int
in_ch_blks,
__private
const
int
in_ch_blks,
__private
const
int
width
)
{
__private
const
int
width
)
{
...
@@ -154,12 +19,14 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
...
@@ -154,12 +19,14 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
half4
bias_value
=
read_imageh
(
bias,
sampler,
(
int2
)(
out_ch_blk,
0
))
;
DATA_TYPE4
out[4]
=
{0}
;
half4
out[4]
;
#
ifdef
BIAS
out[0]
=
(
half4
)(
bias_value.x
)
;
out[0]
=
out[1]
=
(
half4
)(
bias_value.y
)
;
READ_IMAGET
(
bias,
sampler,
(
int2
)(
out_ch_blk,
0
))
;
out[2]
=
(
half4
)(
bias_value.z
)
;
out[1]
=
out[0]
;
out[3]
=
(
half4
)(
bias_value.w
)
;
out[2]
=
out[0]
;
out[3]
=
out[0]
;
#
endif
int
w[4]
;
int
w[4]
;
w[0]
=
out_w_blk
;
w[0]
=
out_w_blk
;
...
@@ -170,135 +37,75 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
...
@@ -170,135 +37,75 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
//
Unrolling
this
loop
hurt
perfmance
//
Unrolling
this
loop
hurt
perfmance
int
in_x_base
=
0
;
int
in_x_base
=
0
;
for
(
int
in_ch_blk
=
0
; in_ch_blk < in_ch_blks; ++in_ch_blk) {
for
(
int
in_ch_blk
=
0
; in_ch_blk < in_ch_blks; ++in_ch_blk) {
half
4
in[4]
;
DATA_TYPE
4
in[4]
;
in[0]
=
read_imageh
(
input,
sampler,
(
int2
)(
in_x_base
+
w[0],
out_hb
))
;
in[0]
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w[0],
out_hb
))
;
if
(
w[1]
<
width
)
{
if
(
w[1]
<
width
)
{
//
conditional
load
hurt
perf,
this
branching
helps
sometimes
//
conditional
load
hurt
perf,
this
branching
helps
sometimes
in[1]
=
read_imageh
(
input,
sampler,
(
int2
)(
in_x_base
+
w[1],
out_hb
))
;
in[1]
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w[1],
out_hb
))
;
in[2]
=
read_imageh
(
input,
sampler,
(
int2
)(
in_x_base
+
w[2],
out_hb
))
;
in[2]
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w[2],
out_hb
))
;
in[3]
=
read_imageh
(
input,
sampler,
(
int2
)(
in_x_base
+
w[3],
out_hb
))
;
in[3]
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w[3],
out_hb
))
;
}
}
//
The
order
matters,
load
input
first
then
load
filter,
why?
const
int
filter_x0
=
in_ch_blk
<<
2
;
const
int
filter_x0
=
in_ch_blk
<<
2
;
half
4
weights[4]
;
DATA_TYPE
4
weights[4]
;
#
pragma
unroll
#
pragma
unroll
for
(
int
c
=
0
; c < 4; ++c) {
for
(
int
c
=
0
; c < 4; ++c) {
weights[c]
=
read_imageh
(
filter,
sampler,
(
int2
)(
filter_x0
+
c,
out_ch_blk
))
;
weights[c]
=
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_x0
+
c,
out_ch_blk
))
;
}
}
//
Will
prefetch
L2
improve
performance?
How
to
pretch
image
data?
//
Will
prefetch
L2
improve
performance?
How
to
pretch
image
data?
//
Interleaving
load
and
mul
does
not
improve
performance
as
expected
//
Interleaving
load
and
mul
does
not
improve
performance
as
expected
#
pragma
unroll
#
pragma
unroll
for
(
int
c
=
0
; c < 4; ++c
) {
for
(
int
wi
=
0
; wi < 4; ++wi
) {
out[
c]
+=
in[c
].x
*
weights[0]
;
out[
wi]
+=
in[wi
].x
*
weights[0]
;
out[
c]
+=
in[c
].y
*
weights[1]
;
out[
wi]
+=
in[wi
].y
*
weights[1]
;
out[
c]
+=
in[c
].z
*
weights[2]
;
out[
wi]
+=
in[wi
].z
*
weights[2]
;
out[
c]
+=
in[c
].w
*
weights[3]
;
out[
wi]
+=
in[wi
].w
*
weights[3]
;
}
}
in_x_base
+=
width
;
in_x_base
+=
width
;
}
}
const
int
out_x_base
=
out_ch_blk
*
width
;
#
ifdef
FUSED_BATCH_NORM
write_imageh
(
output,
(
int2
)(
out_x_base
+
w[0],
out_hb
)
,
out[0]
)
;
//
batch
norm
DATA_TYPE4
bn_scale_value
=
if
(
w[1]
>=
width
)
return
;
READ_IMAGET
(
bn_scale,
sampler,
(
int2
)(
out_ch_blk,
0
))
;
write_imageh
(
output,
(
int2
)(
out_x_base
+
w[1],
out_hb
)
,
out[1]
)
;
DATA_TYPE4
scale[4]
;
scale[0]
=
(
DATA_TYPE4
)(
bn_scale_value.x
)
;
if
(
w[2]
>=
width
)
return
;
scale[1]
=
(
DATA_TYPE4
)(
bn_scale_value.y
)
;
write_imageh
(
output,
(
int2
)(
out_x_base
+
w[2],
out_hb
)
,
out[2]
)
;
scale[2]
=
(
DATA_TYPE4
)(
bn_scale_value.z
)
;
scale[3]
=
(
DATA_TYPE4
)(
bn_scale_value.w
)
;
if
(
w[3]
>=
width
)
return
;
DATA_TYPE4
bn_offset_value
=
write_imageh
(
output,
(
int2
)(
out_x_base
+
w[3],
out_hb
)
,
out[3]
)
;
READ_IMAGET
(
bn_offset,
sampler,
(
int2
)(
out_ch_blk,
0
))
;
}
DATA_TYPE4
offset[4]
;
offset[0]
=
(
DATA_TYPE4
)(
bn_offset_value.x
)
;
__kernel
void
conv_2d_1x1_h8
(
__read_only
image2d_t
input,
/*
[c%8
*
w
*
c/8,
h
*
b]
*/
offset[1]
=
(
DATA_TYPE4
)(
bn_offset_value.y
)
;
__read_only
image2d_t
filter,
/*
cout%8
*
cin,
cout/8
*/
offset[2]
=
(
DATA_TYPE4
)(
bn_offset_value.z
)
;
__read_only
image2d_t
bias,
/*
cout%8
*
cout/8
*/
offset[3]
=
(
DATA_TYPE4
)(
bn_offset_value.w
)
;
__write_only
image2d_t
output,
__private
const
int
in_ch_blks,
#
pragma
unroll
__private
const
int
width
)
{
for
(
int
wi
=
0
; wi < 4; ++wi) {
const
int
out_ch_blk
=
get_global_id
(
0
)
;
out[wi]
=
out[wi]
*
scale[wi]
+
offset[wi]
;
const
int
out_w_blk
=
get_global_id
(
1
)
;
}
const
int
out_w_blks
=
get_global_size
(
1
)
;
#
endif
const
int
out_hb
=
get_global_id
(
2
)
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
float4
bias_value
=
read_imagef
(
bias,
sampler,
(
int2
)(
out_ch_blk,
0
))
;
half4
bias_value03
=
as_half4
(
bias_value.xy
)
;
half4
bias_value47
=
as_half4
(
bias_value.zw
)
;
half4
out[8]
;
out[0]
=
(
half4
)(
bias_value03.x
)
;
out[1]
=
(
half4
)(
bias_value03.y
)
;
out[2]
=
(
half4
)(
bias_value03.z
)
;
out[3]
=
(
half4
)(
bias_value03.w
)
;
out[4]
=
(
half4
)(
bias_value47.x
)
;
out[5]
=
(
half4
)(
bias_value47.y
)
;
out[6]
=
(
half4
)(
bias_value47.z
)
;
out[7]
=
(
half4
)(
bias_value47.w
)
;
int
w[4]
;
w[0]
=
out_w_blk
;
w[1]
=
w[0]
+
out_w_blks
;
w[2]
=
w[1]
+
out_w_blks
;
w[3]
=
w[2]
+
out_w_blks
;
//
Unrolling
this
loop
hurt
perfmance
int
in_x_base
=
0
;
for
(
int
in_ch_blk
=
0
; in_ch_blk < in_ch_blks; ++in_ch_blk) {
half4
in[8]
;
#
pragma
unroll
for
(
int
wi
=
0
; wi < 4; ++wi) {
float4
in_value
=
read_imagef
(
input,
sampler,
(
int2
)(
in_x_base
+
w[0],
out_hb
))
;
in[wi
<<
1]
=
as_half4
(
in_value.xy
)
;
in[wi
<<
1
+
1]
=
as_half4
(
in_value.zw
)
;
}
//
The
order
matters,
load
input
first
then
load
filter,
why?
const
int
filter_x0
=
in_ch_blk
<<
2
;
half4
weights[8]
;
#
pragma
unroll
for
(
int
wi
=
0
; wi < 4; ++wi) {
float4
weights_value
=
read_imagef
(
filter,
sampler,
(
int2
)(
filter_x0
+
wi,
out_ch_blk
))
;
weights[wi
<<
1]
=
as_half4
(
weights_value.xy
)
;
weights[wi
<<
1
+
1]
=
as_half4
(
weights_value.zw
)
;
}
//
Will
prefetch
L2
improve
performance?
How
to
pretch
image
data?
//
Interleaving
load
and
mul
does
not
improve
performance
as
expected
#
pragma
unroll
for
(
int
wi
=
0
; wi < 4; ++wi) {
int
idx
=
wi
<<
1
;
out[idx]
+=
in[idx].x
*
weights[0]
;
out[idx]
+=
in[idx].y
*
weights[1]
;
out[idx]
+=
in[idx].z
*
weights[2]
;
out[idx]
+=
in[idx].w
*
weights[3]
;
++idx
;
out[idx]
+=
in[idx].x
*
weights[4]
;
out[idx]
+=
in[idx].y
*
weights[5]
;
out[idx]
+=
in[idx].z
*
weights[6]
;
out[idx]
+=
in[idx].w
*
weights[7]
;
}
in_x_base
+=
width
;
#
ifdef
FUSED_RELU
#
pragma
unroll
for
(
int
wi
=
0
; wi < 4; ++wi) {
//
TODO
relux
out[wi]
=
fmax
(
out[wi],
0
)
;
}
}
#
endif
const
int
out_x_base
=
out_ch_blk
*
width
;
const
int
out_x_base
=
out_ch_blk
*
width
;
float4
out_value
=
(
float4
)(
as_float2
(
out[0]
)
,
as_float2
(
out[1]
))
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w[3],
out_hb
)
,
out[0]
)
;
write_imagef
(
output,
(
int2
)(
out_x_base
+
w[0],
out_hb
)
,
out_value
)
;
if
(
w[1]
>=
width
)
return
;
if
(
w[1]
>=
width
)
return
;
out_value
=
(
float4
)(
as_float2
(
out[2]
)
,
as_float2
(
out[3]
))
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w[1],
out_hb
)
,
out[1]
)
;
write_imagef
(
output,
(
int2
)(
out_x_base
+
w[0],
out_hb
)
,
out_value
)
;
if
(
w[2]
>=
width
)
return
;
if
(
w[2]
>=
width
)
return
;
out_value
=
(
float4
)(
as_float2
(
out[4]
)
,
as_float2
(
out[5]
))
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w[3],
out_hb
)
,
out[2]
)
;
write_imagef
(
output,
(
int2
)(
out_x_base
+
w[0],
out_hb
)
,
out_value
)
;
if
(
w[3]
>=
width
)
return
;
if
(
w[3]
>=
width
)
return
;
out_value
=
(
float4
)(
as_float2
(
out[6]
)
,
as_float2
(
out[7]
))
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w[3],
out_hb
)
,
out[3]
)
;
write_imagef
(
output,
(
int2
)(
out_x_base
+
w[0],
out_hb
)
,
out_value
)
;
}
}
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
8049fb22
...
@@ -11,75 +11,31 @@
...
@@ -11,75 +11,31 @@
namespace
mace
{
namespace
mace
{
namespace
kernels
{
namespace
kernels
{
void
Conv1x1
V2
(
const
Tensor
*
input
,
void
Conv1x1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
Tensor
*
bias
,
const
int
stride
,
const
int
stride
,
Tensor
*
output
)
{
Tensor
*
output
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
channels
=
output
->
dim
(
1
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
height
=
output
->
dim
(
2
);
const
index_t
width
=
output
->
dim
(
2
);
const
index_t
width
=
output
->
dim
(
3
);
const
index_t
channels
=
output
->
dim
(
3
);
const
index_t
input_channels
=
input
->
dim
(
1
);
const
index_t
input_batch
=
input
->
dim
(
0
);
const
index_t
input_height
=
input
->
dim
(
1
);
auto
runtime
=
OpenCLRuntime
::
Get
();
const
index_t
input_width
=
input
->
dim
(
2
);
auto
program
=
runtime
->
program
();
const
index_t
input_channels
=
input
->
dim
(
3
);
const
index_t
channel_blocks
=
(
channels
+
3
)
/
4
;
const
index_t
pixel_blocks
=
(
width
+
3
)
/
4
*
height
;
// TODO KernelFunctor has an extra clReleaseCommandQueue due to a copy
// TODO check wired clReleaseCommandQueue latency
// The KernelFunctor can cause segment faults in cb_retain_event
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
input
->
dtype
()));
built_options
.
emplace
(
stride
==
1
?
"-DSTRIDE_1"
:
""
);
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
auto
conv_2d_kernel
=
runtime
->
BuildKernel
(
"conv_2d_1x1"
,
"conv_2d_1x1_v2"
,
built_options
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
uint32_t
idx
=
0
;
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
input
->
buffer
())));
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
filter
->
buffer
())));
if
(
bias
!=
nullptr
)
{
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
bias
->
buffer
())));
}
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input_channels
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
channels
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input
->
dim
(
2
)));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input
->
dim
(
3
)));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
width
));
auto
command_queue
=
runtime
->
command_queue
();
cl_int
error
=
command_queue
.
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
static_cast
<
int
>
(
batch
),
static_cast
<
int
>
(
channel_blocks
),
static_cast
<
int
>
(
pixel_blocks
)),
cl
::
NDRange
(
1
,
2
,
kwg_size
/
2
),
NULL
,
OpenCLRuntime
::
Get
()
->
GetDefaultEvent
());
MACE_CHECK
(
error
==
CL_SUCCESS
,
error
);
}
void
Conv1x1V3
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
stride
,
Tensor
*
output
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
channels
=
output
->
dim
(
1
);
const
index_t
height
=
output
->
dim
(
2
);
const
index_t
width
=
output
->
dim
(
3
);
const
index_t
input_channels
=
input
->
dim
(
1
);
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
MACE_CHECK
(
stride
==
1
);
MACE_CHECK
(
input_batch
==
batch
);
MACE_CHECK
(
stride
!=
1
||
(
input_height
==
height
&&
input_width
==
width
));
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
input
->
dtype
()));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
input
->
dtype
()));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DataTypeToOPENCLCMDDataType
(
input
->
dtype
()));
built_options
.
emplace
(
"-DSTRIDE_1"
);
built_options
.
emplace
(
"-DSTRIDE_1"
);
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
...
@@ -103,10 +59,11 @@ void Conv1x1V3(const Tensor *input,
...
@@ -103,10 +59,11 @@ void Conv1x1V3(const Tensor *input,
cl_int
error
;
cl_int
error
;
error
=
command_queue
.
enqueueNDRangeKernel
(
error
=
command_queue
.
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NullRange
,
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
height
),
cl
::
NDRange
(
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)),
static_cast
<
uint32_t
>
(
height
*
batch
)),
cl
::
NDRange
(
4
,
15
,
8
),
cl
::
NDRange
(
4
,
15
,
8
),
// TODO auto tuning
NULL
,
OpenCLRuntime
::
Get
()
->
GetDefaultEvent
());
nullptr
,
OpenCLRuntime
::
Get
()
->
GetDefaultEvent
());
MACE_CHECK
(
error
==
CL_SUCCESS
,
error
);
MACE_CHECK
(
error
==
CL_SUCCESS
,
error
);
}
}
...
@@ -115,18 +72,7 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
...
@@ -115,18 +72,7 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
const
Tensor
*
bias
,
const
Tensor
*
bias
,
const
int
*
padding
,
const
int
*
padding
,
Tensor
*
output
)
{
Tensor
*
output
)
{
const
index_t
batch
=
output
->
dim
(
0
);
Conv1x1
(
input
,
filter
,
bias
,
1
,
output
);
const
index_t
height
=
output
->
dim
(
2
);
const
index_t
width
=
output
->
dim
(
3
);
const
index_t
input_batch
=
input
->
dim
(
0
);
const
index_t
input_height
=
input
->
dim
(
2
);
const
index_t
input_width
=
input
->
dim
(
3
);
MACE_CHECK
(
input_batch
==
batch
&&
input_height
==
height
&&
input_width
==
width
);
Conv1x1V2
(
input
,
filter
,
bias
,
1
,
output
);
};
};
extern
void
Conv2dOpenclK1x1S2
(
const
Tensor
*
input
,
extern
void
Conv2dOpenclK1x1S2
(
const
Tensor
*
input
,
...
@@ -134,9 +80,7 @@ extern void Conv2dOpenclK1x1S2(const Tensor *input,
...
@@ -134,9 +80,7 @@ extern void Conv2dOpenclK1x1S2(const Tensor *input,
const
Tensor
*
bias
,
const
Tensor
*
bias
,
const
int
*
padding
,
const
int
*
padding
,
Tensor
*
output
)
{
Tensor
*
output
)
{
MACE_CHECK
(
input
->
dim
(
0
)
==
output
->
dim
(
0
));
Conv1x1
(
input
,
filter
,
bias
,
2
,
output
);
Conv1x1V2
(
input
,
filter
,
bias
,
2
,
output
);
};
};
}
// namespace kernels
}
// namespace kernels
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录