Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
项目经理老王
Mace
提交
79ae9e97
Mace
项目概览
项目经理老王
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
79ae9e97
编写于
1月 22, 2019
作者:
刘
刘琦
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'fix-reduce-half-err' into 'master'
fix reduce half err See merge request !958
上级
4c9a444e
9380a0ca
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
61 addition
and
55 deletion
+61
-55
mace/ops/opencl/cl/reduce.cl
mace/ops/opencl/cl/reduce.cl
+44
-44
mace/ops/opencl/image/reduce.h
mace/ops/opencl/image/reduce.h
+17
-11
未找到文件。
mace/ops/opencl/cl/reduce.cl
浏览文件 @
79ae9e97
...
...
@@ -3,81 +3,81 @@
__kernel
void
reduce
(
OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__local
float4
*group_sum,
__private
const
int
group_size,
__private
const
int
partial_len,
__private
const
int
remain_index,
__private
const
int
batch,
__local
float4
*local_buffer,
__private
const
int
group_num,
__private
const
int
compute_size,
__private
const
int
last_index,
__private
const
int
in_height,
__private
const
int
in_width,
__private
const
float
image_size_reciprocal
,
__private
const
float
scale
,
__private
const
int
channel_blocks,
__write_only
image2d_t
output
)
{
const
int
i
=
get_local_id
(
0
)
;
const
int
j
=
get_local_id
(
1
)
;
const
int
k
=
get_global_id
(
2
)
;
const
int
w
=
get_local_id
(
0
)
;
const
int
h
=
get_local_id
(
1
)
;
const
int
bc
=
get_global_id
(
2
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
k
>=
global_size_dim2
)
if
(
bc
>=
global_size_dim2
)
return
;
#
endif
const
int
dim0_size
=
get_local_size
(
0
)
;
const
int
index
=
mad24
(
j,
dim0_size,
i
)
;
const
int
b
=
k
/
channel_blocks
;
const
int
ch
=
mad24
(
b,
-channel_blocks,
k
)
;
const
int
width
=
get_local_size
(
0
)
;
const
int
index
=
mad24
(
h,
width,
w
)
;
const
int
b
=
bc
/
channel_blocks
;
const
int
ch
=
mad24
(
b,
-channel_blocks,
bc
)
;
DATA_TYPE4
in
;
#
if
REDUCE_TYPE
==
1
float4
tmp
=
(
float
4
)
{MAXFLOAT,
MAXFLOAT,
MAXFLOAT,
MAXFLOAT}
;
DATA_TYPE4
part_result
=
(
DATA_TYPE
4
)
{MAXFLOAT,
MAXFLOAT,
MAXFLOAT,
MAXFLOAT}
;
#
elif
REDUCE_TYPE
==
2
float4
tmp
=
(
float
4
)
{-MAXFLOAT,
-MAXFLOAT,
-MAXFLOAT,
-MAXFLOAT}
;
DATA_TYPE4
part_result
=
(
DATA_TYPE
4
)
{-MAXFLOAT,
-MAXFLOAT,
-MAXFLOAT,
-MAXFLOAT}
;
#
elif
REDUCE_TYPE
==
3
float4
tmp
=
(
float
4
)
{1,
1
,
1
,
1}
;
DATA_TYPE4
part_result
=
(
DATA_TYPE
4
)
{1,
1
,
1
,
1}
;
#
else
float4
tmp
=
(
float
4
)
{0,
0
,
0
,
0}
;
DATA_TYPE4
part_result
=
(
DATA_TYPE
4
)
{0,
0
,
0
,
0}
;
#
endif
const
int
valid_part_len
=
select
(
partial_len,
partial_len
-
1
,
remain_index
>
0
&&
index
>=
remain_index
)
;
const
int
full_offset
=
mul24
(
index,
partial_len
)
;
const
int
base_offset
=
select
(
full_offset,
full_offset
-
(
index
-
remain_index
)
,
valid_part_len
<
partial_len
)
;
const
bool
after_last
=
(
last_index
>
0
&&
index
>=
last_index
)
;
//
After
last
index,
each
kernel
only
computes
(
compute_size
-
1
)
elements.
const
int
actual_compute_size
=
select
(
compute_size,
compute_size
-
1
,
after_last
)
;
const
int
base_offset
=
mul24
(
index,
actual_compute_size
)
;
const
int
offset=
select
(
base_offset,
base_offset
+
last_index,
after_last
)
;
#
pragma
unroll
for
(
int
l
=
0
; l < valid_part_len; ++l
) {
int
offset
=
base_offset
+
l
;
int
h_id
=
offset
/
in_width
;
int
w_id
=
mad24
(
h_id,
-in_width,
offset
)
;
int
pos_x
=
mad24
(
ch,
in_width,
w_id
)
;
int
pos_y
=
mad24
(
b,
in_height,
h_id
)
;
for
(
int
i
=
0
; i < actual_compute_size; ++i
) {
int
element_idx
=
offset
+
i
;
int
h_id
x
=
element_idx
/
in_width
;
int
w_id
x
=
mad24
(
h_idx,
-in_width,
element_idx
)
;
int
pos_x
=
mad24
(
ch,
in_width,
w_id
x
)
;
int
pos_y
=
mad24
(
b,
in_height,
h_id
x
)
;
in
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
pos_x,
pos_y
))
;
//
MIN
#
if
REDUCE_TYPE
==
1
tmp
=
fmin
(
tmp
,
in
)
;
part_result
=
fmin
(
part_result
,
in
)
;
//
MAX
#
elif
REDUCE_TYPE
==
2
tmp
=
fmax
(
tmp
,
in
)
;
part_result
=
fmax
(
part_result
,
in
)
;
//
PROD
#
elif
REDUCE_TYPE
==
3
tmp
=
tmp
*
in
;
part_result
=
part_result
*
in
;
//
MEAN
#
else
tmp
=
tmp
+
in
;
part_result
=
part_result
+
in
;
#
endif
}
#
if
REDUCE_TYPE
==
0
tmp
=
tmp
*
image_size_reciprocal
;
part_result
=
part_result
*
scale
;
#
endif
group_sum[index]
=
tmp
;
local_buffer[index]
=
part_result
;
#
ifdef
NON_QUALCOMM_ADRENO
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
endif
if
(
i
==
0
&&
j
==
0
)
{
if
(
w
==
0
&&
h
==
0
)
{
#
if
REDUCE_TYPE
==
1
DATA_TYPE4
out
=
(
DATA_TYPE4
)
{MAXFLOAT,
MAXFLOAT,
MAXFLOAT,
MAXFLOAT}
;
#
elif
REDUCE_TYPE
==
2
...
...
@@ -88,15 +88,15 @@ __kernel void reduce(OUT_OF_RANGE_PARAMS
DATA_TYPE4
out
=
(
DATA_TYPE4
)
{0,
0
,
0
,
0}
;
#
endif
#
pragma
unroll
for
(
int
l
=
0
; l < group_size; ++l
) {
for
(
int
i
=
0
; i < group_num; ++i
) {
#
if
REDUCE_TYPE
==
1
out
=
fmin
(
out,
group_sum[l
]
)
;
out
=
fmin
(
out,
local_buffer[i
]
)
;
#
elif
REDUCE_TYPE
==
2
out
=
fmax
(
out,
group_sum[l
]
)
;
out
=
fmax
(
out,
local_buffer[i
]
)
;
#
elif
REDUCE_TYPE
==
3
out
=
out
*
group_sum[l
]
;
out
=
out
*
local_buffer[i
]
;
#
else
out
=
out
+
group_sum[l
]
;
out
=
out
+
local_buffer[i
]
;
#
endif
}
WRITE_IMAGET
(
output,
(
int2
)(
ch,
b
)
,
out
)
;
...
...
mace/ops/opencl/image/reduce.h
浏览文件 @
79ae9e97
...
...
@@ -109,13 +109,20 @@ MaceStatus ReduceKernel<T>::Compute(
static_cast
<
uint32_t
>
(
runtime
->
GetKernelWaveSize
(
kernel_
));
gws
=
{
4
,
(
wave_size
/
4
),
static_cast
<
uint32_t
>
(
batch
*
channel_blocks
)};
}
else
{
gws
=
{
4
,
16
,
static_cast
<
uint32_t
>
(
batch
*
channel_blocks
)};
// Ensure each kernel has at least 4 input elements.
gws
=
{
4
,
image_size
/
16
,
static_cast
<
uint32_t
>
(
batch
*
channel_blocks
)};
if
(
gws
[
1
]
==
0
)
{
gws
[
1
]
=
1
;
}
else
if
(
gws
[
1
]
>
16
)
{
gws
[
1
]
=
16
;
}
}
lws
=
{
gws
[
0
],
gws
[
1
],
1
};
const
int
group_size
=
lws
[
0
]
*
lws
[
1
]
*
lws
[
2
];
const
int
partial_len
=
(
image_size
+
group_size
-
1
)
/
group_size
;
const
int
remain_index
=
image_size
%
group_size
;
const
float
img_size_reciprocal
=
1.
f
/
(
in_width
*
in_height
);
const
int
group_num
=
lws
[
0
]
*
lws
[
1
]
*
lws
[
2
];
// Each kernel intends to compute compute_size elements.
const
int
compute_size
=
(
image_size
+
group_num
-
1
)
/
group_num
;
const
int
last_index
=
image_size
%
group_num
;
const
float
scale
=
1.
f
/
(
in_width
*
in_height
);
MACE_OUT_OF_RANGE_INIT
(
kernel_
);
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
...
...
@@ -123,15 +130,14 @@ MaceStatus ReduceKernel<T>::Compute(
MACE_OUT_OF_RANGE_SET_ARGS
(
kernel_
);
MACE_SET_3D_GWS_ARGS
(
kernel_
,
gws
);
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
(
group_
size
*
4
*
sizeof
(
T
)),
kernel_
.
setArg
(
idx
++
,
(
group_
num
*
4
*
sizeof
(
float
)),
nullptr
);
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
group_size
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
partial_len
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
remain_index
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
batch
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
group_num
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
compute_size
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
last_index
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
in_height
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
in_width
));
kernel_
.
setArg
(
idx
++
,
img_size_reciprocal
);
kernel_
.
setArg
(
idx
++
,
scale
);
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
channel_blocks
));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录