Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
野马c
Mace
提交
aeb4c35e
Mace
项目概览
野马c
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
aeb4c35e
编写于
10月 30, 2017
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Use vector operation to optimize batch_norm opencl kernel.
上级
5e103649
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
24 addition
and
10 deletion
+24
-10
mace/kernels/opencl/batch_norm_opencl.cc
mace/kernels/opencl/batch_norm_opencl.cc
+7
-4
mace/kernels/opencl/cl/batch_norm.cl
mace/kernels/opencl/cl/batch_norm.cl
+17
-6
未找到文件。
mace/kernels/opencl/batch_norm_opencl.cc
浏览文件 @
aeb4c35e
...
...
@@ -20,9 +20,12 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
const
Tensor
*
epsilon
,
Tensor
*
output
)
{
index_t
pixel_size
=
input
->
dim
(
2
)
*
input
->
dim
(
3
);
index_t
blocks
=
(
pixel_size
+
3
)
/
4
;
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
input
->
dim
(
0
)),
static_cast
<
uint32_t
>
(
input
->
dim
(
1
)),
static_cast
<
uint32_t
>
(
input
->
dim
(
2
)
*
input
->
dim
(
3
)
)};
static_cast
<
uint32_t
>
(
blocks
)};
auto
runtime
=
OpenCLRuntime
::
Get
();
...
...
@@ -39,10 +42,10 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
mean
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
var
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
epsilon
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
gws
[
2
]
);
bm_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
pixel_size
)
);
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
lws
[
1
]
*
sizeof
(
float
),
nullptr
);
bm_kernel
.
setArg
(
idx
++
,
lws
[
1
]
*
sizeof
(
float
),
nullptr
);
bm_kernel
.
setArg
(
idx
++
,
lws
[
1
]
*
sizeof
(
float
)
*
4
,
nullptr
);
bm_kernel
.
setArg
(
idx
++
,
lws
[
1
]
*
sizeof
(
float
)
*
4
,
nullptr
);
auto
params_generator
=
[
&
kwg_size
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
return
{{
1
,
1
,
64
},
...
...
mace/kernels/opencl/cl/batch_norm.cl
浏览文件 @
aeb4c35e
...
...
@@ -6,8 +6,8 @@ void kernel batch_norm(global const float *input,
global
const
float
*epsilon,
private
const
uint
pixels,
global
float
*output,
__local
float
*new_scale,
__local
float
*new_offset
)
{
__local
float
4
*new_scale,
__local
float
4
*new_offset
)
{
const
int
batch
=
get_global_id
(
0
)
;
const
int
channel
=
get_global_id
(
1
)
;
const
int
channels
=
get_global_size
(
1
)
;
...
...
@@ -16,15 +16,26 @@ void kernel batch_norm(global const float *input,
const
int
local_pixel_idx
=
get_local_id
(
2
)
;
if
(
local_pixel_idx
==
0
)
{
new_scale[local_channel]
=
scale[channel]
*
rsqrt
(
var[channel]
+
*epsilon
)
;
new_offset[local_channel]
=
offset[channel]
-
mean[channel]
*
new_scale[local_channel]
;
new_scale[local_channel]
=
(
float4
)(
scale[channel]
*
rsqrt
(
var[channel]
+
*epsilon
)
)
;
new_offset[local_channel]
=
(
float4
)(
offset[channel]
-
mean[channel]
*
new_scale[local_channel].x
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
const
int
sample_offset
=
(
batch
*
channels
+
channel
)
*
pixels
+
pixel_offset
;
const
int
sample_offset
=
(
batch
*
channels
+
channel
)
*
pixels
+
pixel_offset
*4
;
const
float
*input_ptr
=
input
+
sample_offset
;
float
*output_ptr
=
output
+
sample_offset
;
*output_ptr
=
new_scale[local_channel]
*
*input_ptr
+
new_offset[local_channel]
;
const
int
end
=
(
batch
*
channels
+
channel
+
1
)
*
pixels
;
if
((
sample_offset+4
)
>
end
)
{
for
(
int
i
=
sample_offset
; i < end; ++i) {
*output_ptr
=
new_scale[local_channel].x
*
*input_ptr
+
new_offset[local_channel].x
;
++input_ptr
;
++output_ptr
;
}
}
else
{
float4
values
=
vload4
(
0
,
input_ptr
)
;
values
=
values
*
new_scale[local_channel]
+
new_offset[local_channel]
;
vstore4
(
values,
0
,
output_ptr
)
;
}
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录