Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
845377f2
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
845377f2
编写于
10月 26, 2017
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Optimize the batch norm opencl kernel.
上级
dff4b94c
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
71 addition
and
26 deletion
+71
-26
mace/core/runtime/opencl/opencl_runtime.cc
mace/core/runtime/opencl/opencl_runtime.cc
+12
-0
mace/core/runtime/opencl/opencl_runtime.h
mace/core/runtime/opencl/opencl_runtime.h
+2
-0
mace/core/runtime/opencl/opencl_wrapper.cc
mace/core/runtime/opencl/opencl_wrapper.cc
+19
-0
mace/kernels/opencl/batch_norm_opencl.cc
mace/kernels/opencl/batch_norm_opencl.cc
+26
-17
mace/kernels/opencl/cl/batch_norm.cl
mace/kernels/opencl/cl/batch_norm.cl
+1
-2
mace/ops/batch_norm.h
mace/ops/batch_norm.h
+11
-7
未找到文件。
mace/core/runtime/opencl/opencl_runtime.cc
浏览文件 @
845377f2
...
...
@@ -160,4 +160,16 @@ cl::Program &OpenCLRuntime::program() {
return
program_
;
}
int
OpenCLRuntime
::
GetDeviceMaxWorkGroupSize
()
{
unsigned
long
long
size
=
0
;
device_
.
getInfo
(
CL_DEVICE_MAX_WORK_GROUP_SIZE
,
&
size
);
return
static_cast
<
int
>
(
size
);
}
int
OpenCLRuntime
::
GetKernelMaxWorkGroupSize
(
const
cl
::
Kernel
&
kernel
)
{
unsigned
long
long
size
=
0
;
kernel
.
getWorkGroupInfo
(
device_
,
CL_KERNEL_WORK_GROUP_SIZE
,
&
size
);
return
static_cast
<
int
>
(
size
);
}
}
// namespace mace
mace/core/runtime/opencl/opencl_runtime.h
浏览文件 @
845377f2
...
...
@@ -21,6 +21,8 @@ class OpenCLRuntime {
public:
static
OpenCLRuntime
*
Get
();
int
GetDeviceMaxWorkGroupSize
();
int
GetKernelMaxWorkGroupSize
(
const
cl
::
Kernel
&
kernel
);
cl
::
Context
&
context
();
cl
::
Device
&
device
();
cl
::
CommandQueue
&
command_queue
();
...
...
mace/core/runtime/opencl/opencl_wrapper.cc
浏览文件 @
845377f2
...
...
@@ -136,6 +136,8 @@ class OpenCLLibraryImpl final {
using
clRetainDeviceFunc
=
cl_int
(
*
)(
cl_device_id
);
using
clReleaseDeviceFunc
=
cl_int
(
*
)(
cl_device_id
);
using
clRetainEventFunc
=
cl_int
(
*
)(
cl_event
);
using
clGetKernelWorkGroupInfoFunc
=
cl_int
(
*
)(
cl_kernel
,
cl_device_id
,
cl_kernel_work_group_info
,
size_t
,
void
*
,
size_t
*
);
#define DEFINE_FUNC_PTR(func) func##Func func = nullptr
...
...
@@ -177,6 +179,7 @@ class OpenCLLibraryImpl final {
DEFINE_FUNC_PTR
(
clRetainDevice
);
DEFINE_FUNC_PTR
(
clReleaseDevice
);
DEFINE_FUNC_PTR
(
clRetainEvent
);
DEFINE_FUNC_PTR
(
clGetKernelWorkGroupInfo
);
#undef DEFINE_FUNC_PTR
...
...
@@ -296,6 +299,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) {
ASSIGN_FROM_DLSYM
(
clRetainDevice
);
ASSIGN_FROM_DLSYM
(
clReleaseDevice
);
ASSIGN_FROM_DLSYM
(
clRetainEvent
);
ASSIGN_FROM_DLSYM
(
clGetKernelWorkGroupInfo
);
#undef ASSIGN_FROM_DLSYM
...
...
@@ -782,3 +786,18 @@ cl_int clRetainEvent(cl_event event) {
return
CL_OUT_OF_RESOURCES
;
}
}
cl_int
clGetKernelWorkGroupInfo
(
cl_kernel
kernel
,
cl_device_id
device
,
cl_kernel_work_group_info
param_name
,
size_t
param_value_size
,
void
*
param_value
,
size_t
*
param_value_size_ret
)
{
auto
func
=
mace
::
OpenCLLibraryImpl
::
Get
().
clGetKernelWorkGroupInfo
;
if
(
func
!=
nullptr
)
{
return
func
(
kernel
,
device
,
param_name
,
param_value_size
,
param_value
,
param_value_size_ret
);
}
else
{
return
CL_OUT_OF_RESOURCES
;
}
}
mace/kernels/opencl/batch_norm_opencl.cc
浏览文件 @
845377f2
...
...
@@ -18,27 +18,36 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
const
Tensor
*
var
,
const
Tensor
*
epsilon
,
Tensor
*
output
)
{
const
index_t
n
=
input
->
dim
(
0
);
const
index_t
channel
=
input
->
dim
(
1
);
const
index_t
sample_size
=
input
->
dim
(
2
)
*
input
->
dim
(
3
);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
input
->
dim
(
0
)),
static_cast
<
uint32_t
>
(
input
->
dim
(
1
)),
static_cast
<
uint32_t
>
(
input
->
dim
(
2
)
*
input
->
dim
(
3
))};
const
uint32_t
lws
[
3
]
=
{
1
,
2
,
128
};
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
();
auto
_kernel
=
cl
::
Kernel
(
program
,
"batch_norm"
);
_kernel
.
setArg
(
0
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
input
->
buffer
())));
_kernel
.
setArg
(
1
,
*
(
static_cast
<
cl
::
Buffer
*>
(
scale
->
buffer
())));
_kernel
.
setArg
(
2
,
*
(
static_cast
<
cl
::
Buffer
*>
(
offset
->
buffer
())));
_kernel
.
setArg
(
3
,
*
(
static_cast
<
cl
::
Buffer
*>
(
mean
->
buffer
())));
_kernel
.
setArg
(
4
,
*
(
static_cast
<
cl
::
Buffer
*>
(
var
->
buffer
())));
_kernel
.
setArg
(
5
,
*
(
static_cast
<
cl
::
Buffer
*>
(
epsilon
->
buffer
())));
_kernel
.
setArg
(
6
,
static_cast
<
int
>
(
sample_size
));
_kernel
.
setArg
(
7
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
_kernel
.
setArg
(
8
,
32u
,
nullptr
);
_kernel
.
setArg
(
9
,
32u
,
nullptr
);
auto
bm_kernel
=
cl
::
Kernel
(
program
,
"batch_norm"
);
uint32_t
idx
=
0
;
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
input
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
scale
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
offset
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
mean
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
var
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
epsilon
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
gws
[
2
]);
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
lws
[
1
]
*
sizeof
(
float
),
nullptr
);
bm_kernel
.
setArg
(
idx
++
,
lws
[
1
]
*
sizeof
(
float
),
nullptr
);
MACE_CHECK
(
std
::
accumulate
(
lws
,
lws
+
3
,
1
,
std
::
multiplies
<
uint32_t
>
())
<
runtime
->
GetKernelMaxWorkGroupSize
(
bm_kernel
));
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
n
,
channel
,
sample_size
),
cl
::
NDRange
(
1
,
1
,
128
));
bm
_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]
),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]
));
MACE_CHECK
(
error
==
CL_SUCCESS
);
}
...
...
mace/kernels/opencl/cl/batch_norm.cl
浏览文件 @
845377f2
...
...
@@ -4,7 +4,7 @@ void kernel batch_norm(global const float *input,
global
const
float
*mean,
global
const
float
*var,
global
const
float
*epsilon,
private
const
int
pixels,
private
const
u
int
pixels,
global
float
*output,
__local
float
*new_scale,
__local
float
*new_offset
)
{
...
...
@@ -23,7 +23,6 @@ void kernel batch_norm(global const float *input,
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
const
int
sample_offset
=
(
batch
*
channels
+
channel
)
*
pixels
+
pixel_offset
;
const
float
*input_ptr
=
input
+
sample_offset
;
float
*output_ptr
=
output
+
sample_offset
;
*output_ptr
=
new_scale[local_channel]
*
*input_ptr
+
new_offset[local_channel]
;
...
...
mace/ops/batch_norm.h
浏览文件 @
845377f2
...
...
@@ -17,12 +17,12 @@ class BatchNormOp : public Operator<D, T> {
:
Operator
<
D
,
T
>
(
operator_def
,
ws
),
functor_
()
{}
bool
Run
()
override
{
const
Tensor
*
input
=
this
->
Input
(
0
);
const
Tensor
*
scale
=
this
->
Input
(
1
);
const
Tensor
*
offset
=
this
->
Input
(
2
);
const
Tensor
*
mean
=
this
->
Input
(
3
);
const
Tensor
*
var
=
this
->
Input
(
4
);
const
Tensor
*
epsilon
=
this
->
Input
(
5
);
const
Tensor
*
input
=
this
->
Input
(
INPUT
);
const
Tensor
*
scale
=
this
->
Input
(
SCALE
);
const
Tensor
*
offset
=
this
->
Input
(
OFFSET
);
const
Tensor
*
mean
=
this
->
Input
(
MEAN
);
const
Tensor
*
var
=
this
->
Input
(
VAR
);
const
Tensor
*
epsilon
=
this
->
Input
(
EPSILON
);
MACE_CHECK
(
input
->
dim_size
()
==
4
,
"input must be 4-dimensional. "
,
input
->
dim_size
());
...
...
@@ -37,7 +37,7 @@ class BatchNormOp : public Operator<D, T> {
MACE_CHECK
(
epsilon
->
dim_size
()
==
0
,
"epsilon must be 0-dimensional. "
,
epsilon
->
dim_size
());
Tensor
*
output
=
this
->
Output
(
0
);
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
output
->
ResizeLike
(
input
);
functor_
(
input
,
scale
,
offset
,
mean
,
var
,
epsilon
,
output
);
...
...
@@ -46,6 +46,10 @@ class BatchNormOp : public Operator<D, T> {
private:
kernels
::
BatchNormFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
INPUT
,
SCALE
,
OFFSET
,
MEAN
,
VAR
,
EPSILON
);
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace mace
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录