Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
c93b928a
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
c93b928a
编写于
10月 31, 2017
作者:
L
Liangliang He
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'batch_norm_opt' into 'master'
Vectorization batch norm opencl kernel. See merge request !83
上级
5e103649
4dd645b8
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
34 addition
and
18 deletion
+34
-18
mace/kernels/opencl/batch_norm_opencl.cc
mace/kernels/opencl/batch_norm_opencl.cc
+7
-4
mace/kernels/opencl/cl/batch_norm.cl
mace/kernels/opencl/cl/batch_norm.cl
+19
-8
mace/utils/BUILD
mace/utils/BUILD
+1
-0
mace/utils/tuner.h
mace/utils/tuner.h
+5
-5
mace/utils/tuner_test.cc
mace/utils/tuner_test.cc
+2
-1
未找到文件。
mace/kernels/opencl/batch_norm_opencl.cc
浏览文件 @
c93b928a
...
@@ -20,9 +20,12 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
...
@@ -20,9 +20,12 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
const
Tensor
*
epsilon
,
const
Tensor
*
epsilon
,
Tensor
*
output
)
{
Tensor
*
output
)
{
index_t
pixel_size
=
input
->
dim
(
2
)
*
input
->
dim
(
3
);
index_t
blocks
=
(
pixel_size
+
3
)
/
4
;
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
input
->
dim
(
0
)),
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
input
->
dim
(
0
)),
static_cast
<
uint32_t
>
(
input
->
dim
(
1
)),
static_cast
<
uint32_t
>
(
input
->
dim
(
1
)),
static_cast
<
uint32_t
>
(
input
->
dim
(
2
)
*
input
->
dim
(
3
)
)};
static_cast
<
uint32_t
>
(
blocks
)};
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
runtime
=
OpenCLRuntime
::
Get
();
...
@@ -39,10 +42,10 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
...
@@ -39,10 +42,10 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
mean
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
mean
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
var
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
var
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
epsilon
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
epsilon
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
gws
[
2
]
);
bm_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
pixel_size
)
);
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
lws
[
1
]
*
sizeof
(
float
),
nullptr
);
bm_kernel
.
setArg
(
idx
++
,
lws
[
1
]
*
sizeof
(
float
)
*
4
,
nullptr
);
bm_kernel
.
setArg
(
idx
++
,
lws
[
1
]
*
sizeof
(
float
),
nullptr
);
bm_kernel
.
setArg
(
idx
++
,
lws
[
1
]
*
sizeof
(
float
)
*
4
,
nullptr
);
auto
params_generator
=
[
&
kwg_size
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
auto
params_generator
=
[
&
kwg_size
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
return
{{
1
,
1
,
64
},
return
{{
1
,
1
,
64
},
...
...
mace/kernels/opencl/cl/batch_norm.cl
浏览文件 @
c93b928a
...
@@ -6,8 +6,8 @@ void kernel batch_norm(global const float *input,
...
@@ -6,8 +6,8 @@ void kernel batch_norm(global const float *input,
global
const
float
*epsilon,
global
const
float
*epsilon,
private
const
uint
pixels,
private
const
uint
pixels,
global
float
*output,
global
float
*output,
__local
float
*new_scale,
__local
float
4
*new_scale,
__local
float
*new_offset
)
{
__local
float
4
*new_offset
)
{
const
int
batch
=
get_global_id
(
0
)
;
const
int
batch
=
get_global_id
(
0
)
;
const
int
channel
=
get_global_id
(
1
)
;
const
int
channel
=
get_global_id
(
1
)
;
const
int
channels
=
get_global_size
(
1
)
;
const
int
channels
=
get_global_size
(
1
)
;
...
@@ -16,15 +16,26 @@ void kernel batch_norm(global const float *input,
...
@@ -16,15 +16,26 @@ void kernel batch_norm(global const float *input,
const
int
local_pixel_idx
=
get_local_id
(
2
)
;
const
int
local_pixel_idx
=
get_local_id
(
2
)
;
if
(
local_pixel_idx
==
0
)
{
if
(
local_pixel_idx
==
0
)
{
new_scale[local_channel]
=
scale[channel]
*
rsqrt
(
var[channel]
+
*epsilon
)
;
new_scale[local_channel]
=
(
float4
)(
scale[channel]
*
rsqrt
(
var[channel]
+
*epsilon
)
)
;
new_offset[local_channel]
=
offset[channel]
-
mean[channel]
*
new_scale[local_channel]
;
new_offset[local_channel]
=
(
float4
)(
offset[channel]
-
mean[channel]
*
new_scale[local_channel].x
)
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
const
int
sample_offset
=
(
batch
*
channels
+
channel
)
*
pixels
+
pixel_offset
;
const
int
image_offset
=
(
batch
*
channels
+
channel
)
*
pixels
+
pixel_offset*4
;
const
float
*input_ptr
=
input
+
sample_offset
;
const
float
*input_ptr
=
input
+
image_offset
;
float
*output_ptr
=
output
+
sample_offset
;
float
*output_ptr
=
output
+
image_offset
;
*output_ptr
=
new_scale[local_channel]
*
*input_ptr
+
new_offset[local_channel]
;
const
int
end
=
(
batch
*
channels
+
channel
+
1
)
*
pixels
;
if
((
image_offset+4
)
>
end
)
{
for
(
int
i
=
image_offset
; i < end; ++i) {
*output_ptr
=
new_scale[local_channel].x
*
*input_ptr
+
new_offset[local_channel].x
;
++input_ptr
;
++output_ptr
;
}
}
else
{
float4
values
=
vload4
(
0
,
input_ptr
)
;
values
=
values
*
new_scale[local_channel]
+
new_offset[local_channel]
;
vstore4
(
values,
0
,
output_ptr
)
;
}
}
}
mace/utils/BUILD
浏览文件 @
c93b928a
...
@@ -39,6 +39,7 @@ cc_library(
...
@@ -39,6 +39,7 @@ cc_library(
copts
=
[
"-std=c++11"
],
copts
=
[
"-std=c++11"
],
deps
=
[
deps
=
[
"//mace/core"
,
"//mace/core"
,
"//mace/core:opencl_runtime"
,
],
],
)
)
...
...
mace/utils/tuner.h
浏览文件 @
c93b928a
...
@@ -33,7 +33,7 @@ class Tuner {
...
@@ -33,7 +33,7 @@ class Tuner {
const
std
::
function
<
std
::
vector
<
std
::
vector
<
param_type
>>
()
>
&
param_generator
,
const
std
::
function
<
std
::
vector
<
std
::
vector
<
param_type
>>
()
>
&
param_generator
,
const
std
::
function
<
RetType
(
const
std
::
vector
<
param_type
>
&
)
>
&
func
)
{
const
std
::
function
<
RetType
(
const
std
::
vector
<
param_type
>
&
)
>
&
func
)
{
if
(
IsTuning
())
{
if
(
IsTuning
()
&&
param_generator
!=
nullptr
)
{
// tune
// tune
std
::
vector
<
param_type
>
opt_param
=
default_param
;
std
::
vector
<
param_type
>
opt_param
=
default_param
;
RetType
res
=
Tune
<
RetType
>
(
param_generator
,
func
,
opt_param
);
RetType
res
=
Tune
<
RetType
>
(
param_generator
,
func
,
opt_param
);
...
@@ -68,7 +68,7 @@ class Tuner {
...
@@ -68,7 +68,7 @@ class Tuner {
}
}
inline
void
WriteRunParameters
()
{
inline
void
WriteRunParameters
()
{
VLOG
(
0
)
<<
path_
;
VLOG
(
1
)
<<
path_
;
if
(
path_
!=
nullptr
)
{
if
(
path_
!=
nullptr
)
{
std
::
ofstream
ofs
(
path_
,
std
::
ios
::
binary
|
std
::
ios
::
out
);
std
::
ofstream
ofs
(
path_
,
std
::
ios
::
binary
|
std
::
ios
::
out
);
if
(
ofs
.
is_open
())
{
if
(
ofs
.
is_open
())
{
...
@@ -78,14 +78,14 @@ class Tuner {
...
@@ -78,14 +78,14 @@ class Tuner {
int32_t
key_size
=
kp
.
first
.
size
();
int32_t
key_size
=
kp
.
first
.
size
();
ofs
.
write
(
reinterpret_cast
<
char
*>
(
&
key_size
),
sizeof
(
key_size
));
ofs
.
write
(
reinterpret_cast
<
char
*>
(
&
key_size
),
sizeof
(
key_size
));
ofs
.
write
(
kp
.
first
.
c_str
(),
key_size
);
ofs
.
write
(
kp
.
first
.
c_str
(),
key_size
);
VLOG
(
0
)
<<
kp
.
first
.
c_str
();
VLOG
(
1
)
<<
kp
.
first
.
c_str
();
auto
&
params
=
kp
.
second
;
auto
&
params
=
kp
.
second
;
int32_t
params_size
=
params
.
size
()
*
sizeof
(
param_type
);
int32_t
params_size
=
params
.
size
()
*
sizeof
(
param_type
);
ofs
.
write
(
reinterpret_cast
<
char
*>
(
&
params_size
),
sizeof
(
params_size
));
ofs
.
write
(
reinterpret_cast
<
char
*>
(
&
params_size
),
sizeof
(
params_size
));
for
(
auto
&
param
:
params
)
{
for
(
auto
&
param
:
params
)
{
ofs
.
write
(
reinterpret_cast
<
char
*>
(
&
param
),
sizeof
(
params_size
));
ofs
.
write
(
reinterpret_cast
<
char
*>
(
&
param
),
sizeof
(
params_size
));
VLOG
(
0
)
<<
param
;
VLOG
(
1
)
<<
param
;
}
}
}
}
ofs
.
close
();
ofs
.
close
();
...
@@ -144,7 +144,7 @@ class Tuner {
...
@@ -144,7 +144,7 @@ class Tuner {
}
}
template
<
typename
RetType
>
template
<
typename
RetType
>
inline
RetType
Tune
(
std
::
function
<
std
::
vector
<
std
::
vector
<
param_type
>>
()
>
param_generator
,
inline
RetType
Tune
(
const
std
::
function
<
std
::
vector
<
std
::
vector
<
param_type
>>
()
>
&
param_generator
,
const
std
::
function
<
RetType
(
const
std
::
vector
<
param_type
>
&
)
>
&
func
,
const
std
::
function
<
RetType
(
const
std
::
vector
<
param_type
>
&
)
>
&
func
,
std
::
vector
<
param_type
>
&
opt_params
)
{
std
::
vector
<
param_type
>
&
opt_params
)
{
RetType
res
;
RetType
res
;
...
...
mace/utils/tuner_test.cc
浏览文件 @
c93b928a
...
@@ -13,7 +13,8 @@ class TunerTest: public ::testing::Test {
...
@@ -13,7 +13,8 @@ class TunerTest: public ::testing::Test {
protected:
protected:
virtual
void
SetUp
()
{
virtual
void
SetUp
()
{
remove
(
"/data/local/tmp/mace.config"
);
remove
(
"/data/local/tmp/mace.config"
);
setenv
(
"MACE_RUN_PARAMTER_PATH"
,
"/data/local/tmp/mace.config"
,
1
);
setenv
(
"MACE_RUN_PARAMETER_PATH"
,
"/data/local/tmp/mace.config"
,
1
);
setenv
(
"MACE_TUNING"
,
"1"
,
1
);
}
}
};
};
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录