Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
44d4903d
Mace
项目概览
Xiaomi
/
Mace
通知
107
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
44d4903d
编写于
3月 28, 2018
作者:
Y
yejianwu
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
remove compatible 1.1 1.2 for fully_connected
上级
bdd6ff45
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
11 addition
and
68 deletion
+11
-68
mace/kernels/opencl/cl/fully_connected.cl
mace/kernels/opencl/cl/fully_connected.cl
+1
-33
mace/kernels/opencl/fully_connected_opencl.cc
mace/kernels/opencl/fully_connected_opencl.cc
+10
-35
未找到文件。
mace/kernels/opencl/cl/fully_connected.cl
浏览文件 @
44d4903d
...
@@ -10,23 +10,9 @@ __kernel void fully_connected(__read_only image2d_t input,
...
@@ -10,23 +10,9 @@ __kernel void fully_connected(__read_only image2d_t input,
__private
const
int
input_height,
__private
const
int
input_height,
__private
const
int
input_width,
__private
const
int
input_width,
__private
const
int
input_channel,
__private
const
int
input_channel,
#
ifndef
USE_QUALCOMM_OPENCL_2_0
__private
const
float
relux_max_limit,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1
)
{
#
else
__private
const
float
relux_max_limit
)
{
__private
const
float
relux_max_limit
)
{
#
endif
const
int
batch_idx
=
get_global_id
(
0
)
;
const
int
batch_idx
=
get_global_id
(
0
)
;
const
int
out_blk_idx
=
get_global_id
(
1
)
;
const
int
out_blk_idx
=
get_global_id
(
1
)
;
#
ifndef
USE_QUALCOMM_OPENCL_2_0
if
(
batch_idx
>=
global_size_dim0
|
| out_blk_idx >= global_size_dim1) {
return;
}
#endif
const
int
input_chan_blk
=
(
input_channel
+
3
)
>>
2
;
const
int
input_chan_blk
=
(
input_channel
+
3
)
>>
2
;
float4
input_value
;
float4
input_value
;
...
@@ -82,29 +68,11 @@ __kernel void fully_connected_width(__read_only image2d_t input,
...
@@ -82,29 +68,11 @@ __kernel void fully_connected_width(__read_only image2d_t input,
__private const int input_width,
__private const int input_width,
__private const int in_chan_blks,
__private const int in_chan_blks,
__private const int out_blks,
__private const int out_blks,
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const float relux_max_limit,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__private const float relux_max_limit) {
__private const float relux_max_limit) {
#endif
const int inter_out_idx = get_global_id(0);
const int inter_out_idx = get_global_id(0);
const int width_blk_idx = get_global_id(1);
const int width_blk_idx = get_global_id(1);
const int batch_out_blk_idx = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
if (inter_out_idx >= global_size_dim0 || width_blk_idx >= global_size_dim1
|| batch_out_blk_idx >= global_size_dim2) {
return;
}
const int width_blk_count = global_size_dim1;
#else
const int width_blk_count = get_global_size(1);
const int width_blk_count = get_global_size(1);
#endif
const int batch_out_blk_idx = get_global_id(2);
const int batch_idx = batch_out_blk_idx / out_blks;
const int batch_idx = batch_out_blk_idx / out_blks;
const int out_blk_idx = batch_out_blk_idx % out_blks;
const int out_blk_idx = batch_out_blk_idx % out_blks;
...
...
mace/kernels/opencl/fully_connected_opencl.cc
浏览文件 @
44d4903d
...
@@ -24,11 +24,8 @@ void FCWXKernel(cl::Kernel *kernel,
...
@@ -24,11 +24,8 @@ void FCWXKernel(cl::Kernel *kernel,
<<
"FC width kernel only support input with 4x channel."
;
<<
"FC width kernel only support input with 4x channel."
;
MACE_CHECK_NOTNULL
(
gws
);
MACE_CHECK_NOTNULL
(
gws
);
MACE_CHECK_NOTNULL
(
lws
);
MACE_CHECK_NOTNULL
(
lws
);
auto
runtime
=
OpenCLRuntime
::
Global
();
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel
->
get
()
==
nullptr
)
{
if
(
kernel
->
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
...
@@ -37,9 +34,6 @@ void FCWXKernel(cl::Kernel *kernel,
...
@@ -37,9 +34,6 @@ void FCWXKernel(cl::Kernel *kernel,
built_options
.
emplace
(
"-Dfully_connected_width="
+
kernel_name
);
built_options
.
emplace
(
"-Dfully_connected_width="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
is_qualcomm_opencl200
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
if
(
bias
!=
nullptr
)
{
if
(
bias
!=
nullptr
)
{
built_options
.
emplace
(
"-DBIAS"
);
built_options
.
emplace
(
"-DBIAS"
);
}
}
...
@@ -81,7 +75,6 @@ void FCWXKernel(cl::Kernel *kernel,
...
@@ -81,7 +75,6 @@ void FCWXKernel(cl::Kernel *kernel,
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
output_blocks
=
RoundUpDiv4
(
output
->
dim
(
3
));
const
index_t
output_blocks
=
RoundUpDiv4
(
output
->
dim
(
3
));
(
*
gws
)[
2
]
=
static_cast
<
uint32_t
>
(
batch
*
output_blocks
);
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
...
@@ -97,22 +90,14 @@ void FCWXKernel(cl::Kernel *kernel,
...
@@ -97,22 +90,14 @@ void FCWXKernel(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
RoundUpDiv4
(
input
->
dim
(
3
))));
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
RoundUpDiv4
(
input
->
dim
(
3
))));
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
output_blocks
));
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
output_blocks
));
kernel
->
setArg
(
idx
++
,
relux_max_limit
);
kernel
->
setArg
(
idx
++
,
relux_max_limit
);
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
0
]);
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
1
]);
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
2
]);
*
prev_input_shape
=
input
->
shape
();
(
*
gws
)[
2
]
=
static_cast
<
uint32_t
>
(
batch
*
output_blocks
);
}
std
::
vector
<
uint32_t
>
roundup_gws
(
lws
->
size
());
*
prev_input_shape
=
input
->
shape
();
for
(
size_t
i
=
0
;
i
<
lws
->
size
();
++
i
)
{
roundup_gws
[
i
]
=
RoundUp
((
*
gws
)[
i
],
(
*
lws
)[
i
]);
}
}
cl
::
Event
event
;
cl
::
Event
event
;
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
*
kernel
,
cl
::
NullRange
,
*
kernel
,
cl
::
NullRange
,
cl
::
NDRange
((
*
gws
)[
0
],
(
*
gws
)[
1
],
(
*
gws
)[
2
]),
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
((
*
lws
)[
0
],
(
*
lws
)[
1
],
(
*
lws
)[
2
]),
nullptr
,
&
event
);
cl
::
NDRange
((
*
lws
)[
0
],
(
*
lws
)[
1
],
(
*
lws
)[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
...
@@ -140,21 +125,14 @@ void FCWTXKernel(cl::Kernel *kernel,
...
@@ -140,21 +125,14 @@ void FCWTXKernel(cl::Kernel *kernel,
StatsFuture
*
future
)
{
StatsFuture
*
future
)
{
MACE_CHECK_NOTNULL
(
gws
);
MACE_CHECK_NOTNULL
(
gws
);
MACE_CHECK_NOTNULL
(
lws
);
MACE_CHECK_NOTNULL
(
lws
);
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel
->
get
()
==
nullptr
)
{
if
(
kernel
->
get
()
==
nullptr
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"fully_connected"
);
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"fully_connected"
);
built_options
.
emplace
(
"-Dfully_connected="
+
kernel_name
);
built_options
.
emplace
(
"-Dfully_connected="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
is_qualcomm_opencl200
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
if
(
bias
!=
nullptr
)
{
if
(
bias
!=
nullptr
)
{
built_options
.
emplace
(
"-DBIAS"
);
built_options
.
emplace
(
"-DBIAS"
);
}
}
...
@@ -183,13 +161,6 @@ void FCWTXKernel(cl::Kernel *kernel,
...
@@ -183,13 +161,6 @@ void FCWTXKernel(cl::Kernel *kernel,
}
}
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
output_blocks
=
RoundUpDiv4
(
output
->
dim
(
3
));
*
gws
=
{
static_cast
<
uint32_t
>
(
batch
),
static_cast
<
uint32_t
>
(
output_blocks
),
};
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
weight
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
weight
->
opencl_image
()));
if
(
bias
!=
nullptr
)
{
if
(
bias
!=
nullptr
)
{
...
@@ -201,9 +172,13 @@ void FCWTXKernel(cl::Kernel *kernel,
...
@@ -201,9 +172,13 @@ void FCWTXKernel(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
input
->
dim
(
3
)));
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
input
->
dim
(
3
)));
// FIXME handle flexable data type: half not supported
// FIXME handle flexable data type: half not supported
kernel
->
setArg
(
idx
++
,
relux_max_limit
);
kernel
->
setArg
(
idx
++
,
relux_max_limit
);
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
0
]);
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
1
]);
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
output_blocks
=
RoundUpDiv4
(
output
->
dim
(
3
));
*
gws
=
{
static_cast
<
uint32_t
>
(
batch
),
static_cast
<
uint32_t
>
(
output_blocks
),
};
*
prev_input_shape
=
input
->
shape
();
*
prev_input_shape
=
input
->
shape
();
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录