Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
b061818c
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
b061818c
编写于
10月 30, 2017
作者:
L
Liangliang He
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'opencl' into 'master'
Update conv 1x1 opencl kernel See merge request !80
上级
1ac6b979
7b480af1
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
100 addition
and
113 deletion
+100
-113
mace/kernels/opencl/cl/assign_f32.cl
mace/kernels/opencl/cl/assign_f32.cl
+0
-42
mace/kernels/opencl/cl/conv_2d_1x1.cl
mace/kernels/opencl/cl/conv_2d_1x1.cl
+46
-13
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+52
-58
mace/ops/conv_2d_benchmark.cc
mace/ops/conv_2d_benchmark.cc
+2
-0
未找到文件。
mace/kernels/opencl/cl/assign_f32.cl
已删除
100644 → 0
浏览文件 @
1ac6b979
void
kernel
assign_v16_f32
(
global
float
*output,
private
const
float
value,
private
const
int
pixels
)
{
int
pixel_block
=
get_global_id
(
0
)
;
int
pixel_offset
=
pixel_block
*
16
;
float
*output_ptr
=
output
+
pixel_offset
;
int
remains
=
pixels
-
pixel_offset
;
if
(
remains
>=
16
)
{
for
(
int
i
=
0
; i < 4; ++i) {
vstore4
(
value,
i,
output_ptr
)
;
}
}
else
{
for
(
int
i
=
0
; i < remains; ++i) {
output_ptr[i]
=
value
;
}
}
}
void
kernel
assign_3d_v16_f32
(
global
float
*output,
global
const
float
*values,
private
const
int
pixels
)
{
int
batch
=
get_global_id
(
0
)
;
int
channel
=
get_global_id
(
1
)
;
int
channels
=
get_global_size
(
1
)
;
int
pixel_block
=
get_global_id
(
2
)
;
int
pixel_offset
=
pixel_block
*
16
;
float
value
=
values[channel]
;
float
*output_ptr
=
output
+
(
batch
*
channels
+
channel
)
*
pixels
+
pixel_offset
;
int
remains
=
pixels
-
pixel_offset
;
if
(
remains
>=
16
)
{
for
(
int
i
=
0
; i < 4; ++i) {
vstore4
(
value,
i,
output_ptr
)
;
}
}
else
{
for
(
int
i
=
0
; i < remains; ++i) {
output_ptr[i]
=
value
;
}
}
}
mace/kernels/opencl/cl/conv_2d_1x1.cl
浏览文件 @
b061818c
/*
*
Split
work
item
along
output
channels
and
pixels
*/
void
kernel
conv_2d_1x1_nchw
(
global
const
float
*input,
/*
n,
c,
h,
w
*/
global
const
float
*filter,
/*
o,
i,
kh,
kw
*/
global
float
*output,
/*
n,
c,
h,
w
*/
private
const
int
in_offset,
private
const
int
out_offset,
private
const
int
pixel_num,
private
const
int
in_chan_num,
private
const
int
out_chan_num
)
{
int
out_chan_blk
=
get_global_id
(
0
)
;
int
out_pixel_blk
=
get_global_id
(
1
)
;
void
kernel
conv_2d_1x1_naive
(
global
const
float
*input,
/*
n,
c,
h,
w
*/
global
const
float
*filter,
/*
o,
i,
kh,
kw
*/
global
const
float
*bias,
/*
o
*/
global
float
*output,
/*
n,
c,
h,
w
*/
private
const
int
input_channels
)
{
const
int
batch
=
get_global_id
(
0
)
;
const
int
channel
=
get_global_id
(
1
)
;
const
int
channels
=
get_global_size
(
1
)
;
const
int
pixel
=
get_global_id
(
2
)
;
const
int
pixels
=
get_global_size
(
2
)
;
float
*output_ptr
=
output
+
(
batch
*
channels
+
channel
)
*
pixels
;
output_ptr[pixel]
=
bias[channel]
;
for
(
int
inc
=
0
; inc < input_channels; ++inc) {
const
float
*input_ptr
=
input
+
(
batch
*
input_channels
+
inc
)
*
pixels
+
pixel
;
const
float
weights
=
filter[channel
*
input_channels
+
inc]
;
float
in
=
input_ptr[0]
;
float
out
=
output_ptr[0]
;
out
+=
in
*
weights
;
output_ptr[0]
=
out
;
}
}
void
kernel
conv_2d_1x1_v2
(
global
const
float
*input,
/*
n,
c,
h,
w
*/
global
const
float
*filter,
/*
o,
i,
kh,
kw
*/
global
const
float
*bias,
/*
o
*/
global
float
*output,
/*
n,
c,
h,
w
*/
private
const
int
in_chan_num,
private
const
int
out_chan_num,
private
const
int
pixel_num
)
{
int
batch
=
get_global_id
(
0
)
;
int
out_chan_blk
=
get_global_id
(
1
)
;
int
out_pixel_blk
=
get_global_id
(
2
)
;
const
int
out_chan_begin
=
out_chan_blk
*
4
;
const
int
out_chan_end
=
min
(
out_chan_begin
+
4
,
out_chan_num
)
;
const
int
out_pixel_begin
=
out_pixel_blk
*
4
;
const
int
out_pixel_end
=
min
(
out_pixel_begin
+
4
,
pixel_num
)
;
const
int
in_offset
=
batch
*
in_chan_num
*
pixel_num
;
const
int
out_offset
=
batch
*
out_chan_num
*
pixel_num
;
const
float
*input_base
=
input
+
in_offset
+
out_pixel_begin
;
float
*output_base
=
output
+
out_offset
+
out_pixel_begin
;
int
pixels
=
out_pixel_end
-
out_pixel_begin
;
for
(
int
out_chan
=
out_chan_begin
; out_chan < out_chan_end; ++out_chan) {
float
bias_value
=
bias[out_chan]
;
float
*output_ptr
=
output_base
+
out_chan
*
pixel_num
;
for
(
int
p
=
0
; p < pixels; ++p) {
output_ptr[p]
=
bias_value
;
}
}
int
in_chan
=
0
;
if
(
pixels
==
4
)
{
for
(
; in_chan + 3 < in_chan_num; in_chan += 4) {
...
...
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
b061818c
...
...
@@ -10,49 +10,41 @@
namespace
mace
{
namespace
kernels
{
static
constexpr
index_t
kInputChannelBlockSize
=
2
;
static
constexpr
index_t
kOutputChannelBlockSize
=
4
;
void
Conv1x1Naive
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
)
{
const
index_t
batch
=
output
->
shape
()[
0
];
const
index_t
channels
=
output
->
shape
()[
1
];
const
index_t
height
=
output
->
shape
()[
2
];
const
index_t
width
=
output
->
shape
()[
3
];
const
index_t
input_channels
=
input
->
shape
()[
1
];
void
AssignBias
(
Tensor
*
output
,
const
Tensor
*
bias
)
{
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
();
if
(
bias
==
nullptr
)
{
auto
assign_bias
=
cl
::
KernelFunctor
<
cl
::
Buffer
,
float
,
int
>
(
program
,
"assign_v16_f32"
);
index_t
pixels
=
output
->
NumElements
();
index_t
blocks
=
(
pixels
+
15
)
/
16
;
cl_int
error
;
assign_bias
(
cl
::
EnqueueArgs
(
runtime
->
command_queue
(),
cl
::
NDRange
(
blocks
),
cl
::
NullRange
),
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())),
0.0
f
,
static_cast
<
int
>
(
pixels
),
error
);
MACE_CHECK
(
error
==
CL_SUCCESS
);
}
else
{
auto
output_shape
=
output
->
shape
();
index_t
batch
=
output_shape
[
0
];
index_t
channels
=
output_shape
[
1
];
index_t
pixels
=
output_shape
[
2
]
*
output_shape
[
3
];
index_t
blocks
=
(
pixels
+
15
)
/
16
;
MACE_CHECK
(
channels
==
bias
->
shape
()[
0
],
"Channels mismatch"
);
auto
conv_2d
=
cl
::
KernelFunctor
<
cl
::
Buffer
,
cl
::
Buffer
,
cl
::
Buffer
,
cl
::
Buffer
,
int
,
int
>
(
program
,
"conv_2d_1x1_naive"
);
const
index_t
pixels
=
height
*
width
;
auto
assign_bias
=
cl
::
KernelFunctor
<
cl
::
Buffer
,
cl
::
Buffer
,
int
>
(
program
,
"assign_3d_v16_f32"
);
cl_int
error
;
assign_bias
(
cl
::
EnqueueArgs
(
runtime
->
command_queue
(),
cl
::
NDRange
(
batch
,
channels
,
blocks
),
cl
::
NDRange
(
1
,
8
,
128
)),
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())),
*
(
static_cast
<
cl
::
Buffer
*>
(
bias
->
buffer
())),
static_cast
<
int
>
(
pixels
),
error
);
MACE_CHECK
(
error
==
CL_SUCCESS
);
}
}
cl_int
error
;
conv_2d
(
cl
::
EnqueueArgs
(
runtime
->
command_queue
(),
cl
::
NDRange
(
static_cast
<
int
>
(
batch
),
static_cast
<
int
>
(
channels
),
static_cast
<
int
>
(
pixels
)),
cl
::
NDRange
(
1
,
1
,
128
)),
*
(
static_cast
<
cl
::
Buffer
*>
(
input
->
buffer
())),
*
(
static_cast
<
cl
::
Buffer
*>
(
filter
->
buffer
())),
*
(
static_cast
<
cl
::
Buffer
*>
(
bias
->
buffer
())),
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())),
static_cast
<
int
>
(
input_channels
),
error
);
MACE_CHECK
(
error
==
CL_SUCCESS
);
}
void
Conv1x1NCHW
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
Tensor
*
output
)
{
void
Conv1x1V2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
)
{
const
index_t
batch
=
output
->
shape
()[
0
];
const
index_t
channels
=
output
->
shape
()[
1
];
const
index_t
height
=
output
->
shape
()[
2
];
...
...
@@ -61,25 +53,27 @@ void Conv1x1NCHW(const Tensor *input,
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
();
auto
conv_2d
=
cl
::
KernelFunctor
<
cl
::
Buffer
,
cl
::
Buffer
,
cl
::
Buffer
,
int
,
int
,
int
,
int
,
int
>
(
program
,
"conv_2d_1x1_nchw"
);
const
index_t
total_pixels
=
height
*
width
;
auto
conv_2d
=
cl
::
KernelFunctor
<
cl
::
Buffer
,
cl
::
Buffer
,
cl
::
Buffer
,
cl
::
Buffer
,
int
,
int
,
int
,
int
>
(
program
,
"conv_2d_1x1_v2"
);
const
index_t
pixels
=
height
*
width
;
const
index_t
channel_blocks
=
(
channels
+
3
)
/
4
;
const
index_t
pixel_blocks
=
(
pixels
+
3
)
/
4
;
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
int
input_offset
=
b
*
input_channels
*
total_pixels
;
int
output_offset
=
b
*
channels
*
total_pixels
;
int
chan_blk_num
=
(
channels
+
3
)
>>
2
;
// each 4 output channels
int
pixel_blk_num
=
(
total_pixels
+
3
)
>>
2
;
// each 4 pixels
cl_int
error
;
conv_2d
(
cl
::
EnqueueArgs
(
runtime
->
command_queue
(
),
cl
::
NDRange
(
chan_blk_num
,
pixel_blk_num
),
cl
::
NDRange
(
1
,
256
)),
*
(
static_cast
<
cl
::
Buffer
*>
(
in
put
->
buffer
())),
*
(
static_cast
<
cl
::
Buffer
*>
(
filter
->
buffer
())
),
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())
),
input_offset
,
output_offset
,
total_pixels
,
input_channels
,
channels
,
error
);
MACE_CHECK
(
error
==
CL_SUCCESS
);
}
cl_int
error
;
conv_2d
(
cl
::
EnqueueArgs
(
runtime
->
command_queue
(),
cl
::
NDRange
(
static_cast
<
int
>
(
batch
),
static_cast
<
int
>
(
channel_blocks
),
static_cast
<
int
>
(
pixel_blocks
)),
cl
::
NDRange
(
1
,
1
,
256
)),
*
(
static_cast
<
cl
::
Buffer
*>
(
input
->
buffer
())
),
*
(
static_cast
<
cl
::
Buffer
*>
(
filter
->
buffer
())
),
*
(
static_cast
<
cl
::
Buffer
*>
(
bias
->
buffer
()
)),
*
(
static_cast
<
cl
::
Buffer
*>
(
out
put
->
buffer
())),
static_cast
<
int
>
(
input_channels
),
static_cast
<
int
>
(
channels
),
static_cast
<
int
>
(
pixels
),
error
);
MACE_CHECK
(
error
==
CL_SUCCESS
);
}
extern
void
Conv2dOpenclK1x1S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
...
...
@@ -95,8 +89,8 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
MACE_CHECK
(
input_batch
==
batch
&&
input_height
==
height
&&
input_width
==
width
);
AssignBias
(
output
,
bias
);
Conv1x1
NCHW
(
input
,
filter
,
output
);
// Conv1x1Naive(input, filter, bias, output
);
Conv1x1
V2
(
input
,
filter
,
bias
,
output
);
};
}
// namespace kernels
...
...
mace/ops/conv_2d_benchmark.cc
浏览文件 @
b061818c
...
...
@@ -46,11 +46,13 @@ static void Conv2d(int iters,
// Warm-up
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
net
.
RunOp
(
D
);
net
.
Sync
();
}
mace
::
testing
::
StartTiming
();
while
(
iters
--
)
{
net
.
RunOp
(
D
);
net
.
Sync
();
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录