Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
野马c
Mace
提交
b469a945
Mace
项目概览
野马c
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
b469a945
编写于
11月 03, 2017
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Finish depthwise 3x3 conv with stride 2.
上级
a7290142
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
133 addition
and
97 deletion
+133
-97
mace/kernels/opencl/cl/conv_2d_3x3.cl
mace/kernels/opencl/cl/conv_2d_3x3.cl
+5
-12
mace/kernels/opencl/cl/conv_helper.h
mace/kernels/opencl/cl/conv_helper.h
+15
-0
mace/kernels/opencl/cl/depthwise_conv_3x3.cl
mace/kernels/opencl/cl/depthwise_conv_3x3.cl
+49
-44
mace/kernels/opencl/conv_2d_opencl_3x3.cc
mace/kernels/opencl/conv_2d_opencl_3x3.cc
+14
-14
mace/kernels/opencl/depthwise_conv_opencl.cc
mace/kernels/opencl/depthwise_conv_opencl.cc
+3
-1
mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
+47
-26
未找到文件。
mace/kernels/opencl/cl/conv_2d_3x3.cl
浏览文件 @
b469a945
float4
conv1x3_s1
(
const
float
*input_ptr,
const
float
*filter_ptr
)
;
float4
conv1x3_s2
(
const
float
*input_ptr,
const
float
*filter_ptr
)
;
float
conv3x3
(
const
float
*input_ptr,
const
float
*filter_ptr,
const
int
row_width
)
;
#
include
<conv_helper.h>
void
kernel
conv_2d_3x3
(
global
const
float
*input,
global
const
float
*filter,
global
const
float
*bias,
...
...
@@ -18,9 +11,9 @@ void kernel conv_2d_3x3(global const float *input,
private
const
uint
out_width,
private
const
uint
stride_h,
private
const
uint
stride_w
)
{
int
batch
=
get_global_id
(
0
)
;
int
out_chan_blk
=
get_global_id
(
1
)
;
int
out_pixel_blk
=
get_global_id
(
2
)
;
const
int
batch
=
get_global_id
(
0
)
;
const
int
out_chan_blk
=
get_global_id
(
1
)
;
const
int
out_pixel_blk
=
get_global_id
(
2
)
;
const
uint
in_pixel
=
in_height
*
in_width
;
const
uint
out_pixel
=
out_height
*
out_width
;
...
...
@@ -43,10 +36,10 @@ void kernel conv_2d_3x3(global const float *input,
uint
pixels
=
out_pixel_end
-
out_pixel_begin
;
for
(
uint
i
=
out_chan_begin
; i < out_chan_end; ++i) {
float4
res
=
(
float4
)
bias[i]
;
float
*output_ptr
=
output_base
+
i
*
out_pixel
;
const
float
*filter_base
=
filter
+
i
*
in_chan_num
*
9
;
if
(
pixels
==
4
)
{
float4
res
=
(
float4
)
bias[i]
;
for
(
uint
in_chan_idx
=
0
; in_chan_idx < in_chan_num; ++in_chan_idx) {
const
float*
input_ptr
=
input_base
+
in_chan_idx
*
in_pixel
;
const
float*
filter_ptr
=
filter_base
+
in_chan_idx
*
9
;
...
...
mace/kernels/opencl/cl/conv_helper.h
0 → 100644
浏览文件 @
b469a945
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
#define MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
float4
conv1x3_s1
(
const
float
*
input_ptr
,
const
float
*
filter_ptr
);
float4
conv1x3_s2
(
const
float
*
input_ptr
,
const
float
*
filter_ptr
);
float
conv3x3
(
const
float
*
input_ptr
,
const
float
*
filter_ptr
,
const
int
row_width
);
#endif // MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
mace/kernels/opencl/cl/depthwise_conv_3x3.cl
浏览文件 @
b469a945
float4
conv1x3_s1
(
const
float
*input_ptr,
const
float
*filter_ptr
)
;
float
conv3x3
(
const
float
*input_ptr,
const
float
*filter_ptr,
const
int
row_width
)
;
#
include
<conv_helper.h>
//TODO
merge
the
depthwise
with
conv
3x3
to
remove
duplicate
code.
void
kernel
depthwise_conv_3x3
(
global
const
float
*input,
/*
n,
c,
h,
w
*/
global
const
float
*filter,
/*
m,
i,
kh,
kw
*/
global
const
float
*bias,
/*
o
*/
global
float
*output,
/*
n,
c,
h,
w
*/
private
const
uint
in_chan_num,
private
const
uint
out_chan_num,
private
const
uint
in_height,
private
const
uint
in_width,
private
const
uint
out_height,
private
const
uint
out_width,
private
const
uint
stride_h,
private
const
uint
stride_w
)
{
const
int
batch
=
get_global_id
(
0
)
;
const
int
out_chan_blk
=
get_global_id
(
1
)
;
const
int
out_pixel_blk
=
get_global_id
(
2
)
;
void
kernel
depthwise_conv_3x3_s1
(
global
const
float
*input,
/*
n,
c,
h,
w
*/
global
const
float
*filter,
/*
m,
i,
kh,
kw
*/
global
const
float
*bias,
/*
o
*/
global
float
*output,
/*
n,
c,
h,
w
*/
private
const
int
in_chan_num,
private
const
int
out_chan_num,
private
const
int
in_height,
private
const
int
in_width,
private
const
int
out_height,
private
const
int
out_width
)
{
int
batch
=
get_global_id
(
0
)
;
int
out_chan_blk
=
get_global_id
(
1
)
;
int
out_pixel_blk
=
get_global_id
(
2
)
;
const
uint
in_pixel
=
in_height
*
in_width
;
const
uint
out_pixel
=
out_height
*
out_width
;
const
uint
multiplier
=
out_chan_num
/
in_chan_num
;
const
int
in_pixel
=
in_height
*
in_width
;
const
int
out_pixel
=
out_height
*
out_width
;
const
int
multiplier
=
out_chan_num
/
in_chan_num
;
const
uint
round_out_width
=
(
out_width
+
3
)
/
4
;
const
uint
out_pixel_height
=
out_pixel_blk
/
round_
out_width
;
const
uint
out_pixel_width
=
out_pixel_blk
%
round_out_width
;
const
int
round_out_width
=
(
out_width
+
3
)
/
4
;
const
int
out_pixel_height
=
out_pixel_blk
/
round_out_width
;
const
int
out_pixel_width
=
out_pixel_blk
%
round_out_width
;
const
uint
out_chan_begin
=
out_chan_blk
*
4
;
const
uint
out_chan_end
=
min
(
out_chan_begin
+
4
,
out_chan_num
)
;
const
uint
out_pixel_begin
=
out_pixel_height
*
out_width
+
out_pixel_width
*
4
;
const
uint
out_pixel_end
=
min
(
out_pixel_begin
+
4
,
(
out_pixel_height
+
1
)
*
out_width
)
;
const
uint
in_pixel_begin
=
out_pixel_height
*
stride_h
*
in_width
+
out_pixel_width
*
stride_w
*
4
;
const
int
out_chan_begin
=
out_chan_blk
*
4
;
const
int
out_chan_end
=
min
(
out_chan_begin
+
4
,
out_chan_num
)
;
const
int
out_pixel_begin
=
out_pixel_height
*
out_width
+
out_pixel_width
*
4
;
const
int
out_pixel_end
=
min
(
out_pixel_begin
+
4
,
(
out_pixel_height
+
1
)
*
out_width
)
;
const
int
in_pixel_begin
=
out_pixel_height
*
in_width
+
out_pixel_width
*
4
;
const
int
in_offset
=
batch
*
in_chan_num
*
in_pixel
;
const
int
out_offset
=
batch
*
out_chan_num
*
out_pixel
;
const
uint
in_offset
=
batch
*
in_chan_num
*
in_pixel
;
const
uint
out_offset
=
batch
*
out_chan_num
*
out_pixel
;
const
float
*input_base
=
input
+
in_offset
+
in_pixel_begin
;
float
*output_base
=
output
+
out_offset
+
out_pixel_begin
;
int
pixels
=
out_pixel_end
-
out_pixel_begin
;
u
int
pixels
=
out_pixel_end
-
out_pixel_begin
;
for
(
int
i
=
out_chan_begin
; i < out_chan_end; ++i) {
for
(
u
int
i
=
out_chan_begin
; i < out_chan_end; ++i) {
float
bias_value
=
bias[i]
;
const
float
*input_ptr
=
input_base
+
(
i
/
multiplier
)
*
in_pixel
;
const
float
*filter_ptr
=
filter
+
i
*
9
;
float
*output_ptr
=
output_base
+
i
*
out_pixel
;
if
(
pixels
<
4
)
{
for
(
int
out_idx
=
0
; out_idx < pixels; ++out_idx) {
output_ptr[out_idx]
=
bias_value
;
output_ptr[out_idx]
+=
conv3x3
(
input_ptr,
filter_ptr,
in_width
)
;
input_ptr
+=
1
;
if
(
pixels
==
4
)
{
float4
res
=
(
float4
)
bias[i]
;
if
(
stride_w
==
1
)
{
res
+=
conv1x3_s1
(
input_ptr
+
0
*
in_width,
filter_ptr
+
0
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
1
*
in_width,
filter_ptr
+
1
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
2
*
in_width,
filter_ptr
+
2
*
3
)
;
}
else
{
res
+=
conv1x3_s2
(
input_ptr
+
0
*
in_width,
filter_ptr
+
0
*
3
)
;
res
+=
conv1x3_s2
(
input_ptr
+
1
*
in_width,
filter_ptr
+
1
*
3
)
;
res
+=
conv1x3_s2
(
input_ptr
+
2
*
in_width,
filter_ptr
+
2
*
3
)
;
}
}
else
{
float4
res
=
(
float4
)
bias_value
;
res
+=
conv1x3_s1
(
input_ptr
+
0
*
in_width,
filter_ptr
+
0
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
1
*
in_width,
filter_ptr
+
1
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
2
*
in_width,
filter_ptr
+
2
*
3
)
;
vstore4
(
res,
0
,
output_ptr
)
;
}
else
{
for
(
uint
p
=
0
; p < pixels; ++p) {
float
res
=
bias[i]
;
res
+=
conv3x3
(
input_ptr,
filter_ptr,
in_width
)
;
output_ptr[p]
=
res
;
input_ptr
+=
stride_w
;
}
}
}
...
...
mace/kernels/opencl/conv_2d_opencl_3x3.cc
浏览文件 @
b469a945
...
...
@@ -22,21 +22,21 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
();
auto
bm
_kernel
=
cl
::
Kernel
(
program
,
"conv_2d_3x3"
);
auto
conv
_kernel
=
cl
::
Kernel
(
program
,
"conv_2d_3x3"
);
uint32_t
idx
=
0
;
bm
_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
input
->
buffer
())));
bm
_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
filter
->
buffer
())));
bm
_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
bias
->
buffer
())));
bm
_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
bm
_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input
->
dim
(
1
)));
bm
_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
channels
));
bm
_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input
->
dim
(
2
)));
bm
_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input
->
dim
(
3
)));
bm
_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
height
));
bm
_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
width
));
bm
_kernel
.
setArg
(
idx
++
,
stride
);
bm
_kernel
.
setArg
(
idx
++
,
stride
);
conv
_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
input
->
buffer
())));
conv
_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
filter
->
buffer
())));
conv
_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
bias
->
buffer
())));
conv
_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
conv
_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input
->
dim
(
1
)));
conv
_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
channels
));
conv
_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input
->
dim
(
2
)));
conv
_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input
->
dim
(
3
)));
conv
_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
height
));
conv
_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
width
));
conv
_kernel
.
setArg
(
idx
++
,
stride
);
conv
_kernel
.
setArg
(
idx
++
,
stride
);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
output
->
dim
(
0
)),
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
pixel_blocks
)};
...
...
@@ -44,7 +44,7 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
static_cast
<
uint32_t
>
(
1
),
static_cast
<
uint32_t
>
(
256
)};
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
bm
_kernel
,
cl
::
NullRange
,
conv
_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]));
MACE_CHECK
(
error
==
CL_SUCCESS
);
...
...
mace/kernels/opencl/depthwise_conv_opencl.cc
浏览文件 @
b469a945
...
...
@@ -10,6 +10,8 @@ namespace kernels {
extern
void
DepthwiseConvOpenclK3x3S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
);
extern
void
DepthwiseConvOpenclK3x3S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
);
template
<
>
void
DepthwiseConv2dFunctor
<
DeviceType
::
OPENCL
,
float
>::
operator
()(
const
Tensor
*
input
,
const
Tensor
*
filter
,
...
...
@@ -21,7 +23,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor
static
const
Conv2dOpenclFunction
selector
[
5
][
2
]
=
{
{
nullptr
,
nullptr
},
{
nullptr
,
nullptr
},
{
DepthwiseConvOpenclK3x3S1
,
nullptr
},
{
DepthwiseConvOpenclK3x3S1
,
DepthwiseConvOpenclK3x3S2
},
{
nullptr
,
nullptr
},
{
nullptr
,
nullptr
}};
...
...
mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
浏览文件 @
b469a945
...
...
@@ -9,10 +9,11 @@
namespace
mace
{
namespace
kernels
{
extern
void
DepthwiseConvOpenclK3x3S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
)
{
static
void
InnerDepthwiseConvOpenclK3x3S12
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
uint32_t
stride
,
Tensor
*
output
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
channels
=
output
->
dim
(
1
);
const
index_t
height
=
output
->
dim
(
2
);
...
...
@@ -24,33 +25,53 @@ extern void DepthwiseConvOpenclK3x3S1(const Tensor *input,
const
index_t
input_width
=
input
->
dim
(
3
);
MACE_CHECK
(
input_batch
==
batch
);
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
();
auto
conv_2d
=
cl
::
KernelFunctor
<
cl
::
Buffer
,
cl
::
Buffer
,
cl
::
Buffer
,
cl
::
Buffer
,
int
,
int
,
int
,
int
,
int
,
int
,
int
>
(
program
,
"depthwise_conv_3x3_s1"
);
const
index_t
pixels
=
height
*
width
;
const
index_t
channel_blocks
=
(
channels
+
3
)
/
4
;
const
index_t
pixel_blocks
=
(
width
+
3
)
/
4
*
height
;
cl_int
error
;
conv_2d
(
cl
::
EnqueueArgs
(
runtime
->
command_queue
(),
cl
::
NDRange
(
static_cast
<
int
>
(
batch
),
static_cast
<
int
>
(
channel_blocks
),
static_cast
<
int
>
(
pixel_blocks
)),
cl
::
NDRange
(
1
,
1
,
256
)),
*
(
static_cast
<
cl
::
Buffer
*>
(
input
->
buffer
())),
*
(
static_cast
<
cl
::
Buffer
*>
(
filter
->
buffer
())),
*
(
static_cast
<
cl
::
Buffer
*>
(
bias
->
buffer
())),
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())),
static_cast
<
int
>
(
input_channels
),
static_cast
<
int
>
(
channels
),
static_cast
<
int
>
(
input_height
),
static_cast
<
int
>
(
input_width
),
static_cast
<
int
>
(
height
),
static_cast
<
int
>
(
width
),
error
);
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
();
auto
conv_kernel
=
cl
::
Kernel
(
program
,
"depthwise_conv_3x3"
);
uint32_t
idx
=
0
;
conv_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
input
->
buffer
())));
conv_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
filter
->
buffer
())));
conv_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
bias
->
buffer
())));
conv_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input
->
dim
(
1
)));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
channels
));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input
->
dim
(
2
)));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input
->
dim
(
3
)));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
height
));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
width
));
conv_kernel
.
setArg
(
idx
++
,
stride
);
conv_kernel
.
setArg
(
idx
++
,
stride
);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
output
->
dim
(
0
)),
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
pixel_blocks
)};
const
uint32_t
lws
[
3
]
=
{
static_cast
<
uint32_t
>
(
1
),
static_cast
<
uint32_t
>
(
1
),
static_cast
<
uint32_t
>
(
256
)};
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]));
MACE_CHECK
(
error
==
CL_SUCCESS
);
}
extern
void
DepthwiseConvOpenclK3x3S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
)
{
InnerDepthwiseConvOpenclK3x3S12
(
input
,
filter
,
bias
,
1
,
output
);
};
extern
void
DepthwiseConvOpenclK3x3S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
)
{
InnerDepthwiseConvOpenclK3x3S12
(
input
,
filter
,
bias
,
2
,
output
);
};
}
// namespace kernels
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录