Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
ff5d176e
Mace
项目概览
Xiaomi
/
Mace
通知
107
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
ff5d176e
编写于
1月 24, 2018
作者:
L
Liangliang He
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add depthwise conv2d opencl kernel
上级
0279acae
变更
26
隐藏空白更改
内联
并排
Showing
26 changed file
with
852 addition
and
482 deletion
+852
-482
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+4
-4
mace/kernels/depthwise_conv2d.h
mace/kernels/depthwise_conv2d.h
+113
-54
mace/kernels/opencl/REAEMD.md
mace/kernels/opencl/REAEMD.md
+12
-0
mace/kernels/opencl/buffer_to_image.cc
mace/kernels/opencl/buffer_to_image.cc
+4
-1
mace/kernels/opencl/cl/buffer_to_image.cl
mace/kernels/opencl/cl/buffer_to_image.cl
+50
-0
mace/kernels/opencl/cl/conv_2d.cl
mace/kernels/opencl/cl/conv_2d.cl
+9
-14
mace/kernels/opencl/cl/conv_2d_1x1.cl
mace/kernels/opencl/cl/conv_2d_1x1.cl
+1
-1
mace/kernels/opencl/cl/conv_2d_3x3.cl
mace/kernels/opencl/cl/conv_2d_3x3.cl
+3
-2
mace/kernels/opencl/cl/depthwise_conv2d.cl
mace/kernels/opencl/cl/depthwise_conv2d.cl
+129
-0
mace/kernels/opencl/cl/depthwise_conv_3x3.cl
mace/kernels/opencl/cl/depthwise_conv_3x3.cl
+0
-126
mace/kernels/opencl/conv_2d_opencl_general.cc
mace/kernels/opencl/conv_2d_opencl_general.cc
+8
-7
mace/kernels/opencl/depthwise_conv_opencl.cc
mace/kernels/opencl/depthwise_conv_opencl.cc
+141
-37
mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
+0
-98
mace/kernels/opencl/helper.cc
mace/kernels/opencl/helper.cc
+23
-8
mace/kernels/opencl/helper.h
mace/kernels/opencl/helper.h
+4
-3
mace/ops/buffer_to_image.h
mace/ops/buffer_to_image.h
+1
-1
mace/ops/buffer_to_image_test.cc
mace/ops/buffer_to_image_test.cc
+6
-6
mace/ops/conv_2d_benchmark.cc
mace/ops/conv_2d_benchmark.cc
+1
-1
mace/ops/conv_2d_test.cc
mace/ops/conv_2d_test.cc
+8
-8
mace/ops/core_test.cc
mace/ops/core_test.cc
+2
-2
mace/ops/depthwise_conv2d.cc
mace/ops/depthwise_conv2d.cc
+6
-0
mace/ops/depthwise_conv2d.h
mace/ops/depthwise_conv2d.h
+11
-24
mace/ops/depthwise_conv2d_test.cc
mace/ops/depthwise_conv2d_test.cc
+256
-47
mace/ops/depthwise_conv_2d_benchmark.cc
mace/ops/depthwise_conv_2d_benchmark.cc
+50
-28
mace/ops/fused_conv_2d_test.cc
mace/ops/fused_conv_2d_test.cc
+9
-9
mace/ops/image_to_buffer.h
mace/ops/image_to_buffer.h
+1
-1
未找到文件。
mace/kernels/conv_2d.h
浏览文件 @
ff5d176e
...
@@ -21,15 +21,15 @@ struct Conv2dFunctorBase {
...
@@ -21,15 +21,15 @@ struct Conv2dFunctorBase {
const
float
relux_max_limit
,
const
float
relux_max_limit
,
const
float
prelu_alpha
)
const
float
prelu_alpha
)
:
strides_
(
strides
),
:
strides_
(
strides
),
dilations_
(
dilations
),
paddings_
(
paddings
),
paddings_
(
paddings
),
dilations_
(
dilations
),
activation_
(
activation
),
activation_
(
activation
),
relux_max_limit_
(
relux_max_limit
),
relux_max_limit_
(
relux_max_limit
),
prelu_alpha_
(
prelu_alpha
)
{}
prelu_alpha_
(
prelu_alpha
)
{}
const
int
*
strides_
;
// [stride_h, stride_w]
const
int
*
strides_
;
// [stride_h, stride_w]
const
int
*
dilations_
;
// [dilation_h, dilation_w]
const
Padding
paddings_
;
const
Padding
paddings_
;
const
int
*
dilations_
;
// [dilation_h, dilation_w]
const
ActivationType
activation_
;
const
ActivationType
activation_
;
const
float
relux_max_limit_
;
const
float
relux_max_limit_
;
const
float
prelu_alpha_
;
const
float
prelu_alpha_
;
...
@@ -50,8 +50,8 @@ struct Conv2dFunctor : Conv2dFunctorBase {
...
@@ -50,8 +50,8 @@ struct Conv2dFunctor : Conv2dFunctorBase {
relux_max_limit
,
relux_max_limit
,
prelu_alpha
)
{}
prelu_alpha
)
{}
void
operator
()(
const
Tensor
*
input
,
void
operator
()(
const
Tensor
*
input
,
// NHWC
const
Tensor
*
filter
,
const
Tensor
*
filter
,
// HWIO
const
Tensor
*
bias
,
const
Tensor
*
bias
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
)
{
...
...
mace/kernels/depthwise_conv2d.h
浏览文件 @
ff5d176e
...
@@ -2,28 +2,57 @@
...
@@ -2,28 +2,57 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
// Copyright (c) 2017 XiaoMi All rights reserved.
//
//
#ifndef MACE_KERNELS_DEPTHWISE_CONV_H_
#ifndef MACE_KERNELS_DEPTHWISE_CONV
2D
_H_
#define MACE_KERNELS_DEPTHWISE_CONV_H_
#define MACE_KERNELS_DEPTHWISE_CONV
2D
_H_
#include "mace/core/future.h"
#include "mace/core/common.h"
#include "mace/core/common.h"
#include "mace/
kernels/conv_pool_2d_util
.h"
#include "mace/
core/future
.h"
#include "mace/core/public/mace.h"
#include "mace/core/public/mace.h"
#include "mace/kernels/conv_pool_2d_util.h"
namespace
mace
{
namespace
mace
{
namespace
kernels
{
namespace
kernels
{
struct
DepthwiseConv2dFunctorBase
{
DepthwiseConv2dFunctorBase
(
const
int
*
strides
,
const
Padding
padding
,
const
int
*
dilations
,
const
ActivationType
activation
,
const
float
relux_max_limit
,
const
float
prelu_alpha
)
:
strides_
(
strides
),
padding_
(
padding
),
dilations_
(
dilations
),
activation_
(
activation
),
relux_max_limit_
(
relux_max_limit
),
prelu_alpha_
(
prelu_alpha
)
{}
const
int
*
strides_
;
// [stride_h, stride_w]
const
Padding
padding_
;
const
int
*
dilations_
;
// [dilation_h, dilation_w]
const
ActivationType
activation_
;
const
float
relux_max_limit_
;
const
float
prelu_alpha_
;
};
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
struct
DepthwiseConv2dFunctor
{
struct
DepthwiseConv2dFunctor
:
public
DepthwiseConv2dFunctorBase
{
DepthwiseConv2dFunctor
()
{}
DepthwiseConv2dFunctor
(
const
int
*
strides
,
DepthwiseConv2dFunctor
(
const
int
*
strides
,
const
std
::
vector
<
int
>
&
paddings
,
const
Padding
padding
,
const
int
*
dilations
)
const
int
*
dilations
,
:
strides_
(
strides
),
paddings_
(
paddings
),
dilations_
(
dilations
)
{}
const
ActivationType
activation
,
const
float
relux_max_limit
,
void
operator
()(
const
Tensor
*
input
,
// NCHW
const
float
prelu_alpha
)
const
Tensor
*
filter
,
// c_out, c_in, kernel_h, kernel_w
:
DepthwiseConv2dFunctorBase
(
strides
,
const
Tensor
*
bias
,
// c_out
padding
,
dilations
,
activation
,
relux_max_limit
,
prelu_alpha
)
{}
void
operator
()(
const
Tensor
*
input
,
// NHWC
const
Tensor
*
filter
,
// HWIM
const
Tensor
*
bias
,
// O
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
)
{
MACE_CHECK_NOTNULL
(
input
);
MACE_CHECK_NOTNULL
(
input
);
...
@@ -31,18 +60,36 @@ struct DepthwiseConv2dFunctor {
...
@@ -31,18 +60,36 @@ struct DepthwiseConv2dFunctor {
MACE_CHECK_NOTNULL
(
bias
);
MACE_CHECK_NOTNULL
(
bias
);
MACE_CHECK_NOTNULL
(
output
);
MACE_CHECK_NOTNULL
(
output
);
// Create a fake conv_2d filter to calculate the paddings and output size
std
::
vector
<
index_t
>
fake_filter_shape
(
4
);
fake_filter_shape
[
0
]
=
filter
->
shape
()[
0
];
fake_filter_shape
[
1
]
=
filter
->
shape
()[
1
];
fake_filter_shape
[
3
]
=
filter
->
shape
()[
2
]
*
filter
->
shape
()[
3
];
fake_filter_shape
[
2
]
=
1
;
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
int
>
paddings
(
2
);
kernels
::
CalcNHWCPaddingAndOutputSize
(
input
->
shape
().
data
(),
fake_filter_shape
.
data
(),
dilations_
,
strides_
,
padding_
,
output_shape
.
data
(),
paddings
.
data
());
auto
input_shape
=
fake_filter_shape
;
output
->
Resize
(
output_shape
);
index_t
batch
=
output
->
dim
(
0
);
index_t
batch
=
output
->
dim
(
0
);
index_t
channels
=
output
->
dim
(
1
);
index_t
height
=
output
->
dim
(
1
);
index_t
height
=
output
->
dim
(
2
);
index_t
width
=
output
->
dim
(
2
);
index_t
width
=
output
->
dim
(
3
);
index_t
channels
=
output
->
dim
(
3
);
index_t
input_batch
=
input
->
dim
(
0
);
index_t
input_batch
=
input
->
dim
(
0
);
index_t
input_
channels
=
input
->
dim
(
1
);
index_t
input_
height
=
input
->
dim
(
1
);
index_t
input_
height
=
input
->
dim
(
2
);
index_t
input_
width
=
input
->
dim
(
2
);
index_t
input_
width
=
input
->
dim
(
3
);
index_t
input_
channels
=
input
->
dim
(
3
);
index_t
kernel_h
=
filter
->
dim
(
2
);
index_t
kernel_h
=
filter
->
dim
(
0
);
index_t
kernel_w
=
filter
->
dim
(
3
);
index_t
kernel_w
=
filter
->
dim
(
1
);
index_t
multiplier
=
filter
->
dim
(
3
);
MACE_CHECK
(
filter
->
dim
(
2
)
==
input_channels
,
filter
->
dim
(
2
),
"!="
,
input_channels
);
MACE_CHECK
(
channels
==
input_channels
*
multiplier
);
int
stride_h
=
strides_
[
0
];
int
stride_h
=
strides_
[
0
];
int
stride_w
=
strides_
[
1
];
int
stride_w
=
strides_
[
1
];
...
@@ -53,13 +100,12 @@ struct DepthwiseConv2dFunctor {
...
@@ -53,13 +100,12 @@ struct DepthwiseConv2dFunctor {
MACE_CHECK
(
batch
==
input_batch
,
"Input/Output batch size mismatch"
);
MACE_CHECK
(
batch
==
input_batch
,
"Input/Output batch size mismatch"
);
// The left-upper most offset of the padded input
// The left-upper most offset of the padded input
int
padded_h_start
=
0
-
paddings
_
[
0
]
/
2
;
int
padded_h_start
=
0
-
paddings
[
0
]
/
2
;
int
padded_w_start
=
0
-
paddings
_
[
1
]
/
2
;
int
padded_w_start
=
0
-
paddings
[
1
]
/
2
;
index_t
padded_h_stop
=
input_height
+
paddings
_
[
0
]
-
paddings_
[
0
]
/
2
;
index_t
padded_h_stop
=
input_height
+
paddings
[
0
]
-
paddings
[
0
]
/
2
;
index_t
padded_w_stop
=
input_width
+
paddings
_
[
1
]
-
paddings_
[
1
]
/
2
;
index_t
padded_w_stop
=
input_width
+
paddings
[
1
]
-
paddings
[
1
]
/
2
;
index_t
kernel_size
=
kernel_h
*
kernel_w
;
const
index_t
kernel_size
=
kernel_h
*
kernel_w
;
index_t
multiplier
=
filter
->
dim
(
0
);
Tensor
::
MappingGuard
input_mapper
(
input
);
Tensor
::
MappingGuard
input_mapper
(
input
);
Tensor
::
MappingGuard
filter_mapper
(
filter
);
Tensor
::
MappingGuard
filter_mapper
(
filter
);
...
@@ -67,20 +113,22 @@ struct DepthwiseConv2dFunctor {
...
@@ -67,20 +113,22 @@ struct DepthwiseConv2dFunctor {
Tensor
::
MappingGuard
output_mapper
(
output
);
Tensor
::
MappingGuard
output_mapper
(
output
);
const
T
*
input_ptr
=
input
->
data
<
T
>
();
const
T
*
input_ptr
=
input
->
data
<
T
>
();
const
T
*
filter_ptr
=
filter
->
data
<
T
>
();
const
T
*
filter_ptr
=
filter
->
data
<
T
>
();
const
T
*
bias_ptr
=
bias
->
data
<
T
>
();
const
T
*
bias_ptr
=
bias
->
data
<
T
>
();
T
*
output_ptr
=
output
->
mutable_data
<
T
>
();
T
*
output_ptr
=
output
->
mutable_data
<
T
>
();
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
for
(
int
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
int
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
int
h
=
0
;
h
<
height
;
++
h
)
{
T
bias_channel
=
bias_ptr
?
bias_ptr
[
c
]
:
0
;
for
(
int
w
=
0
;
w
<
width
;
++
w
)
{
for
(
int
h
=
0
;
h
<
height
;
++
h
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
int
w
=
0
;
w
<
width
;
++
w
)
{
const
index_t
inc
=
c
/
multiplier
;
index_t
offset
=
n
*
channels
*
height
*
width
+
const
index_t
m
=
c
%
multiplier
;
c
*
height
*
width
+
h
*
width
+
w
;
T
bias_channel
=
bias_ptr
?
bias_ptr
[
c
]
:
0
;
index_t
offset
=
n
*
height
*
width
*
channels
+
h
*
width
*
channels
+
w
*
channels
+
c
;
output_ptr
[
offset
]
=
bias_channel
;
output_ptr
[
offset
]
=
bias_channel
;
T
sum
=
0
;
T
sum
=
0
;
const
T
*
filter_base
=
filter_ptr
+
c
*
kernel_size
;
const
T
*
filter_base
=
filter_ptr
+
inc
*
multiplier
+
m
;
for
(
int
kh
=
0
;
kh
<
kernel_h
;
++
kh
)
{
for
(
int
kh
=
0
;
kh
<
kernel_h
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
kernel_w
;
++
kw
)
{
for
(
int
kw
=
0
;
kw
<
kernel_w
;
++
kw
)
{
int
inh
=
padded_h_start
+
h
*
stride_h
+
dilation_h
*
kh
;
int
inh
=
padded_h_start
+
h
*
stride_h
+
dilation_h
*
kh
;
...
@@ -92,12 +140,12 @@ struct DepthwiseConv2dFunctor {
...
@@ -92,12 +140,12 @@ struct DepthwiseConv2dFunctor {
"Out of range read from input: "
,
inh
,
", "
,
inw
);
"Out of range read from input: "
,
inh
,
", "
,
inw
);
}
else
{
}
else
{
index_t
input_offset
=
index_t
input_offset
=
n
*
input_
channels
*
input_height
*
input_width
+
n
*
input_
height
*
input_width
*
input_channels
+
(
c
/
multiplier
)
*
input_height
*
input_width
+
inh
*
input_width
*
input_channels
+
in
h
*
input_width
+
inw
;
in
w
*
input_channels
+
inc
;
sum
+=
input_ptr
[
input_offset
]
*
*
filter_base
;
sum
+=
input_ptr
[
input_offset
]
*
filter_base
[
0
];
// HWIM
}
}
++
filter_base
;
filter_base
+=
input_channels
*
multiplier
;
}
}
}
}
output_ptr
[
offset
]
+=
sum
;
output_ptr
[
offset
]
+=
sum
;
...
@@ -106,10 +154,6 @@ struct DepthwiseConv2dFunctor {
...
@@ -106,10 +154,6 @@ struct DepthwiseConv2dFunctor {
}
}
}
}
}
}
const
int
*
strides_
;
// [stride_h, stride_w]
std
::
vector
<
int
>
paddings_
;
// [padding_h, padding_w]
const
int
*
dilations_
;
// [dilation_h, dilation_w]
};
};
template
<
>
template
<
>
...
@@ -120,15 +164,30 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
...
@@ -120,15 +164,30 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
);
StatsFuture
*
future
);
template
<
>
template
<
typename
T
>
void
DepthwiseConv2dFunctor
<
DeviceType
::
OPENCL
,
float
>::
operator
()(
struct
DepthwiseConv2dFunctor
<
DeviceType
::
OPENCL
,
T
>
const
Tensor
*
input
,
:
DepthwiseConv2dFunctorBase
{
const
Tensor
*
filter
,
DepthwiseConv2dFunctor
(
const
int
*
strides
,
const
Tensor
*
bias
,
const
Padding
padding
,
Tensor
*
output
,
const
int
*
dilations
,
StatsFuture
*
future
);
const
ActivationType
activation
,
const
float
relux_max_limit
,
const
float
prelu_alpha
)
:
DepthwiseConv2dFunctorBase
(
strides
,
padding
,
dilations
,
activation
,
relux_max_limit
,
prelu_alpha
)
{}
void
operator
()(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
,
StatsFuture
*
future
);
};
}
//
namespace kernels
}
// namespace kernels
}
//
namespace mace
}
// namespace mace
#endif //
MACE_KERNELS_DEPTHWISE_CONV
_H_
#endif //
MACE_KERNELS_DEPTHWISE_CONV2D
_H_
mace/kernels/opencl/REAEMD.md
0 → 100644
浏览文件 @
ff5d176e
OpenCL Image Storage Layout
===
Input/Output
---
Conv2D Filter
---
Depthwise Conv2D Filter
---
mace/kernels/opencl/buffer_to_image.cc
浏览文件 @
ff5d176e
...
@@ -27,9 +27,12 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
...
@@ -27,9 +27,12 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
string
kernel_name
;
string
kernel_name
;
switch
(
type
)
{
switch
(
type
)
{
case
FILTER
:
case
CONV2D_
FILTER
:
kernel_name
=
i2b_
?
"filter_image_to_buffer"
:
"filter_buffer_to_image"
;
kernel_name
=
i2b_
?
"filter_image_to_buffer"
:
"filter_buffer_to_image"
;
break
;
break
;
case
DW_CONV2D_FILTER
:
kernel_name
=
i2b_
?
"dw_filter_image_to_buffer"
:
"dw_filter_buffer_to_image"
;
break
;
case
IN_OUT
:
case
IN_OUT
:
kernel_name
=
i2b_
?
"in_out_image_to_buffer"
:
"in_out_buffer_to_image"
;
kernel_name
=
i2b_
?
"in_out_image_to_buffer"
:
"in_out_buffer_to_image"
;
break
;
break
;
...
...
mace/kernels/opencl/cl/buffer_to_image.cl
浏览文件 @
ff5d176e
...
@@ -72,6 +72,56 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, ic, oc
...
@@ -72,6 +72,56 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, ic, oc
}
}
}
}
__kernel
void
dw_filter_buffer_to_image
(
__global
const
DATA_TYPE
*input,
/*
h,
w,
ic,
m
*/
__private
const
int
filter_w,
__private
const
int
in_channel,
__private
const
int
multiplier,
__write_only
image2d_t
output
)
{
/*
ic%4
*
kh
*
kw
*
m,
ic/4
*/
const
int
w
=
get_global_id
(
0
)
;
const
int
h
=
get_global_id
(
1
)
;
DATA_TYPE4
values
=
0
;
if
(
multiplier
==
1
)
{
const
int
in_channel_idx
=
h
<<
2
;
const
int
h_idx
=
w
/
filter_w
;
const
int
w_idx
=
w
%
filter_w
;
const
int
offset
=
mad24
(
mad24
(
h_idx,
filter_w,
w_idx
)
,
in_channel,
in_channel_idx
)
;
const
int
size
=
in_channel
-
in_channel_idx
;
if
(
in_channel_idx
<
in_channel
)
{
if
(
size
<
4
)
{
switch
(
size
)
{
case
3:
values.z
=
*
(
input
+
offset
+
2
)
;
case
2:
values.y
=
*
(
input
+
offset
+
1
)
;
case
1:
values.x
=
*
(
input
+
offset
)
;
}
}
else
{
values
=
vload4
(
0
,
input
+
offset
)
;
}
}
}
else
{
const
int
in_channel_idx
=
h
<<
2
;
const
int
m
=
w
%
multiplier
;
const
int
hw_idx
=
w
/
multiplier
;
const
int
h_idx
=
hw_idx
/
filter_w
;
const
int
w_idx
=
hw_idx
%
filter_w
;
const
int
offset
=
mad24
(
mad24
(
mad24
(
h_idx,
filter_w,
w_idx
)
,
in_channel,
in_channel_idx
)
,
multiplier,
m
)
;
//
TODO
support
multiplier
>
1
}
int2
coord
=
(
int2
)(
w,
h
)
;
WRITE_IMAGET
(
output,
coord,
values
)
;
}
__kernel
void
in_out_buffer_to_image
(
__global
const
DATA_TYPE
*input,
/*
nhwc
*/
__kernel
void
in_out_buffer_to_image
(
__global
const
DATA_TYPE
*input,
/*
nhwc
*/
__private
const
int
height,
__private
const
int
height,
__private
const
int
width,
__private
const
int
width,
...
...
mace/kernels/opencl/cl/conv_2d.cl
浏览文件 @
ff5d176e
#
include
<common.h>
#
include
<common.h>
__kernel
void
conv_2d
(
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__kernel
void
conv_2d
(
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin
*
k
w
*
kh
,
cout/4
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin
*
k
h
*
kw
,
cout/4
*/
#
ifdef
BIAS
#
ifdef
BIAS
__read_only
image2d_t
bias,
/*
cout%4
*
cout/4
*/
__read_only
image2d_t
bias,
/*
cout%4
*
cout/4
*/
#
endif
#
endif
...
@@ -15,6 +15,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
...
@@ -15,6 +15,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__private
const
int
out_width,
__private
const
int
out_width,
__private
const
int
filter_height,
__private
const
int
filter_height,
__private
const
int
filter_width,
__private
const
int
filter_width,
__private
const
int
stride,
__private
const
int
padding_top,
__private
const
int
padding_top,
__private
const
int
padding_left,
__private
const
int
padding_left,
__private
const
int
dilation_h,
__private
const
int
dilation_h,
...
@@ -38,19 +39,12 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
...
@@ -38,19 +39,12 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
DATA_TYPE4
out3
=
0
;
DATA_TYPE4
out3
=
0
;
#
endif
#
endif
#
if
STRIDE
==
1
int
in_width_stride
=
mul24
(
out_w_blks,
stride
)
;
int
in_width0
=
out_w_blk
-
padding_left
;
int
in_width0
=
mad24
(
out_w_blk,
stride,
-padding_left
)
;
int
in_width1
=
in_width0
+
out_w_blks
;
int
in_width1
=
in_width0
+
in_width_stride
;
int
in_width2
=
in_width1
+
out_w_blks
;
int
in_width2
=
in_width1
+
in_width_stride
;
int
in_width3
=
in_width2
+
out_w_blks
;
int
in_width3
=
in_width2
+
in_width_stride
;
const
int
height_idx
=
(
out_hb
%
out_height
)
-
padding_top
;
const
int
height_idx
=
mad24
((
out_hb
%
out_height
)
,
stride,
-padding_top
)
;
#
else
int
in_width0
=
(
out_w_blk
<<
1
)
-
padding_left
;
int
in_width1
=
((
out_w_blk
+
out_w_blks
)
<<
1
)
-
padding_left
;
int
in_width2
=
((
out_w_blk
+
(
out_w_blks
<<
1
))
<<
1
)
-
padding_left
;
int
in_width3
=
((
out_w_blk
+
(
out_w_blks
<<
1
)
+
out_w_blks
)
<<
1
)
-
padding_left
;
const
int
height_idx
=
((
out_hb
%
out_height
)
<<
1
)
-
padding_top
;
#
endif
const
int
batch_idx
=
mul24
((
out_hb
/
out_height
)
,
in_height
)
;
const
int
batch_idx
=
mul24
((
out_hb
/
out_height
)
,
in_height
)
;
const
int
rounded_in_ch_x_filter_width
=
mul24
(
rounded_in_ch,
filter_width
)
;
const
int
rounded_in_ch_x_filter_width
=
mul24
(
rounded_in_ch,
filter_width
)
;
...
@@ -61,6 +55,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
...
@@ -61,6 +55,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
const
int
in_idx
=
mul24
(
in_ch_blk,
in_width
)
;
const
int
in_idx
=
mul24
(
in_ch_blk,
in_width
)
;
int
filter_x_part0
=
in_ch_blk
<<
2
;
int
filter_x_part0
=
in_ch_blk
<<
2
;
for
(
short
hb_idx
=
0
; hb_idx < filter_height; ++hb_idx) {
for
(
short
hb_idx
=
0
; hb_idx < filter_height; ++hb_idx) {
//
TODO
(
heliangliang
)
optimize
out
these
muls
int
in_hb_value
=
height_idx
+
mul24
(
hb_idx,
dilation_h
)
;
int
in_hb_value
=
height_idx
+
mul24
(
hb_idx,
dilation_h
)
;
in_hb_value
=
select
(
in_hb_value
+
batch_idx,
in_hb_value
=
select
(
in_hb_value
+
batch_idx,
-1
,
-1
,
...
...
mace/kernels/opencl/cl/conv_2d_1x1.cl
浏览文件 @
ff5d176e
...
@@ -37,7 +37,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
...
@@ -37,7 +37,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
w.z
=
w.y
+
out_w_blks
;
w.z
=
w.y
+
out_w_blks
;
w.w
=
w.z
+
out_w_blks
;
w.w
=
w.z
+
out_w_blks
;
int
out_hb_idx
=
(
out_hb
%
height
)
;
int
out_hb_idx
=
(
out_hb
%
height
)
;
#
el
se
#
el
if
STRIDE
==
2
w.x
=
out_w_blk
<<
1
;
w.x
=
out_w_blk
<<
1
;
w.y
=
(
out_w_blk
+
out_w_blks
)
<<
1
;
w.y
=
(
out_w_blk
+
out_w_blks
)
<<
1
;
w.z
=
(
out_w_blk
+
(
out_w_blks
<<
1
))
<<
1
;
w.z
=
(
out_w_blk
+
(
out_w_blks
<<
1
))
<<
1
;
...
...
mace/kernels/opencl/cl/conv_2d_3x3.cl
浏览文件 @
ff5d176e
#
include
<common.h>
#
include
<common.h>
__kernel
void
conv_2d_3x3
(
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__kernel
void
conv_2d_3x3
(
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin
*
k
w
*
kh
,
cout/4
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin
*
k
h
*
kw
,
cout/4
*/
#
ifdef
BIAS
#
ifdef
BIAS
__read_only
image2d_t
bias,
/*
cout%4
*
cout/4
*/
__read_only
image2d_t
bias,
/*
cout%4
*
cout/4
*/
#
endif
#
endif
...
@@ -45,7 +45,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
...
@@ -45,7 +45,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
int
in_width3
=
in_width2
+
out_w_blks
;
int
in_width3
=
in_width2
+
out_w_blks
;
int
in_width4
=
in_width3
+
out_w_blks
;
int
in_width4
=
in_width3
+
out_w_blks
;
const
int
height_idx
=
(
out_hb
%
out_height
)
-
padding_top
;
const
int
height_idx
=
(
out_hb
%
out_height
)
-
padding_top
;
#
el
se
#
el
if
STRIDE
==
2
int
in_width0
=
(
out_w_blk
<<
1
)
-
padding_left
;
int
in_width0
=
(
out_w_blk
<<
1
)
-
padding_left
;
int
in_width1
=
((
out_w_blk
+
out_w_blks
)
<<
1
)
-
padding_left
;
int
in_width1
=
((
out_w_blk
+
out_w_blks
)
<<
1
)
-
padding_left
;
int
in_width2
=
((
out_w_blk
+
(
out_w_blks
<<
1
))
<<
1
)
-
padding_left
;
int
in_width2
=
((
out_w_blk
+
(
out_w_blks
<<
1
))
<<
1
)
-
padding_left
;
...
@@ -63,6 +63,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
...
@@ -63,6 +63,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const
int
in_idx
=
mul24
(
in_ch_blk,
in_width
)
;
const
int
in_idx
=
mul24
(
in_ch_blk,
in_width
)
;
int
filter_x_part0
=
in_ch_blk
<<
2
;
int
filter_x_part0
=
in_ch_blk
<<
2
;
for
(
short
hb_idx
=
0
; hb_idx < 3; ++hb_idx) {
for
(
short
hb_idx
=
0
; hb_idx < 3; ++hb_idx) {
//
TODO
(
heliangliang
)
optimize
out
these
muls
int
in_hb_value
=
height_idx
+
mul24
(
hb_idx,
dilation_h
)
;
int
in_hb_value
=
height_idx
+
mul24
(
hb_idx,
dilation_h
)
;
in_hb_value
=
select
(
in_hb_value
+
batch_idx,
in_hb_value
=
select
(
in_hb_value
+
batch_idx,
-1
,
-1
,
...
...
mace/kernels/opencl/cl/depthwise_conv2d.cl
0 → 100644
浏览文件 @
ff5d176e
#
include
<common.h>
//
Only
multiplier
=
1
is
supported
__kernel
void
depthwise_conv2d
(
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
kh
*
kw
*
m,
cin/4
*/
#
ifdef
BIAS
__read_only
image2d_t
bias,
/*
cout%4
*
cout/4
*/
#
endif
__write_only
image2d_t
output,
__private
const
DATA_TYPE
relux_max_limit,
__private
const
DATA_TYPE
prelu_alpha,
__private
const
short
in_height,
__private
const
short
in_width,
__private
const
short
in_ch_blks,
__private
const
short
out_height,
__private
const
short
out_width,
__private
const
short
filter_height,
__private
const
short
filter_width,
__private
const
short
padding_top,
__private
const
short
padding_left,
__private
const
short
dilation_h,
__private
const
short
dilation_w
)
{
const
short
out_ch_blk
=
get_global_id
(
0
)
;
const
short
out_w_blk
=
get_global_id
(
1
)
;
const
short
out_w_blks
=
get_global_size
(
1
)
;
const
short
out_hb
=
get_global_id
(
2
)
;
const
short
rounded_in_ch
=
in_ch_blks
<<
2
;
const
short
in_ch_blk
=
out_ch_blk
; // multiplier = 1
#
ifdef
BIAS
DATA_TYPE4
out0
=
READ_IMAGET
(
bias,
SAMPLER,
(
int2
)(
out_ch_blk,
0
))
;
DATA_TYPE4
out1
=
out0
;
DATA_TYPE4
out2
=
out0
;
DATA_TYPE4
out3
=
out0
;
#
else
DATA_TYPE4
out0
=
0
;
DATA_TYPE4
out1
=
0
;
DATA_TYPE4
out2
=
0
;
DATA_TYPE4
out3
=
0
;
#
endif
const
short
out_h
=
out_hb
%
out_height
;
#
if
STRIDE
==
1
const
short
in_width0
=
out_w_blk
-
padding_left
;
const
short
in_width1
=
in_width0
+
out_w_blks
;
const
short
in_width2
=
in_width1
+
out_w_blks
;
const
short
in_width3
=
in_width2
+
out_w_blks
;
const
short
height_idx
=
out_h
-
padding_top
;
#
elif
STRIDE
==
2
int
in_width0
=
(
out_w_blk
<<
1
)
-
padding_left
;
int
in_width1
=
((
out_w_blk
+
out_w_blks
)
<<
1
)
-
padding_left
;
int
in_width2
=
((
out_w_blk
+
(
out_w_blks
<<
1
))
<<
1
)
-
padding_left
;
int
in_width3
=
((
out_w_blk
+
(
out_w_blks
<<
1
)
+
out_w_blks
)
<<
1
)
-
padding_left
;
int
in_width4
=
((
out_w_blk
+
(
out_w_blks
<<
2
))
<<
1
)
-
padding_left
;
const
int
height_idx
=
(
out_h
<<
1
)
-
padding_top
;
#
else
const
short
in_width_stride
=
mul24
(
out_w_blks,
STRIDE
)
;
const
short
in_width0
=
mad24
(
out_w_blk,
STRIDE,
-padding_left
)
;
const
short
in_width1
=
in_width0
+
in_width_stride
;
const
short
in_width2
=
in_width1
+
in_width_stride
;
const
short
in_width3
=
in_width2
+
in_width_stride
;
const
short
height_idx
=
mad24
(
out_h,
STRIDE,
-padding_top
)
;
#
endif
const
short
batch_idx
=
mul24
((
out_hb
/
out_height
)
,
in_height
)
;
const
short
rounded_in_ch_x_filter_width
=
mul24
(
rounded_in_ch,
filter_width
)
;
const
short
in_idx
=
mul24
(
in_ch_blk,
in_width
)
;
short
filter_idx
=
0
;
short
in_hb_idx
=
height_idx
;
for
(
short
filter_h_idx
=
0
; filter_h_idx < filter_height; ++filter_h_idx) {
short
in_hb
=
select
(
in_hb_idx
+
batch_idx,
-1
,
(
in_hb_idx
<
0
|
| in_hb_idx >= in_height));
short in_w_idx = 0;
for (short filter_w_idx = 0; filter_w_idx < filter_width; ++filter_w_idx) {
short in_w;
DATA_TYPE4 in0, in1, in2, in3;
#define READ_INPUT(i) \
in_w = in_w_idx + in_width##i; \
in_w = select(in_idx + in_w, \
-1, \
(in_w < 0 || in_w >= in_width)); \
in##i = READ_IMAGET(input, SAMPLER, (int2)(in_w, in_hb));
READ_INPUT(0);
READ_INPUT(1);
READ_INPUT(2);
READ_INPUT(3);
#undef READ_INPUT
DATA_TYPE4 weights = READ_IMAGET(filter, SAMPLER,
(int2)(filter_idx, in_ch_blk));
out0 = mad(in0, weights, out0);
out1 = mad(in1, weights, out1);
out2 = mad(in2, weights, out2);
out3 = mad(in3, weights, out3);
++filter_idx;
in_w_idx += dilation_w;
}
in_hb_idx += dilation_h;
}
#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) |
|
defined
(
USE_SIGMOID
)
out0
=
do_activation
(
out0,
relux_max_limit,
prelu_alpha
)
;
out1
=
do_activation
(
out1,
relux_max_limit,
prelu_alpha
)
;
out2
=
do_activation
(
out2,
relux_max_limit,
prelu_alpha
)
;
out3
=
do_activation
(
out3,
relux_max_limit,
prelu_alpha
)
;
#
endif
const
short
out_x_base
=
mul24
(
out_ch_blk,
out_width
)
;
short
w
=
out_w_blk
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out0
)
;
w
+=
out_w_blks
;
if
(
w
>=
out_width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out1
)
;
w
+=
out_w_blks
;
if
(
w
>=
out_width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out2
)
;
w
+=
out_w_blks
;
if
(
w
>=
out_width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out3
)
;
}
mace/kernels/opencl/cl/depthwise_conv_3x3.cl
已删除
100644 → 0
浏览文件 @
0279acae
#
include
<common.h>
VEC_DATA_TYPE
(
DATA_TYPE,4
)
conv1x3_s1
(
const
DATA_TYPE
*input_ptr,
const
DATA_TYPE
*filter_ptr
)
{
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row0
=
vload4
(
0
,
input_ptr
)
;
VEC_DATA_TYPE
(
DATA_TYPE,2
)
input1
=
vload2
(
0
,
input_ptr+4
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row1
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
row0.s123,
input1.s0
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row2
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
row0.s23,
input1.s01
)
;
VEC_DATA_TYPE
(
DATA_TYPE,3
)
filter_values
=
vload3
(
0
,
filter_ptr
)
;
return
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s0
*
row0
+
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s1
*
row1
+
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s2
*
row2
;
}
VEC_DATA_TYPE
(
DATA_TYPE,4
)
conv1x3_s2
(
const
DATA_TYPE
*input_ptr,
const
DATA_TYPE
*filter_ptr
)
{
VEC_DATA_TYPE
(
DATA_TYPE,8
)
input
=
vload8
(
0
,
input_ptr
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row0
=
input.even
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row1
=
input.odd
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row2
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
row0.s123,
input_ptr[8]
)
;
VEC_DATA_TYPE
(
DATA_TYPE,3
)
filter_values
=
vload3
(
0
,
filter_ptr
)
;
return
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s0
*
row0
+
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s1
*
row1
+
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s2
*
row2
;
}
//
Supported
data
type:
half/float
DATA_TYPE
conv3x3
(
const
DATA_TYPE
*input_ptr,
const
DATA_TYPE
*filter_ptr,
const
int
row_width
)
{
VEC_DATA_TYPE
(
DATA_TYPE,3
)
input_value
=
vload3
(
0
,
input_ptr
)
;
VEC_DATA_TYPE
(
DATA_TYPE,3
)
filter_value
=
vload3
(
0
,
filter_ptr
)
;
VEC_DATA_TYPE
(
DATA_TYPE,3
)
res
=
input_value
*
filter_value
;
input_ptr
+=
row_width
;
input_value
=
vload3
(
0
,
input_ptr
)
;
filter_value
=
vload3
(
1
,
filter_ptr
)
;
res
+=
input_value
*
filter_value
;
input_ptr
+=
row_width
;
input_value
=
vload3
(
0
,
input_ptr
)
;
filter_value
=
vload3
(
2
,
filter_ptr
)
;
res
+=
input_value
*
filter_value
;
return
res.s0
+
res.s1
+
res.s2
;
}
//TODO
merge
the
depthwise
with
conv
3x3
to
remove
duplicate
code.
__kernel
void
depthwise_conv_3x3
(
__global
const
DATA_TYPE
*input,
/*
n,
c,
h,
w
*/
__global
const
DATA_TYPE
*filter,
/*
m,
i,
kh,
kw
*/
#
ifdef
BIAS
__global
const
DATA_TYPE
*bias,
/*
o
*/
#
endif
__global
DATA_TYPE
*output,
/*
n,
c,
h,
w
*/
__private
const
int
in_chan_num,
__private
const
int
out_chan_num,
__private
const
int
in_height,
__private
const
int
in_width,
__private
const
int
out_height,
__private
const
int
out_width
)
{
int
batch
=
get_global_id
(
0
)
;
int
out_chan_blk
=
get_global_id
(
1
)
;
int
out_pixel_blk
=
get_global_id
(
2
)
;
const
int
in_pixel
=
in_height
*
in_width
;
const
int
out_pixel
=
out_height
*
out_width
;
const
int
multiplier
=
out_chan_num
/
in_chan_num
;
const
int
round_out_width
=
(
out_width
+
3
)
/
4
;
const
int
out_pixel_height
=
out_pixel_blk
/
round_out_width
;
const
int
out_pixel_width
=
out_pixel_blk
%
round_out_width
;
const
int
out_chan_begin
=
out_chan_blk
*
4
;
const
int
out_chan_end
=
min
(
out_chan_begin
+
4
,
out_chan_num
)
;
const
int
out_pixel_begin
=
out_pixel_height
*
out_width
+
out_pixel_width
*
4
;
const
int
out_pixel_end
=
min
(
out_pixel_begin
+
4
,
(
out_pixel_height
+
1
)
*
out_width
)
;
#
ifdef
STRIDE_1
const
int
in_pixel_begin
=
out_pixel_height
*
in_width
+
out_pixel_width
*
4
;
#
else
const
int
in_pixel_begin
=
out_pixel_height
*
2
*
in_width
+
out_pixel_width
*
2
*
4
;
#
endif
const
int
in_offset
=
batch
*
in_chan_num
*
in_pixel
;
const
int
out_offset
=
batch
*
out_chan_num
*
out_pixel
;
const
DATA_TYPE
*input_base
=
input
+
in_offset
+
in_pixel_begin
;
DATA_TYPE
*output_base
=
output
+
out_offset
+
out_pixel_begin
;
const
int
pixels
=
out_pixel_end
-
out_pixel_begin
;
for
(
int
i
=
out_chan_begin
; i < out_chan_end; ++i) {
const
DATA_TYPE
*input_ptr
=
input_base
+
(
i
/
multiplier
)
*
in_pixel
;
const
DATA_TYPE
*filter_ptr
=
filter
+
i
*
9
;
DATA_TYPE
*output_ptr
=
output_base
+
i
*
out_pixel
;
if
(
pixels
==
4
)
{
#
ifdef
BIAS
VEC_DATA_TYPE
(
DATA_TYPE,4
)
res
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
bias[i]
;
#
else
VEC_DATA_TYPE
(
DATA_TYPE,4
)
res
=
0
;
#
endif
/*
defined
(
BIAS
)
*/
#
ifdef
STRIDE_1
res
+=
conv1x3_s1
(
input_ptr
+
0
*
in_width,
filter_ptr
+
0
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
1
*
in_width,
filter_ptr
+
1
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
2
*
in_width,
filter_ptr
+
2
*
3
)
;
#
else
res
+=
conv1x3_s2
(
input_ptr
+
0
*
in_width,
filter_ptr
+
0
*
3
)
;
res
+=
conv1x3_s2
(
input_ptr
+
1
*
in_width,
filter_ptr
+
1
*
3
)
;
res
+=
conv1x3_s2
(
input_ptr
+
2
*
in_width,
filter_ptr
+
2
*
3
)
;
#
endif
vstore4
(
res,
0
,
output_ptr
)
;
}
else
{
for
(
int
p
=
0
; p < pixels; ++p) {
#
ifdef
BIAS
DATA_TYPE
res
=
bias[i]
;
#
else
DATA_TYPE
res
=
0
;
#
endif
res
+=
conv3x3
(
input_ptr,
filter_ptr,
in_width
)
;
output_ptr[p]
=
res
;
#
ifdef
STRIDE_1
input_ptr
+=
1
;
#
else
input_ptr
+=
2
;
#
endif
}
}
}
}
mace/kernels/opencl/conv_2d_opencl_general.cc
浏览文件 @
ff5d176e
...
@@ -81,13 +81,14 @@ void Conv2dOpencl(const Tensor *input,
...
@@ -81,13 +81,14 @@ void Conv2dOpencl(const Tensor *input,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
output
->
buffer
())));
*
(
static_cast
<
const
cl
::
Image2D
*>
(
output
->
buffer
())));
conv_2d_kernel
.
setArg
(
idx
++
,
relux_max_limit
);
conv_2d_kernel
.
setArg
(
idx
++
,
relux_max_limit
);
conv_2d_kernel
.
setArg
(
idx
++
,
prelu_alpha
);
conv_2d_kernel
.
setArg
(
idx
++
,
prelu_alpha
);
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input
->
dim
(
1
)));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input
->
dim
(
1
)));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input
->
dim
(
2
)));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input
->
dim
(
2
)));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input_channel_blocks
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input_channel_blocks
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
height
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
width
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
width
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
filter
->
dim
(
0
)));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
filter
->
dim
(
0
)));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
filter
->
dim
(
1
)));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
filter
->
dim
(
1
)));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
stride
));
conv_2d_kernel
.
setArg
(
idx
++
,
padding
[
0
]
/
2
);
conv_2d_kernel
.
setArg
(
idx
++
,
padding
[
0
]
/
2
);
conv_2d_kernel
.
setArg
(
idx
++
,
padding
[
1
]
/
2
);
conv_2d_kernel
.
setArg
(
idx
++
,
padding
[
1
]
/
2
);
conv_2d_kernel
.
setArg
(
idx
++
,
dilations
[
0
]);
conv_2d_kernel
.
setArg
(
idx
++
,
dilations
[
0
]);
...
...
mace/kernels/opencl/depthwise_conv_opencl.cc
浏览文件 @
ff5d176e
...
@@ -2,60 +2,164 @@
...
@@ -2,60 +2,164 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
// Copyright (c) 2017 XiaoMi All rights reserved.
//
//
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/activation.h"
#include "mace/kernels/depthwise_conv2d.h"
#include "mace/kernels/depthwise_conv2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace
mace
{
namespace
mace
{
namespace
kernels
{
namespace
kernels
{
extern
void
DepthwiseConvOpenclK3x3S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
void
DepthwiseConv2d
(
const
Tensor
*
input
,
// NHWC
const
Tensor
*
bias
,
Tensor
*
output
,
const
Tensor
*
filter
,
// HWIM
StatsFuture
*
future
);
const
Tensor
*
bias
,
const
int
stride
,
extern
void
DepthwiseConvOpenclK3x3S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
int
*
paddings
,
const
Tensor
*
bias
,
Tensor
*
output
,
const
int
*
dilations
,
StatsFuture
*
future
);
const
ActivationType
activation
,
template
<
>
const
float
relux_max_limit
,
void
DepthwiseConv2dFunctor
<
DeviceType
::
OPENCL
,
float
>::
operator
()(
const
Tensor
*
input
,
const
float
prelu_alpha
,
const
Tensor
*
filter
,
const
DataType
dt
,
const
Tensor
*
bias
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
)
{
const
index_t
batch
=
output
->
dim
(
0
);
typedef
void
(
*
Conv2dOpenclFunction
)(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
index_t
height
=
output
->
dim
(
1
);
const
Tensor
*
bias
,
Tensor
*
output
,
const
index_t
width
=
output
->
dim
(
2
);
StatsFuture
*
future
);
const
index_t
channels
=
output
->
dim
(
3
);
// Selection matrix: kernel_size x stride_size
static
const
Conv2dOpenclFunction
selector
[
5
][
2
]
=
{
{
nullptr
,
nullptr
},
{
nullptr
,
nullptr
},
{
DepthwiseConvOpenclK3x3S1
,
DepthwiseConvOpenclK3x3S2
},
{
nullptr
,
nullptr
},
{
nullptr
,
nullptr
}};
const
index_t
input_batch
=
input
->
dim
(
0
);
const
index_t
input_height
=
input
->
dim
(
1
);
const
index_t
input_width
=
input
->
dim
(
2
);
const
index_t
input_channels
=
input
->
dim
(
3
);
const
index_t
filter_height
=
filter
->
dim
(
0
);
const
index_t
filter_width
=
filter
->
dim
(
1
);
const
index_t
multiplier
=
filter
->
dim
(
3
);
MACE_CHECK
(
multiplier
==
1
,
"Multiplier > 1 not supported"
);
MACE_CHECK
(
multiplier
*
input_channels
==
channels
);
MACE_CHECK
(
filter
->
dim
(
2
)
==
input_channels
,
filter
->
dim
(
2
),
"!="
,
input_channels
);
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"depthwise_conv2d"
);
built_options
.
emplace
(
"-Ddepthwise_conv2d="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
built_options
.
emplace
(
"-DSTRIDE="
+
ToString
(
stride
));
switch
(
activation
)
{
case
NOOP
:
break
;
case
RELU
:
built_options
.
emplace
(
"-DUSE_RELU"
);
break
;
case
RELUX
:
built_options
.
emplace
(
"-DUSE_RELUX"
);
break
;
case
PRELU
:
built_options
.
emplace
(
"-DUSE_PRELU"
);
break
;
case
TANH
:
built_options
.
emplace
(
"-DUSE_TANH"
);
break
;
case
SIGMOID
:
built_options
.
emplace
(
"-DUSE_SIGMOID"
);
break
;
defeult:
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
auto
dw_conv2d_kernel
=
runtime
->
BuildKernel
(
"depthwise_conv2d"
,
kernel_name
,
built_options
);
uint32_t
idx
=
0
;
dw_conv2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
input
->
buffer
())));
dw_conv2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
filter
->
buffer
())));
if
(
bias
!=
nullptr
)
{
dw_conv2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
bias
->
buffer
())));
}
dw_conv2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
output
->
buffer
())));
dw_conv2d_kernel
.
setArg
(
idx
++
,
relux_max_limit
);
dw_conv2d_kernel
.
setArg
(
idx
++
,
prelu_alpha
);
dw_conv2d_kernel
.
setArg
(
idx
++
,
static_cast
<
short
>
(
input_height
));
dw_conv2d_kernel
.
setArg
(
idx
++
,
static_cast
<
short
>
(
input_width
));
dw_conv2d_kernel
.
setArg
(
idx
++
,
static_cast
<
short
>
(
input_channel_blocks
));
dw_conv2d_kernel
.
setArg
(
idx
++
,
static_cast
<
short
>
(
height
));
dw_conv2d_kernel
.
setArg
(
idx
++
,
static_cast
<
short
>
(
width
));
dw_conv2d_kernel
.
setArg
(
idx
++
,
static_cast
<
short
>
(
filter_height
));
dw_conv2d_kernel
.
setArg
(
idx
++
,
static_cast
<
short
>
(
filter_width
));
dw_conv2d_kernel
.
setArg
(
idx
++
,
static_cast
<
short
>
(
paddings
[
0
]
/
2
));
dw_conv2d_kernel
.
setArg
(
idx
++
,
static_cast
<
short
>
(
paddings
[
1
]
/
2
));
dw_conv2d_kernel
.
setArg
(
idx
++
,
static_cast
<
short
>
(
dilations
[
0
]));
dw_conv2d_kernel
.
setArg
(
idx
++
,
static_cast
<
short
>
(
dilations
[
1
]));
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
string
tuning_key
=
Concat
(
"depthwise_conv2d_ocl_kernel_"
,
activation
,
batch
,
height
,
width
,
channels
,
multiplier
);
TuningOrRun3DKernel
(
dw_conv2d_kernel
,
tuning_key
,
gws
,
lws
,
future
);
}
template
<
typename
T
>
void
DepthwiseConv2dFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
,
StatsFuture
*
future
)
{
typedef
void
(
*
Conv2dOpenclFunction
)(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
,
StatsFuture
*
future
);
index_t
kernel_h
=
filter
->
dim
(
2
);
index_t
kernel_h
=
filter
->
dim
(
2
);
index_t
kernel_w
=
filter
->
dim
(
3
);
index_t
kernel_w
=
filter
->
dim
(
3
);
if
(
kernel_h
!=
kernel_w
||
kernel_h
>
5
||
strides_
[
0
]
!=
strides_
[
1
]
||
if
(
strides_
[
0
]
!=
strides_
[
1
])
{
strides_
[
0
]
>
2
||
dilations_
[
0
]
!=
1
||
dilations_
[
1
]
!=
1
||
LOG
(
WARNING
)
<<
"OpenCL depthwise conv2d kernel with "
selector
[
kernel_h
-
1
][
strides_
[
0
]
-
1
]
==
nullptr
)
{
LOG
(
WARNING
)
<<
"OpenCL conv2d kernel with "
<<
"filter"
<<
kernel_h
<<
"x"
<<
kernel_w
<<
","
<<
"filter"
<<
kernel_h
<<
"x"
<<
kernel_w
<<
","
<<
" stride "
<<
strides_
[
0
]
<<
"x"
<<
strides_
[
1
]
<<
" stride "
<<
strides_
[
0
]
<<
"x"
<<
strides_
[
1
]
<<
" is not implemented yet, using slow version"
;
<<
" is not implemented yet, using slow version"
;
// TODO(heliangliang) The CPU/NEON kernel should map the buffer
// TODO(heliangliang) The CPU/NEON kernel should map the buffer
DepthwiseConv2dFunctor
<
DeviceType
::
CPU
,
float
>
(
strides_
,
paddings_
,
dilations_
)(
DepthwiseConv2dFunctor
<
DeviceType
::
CPU
,
float
>
(
input
,
filter
,
bias
,
output
,
future
);
strides_
,
padding_
,
dilations_
,
activation_
,
relux_max_limit_
,
prelu_alpha_
)(
input
,
filter
,
bias
,
output
,
future
);
return
;
return
;
}
}
auto
conv2d_func
=
selector
[
kernel_h
-
1
][
strides_
[
0
]
-
1
];
// Create a fake conv_2d filter to calculate the paddings and output size
if
(
paddings_
[
0
]
>
0
||
paddings_
[
1
]
>
0
)
{
std
::
vector
<
index_t
>
fake_filter_shape
(
4
);
Tensor
padded_input
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
DataTypeToEnum
<
float
>::
v
());
fake_filter_shape
[
0
]
=
filter
->
shape
()[
0
];
ConstructInputWithPadding
(
input
,
paddings_
.
data
(),
&
padded_input
);
fake_filter_shape
[
1
]
=
filter
->
shape
()[
1
];
conv2d_func
(
&
padded_input
,
filter
,
bias
,
output
,
future
);
fake_filter_shape
[
3
]
=
filter
->
shape
()[
2
]
*
filter
->
shape
()[
3
];
}
else
{
fake_filter_shape
[
2
]
=
1
;
conv2d_func
(
input
,
filter
,
bias
,
output
,
future
);
}
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
int
>
paddings
(
2
);
kernels
::
CalcNHWCPaddingAndOutputSize
(
input
->
shape
().
data
(),
fake_filter_shape
.
data
(),
dilations_
,
strides_
,
padding_
,
output_shape
.
data
(),
paddings
.
data
());
std
::
vector
<
size_t
>
output_image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT
,
output_image_shape
);
output
->
ResizeImage
(
output_shape
,
output_image_shape
);
DepthwiseConv2d
(
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
prelu_alpha_
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
);
}
}
template
struct
DepthwiseConv2dFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
DepthwiseConv2dFunctor
<
DeviceType
::
OPENCL
,
half
>;
}
// namespace kernels
}
// namespace kernels
}
// namespace mace
}
// namespace mace
mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
已删除
100644 → 0
浏览文件 @
0279acae
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/common.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
namespace
mace
{
namespace
kernels
{
static
void
InnerDepthwiseConvOpenclK3x3S12
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
uint32_t
stride
,
Tensor
*
output
,
StatsFuture
*
future
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
channels
=
output
->
dim
(
1
);
const
index_t
height
=
output
->
dim
(
2
);
const
index_t
width
=
output
->
dim
(
3
);
const
index_t
input_batch
=
input
->
dim
(
0
);
const
index_t
input_channels
=
input
->
dim
(
1
);
const
index_t
input_height
=
input
->
dim
(
2
);
const
index_t
input_width
=
input
->
dim
(
3
);
MACE_CHECK
(
input_batch
==
batch
);
const
index_t
pixels
=
height
*
width
;
const
index_t
channel_blocks
=
(
channels
+
3
)
/
4
;
const
index_t
pixel_blocks
=
(
width
+
3
)
/
4
*
height
;
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"depthwise_conv_3x3"
);
built_options
.
emplace
(
"-Ddepthwise_conv_3x3="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
input
->
dtype
()));
built_options
.
emplace
(
stride
==
1
?
"-DSTRIDE_1"
:
""
);
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
auto
conv_kernel
=
runtime
->
BuildKernel
(
"depthwise_conv_3x3"
,
kernel_name
,
built_options
);
uint32_t
idx
=
0
;
conv_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
input
->
buffer
())));
conv_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
filter
->
buffer
())));
if
(
bias
!=
nullptr
)
{
conv_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
bias
->
buffer
())));
}
conv_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
1
)));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
channels
));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
2
)));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
3
)));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
height
));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
width
));
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
output
->
dim
(
0
)),
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
pixel_blocks
)};
const
uint32_t
lws
[
3
]
=
{
static_cast
<
uint32_t
>
(
1
),
static_cast
<
uint32_t
>
(
1
),
static_cast
<
uint32_t
>
(
256
)};
cl
::
Event
event
;
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
);
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
event
.
wait
();
if
(
stats
!=
nullptr
)
{
runtime
->
GetCallStats
(
event
,
stats
);
}
};
}
}
extern
void
DepthwiseConvOpenclK3x3S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
,
StatsFuture
*
future
)
{
InnerDepthwiseConvOpenclK3x3S12
(
input
,
filter
,
bias
,
1
,
output
,
future
);
};
extern
void
DepthwiseConvOpenclK3x3S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
,
StatsFuture
*
future
)
{
InnerDepthwiseConvOpenclK3x3S12
(
input
,
filter
,
bias
,
2
,
output
,
future
);
};
}
// namespace kernels
}
// namespace mace
mace/kernels/opencl/helper.cc
浏览文件 @
ff5d176e
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
namespace
mace
{
namespace
mace
{
namespace
kernels
{
namespace
kernels
{
// [(
c+3)/4*
W, N * H]
// [(
C + 3) / 4 *
W, N * H]
void
CalInOutputImageShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* NHWC */
void
CalInOutputImageShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* NHWC */
std
::
vector
<
size_t
>
&
image_shape
)
{
std
::
vector
<
size_t
>
&
image_shape
)
{
MACE_CHECK
(
shape
.
size
()
==
4
);
MACE_CHECK
(
shape
.
size
()
==
4
);
...
@@ -18,13 +18,22 @@ void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
...
@@ -18,13 +18,22 @@ void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
image_shape
[
1
]
=
shape
[
0
]
*
shape
[
1
];
image_shape
[
1
]
=
shape
[
0
]
*
shape
[
1
];
}
}
// [
H * W * RoundUp<4>(Ic)
, (Oc + 3) / 4]
// [
RoundUp<4>(Ic) * H * W
, (Oc + 3) / 4]
void
Cal
FilterImageShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* HWIO
*/
void
Cal
Conv2dFilterImageShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* HWIO
*/
std
::
vector
<
size_t
>
&
image_shape
)
{
std
::
vector
<
size_t
>
&
image_shape
)
{
MACE_CHECK
(
shape
.
size
()
==
4
);
MACE_CHECK
(
shape
.
size
()
==
4
);
image_shape
.
resize
(
2
);
image_shape
.
resize
(
2
);
image_shape
[
0
]
=
shape
[
0
]
*
shape
[
1
]
*
RoundUp
<
index_t
>
(
shape
[
2
],
4
);
image_shape
[
0
]
=
shape
[
0
]
*
shape
[
1
]
*
RoundUp
<
index_t
>
(
shape
[
2
],
4
);
image_shape
[
1
]
=
RoundUpDiv4
(
shape
.
back
());
image_shape
[
1
]
=
RoundUpDiv4
(
shape
[
3
]);
}
// [H * W * M, (Ic + 3) / 4]
void
CalDepthwiseConv2dFilterImageShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* HWIM */
std
::
vector
<
size_t
>
&
image_shape
)
{
MACE_CHECK
(
shape
.
size
()
==
4
);
image_shape
.
resize
(
2
);
image_shape
[
0
]
=
shape
[
0
]
*
shape
[
1
]
*
shape
[
3
];
image_shape
[
1
]
=
RoundUpDiv4
(
shape
[
2
]);
}
}
// [(size + 3) / 4, 1]
// [(size + 3) / 4, 1]
...
@@ -40,11 +49,17 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
...
@@ -40,11 +49,17 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const
BufferType
type
,
const
BufferType
type
,
std
::
vector
<
size_t
>
&
image_shape
)
{
std
::
vector
<
size_t
>
&
image_shape
)
{
switch
(
type
)
{
switch
(
type
)
{
case
FILTER
:
CalFilterImageShape
(
shape
,
image_shape
);
case
CONV2D_FILTER
:
CalConv2dFilterImageShape
(
shape
,
image_shape
);
break
;
case
DW_CONV2D_FILTER
:
CalDepthwiseConv2dFilterImageShape
(
shape
,
image_shape
);
break
;
break
;
case
IN_OUT
:
CalInOutputImageShape
(
shape
,
image_shape
);
case
IN_OUT
:
CalInOutputImageShape
(
shape
,
image_shape
);
break
;
break
;
case
ARGUMENT
:
CalArgImageShape
(
shape
,
image_shape
);
case
ARGUMENT
:
CalArgImageShape
(
shape
,
image_shape
);
break
;
break
;
default:
LOG
(
FATAL
)
<<
"Mace not supported yet."
;
default:
LOG
(
FATAL
)
<<
"Mace not supported yet."
;
}
}
...
...
mace/kernels/opencl/helper.h
浏览文件 @
ff5d176e
...
@@ -17,9 +17,10 @@ namespace kernels {
...
@@ -17,9 +17,10 @@ namespace kernels {
const
float
kMaxKernelExeTime
=
1000.0
;
// microseconds
const
float
kMaxKernelExeTime
=
1000.0
;
// microseconds
enum
BufferType
{
enum
BufferType
{
FILTER
=
0
,
CONV2D_FILTER
=
0
,
IN_OUT
=
1
,
DW_CONV2D_FILTER
=
1
,
ARGUMENT
=
2
IN_OUT
=
2
,
ARGUMENT
=
3
};
};
void
CalImage2DShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* NHWC */
void
CalImage2DShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* NHWC */
...
...
mace/ops/buffer_to_image.h
浏览文件 @
ff5d176e
...
@@ -20,7 +20,7 @@ class BufferToImageOp: public Operator<D, T> {
...
@@ -20,7 +20,7 @@ class BufferToImageOp: public Operator<D, T> {
const
Tensor
*
input_tensor
=
this
->
Input
(
INPUT
);
const
Tensor
*
input_tensor
=
this
->
Input
(
INPUT
);
kernels
::
BufferType
type
=
static_cast
<
kernels
::
BufferType
>
(
OperatorBase
::
GetSingleArgument
<
int
>
(
kernels
::
BufferType
type
=
static_cast
<
kernels
::
BufferType
>
(
OperatorBase
::
GetSingleArgument
<
int
>
(
"buffer_type"
,
static_cast
<
int
>
(
kernels
::
FILTER
)));
"buffer_type"
,
static_cast
<
int
>
(
kernels
::
CONV2D_
FILTER
)));
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
functor_
(
const_cast
<
Tensor
*>
(
input_tensor
),
type
,
output
,
future
);
functor_
(
const_cast
<
Tensor
*>
(
input_tensor
),
type
,
output
,
future
);
...
...
mace/ops/buffer_to_image_test.cc
浏览文件 @
ff5d176e
...
@@ -75,27 +75,27 @@ TEST(BufferToImageTest, InputLarge) {
...
@@ -75,27 +75,27 @@ TEST(BufferToImageTest, InputLarge) {
}
}
TEST
(
BufferToImageTest
,
Filter1x1Small
)
{
TEST
(
BufferToImageTest
,
Filter1x1Small
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
FILTER
,
{
1
,
1
,
3
,
5
});
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
CONV2D_
FILTER
,
{
1
,
1
,
3
,
5
});
}
}
TEST
(
BufferToImageTest
,
Filter1x1Media
)
{
TEST
(
BufferToImageTest
,
Filter1x1Media
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
FILTER
,
{
1
,
1
,
13
,
17
});
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
CONV2D_
FILTER
,
{
1
,
1
,
13
,
17
});
}
}
TEST
(
BufferToImageTest
,
Filter1x1Large
)
{
TEST
(
BufferToImageTest
,
Filter1x1Large
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
FILTER
,
{
1
,
1
,
128
,
512
});
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
CONV2D_
FILTER
,
{
1
,
1
,
128
,
512
});
}
}
TEST
(
BufferToImageTest
,
Filter3x3Small
)
{
TEST
(
BufferToImageTest
,
Filter3x3Small
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
FILTER
,
{
3
,
3
,
3
,
5
});
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
CONV2D_
FILTER
,
{
3
,
3
,
3
,
5
});
}
}
TEST
(
BufferToImageTest
,
Filter3x3Meida
)
{
TEST
(
BufferToImageTest
,
Filter3x3Meida
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
FILTER
,
{
3
,
3
,
13
,
17
});
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
CONV2D_
FILTER
,
{
3
,
3
,
13
,
17
});
}
}
TEST
(
BufferToImageTest
,
Filter3x3Large
)
{
TEST
(
BufferToImageTest
,
Filter3x3Large
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
FILTER
,
{
3
,
3
,
128
,
256
});
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
CONV2D_
FILTER
,
{
3
,
3
,
128
,
256
});
}
}
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
...
...
mace/ops/conv_2d_benchmark.cc
浏览文件 @
ff5d176e
...
@@ -36,7 +36,7 @@ static void Conv2d(int iters,
...
@@ -36,7 +36,7 @@ static void Conv2d(int iters,
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
kernels
::
BufferType
::
CONV2D_
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
...
...
mace/ops/conv_2d_test.cc
浏览文件 @
ff5d176e
...
@@ -102,7 +102,7 @@ void TestNHWCSimple3x3VALID() {
...
@@ -102,7 +102,7 @@ void TestNHWCSimple3x3VALID() {
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
kernels
::
BufferType
::
CONV2D_
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
...
@@ -159,7 +159,7 @@ void TestNHWCSimple3x3SAME() {
...
@@ -159,7 +159,7 @@ void TestNHWCSimple3x3SAME() {
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
kernels
::
BufferType
::
CONV2D_
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
...
@@ -264,7 +264,7 @@ void TestNHWCSimple3x3WithoutBias() {
...
@@ -264,7 +264,7 @@ void TestNHWCSimple3x3WithoutBias() {
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
kernels
::
BufferType
::
CONV2D_
FILTER
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"InputImage"
)
...
@@ -371,7 +371,7 @@ static void TestNHWCCombined3x3() {
...
@@ -371,7 +371,7 @@ static void TestNHWCCombined3x3() {
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
kernels
::
BufferType
::
CONV2D_
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
kernels
::
BufferType
::
ARGUMENT
);
...
@@ -444,7 +444,7 @@ void TestConv1x1() {
...
@@ -444,7 +444,7 @@ void TestConv1x1() {
BufferToImage
<
D
,
float
>
(
net
,
"Input"
,
"InputImage"
,
BufferToImage
<
D
,
float
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
float
>
(
net
,
"Filter"
,
"FilterImage"
,
BufferToImage
<
D
,
float
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
kernels
::
BufferType
::
CONV2D_
FILTER
);
BufferToImage
<
D
,
float
>
(
net
,
"Bias"
,
"BiasImage"
,
BufferToImage
<
D
,
float
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
kernels
::
BufferType
::
ARGUMENT
);
...
@@ -535,7 +535,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
...
@@ -535,7 +535,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
kernels
::
BufferType
::
CONV2D_
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
kernels
::
BufferType
::
ARGUMENT
);
...
@@ -628,7 +628,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
...
@@ -628,7 +628,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
BufferToImage
<
D
,
half
>
(
net
,
"Input"
,
"InputImage"
,
BufferToImage
<
D
,
half
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
half
>
(
net
,
"Filter"
,
"FilterImage"
,
BufferToImage
<
D
,
half
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
kernels
::
BufferType
::
CONV2D_
FILTER
);
BufferToImage
<
D
,
half
>
(
net
,
"Bias"
,
"BiasImage"
,
BufferToImage
<
D
,
half
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
kernels
::
BufferType
::
ARGUMENT
);
...
@@ -759,7 +759,7 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
...
@@ -759,7 +759,7 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
// run on gpu
// run on gpu
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
CONV2D_
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
...
...
mace/ops/core_test.cc
浏览文件 @
ff5d176e
...
@@ -15,7 +15,7 @@ TEST(CoreTest, INIT_MODE) {
...
@@ -15,7 +15,7 @@ TEST(CoreTest, INIT_MODE) {
OpDefBuilder
(
"BufferToImage"
,
"BufferToImageTest"
)
OpDefBuilder
(
"BufferToImage"
,
"BufferToImageTest"
)
.
Input
(
"Input"
)
.
Input
(
"Input"
)
.
Output
(
"B2IOutput"
)
.
Output
(
"B2IOutput"
)
.
AddIntArg
(
"buffer_type"
,
kernels
::
BufferType
::
FILTER
)
.
AddIntArg
(
"buffer_type"
,
kernels
::
BufferType
::
CONV2D_
FILTER
)
.
AddIntArg
(
"mode"
,
static_cast
<
int
>
(
NetMode
::
INIT
))
.
AddIntArg
(
"mode"
,
static_cast
<
int
>
(
NetMode
::
INIT
))
.
Finalize
(
&
op_defs
[
op_defs
.
size
()
-
1
]);
.
Finalize
(
&
op_defs
[
op_defs
.
size
()
-
1
]);
...
@@ -33,7 +33,7 @@ TEST(CoreTest, INIT_MODE) {
...
@@ -33,7 +33,7 @@ TEST(CoreTest, INIT_MODE) {
OpDefBuilder
(
"ImageToBuffer"
,
"ImageToBufferTest"
)
OpDefBuilder
(
"ImageToBuffer"
,
"ImageToBufferTest"
)
.
Input
(
"B2IOutput"
)
.
Input
(
"B2IOutput"
)
.
Output
(
"Output"
)
.
Output
(
"Output"
)
.
AddIntArg
(
"buffer_type"
,
kernels
::
BufferType
::
FILTER
)
.
AddIntArg
(
"buffer_type"
,
kernels
::
BufferType
::
CONV2D_
FILTER
)
.
Finalize
(
&
op_defs
[
op_defs
.
size
()
-
1
]);
.
Finalize
(
&
op_defs
[
op_defs
.
size
()
-
1
]);
NetDef
net_def
;
NetDef
net_def
;
...
...
mace/ops/depthwise_conv2d.cc
浏览文件 @
ff5d176e
...
@@ -26,6 +26,12 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {
...
@@ -26,6 +26,12 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {
.
TypeConstraint
<
float
>
(
"T"
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
.
Build
(),
DepthwiseConv2dOp
<
DeviceType
::
OPENCL
,
float
>
);
DepthwiseConv2dOp
<
DeviceType
::
OPENCL
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"DepthwiseConv2d"
)
.
Device
(
DeviceType
::
OPENCL
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
DepthwiseConv2dOp
<
DeviceType
::
OPENCL
,
half
>
);
}
}
}
// namespace mace
}
// namespace mace
mace/ops/depthwise_conv2d.h
浏览文件 @
ff5d176e
...
@@ -2,8 +2,8 @@
...
@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
// Copyright (c) 2017 XiaoMi All rights reserved.
//
//
#ifndef MACE_OPS_DEPTHWISE_CONV_H_
#ifndef MACE_OPS_DEPTHWISE_CONV
2D
_H_
#define MACE_OPS_DEPTHWISE_CONV_H_
#define MACE_OPS_DEPTHWISE_CONV
2D
_H_
#include <memory>
#include <memory>
...
@@ -18,10 +18,13 @@ template <DeviceType D, typename T>
...
@@ -18,10 +18,13 @@ template <DeviceType D, typename T>
class
DepthwiseConv2dOp
:
public
ConvPool2dOpBase
<
D
,
T
>
{
class
DepthwiseConv2dOp
:
public
ConvPool2dOpBase
<
D
,
T
>
{
public:
public:
DepthwiseConv2dOp
(
const
OperatorDef
&
op_def
,
Workspace
*
ws
)
DepthwiseConv2dOp
(
const
OperatorDef
&
op_def
,
Workspace
*
ws
)
:
ConvPool2dOpBase
<
D
,
T
>
(
op_def
,
ws
)
{
:
ConvPool2dOpBase
<
D
,
T
>
(
op_def
,
ws
),
functor_
.
strides_
=
this
->
strides_
.
data
();
functor_
(
this
->
strides_
.
data
(),
functor_
.
dilations_
=
this
->
dilations_
.
data
();
this
->
padding_
,
}
this
->
dilations_
.
data
(),
kernels
::
ActivationType
::
NOOP
,
0.0
f
,
0.0
f
)
{}
bool
Run
(
StatsFuture
*
future
)
override
{
bool
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input
=
this
->
Input
(
INPUT
);
const
Tensor
*
input
=
this
->
Input
(
INPUT
);
...
@@ -31,23 +34,7 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
...
@@ -31,23 +34,7 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
bias
=
this
->
Input
(
BIAS
);
bias
=
this
->
Input
(
BIAS
);
}
}
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
// resize filter shape.
std
::
vector
<
index_t
>
filter_shape
(
filter
->
shape
().
begin
(),
filter
->
shape
().
end
());
filter_shape
[
0
]
*=
filter_shape
[
1
];
filter_shape
[
1
]
=
1
;
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
int
>
paddings
(
2
);
kernels
::
CalcPaddingAndOutputSize
(
input
->
shape
().
data
(),
filter_shape
.
data
(),
this
->
dilations_
.
data
(),
this
->
strides_
.
data
(),
this
->
padding_
,
output_shape
.
data
(),
paddings
.
data
());
output
->
Resize
(
output_shape
);
functor_
.
paddings_
=
paddings
;
functor_
(
input
,
filter
,
bias
,
output
,
future
);
functor_
(
input
,
filter
,
bias
,
output
,
future
);
return
true
;
return
true
;
}
}
...
@@ -59,6 +46,6 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
...
@@ -59,6 +46,6 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
OP_OUTPUT_TAGS
(
OUTPUT
);
OP_OUTPUT_TAGS
(
OUTPUT
);
};
};
}
//
namespace mace
}
// namespace mace
#endif //
MACE_OPS_DEPTHWISE_CONV
_H_
#endif //
MACE_OPS_DEPTHWISE_CONV2D
_H_
mace/ops/depthwise_conv2d_test.cc
浏览文件 @
ff5d176e
...
@@ -7,45 +7,200 @@
...
@@ -7,45 +7,200 @@
using
namespace
mace
;
using
namespace
mace
;
namespace
{
class
DepthwiseConv2dOpTest
:
public
OpsTestBase
{};
class
DepthwiseConv2dOpTest
:
public
OpsTestBase
{};
template
<
DeviceType
D
>
template
<
DeviceType
D
,
typename
T
>
void
SimpleValidTest
()
{
void
SimpleValidTest
()
{
testing
::
internal
::
LogToStderr
();
testing
::
internal
::
LogToStderr
();
// Construct graph
// Construct graph
OpsTestNet
net
;
OpsTestNet
net
;
OpDefBuilder
(
"DepthwiseConv2d"
,
"DepthwiseConv2DTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
2
,
2
,
3
},
{
1
,
3
,
5
,
7
,
9
,
11
,
2
,
4
,
6
,
8
,
10
,
12
});
net
.
AddInputFromArray
<
D
,
float
>
(
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
2
,
2
,
2
,
2
},
"Input"
,
{
1
,
3
,
3
,
2
},
{
1.0
f
,
5.0
f
,
9.0
f
,
13.0
f
,
2.0
f
,
6.0
f
,
10.0
f
,
14.0
f
,
3.0
f
,
7.0
f
,
11.0
f
,
{
1
,
2
,
2
,
4
,
3
,
6
,
4
,
8
,
5
,
10
,
6
,
12
,
7
,
14
,
8
,
16
,
9
,
18
});
15.0
f
,
4.0
f
,
8.0
f
,
12.0
f
,
16.0
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
4
},
{
.1
f
,
.2
f
,
.3
f
,
.4
f
});
"Filter"
,
{
2
,
2
,
2
,
1
},
{
1.0
f
,
2.0
f
,
2.0
f
,
4.0
f
,
3.0
f
,
6.0
f
,
4.0
f
,
8.0
f
});
// Run
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
2
},
{
.1
f
,
.2
f
});
net
.
RunOp
(
D
);
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
DW_CONV2D_FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"DepthwiseConv2d"
,
"DepthwiseConv2DTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
.
Input
(
"BiasImage"
)
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
(
D
);
// Transfer output
ImageToBuffer
<
D
,
T
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
}
else
{
OpDefBuilder
(
"DepthwiseConv2d"
,
"DepthwiseConv2DTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
}
// Check
// Check
auto
expected
=
CreateTensor
<
float
>
(
auto
expected
=
CreateTensor
<
T
>
({
1
,
2
,
2
,
2
},
{
37.1
f
,
148.2
f
,
47.1
f
,
188.2
f
,
{
1
,
4
,
1
,
2
},
67.1
f
,
268.2
f
,
77.1
f
,
308.2
f
});
{
196.1
f
,
252.1
f
,
216.2
f
,
280.2
f
,
272.3
f
,
344.3
f
,
296.4
f
,
376.4
f
});
ExpectTensorNear
<
T
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-5
);
}
TEST_F
(
DepthwiseConv2dOpTest
,
SimpleCPU
)
{
SimpleValidTest
<
DeviceType
::
CPU
,
float
>
();
}
TEST_F
(
DepthwiseConv2dOpTest
,
SimpleOpenCL
)
{
SimpleValidTest
<
DeviceType
::
OPENCL
,
float
>
();
}
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-5
);
TEST_F
(
DepthwiseConv2dOpTest
,
SimpleOpenCLHalf
)
{
SimpleValidTest
<
DeviceType
::
OPENCL
,
half
>
();
}
}
TEST_F
(
DepthwiseConv2dOpTest
,
SimpleCPU
)
{
SimpleValidTest
<
DeviceType
::
CPU
>
();
}
template
<
DeviceType
D
,
typename
T
>
void
ComplexValidTest
()
{
testing
::
internal
::
LogToStderr
();
// Construct graph
OpsTestNet
net
;
template
<
DeviceType
D
>
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
10
,
10
,
3
},
{
0.0
,
0.01
,
0.02
,
0.03
,
0.04
,
0.05
,
0.06
,
0.07
,
0.08
,
0.09
,
0.1
,
0.11
,
0.12
,
0.13
,
0.14
,
0.15
,
0.16
,
0.17
,
0.18
,
0.19
,
0.2
,
0.21
,
0.22
,
0.23
,
0.24
,
0.25
,
0.26
,
0.27
,
0.28
,
0.29
,
0.3
,
0.31
,
0.32
,
0.33
,
0.34
,
0.35
,
0.36
,
0.37
,
0.38
,
0.39
,
0.4
,
0.41
,
0.42
,
0.43
,
0.44
,
0.45
,
0.46
,
0.47
,
0.48
,
0.49
,
0.5
,
0.51
,
0.52
,
0.53
,
0.54
,
0.55
,
0.56
,
0.57
,
0.58
,
0.59
,
0.6
,
0.61
,
0.62
,
0.63
,
0.64
,
0.65
,
0.66
,
0.67
,
0.68
,
0.69
,
0.7
,
0.71
,
0.72
,
0.73
,
0.74
,
0.75
,
0.76
,
0.77
,
0.78
,
0.79
,
0.8
,
0.81
,
0.82
,
0.83
,
0.84
,
0.85
,
0.86
,
0.87
,
0.88
,
0.89
,
0.9
,
0.91
,
0.92
,
0.93
,
0.94
,
0.95
,
0.96
,
0.97
,
0.98
,
0.99
,
1.0
,
1.01
,
1.02
,
1.03
,
1.04
,
1.05
,
1.06
,
1.07
,
1.08
,
1.09
,
1.1
,
1.11
,
1.12
,
1.13
,
1.14
,
1.15
,
1.16
,
1.17
,
1.18
,
1.19
,
1.2
,
1.21
,
1.22
,
1.23
,
1.24
,
1.25
,
1.26
,
1.27
,
1.28
,
1.29
,
1.3
,
1.31
,
1.32
,
1.33
,
1.34
,
1.35
,
1.36
,
1.37
,
1.38
,
1.39
,
1.4
,
1.41
,
1.42
,
1.43
,
1.44
,
1.45
,
1.46
,
1.47
,
1.48
,
1.49
,
1.5
,
1.51
,
1.52
,
1.53
,
1.54
,
1.55
,
1.56
,
1.57
,
1.58
,
1.59
,
1.6
,
1.61
,
1.62
,
1.63
,
1.64
,
1.65
,
1.66
,
1.67
,
1.68
,
1.69
,
1.7
,
1.71
,
1.72
,
1.73
,
1.74
,
1.75
,
1.76
,
1.77
,
1.78
,
1.79
,
1.8
,
1.81
,
1.82
,
1.83
,
1.84
,
1.85
,
1.86
,
1.87
,
1.88
,
1.89
,
1.9
,
1.91
,
1.92
,
1.93
,
1.94
,
1.95
,
1.96
,
1.97
,
1.98
,
1.99
,
2.0
,
2.01
,
2.02
,
2.03
,
2.04
,
2.05
,
2.06
,
2.07
,
2.08
,
2.09
,
2.1
,
2.11
,
2.12
,
2.13
,
2.14
,
2.15
,
2.16
,
2.17
,
2.18
,
2.19
,
2.2
,
2.21
,
2.22
,
2.23
,
2.24
,
2.25
,
2.26
,
2.27
,
2.28
,
2.29
,
2.3
,
2.31
,
2.32
,
2.33
,
2.34
,
2.35
,
2.36
,
2.37
,
2.38
,
2.39
,
2.4
,
2.41
,
2.42
,
2.43
,
2.44
,
2.45
,
2.46
,
2.47
,
2.48
,
2.49
,
2.5
,
2.51
,
2.52
,
2.53
,
2.54
,
2.55
,
2.56
,
2.57
,
2.58
,
2.59
,
2.6
,
2.61
,
2.62
,
2.63
,
2.64
,
2.65
,
2.66
,
2.67
,
2.68
,
2.69
,
2.7
,
2.71
,
2.72
,
2.73
,
2.74
,
2.75
,
2.76
,
2.77
,
2.78
,
2.79
,
2.8
,
2.81
,
2.82
,
2.83
,
2.84
,
2.85
,
2.86
,
2.87
,
2.88
,
2.89
,
2.9
,
2.91
,
2.92
,
2.93
,
2.94
,
2.95
,
2.96
,
2.97
,
2.98
,
2.99
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
5
,
5
,
3
,
1
},
{
0.0
,
0.01
,
0.02
,
0.03
,
0.04
,
0.05
,
0.06
,
0.07
,
0.08
,
0.09
,
0.1
,
0.11
,
0.12
,
0.13
,
0.14
,
0.15
,
0.16
,
0.17
,
0.18
,
0.19
,
0.2
,
0.21
,
0.22
,
0.23
,
0.24
,
0.25
,
0.26
,
0.27
,
0.28
,
0.29
,
0.3
,
0.31
,
0.32
,
0.33
,
0.34
,
0.35
,
0.36
,
0.37
,
0.38
,
0.39
,
0.4
,
0.41
,
0.42
,
0.43
,
0.44
,
0.45
,
0.46
,
0.47
,
0.48
,
0.49
,
0.5
,
0.51
,
0.52
,
0.53
,
0.54
,
0.55
,
0.56
,
0.57
,
0.58
,
0.59
,
0.6
,
0.61
,
0.62
,
0.63
,
0.64
,
0.65
,
0.66
,
0.67
,
0.68
,
0.69
,
0.7
,
0.71
,
0.72
,
0.73
,
0.74
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
6
},
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
});
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
DW_CONV2D_FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"DepthwiseConv2d"
,
"DepthwiseConv2DTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
.
Input
(
"BiasImage"
)
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"strides"
,
{
2
,
2
})
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
(
D
);
// Transfer output
ImageToBuffer
<
D
,
T
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
}
else
{
OpDefBuilder
(
"DepthwiseConv2d"
,
"DepthwiseConv2DTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
2
,
2
})
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
}
// Check
auto
expected
=
CreateTensor
<
T
>
(
{
1
,
5
,
5
,
3
},
{
4.48200035
,
4.63479996
,
4.79079962
,
5.85899973
,
6.05599976
,
6.25699997
,
6.38100004
,
6.59000015
,
6.80300045
,
6.90299988
,
7.1239996
,
7.34899998
,
4.03559971
,
4.16820002
,
4.30319977
,
8.90999985
,
9.1760006
,
9.44599915
,
11.20499992
,
11.54500103
,
11.89000034
,
11.74499989
,
12.09999943
,
12.46000004
,
12.28499985
,
12.65500069
,
13.03000069
,
7.00200033
,
7.22399998
,
7.44900036
,
13.4100008
,
13.79599953
,
14.18599987
,
16.60500145
,
17.09499741
,
17.59000015
,
17.14500046
,
17.65000153
,
18.15999794
,
17.68499947
,
18.20499992
,
18.72999954
,
9.97200012
,
10.28399944
,
10.59899998
,
17.90999985
,
18.41600037
,
18.92599869
,
22.00500107
,
22.64500046
,
23.28999901
,
22.54500008
,
23.19999886
,
23.8599987
,
23.0850029
,
23.75500107
,
24.43000031
,
12.94200039
,
13.34400082
,
13.7489996
,
6.97500038
,
7.29659986
,
7.62060022
,
8.32049942
,
8.72700024
,
9.13650036
,
8.5095005
,
8.92500019
,
9.34349918
,
8.69849968
,
9.12300014
,
9.55049992
,
4.55220032
,
4.80690002
,
5.06340027
});
ExpectTensorNear
<
T
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.2
);
}
TEST_F
(
DepthwiseConv2dOpTest
,
ComplexCPU
)
{
ComplexValidTest
<
DeviceType
::
CPU
,
float
>
();
}
TEST_F
(
DepthwiseConv2dOpTest
,
ComplexOpenCL
)
{
ComplexValidTest
<
DeviceType
::
OPENCL
,
float
>
();
}
TEST_F
(
DepthwiseConv2dOpTest
,
ComplexOpenCLHalf
)
{
ComplexValidTest
<
DeviceType
::
OPENCL
,
half
>
();
}
template
<
DeviceType
D
,
typename
T
>
void
TestNxNS12
(
const
index_t
height
,
const
index_t
width
)
{
void
TestNxNS12
(
const
index_t
height
,
const
index_t
width
)
{
testing
::
internal
::
LogToStderr
();
testing
::
internal
::
LogToStderr
();
auto
func
=
[
&
](
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
auto
func
=
[
&
](
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
...
@@ -53,11 +208,18 @@ void TestNxNS12(const index_t height, const index_t width) {
...
@@ -53,11 +208,18 @@ void TestNxNS12(const index_t height, const index_t width) {
srand
(
time
(
NULL
));
srand
(
time
(
NULL
));
// generate random input
// generate random input
index_t
batch
=
1
;
index_t
batch
=
1
+
rand
()
%
5
;
index_t
input_channels
=
3
;
index_t
input_channels
=
3
+
rand
()
%
16
;
index_t
multiplier
=
2
;
index_t
multiplier
=
1
;
// Construct graph
// Construct graph
OpsTestNet
net
;
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
multiplier
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
multiplier
*
input_channels
});
OpDefBuilder
(
"DepthwiseConv2d"
,
"DepthwiseConv2DTest"
)
OpDefBuilder
(
"DepthwiseConv2d"
,
"DepthwiseConv2DTest"
)
.
Input
(
"Input"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Filter"
)
...
@@ -66,24 +228,56 @@ void TestNxNS12(const index_t height, const index_t width) {
...
@@ -66,24 +228,56 @@ void TestNxNS12(const index_t height, const index_t width) {
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
type
)
.
AddIntArg
(
"padding"
,
type
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
float
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
// Run on cpu
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
net
.
RunOp
();
{
batch
,
input_channels
,
height
,
width
});
net
.
AddRandomInput
<
D
,
float
>
(
"Filter"
,
{
multiplier
,
input_channels
,
kernel_h
,
kernel_w
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
multiplier
*
input_channels
});
// Run on device
net
.
RunOp
(
D
);
// Check
// Check
Tensor
expected
;
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
// run cpu
if
(
D
==
DeviceType
::
OPENCL
)
{
net
.
RunOp
();
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-3
);
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
DW_CONV2D_FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"DepthwiseConv2d"
,
"DepthwiseConv2DTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
.
Input
(
"BiasImage"
)
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
type
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
(
D
);
// Transfer output
ImageToBuffer
<
D
,
float
>
(
net
,
"OutputImage"
,
"DeviceOutput"
,
kernels
::
BufferType
::
IN_OUT
);
}
else
{
OpDefBuilder
(
"DepthwiseConv2d"
,
"DepthwiseConv2DTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"DeviceOutput"
)
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
type
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
}
// Check
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"DeviceOutput"
),
0.1
);
};
};
for
(
int
kernel_size
:
{
3
})
{
for
(
int
kernel_size
:
{
3
})
{
...
@@ -96,32 +290,47 @@ void TestNxNS12(const index_t height, const index_t width) {
...
@@ -96,32 +290,47 @@ void TestNxNS12(const index_t height, const index_t width) {
#if __ARM_NEON
#if __ARM_NEON
TEST_F
(
DepthwiseConv2dOpTest
,
NeonSimpleNxNS12
)
{
TEST_F
(
DepthwiseConv2dOpTest
,
NeonSimpleNxNS12
)
{
TestNxNS12
<
DeviceType
::
NEON
>
(
4
,
4
);
TestNxNS12
<
DeviceType
::
NEON
,
float
>
(
4
,
4
);
}
}
#endif
#endif
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLSimpleNxNS12
)
{
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLSimpleNxNS12
)
{
TestNxNS12
<
DeviceType
::
OPENCL
>
(
4
,
4
);
TestNxNS12
<
DeviceType
::
OPENCL
,
float
>
(
4
,
4
);
}
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLSimpleNxNS12Half
)
{
TestNxNS12
<
DeviceType
::
OPENCL
,
half
>
(
4
,
4
);
}
}
#if __ARM_NEON
#if __ARM_NEON
TEST_F
(
DepthwiseConv2dOpTest
,
NeonAlignedNxNS12
)
{
TEST_F
(
DepthwiseConv2dOpTest
,
NeonAlignedNxNS12
)
{
TestNxNS12
<
DeviceType
::
NEON
>
(
64
,
64
);
TestNxNS12
<
DeviceType
::
NEON
,
float
>
(
64
,
64
);
TestNxNS12
<
DeviceType
::
NEON
>
(
128
,
128
);
TestNxNS12
<
DeviceType
::
NEON
,
float
>
(
128
,
128
);
}
}
#endif
#endif
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLAlignedNxNS12
)
{
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLAlignedNxNS12
)
{
TestNxNS12
<
DeviceType
::
OPENCL
>
(
64
,
64
);
TestNxNS12
<
DeviceType
::
OPENCL
,
float
>
(
64
,
64
);
TestNxNS12
<
DeviceType
::
OPENCL
>
(
128
,
128
);
TestNxNS12
<
DeviceType
::
OPENCL
,
float
>
(
128
,
128
);
}
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLAlignedNxNS12Half
)
{
TestNxNS12
<
DeviceType
::
OPENCL
,
half
>
(
64
,
64
);
TestNxNS12
<
DeviceType
::
OPENCL
,
half
>
(
128
,
128
);
}
}
#if __ARM_NEON
#if __ARM_NEON
TEST_F
(
DepthwiseConv2dOpTest
,
NeonUnalignedNxNS12
)
{
TEST_F
(
DepthwiseConv2dOpTest
,
NeonUnalignedNxNS12
)
{
TestNxNS12
<
DeviceType
::
NEON
>
(
107
,
113
);
TestNxNS12
<
DeviceType
::
NEON
,
float
>
(
107
,
113
);
}
}
#endif
#endif
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLUnalignedNxNS12
)
{
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLUnalignedNxNS12
)
{
TestNxNS12
<
DeviceType
::
OPENCL
>
(
107
,
113
);
TestNxNS12
<
DeviceType
::
OPENCL
,
float
>
(
107
,
113
);
}
}
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLUnalignedNxNS12Half
)
{
TestNxNS12
<
DeviceType
::
OPENCL
,
half
>
(
107
,
113
);
}
}
// namespace
mace/ops/depthwise_conv_2d_benchmark.cc
浏览文件 @
ff5d176e
...
@@ -14,44 +14,65 @@ namespace mace {
...
@@ -14,44 +14,65 @@ namespace mace {
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
static
void
DepthwiseConv2d
(
int
iters
,
static
void
DepthwiseConv2d
(
int
iters
,
int
batch
,
int
batch
,
int
channels
,
int
input_
channels
,
int
height
,
int
height
,
int
width
,
int
width
,
int
kernel_h
,
int
kernel_h
,
int
kernel_w
,
int
kernel_w
,
int
stride
,
int
stride
,
Padding
padding
,
Padding
padding
,
int
output_channels
)
{
int
multiplier
)
{
mace
::
testing
::
StopTiming
();
mace
::
testing
::
StopTiming
();
OpsTestNet
net
;
OpsTestNet
net
;
OpDefBuilder
(
"DepthwiseConv2d"
,
"DepthwiseConv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
stride
,
stride
})
.
AddIntArg
(
"padding"
,
padding
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
channels
,
height
,
width
});
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Filter"
,
net
.
AddRandomInput
<
D
,
float
>
(
{
output_channels
,
channels
,
kernel_h
,
kernel_w
});
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
multiplier
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
*
channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
input_channels
*
multiplier
});
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
DW_CONV2D_FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"DepthwiseConv2d"
,
"DepthwiseConv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
.
Input
(
"BiasImage"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
stride
,
stride
})
.
AddIntArg
(
"padding"
,
padding
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
}
else
{
OpDefBuilder
(
"DepthwiseConv2d"
,
"DepthwiseConv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
stride
,
stride
})
.
AddIntArg
(
"padding"
,
padding
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
}
// Warm-up
// Warm-up
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
for
(
int
i
=
0
;
i
<
2
;
++
i
)
{
net
.
RunOp
(
D
);
net
.
RunOp
(
D
);
net
.
Sync
();
}
}
net
.
Sync
();
mace
::
testing
::
StartTiming
();
mace
::
testing
::
StartTiming
();
while
(
iters
--
)
{
while
(
iters
--
)
{
net
.
RunOp
(
D
);
net
.
RunOp
(
D
);
net
.
Sync
();
}
}
net
.
Sync
();
}
}
#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, \
#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, \
...
@@ -68,20 +89,21 @@ static void DepthwiseConv2d(int iters,
...
@@ -68,20 +89,21 @@ static void DepthwiseConv2d(int iters,
BENCHMARK( \
BENCHMARK( \
BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE)
\
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU);
\
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
VALID
,
2
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
VALID
,
1
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
VALID
,
2
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
VALID
,
1
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
SAME
,
2
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
SAME
,
1
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
SAME
,
2
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
SAME
,
1
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
1
,
VALID
,
1
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
1
,
VALID
,
1
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
1
,
SAME
,
1
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
1
,
SAME
,
1
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
2
,
VALID
,
2
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
2
,
VALID
,
1
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
2
,
VALID
,
2
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
2
,
VALID
,
1
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
2
,
SAME
,
2
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
2
,
SAME
,
1
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
2
,
SAME
,
2
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
2
,
SAME
,
1
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
2
,
VALID
,
1
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
2
,
VALID
,
1
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
2
,
SAME
,
1
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
2
,
SAME
,
1
,
float
);
}
// namespace mace
}
// namespace mace
mace/ops/fused_conv_2d_test.cc
浏览文件 @
ff5d176e
...
@@ -26,7 +26,7 @@ void TestNHWCSimple3x3VALID() {
...
@@ -26,7 +26,7 @@ void TestNHWCSimple3x3VALID() {
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
kernels
::
BufferType
::
CONV2D_
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"FusedConv2D"
,
"FusedConv2dTest"
)
OpDefBuilder
(
"FusedConv2D"
,
"FusedConv2dTest"
)
...
@@ -83,7 +83,7 @@ void TestNHWCSimple3x3SAME() {
...
@@ -83,7 +83,7 @@ void TestNHWCSimple3x3SAME() {
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
kernels
::
BufferType
::
CONV2D_
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"FusedConv2D"
,
"FusedConv2dTest"
)
OpDefBuilder
(
"FusedConv2D"
,
"FusedConv2dTest"
)
...
@@ -151,7 +151,7 @@ void TestNHWCSimple3x3WithoutBias() {
...
@@ -151,7 +151,7 @@ void TestNHWCSimple3x3WithoutBias() {
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
kernels
::
BufferType
::
CONV2D_
FILTER
);
OpDefBuilder
(
"FusedConv2D"
,
"FusedConv2dTest"
)
OpDefBuilder
(
"FusedConv2D"
,
"FusedConv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"InputImage"
)
...
@@ -220,7 +220,7 @@ void TestConv1x1() {
...
@@ -220,7 +220,7 @@ void TestConv1x1() {
BufferToImage
<
D
,
float
>
(
net
,
"Input"
,
"InputImage"
,
BufferToImage
<
D
,
float
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
float
>
(
net
,
"Filter"
,
"FilterImage"
,
BufferToImage
<
D
,
float
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
kernels
::
BufferType
::
CONV2D_
FILTER
);
BufferToImage
<
D
,
float
>
(
net
,
"Bias"
,
"BiasImage"
,
BufferToImage
<
D
,
float
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
kernels
::
BufferType
::
ARGUMENT
);
...
@@ -311,7 +311,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
...
@@ -311,7 +311,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
kernels
::
BufferType
::
CONV2D_
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
kernels
::
BufferType
::
ARGUMENT
);
...
@@ -397,7 +397,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
...
@@ -397,7 +397,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
BufferToImage
<
D
,
half
>
(
net
,
"Input"
,
"InputImage"
,
BufferToImage
<
D
,
half
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
half
>
(
net
,
"Filter"
,
"FilterImage"
,
BufferToImage
<
D
,
half
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
kernels
::
BufferType
::
CONV2D_
FILTER
);
BufferToImage
<
D
,
half
>
(
net
,
"Bias"
,
"BiasImage"
,
BufferToImage
<
D
,
half
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
kernels
::
BufferType
::
ARGUMENT
);
...
@@ -475,7 +475,7 @@ static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
...
@@ -475,7 +475,7 @@ static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
kernels
::
BufferType
::
CONV2D_
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
kernels
::
BufferType
::
ARGUMENT
);
...
@@ -551,7 +551,7 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat
...
@@ -551,7 +551,7 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat
// run on gpu
// run on gpu
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
CONV2D_
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"FusedConv2D"
,
"FusedConv2dTest"
)
OpDefBuilder
(
"FusedConv2D"
,
"FusedConv2dTest"
)
...
@@ -633,7 +633,7 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
...
@@ -633,7 +633,7 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
// run on gpu
// run on gpu
BufferToImage
<
D
,
half
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
half
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
half
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
half
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
CONV2D_
FILTER
);
BufferToImage
<
D
,
half
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
BufferToImage
<
D
,
half
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"FusedConv2D"
,
"FusedConv2dTest"
)
OpDefBuilder
(
"FusedConv2D"
,
"FusedConv2dTest"
)
...
...
mace/ops/image_to_buffer.h
浏览文件 @
ff5d176e
...
@@ -21,7 +21,7 @@ class ImageToBufferOp: public Operator<D, T> {
...
@@ -21,7 +21,7 @@ class ImageToBufferOp: public Operator<D, T> {
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
kernels
::
BufferType
type
=
static_cast
<
kernels
::
BufferType
>
(
OperatorBase
::
GetSingleArgument
<
int
>
(
kernels
::
BufferType
type
=
static_cast
<
kernels
::
BufferType
>
(
OperatorBase
::
GetSingleArgument
<
int
>
(
"buffer_type"
,
static_cast
<
int
>
(
kernels
::
FILTER
)));
"buffer_type"
,
static_cast
<
int
>
(
kernels
::
CONV2D_
FILTER
)));
functor_
(
output
,
type
,
const_cast
<
Tensor
*>
(
input_tensor
),
future
);
functor_
(
output
,
type
,
const_cast
<
Tensor
*>
(
input_tensor
),
future
);
return
true
;
return
true
;
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录