Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
2460a946
Mace
项目概览
Xiaomi
/
Mace
通知
107
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
2460a946
编写于
3月 19, 2018
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Support arbitrary input size.
上级
9bda14f3
变更
36
显示空白变更内容
内联
并排
Showing
36 changed file
with
225 addition
and
120 deletion
+225
-120
mace/kernels/activation.h
mace/kernels/activation.h
+1
-0
mace/kernels/addn.h
mace/kernels/addn.h
+1
-0
mace/kernels/batch_norm.h
mace/kernels/batch_norm.h
+1
-0
mace/kernels/bias_add.h
mace/kernels/bias_add.h
+1
-0
mace/kernels/channel_shuffle.h
mace/kernels/channel_shuffle.h
+1
-0
mace/kernels/concat.h
mace/kernels/concat.h
+1
-0
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+1
-0
mace/kernels/depthwise_conv2d.h
mace/kernels/depthwise_conv2d.h
+1
-0
mace/kernels/eltwise.h
mace/kernels/eltwise.h
+1
-0
mace/kernels/fully_connected.h
mace/kernels/fully_connected.h
+1
-0
mace/kernels/opencl/activation_opencl.cc
mace/kernels/opencl/activation_opencl.cc
+5
-0
mace/kernels/opencl/addn.cc
mace/kernels/opencl/addn.cc
+15
-9
mace/kernels/opencl/batch_norm_opencl.cc
mace/kernels/opencl/batch_norm_opencl.cc
+4
-1
mace/kernels/opencl/bias_add_opencl.cc
mace/kernels/opencl/bias_add_opencl.cc
+3
-0
mace/kernels/opencl/channel_shuffle.cc
mace/kernels/opencl/channel_shuffle.cc
+8
-4
mace/kernels/opencl/concat.cc
mace/kernels/opencl/concat.cc
+5
-1
mace/kernels/opencl/conv_2d_opencl.cc
mace/kernels/opencl/conv_2d_opencl.cc
+7
-4
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+5
-0
mace/kernels/opencl/conv_2d_opencl_3x3.cc
mace/kernels/opencl/conv_2d_opencl_3x3.cc
+5
-1
mace/kernels/opencl/conv_2d_opencl_general.cc
mace/kernels/opencl/conv_2d_opencl_general.cc
+5
-1
mace/kernels/opencl/depthwise_conv_opencl.cc
mace/kernels/opencl/depthwise_conv_opencl.cc
+16
-15
mace/kernels/opencl/eltwise_opencl.cc
mace/kernels/opencl/eltwise_opencl.cc
+3
-0
mace/kernels/opencl/fully_connected_opencl.cc
mace/kernels/opencl/fully_connected_opencl.cc
+20
-7
mace/kernels/opencl/helper.h
mace/kernels/opencl/helper.h
+7
-0
mace/kernels/opencl/matmul.cc
mace/kernels/opencl/matmul.cc
+9
-10
mace/kernels/opencl/pooling_opencl.cc
mace/kernels/opencl/pooling_opencl.cc
+32
-26
mace/kernels/opencl/resize_bilinear_opencl.cc
mace/kernels/opencl/resize_bilinear_opencl.cc
+18
-14
mace/kernels/opencl/softmax_opencl.cc
mace/kernels/opencl/softmax_opencl.cc
+3
-0
mace/kernels/opencl/space_to_batch_opencl.cc
mace/kernels/opencl/space_to_batch_opencl.cc
+4
-0
mace/kernels/opencl/winograd_transform.cc
mace/kernels/opencl/winograd_transform.cc
+31
-23
mace/kernels/pooling.h
mace/kernels/pooling.h
+1
-0
mace/kernels/resize_bilinear.h
mace/kernels/resize_bilinear.h
+1
-0
mace/kernels/softmax.h
mace/kernels/softmax.h
+1
-0
mace/kernels/space_to_batch.h
mace/kernels/space_to_batch.h
+1
-0
mace/kernels/winograd_transform.h
mace/kernels/winograd_transform.h
+2
-0
tools/wino_conv.py
tools/wino_conv.py
+4
-4
未找到文件。
mace/kernels/activation.h
浏览文件 @
2460a946
...
@@ -152,6 +152,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
...
@@ -152,6 +152,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
T
relux_max_limit_
;
T
relux_max_limit_
;
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
std
::
string
tuning_key_prefix_
;
std
::
string
tuning_key_prefix_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
}
// namespace kernels
}
// namespace kernels
...
...
mace/kernels/addn.h
浏览文件 @
2460a946
...
@@ -91,6 +91,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
...
@@ -91,6 +91,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
}
// namespace kernels
}
// namespace kernels
...
...
mace/kernels/batch_norm.h
浏览文件 @
2460a946
...
@@ -156,6 +156,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
...
@@ -156,6 +156,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
}
// namepsace kernels
}
// namepsace kernels
...
...
mace/kernels/bias_add.h
浏览文件 @
2460a946
...
@@ -62,6 +62,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
...
@@ -62,6 +62,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
}
// namepsace kernels
}
// namepsace kernels
...
...
mace/kernels/channel_shuffle.h
浏览文件 @
2460a946
...
@@ -55,6 +55,7 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
...
@@ -55,6 +55,7 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
const
int
groups_
;
const
int
groups_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
}
// namespace kernels
}
// namespace kernels
...
...
mace/kernels/concat.h
浏览文件 @
2460a946
...
@@ -83,6 +83,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
...
@@ -83,6 +83,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
}
// namepsace kernels
}
// namepsace kernels
...
...
mace/kernels/conv_2d.h
浏览文件 @
2460a946
...
@@ -401,6 +401,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
...
@@ -401,6 +401,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
}
// namespace kernels
}
// namespace kernels
...
...
mace/kernels/depthwise_conv2d.h
浏览文件 @
2460a946
...
@@ -439,6 +439,7 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
...
@@ -439,6 +439,7 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
}
// namespace kernels
}
// namespace kernels
...
...
mace/kernels/eltwise.h
浏览文件 @
2460a946
...
@@ -94,6 +94,7 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
...
@@ -94,6 +94,7 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
}
// namespace kernels
}
// namespace kernels
...
...
mace/kernels/fully_connected.h
浏览文件 @
2460a946
...
@@ -90,6 +90,7 @@ struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
...
@@ -90,6 +90,7 @@ struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
std
::
vector
<
uint32_t
>
gws_
;
std
::
vector
<
uint32_t
>
gws_
;
std
::
vector
<
uint32_t
>
lws_
;
std
::
vector
<
uint32_t
>
lws_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
}
// namespace kernels
}
// namespace kernels
...
...
mace/kernels/opencl/activation_opencl.cc
浏览文件 @
2460a946
...
@@ -58,6 +58,9 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -58,6 +58,9 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
}
}
kernel_
=
runtime
->
BuildKernel
(
"activation"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"activation"
,
kernel_name
,
built_options
);
}
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
int
idx
=
0
;
int
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
if
(
activation_
==
PRELU
)
{
if
(
activation_
==
PRELU
)
{
...
@@ -66,6 +69,8 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -66,6 +69,8 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
}
kernel_
.
setArg
(
idx
++
,
static_cast
<
float
>
(
relux_max_limit_
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
float
>
(
relux_max_limit_
));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
input_shape_
=
input
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
...
...
mace/kernels/opencl/addn.cc
浏览文件 @
2460a946
...
@@ -32,15 +32,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -32,15 +32,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
MACE_CHECK
(
channels
==
input_tensors
[
i
]
->
dim
(
3
));
MACE_CHECK
(
channels
==
input_tensors
[
i
]
->
dim
(
3
));
}
}
std
::
vector
<
index_t
>
output_shape
=
input_tensors
[
0
]
->
shape
();
std
::
vector
<
size_t
>
output_image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
output_image_shape
);
output_tensor
->
ResizeImage
(
output_shape
,
output_image_shape
);
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
index_t
width_pixels
=
channel_blocks
*
width
;
const
index_t
batch_height_pixels
=
batch
*
height
;
if
(
kernel_
.
get
()
==
nullptr
)
{
if
(
kernel_
.
get
()
==
nullptr
)
{
if
(
input_tensors
.
size
()
>
4
)
{
if
(
input_tensors
.
size
()
>
4
)
{
MACE_NOT_IMPLEMENTED
;
MACE_NOT_IMPLEMENTED
;
...
@@ -55,11 +46,26 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -55,11 +46,26 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
built_options
.
emplace
(
MakeString
(
"-DINPUT_NUM="
,
input_tensors
.
size
()));
built_options
.
emplace
(
MakeString
(
"-DINPUT_NUM="
,
input_tensors
.
size
()));
kernel_
=
runtime
->
BuildKernel
(
"addn"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"addn"
,
kernel_name
,
built_options
);
}
std
::
vector
<
index_t
>
output_shape
=
input_tensors
[
0
]
->
shape
();
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
index_t
width_pixels
=
channel_blocks
*
width
;
const
index_t
batch_height_pixels
=
batch
*
height
;
if
(
!
IsVecEqual
(
input_shape_
,
input_tensors
[
0
]
->
shape
()))
{
std
::
vector
<
size_t
>
output_image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
output_image_shape
);
output_tensor
->
ResizeImage
(
output_shape
,
output_image_shape
);
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
for
(
auto
input
:
input_tensors
)
{
for
(
auto
input
:
input_tensors
)
{
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
}
}
kernel_
.
setArg
(
idx
++
,
*
(
output_tensor
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output_tensor
->
opencl_image
()));
input_shape_
=
input_tensors
[
0
]
->
shape
();
}
}
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_pixels
),
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_pixels
),
...
...
mace/kernels/opencl/batch_norm_opencl.cc
浏览文件 @
2460a946
...
@@ -61,7 +61,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -61,7 +61,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
}
kernel_
=
runtime
->
BuildKernel
(
"batch_norm"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"batch_norm"
,
kernel_name
,
built_options
);
}
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
scale
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
scale
->
opencl_image
()));
...
@@ -73,6 +74,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -73,6 +74,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
}
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
relux_max_limit_
);
kernel_
.
setArg
(
idx
++
,
relux_max_limit_
);
input_shape_
=
input
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
...
...
mace/kernels/opencl/bias_add_opencl.cc
浏览文件 @
2460a946
...
@@ -33,10 +33,13 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -33,10 +33,13 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
kernel_
=
runtime
->
BuildKernel
(
"bias_add"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"bias_add"
,
kernel_name
,
built_options
);
}
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
bias
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
bias
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
input_shape_
=
input
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
...
...
mace/kernels/opencl/channel_shuffle.cc
浏览文件 @
2460a946
...
@@ -13,7 +13,8 @@ namespace mace {
...
@@ -13,7 +13,8 @@ namespace mace {
namespace
kernels
{
namespace
kernels
{
template
<
typename
T
>
template
<
typename
T
>
void
ChannelShuffleFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input
,
void
ChannelShuffleFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
)
{
output
->
ResizeLike
(
input
);
output
->
ResizeLike
(
input
);
...
@@ -39,12 +40,15 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *inpu
...
@@ -39,12 +40,15 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *inpu
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
kernel_
=
runtime
->
BuildKernel
(
"channel_shuffle"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"channel_shuffle"
,
kernel_name
,
built_options
);
}
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
groups_
);
kernel_
.
setArg
(
idx
++
,
groups_
);
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
channels_per_group
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
channels_per_group
));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
input_shape_
=
input
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
group_channel_blocks
),
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
group_channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
width
),
...
...
mace/kernels/opencl/concat.cc
浏览文件 @
2460a946
...
@@ -15,6 +15,7 @@ static void Concat2(cl::Kernel *kernel,
...
@@ -15,6 +15,7 @@ static void Concat2(cl::Kernel *kernel,
const
Tensor
*
input0
,
const
Tensor
*
input0
,
const
Tensor
*
input1
,
const
Tensor
*
input1
,
const
DataType
dt
,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
batch
=
output
->
dim
(
0
);
...
@@ -41,6 +42,8 @@ static void Concat2(cl::Kernel *kernel,
...
@@ -41,6 +42,8 @@ static void Concat2(cl::Kernel *kernel,
}
}
*
kernel
=
runtime
->
BuildKernel
(
"concat"
,
kernel_name
,
built_options
);
*
kernel
=
runtime
->
BuildKernel
(
"concat"
,
kernel_name
,
built_options
);
}
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input0
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
kernel
->
setArg
(
idx
++
,
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
input0
->
opencl_image
())));
*
(
static_cast
<
const
cl
::
Image2D
*>
(
input0
->
opencl_image
())));
...
@@ -49,6 +52,7 @@ static void Concat2(cl::Kernel *kernel,
...
@@ -49,6 +52,7 @@ static void Concat2(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input0
->
dim
(
3
)));
kernel
->
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input0
->
dim
(
3
)));
kernel
->
setArg
(
idx
++
,
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Image2D
*>
(
output
->
opencl_image
())));
*
(
static_cast
<
cl
::
Image2D
*>
(
output
->
opencl_image
())));
*
prev_input_shape
=
input0
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
const
uint32_t
gws
[
3
]
=
{
...
@@ -142,7 +146,7 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -142,7 +146,7 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
switch
(
inputs_count
)
{
switch
(
inputs_count
)
{
case
2
:
case
2
:
Concat2
(
&
kernel_
,
input_list
[
0
],
input_list
[
1
],
DataTypeToEnum
<
T
>::
value
,
Concat2
(
&
kernel_
,
input_list
[
0
],
input_list
[
1
],
DataTypeToEnum
<
T
>::
value
,
output
,
future
);
&
input_shape_
,
output
,
future
);
break
;
break
;
default:
default:
if
(
divisible_four
)
{
if
(
divisible_four
)
{
...
...
mace/kernels/opencl/conv_2d_opencl.cc
浏览文件 @
2460a946
...
@@ -18,6 +18,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
...
@@ -18,6 +18,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const
ActivationType
activation
,
const
ActivationType
activation
,
const
float
relux_max_limit
,
const
float
relux_max_limit
,
const
DataType
dt
,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
);
StatsFuture
*
future
);
...
@@ -31,6 +32,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
...
@@ -31,6 +32,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const
ActivationType
activation
,
const
ActivationType
activation
,
const
float
relux_max_limit
,
const
float
relux_max_limit
,
const
DataType
dt
,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
);
StatsFuture
*
future
);
...
@@ -44,6 +46,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
...
@@ -44,6 +46,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const
ActivationType
activation
,
const
ActivationType
activation
,
const
float
relux_max_limit
,
const
float
relux_max_limit
,
const
DataType
dt
,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
);
StatsFuture
*
future
);
...
@@ -57,8 +60,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -57,8 +60,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
const
Tensor
*
filter
,
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
stride
,
const
int
*
padding
,
const
Tensor
*
bias
,
const
int
stride
,
const
int
*
padding
,
const
int
*
dilations
,
const
ActivationType
activation
,
const
int
*
dilations
,
const
ActivationType
activation
,
const
float
relux_max_limit
,
const
DataType
dt
,
Tensor
*
output
,
const
float
relux_max_limit
,
const
DataType
dt
,
StatsFuture
*
future
);
std
::
vector
<
index_t
>
*
input_shape
,
Tensor
*
output
,
StatsFuture
*
future
);
// Selection matrix: kernel_size x stride_size
// Selection matrix: kernel_size x stride_size
static
const
Conv2dOpenclFunction
selector
[
5
]
=
{
static
const
Conv2dOpenclFunction
selector
[
5
]
=
{
Conv2dOpenclK1x1
,
nullptr
,
Conv2dOpenclK3x3
,
nullptr
,
nullptr
};
Conv2dOpenclK1x1
,
nullptr
,
Conv2dOpenclK3x3
,
nullptr
,
nullptr
};
...
@@ -97,11 +100,11 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -97,11 +100,11 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto
conv2d_func
=
selector
[
kernel_h
-
1
];
auto
conv2d_func
=
selector
[
kernel_h
-
1
];
conv2d_func
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
conv2d_func
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
);
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
);
}
else
{
}
else
{
Conv2dOpencl
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
Conv2dOpencl
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
);
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
);
}
}
}
}
...
...
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
2460a946
...
@@ -20,6 +20,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
...
@@ -20,6 +20,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const
ActivationType
activation
,
const
ActivationType
activation
,
const
float
relux_max_limit
,
const
float
relux_max_limit
,
const
DataType
dt
,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
batch
=
output
->
dim
(
0
);
...
@@ -68,6 +69,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
...
@@ -68,6 +69,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
auto
runtime
=
OpenCLRuntime
::
Global
();
auto
runtime
=
OpenCLRuntime
::
Global
();
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d_1x1"
,
kernel_name
,
built_options
);
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d_1x1"
,
kernel_name
,
built_options
);
}
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
...
@@ -83,6 +86,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
...
@@ -83,6 +86,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
height
));
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
height
));
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
width
));
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
width
));
kernel
->
setArg
(
idx
++
,
stride
);
kernel
->
setArg
(
idx
++
,
stride
);
*
prev_input_shape
=
input
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
...
...
mace/kernels/opencl/conv_2d_opencl_3x3.cc
浏览文件 @
2460a946
...
@@ -22,6 +22,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
...
@@ -22,6 +22,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const
ActivationType
activation
,
const
ActivationType
activation
,
const
float
relux_max_limit
,
const
float
relux_max_limit
,
const
DataType
dt
,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
batch
=
output
->
dim
(
0
);
...
@@ -62,7 +63,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
...
@@ -62,7 +63,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
auto
runtime
=
OpenCLRuntime
::
Global
();
auto
runtime
=
OpenCLRuntime
::
Global
();
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d_3x3"
,
kernel_name
,
built_options
);
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d_3x3"
,
kernel_name
,
built_options
);
}
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
...
@@ -81,6 +83,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
...
@@ -81,6 +83,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
padding
[
1
]
/
2
);
kernel
->
setArg
(
idx
++
,
padding
[
1
]
/
2
);
kernel
->
setArg
(
idx
++
,
dilations
[
0
]);
kernel
->
setArg
(
idx
++
,
dilations
[
0
]);
kernel
->
setArg
(
idx
++
,
dilations
[
1
]);
kernel
->
setArg
(
idx
++
,
dilations
[
1
]);
*
prev_input_shape
=
input
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
...
...
mace/kernels/opencl/conv_2d_opencl_general.cc
浏览文件 @
2460a946
...
@@ -22,6 +22,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
...
@@ -22,6 +22,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const
ActivationType
activation
,
const
ActivationType
activation
,
const
float
relux_max_limit
,
const
float
relux_max_limit
,
const
DataType
dt
,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
batch
=
output
->
dim
(
0
);
...
@@ -62,7 +63,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
...
@@ -62,7 +63,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
auto
runtime
=
OpenCLRuntime
::
Global
();
auto
runtime
=
OpenCLRuntime
::
Global
();
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d"
,
kernel_name
,
built_options
);
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d"
,
kernel_name
,
built_options
);
}
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
...
@@ -83,6 +85,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
...
@@ -83,6 +85,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
padding
[
1
]
/
2
);
kernel
->
setArg
(
idx
++
,
padding
[
1
]
/
2
);
kernel
->
setArg
(
idx
++
,
dilations
[
0
]);
kernel
->
setArg
(
idx
++
,
dilations
[
0
]);
kernel
->
setArg
(
idx
++
,
dilations
[
1
]);
kernel
->
setArg
(
idx
++
,
dilations
[
1
]);
*
prev_input_shape
=
input
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
...
...
mace/kernels/opencl/depthwise_conv_opencl.cc
浏览文件 @
2460a946
...
@@ -21,6 +21,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
...
@@ -21,6 +21,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const
ActivationType
activation
,
const
ActivationType
activation
,
const
float
relux_max_limit
,
const
float
relux_max_limit
,
const
DataType
dt
,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
batch
=
output
->
dim
(
0
);
...
@@ -35,17 +36,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
...
@@ -35,17 +36,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
if
(
kernel
->
get
()
==
nullptr
)
{
if
(
kernel
->
get
()
==
nullptr
)
{
const
index_t
input_batch
=
input
->
dim
(
0
);
const
index_t
input_height
=
input
->
dim
(
1
);
const
index_t
input_width
=
input
->
dim
(
2
);
const
index_t
filter_height
=
filter
->
dim
(
0
);
const
index_t
filter_width
=
filter
->
dim
(
1
);
MACE_CHECK
(
multiplier
==
1
,
"Multiplier > 1 not supported"
);
MACE_CHECK
(
multiplier
*
input_channels
==
channels
);
MACE_CHECK
(
filter
->
dim
(
2
)
==
input_channels
,
filter
->
dim
(
2
),
"!="
,
input_channels
);
auto
runtime
=
OpenCLRuntime
::
Global
();
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"depthwise_conv2d"
);
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"depthwise_conv2d"
);
...
@@ -80,6 +70,18 @@ void DepthwiseConv2d(cl::Kernel *kernel,
...
@@ -80,6 +70,18 @@ void DepthwiseConv2d(cl::Kernel *kernel,
*
kernel
=
*
kernel
=
runtime
->
BuildKernel
(
"depthwise_conv2d"
,
kernel_name
,
built_options
);
runtime
->
BuildKernel
(
"depthwise_conv2d"
,
kernel_name
,
built_options
);
}
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
const
index_t
input_batch
=
input
->
dim
(
0
);
const
index_t
input_height
=
input
->
dim
(
1
);
const
index_t
input_width
=
input
->
dim
(
2
);
const
index_t
filter_height
=
filter
->
dim
(
0
);
const
index_t
filter_width
=
filter
->
dim
(
1
);
MACE_CHECK
(
multiplier
==
1
,
"Multiplier > 1 not supported"
);
MACE_CHECK
(
multiplier
*
input_channels
==
channels
);
MACE_CHECK
(
filter
->
dim
(
2
)
==
input_channels
,
filter
->
dim
(
2
),
"!="
,
input_channels
);
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
...
@@ -102,6 +104,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
...
@@ -102,6 +104,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
static_cast
<
short
>
(
dilations
[
0
]));
kernel
->
setArg
(
idx
++
,
static_cast
<
short
>
(
dilations
[
0
]));
kernel
->
setArg
(
idx
++
,
static_cast
<
short
>
(
dilations
[
1
]));
kernel
->
setArg
(
idx
++
,
static_cast
<
short
>
(
dilations
[
1
]));
}
}
*
prev_input_shape
=
input
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
...
@@ -120,9 +123,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -120,9 +123,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
const
Tensor
*
bias
,
const
Tensor
*
bias
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
)
{
typedef
void
(
*
Conv2dOpenclFunction
)(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
,
StatsFuture
*
future
);
index_t
kernel_h
=
filter
->
dim
(
2
);
index_t
kernel_h
=
filter
->
dim
(
2
);
index_t
kernel_w
=
filter
->
dim
(
3
);
index_t
kernel_w
=
filter
->
dim
(
3
);
if
(
strides_
[
0
]
!=
strides_
[
1
])
{
if
(
strides_
[
0
]
!=
strides_
[
1
])
{
...
@@ -163,7 +164,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -163,7 +164,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
DepthwiseConv2d
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
DepthwiseConv2d
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
);
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
);
}
}
template
struct
DepthwiseConv2dFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
DepthwiseConv2dFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/eltwise_opencl.cc
浏览文件 @
2460a946
...
@@ -36,6 +36,8 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
...
@@ -36,6 +36,8 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
if
(
!
coeff_
.
empty
())
built_options
.
emplace
(
"-DCOEFF_SUM"
);
if
(
!
coeff_
.
empty
())
built_options
.
emplace
(
"-DCOEFF_SUM"
);
kernel_
=
runtime
->
BuildKernel
(
"eltwise"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"eltwise"
,
kernel_name
,
built_options
);
}
if
(
!
IsVecEqual
(
input_shape_
,
input0
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
input0
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input0
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input1
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input1
->
opencl_image
()));
...
@@ -44,6 +46,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
...
@@ -44,6 +46,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_
.
setArg
(
idx
++
,
coeff_
[
1
]);
kernel_
.
setArg
(
idx
++
,
coeff_
[
1
]);
}
}
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
input_shape_
=
input0
->
shape
();
}
}
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_pixels
),
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_pixels
),
...
...
mace/kernels/opencl/fully_connected_opencl.cc
浏览文件 @
2460a946
...
@@ -13,6 +13,7 @@ void FCWXKernel(cl::Kernel *kernel,
...
@@ -13,6 +13,7 @@ void FCWXKernel(cl::Kernel *kernel,
const
Tensor
*
input
,
const
Tensor
*
input
,
const
Tensor
*
weight
,
const
Tensor
*
weight
,
const
Tensor
*
bias
,
const
Tensor
*
bias
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
Tensor
*
output
,
const
ActivationType
activation
,
const
ActivationType
activation
,
std
::
vector
<
uint32_t
>
&
gws
,
std
::
vector
<
uint32_t
>
&
gws
,
...
@@ -67,6 +68,11 @@ void FCWXKernel(cl::Kernel *kernel,
...
@@ -67,6 +68,11 @@ void FCWXKernel(cl::Kernel *kernel,
const
uint32_t
inter_local_blks
=
kwg_size
/
(
gws
[
0
]
*
gws
[
1
]);
const
uint32_t
inter_local_blks
=
kwg_size
/
(
gws
[
0
]
*
gws
[
1
]);
lws
=
{
gws
[
0
],
gws
[
1
],
inter_local_blks
};
lws
=
{
gws
[
0
],
gws
[
1
],
inter_local_blks
};
}
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
output_blocks
=
RoundUpDiv4
(
output
->
dim
(
3
));
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
weight
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
weight
->
opencl_image
()));
...
@@ -80,6 +86,10 @@ void FCWXKernel(cl::Kernel *kernel,
...
@@ -80,6 +86,10 @@ void FCWXKernel(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
RoundUpDiv4
(
input
->
dim
(
3
))));
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
RoundUpDiv4
(
input
->
dim
(
3
))));
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
output_blocks
));
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
output_blocks
));
kernel
->
setArg
(
idx
++
,
relux_max_limit
);
kernel
->
setArg
(
idx
++
,
relux_max_limit
);
gws
[
2
]
=
static_cast
<
uint32_t
>
(
batch
*
output_blocks
);
*
prev_input_shape
=
input
->
shape
();
}
}
cl
::
Event
event
;
cl
::
Event
event
;
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
...
@@ -103,6 +113,7 @@ void FCWTXKernel(cl::Kernel *kernel,
...
@@ -103,6 +113,7 @@ void FCWTXKernel(cl::Kernel *kernel,
const
Tensor
*
input
,
const
Tensor
*
input
,
const
Tensor
*
weight
,
const
Tensor
*
weight
,
const
Tensor
*
bias
,
const
Tensor
*
bias
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
Tensor
*
output
,
const
ActivationType
activation
,
const
ActivationType
activation
,
std
::
vector
<
uint32_t
>
&
gws
,
std
::
vector
<
uint32_t
>
&
gws
,
...
@@ -141,6 +152,9 @@ void FCWTXKernel(cl::Kernel *kernel,
...
@@ -141,6 +152,9 @@ void FCWTXKernel(cl::Kernel *kernel,
*
kernel
=
*
kernel
=
runtime
->
BuildKernel
(
"fully_connected"
,
kernel_name
,
built_options
);
runtime
->
BuildKernel
(
"fully_connected"
,
kernel_name
,
built_options
);
lws
=
{
16
,
64
,
1
};
}
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
weight
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
weight
->
opencl_image
()));
...
@@ -155,14 +169,13 @@ void FCWTXKernel(cl::Kernel *kernel,
...
@@ -155,14 +169,13 @@ void FCWTXKernel(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
relux_max_limit
);
kernel
->
setArg
(
idx
++
,
relux_max_limit
);
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
output_size
=
output
->
dim
(
3
);
const
index_t
output_blocks
=
RoundUpDiv4
(
output
->
dim
(
3
));
const
index_t
output_blocks
=
RoundUpDiv4
(
output_size
);
gws
=
{
gws
=
{
static_cast
<
uint32_t
>
(
batch
),
static_cast
<
uint32_t
>
(
output_blocks
),
static_cast
<
uint32_t
>
(
batch
),
static_cast
<
uint32_t
>
(
output_blocks
),
};
};
lws
=
{
16
,
64
,
1
};
*
prev_input_shape
=
input
->
shape
();
}
}
std
::
stringstream
ss
;
std
::
stringstream
ss
;
...
@@ -185,10 +198,10 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -185,10 +198,10 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
output
->
ResizeImage
(
output_shape
,
output_image_shape
);
output
->
ResizeImage
(
output_shape
,
output_image_shape
);
if
(
weight_type_
==
BufferType
::
WEIGHT_HEIGHT
)
{
if
(
weight_type_
==
BufferType
::
WEIGHT_HEIGHT
)
{
FCWTXKernel
<
T
>
(
&
kernel_
,
input
,
weight
,
bias
,
output
,
FCWTXKernel
<
T
>
(
&
kernel_
,
input
,
weight
,
bias
,
&
input_shape_
,
output
,
activation_
,
gws_
,
lws_
,
relux_max_limit_
,
future
);
activation_
,
gws_
,
lws_
,
relux_max_limit_
,
future
);
}
else
{
}
else
{
FCWXKernel
<
T
>
(
&
kernel_
,
input
,
weight
,
bias
,
output
,
FCWXKernel
<
T
>
(
&
kernel_
,
input
,
weight
,
bias
,
&
input_shape_
,
output
,
activation_
,
gws_
,
lws_
,
relux_max_limit_
,
future
);
activation_
,
gws_
,
lws_
,
relux_max_limit_
,
future
);
}
}
};
};
...
...
mace/kernels/opencl/helper.h
浏览文件 @
2460a946
...
@@ -71,6 +71,13 @@ inline bool LimitKernelTime() {
...
@@ -71,6 +71,13 @@ inline bool LimitKernelTime() {
return
flag
!=
nullptr
&&
strlen
(
flag
)
==
1
&&
flag
[
0
]
==
'1'
;
return
flag
!=
nullptr
&&
strlen
(
flag
)
==
1
&&
flag
[
0
]
==
'1'
;
}
}
template
<
typename
T
>
bool
IsVecEqual
(
const
std
::
vector
<
T
>
&
input0
,
const
std
::
vector
<
T
>
&
input1
)
{
return
((
input0
.
size
()
==
input1
.
size
())
&&
(
std
::
equal
(
input0
.
begin
(),
input0
.
end
(),
input1
.
begin
())));
}
namespace
{
namespace
{
template
<
typename
T
>
template
<
typename
T
>
void
AppendToStream
(
std
::
stringstream
*
ss
,
const
std
::
string
&
delimiter
,
T
v
)
{
void
AppendToStream
(
std
::
stringstream
*
ss
,
const
std
::
string
&
delimiter
,
T
v
)
{
...
...
mace/kernels/opencl/matmul.cc
浏览文件 @
2460a946
...
@@ -36,7 +36,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
...
@@ -36,7 +36,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
kernel_
=
runtime
->
BuildKernel
(
"matmul"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"matmul"
,
kernel_name
,
built_options
);
}
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
A
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
A
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
B
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
B
->
opencl_image
()));
...
@@ -46,7 +46,6 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
...
@@ -46,7 +46,6 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
A
->
dim
(
2
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
A
->
dim
(
2
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height_blocks
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height_blocks
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
RoundUpDiv4
(
A
->
dim
(
2
))));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
RoundUpDiv4
(
A
->
dim
(
2
))));
}
const
uint32_t
gws
[
2
]
=
{
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
...
...
mace/kernels/opencl/pooling_opencl.cc
浏览文件 @
2460a946
...
@@ -17,31 +17,6 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -17,31 +17,6 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
StatsFuture
*
future
)
{
StatsFuture
*
future
)
{
MACE_CHECK
(
dilations_
[
0
]
==
1
&&
dilations_
[
1
]
==
1
)
MACE_CHECK
(
dilations_
[
0
]
==
1
&&
dilations_
[
1
]
==
1
)
<<
"Pooling opencl kernel not support dilation yet"
;
<<
"Pooling opencl kernel not support dilation yet"
;
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
index_t
>
filter_shape
=
{
kernels_
[
0
],
kernels_
[
1
],
input
->
dim
(
3
),
input
->
dim
(
3
)};
std
::
vector
<
int
>
paddings
(
2
);
if
(
paddings_
.
empty
())
{
kernels
::
CalcNHWCPaddingAndOutputSize
(
input
->
shape
().
data
(),
filter_shape
.
data
(),
dilations_
,
strides_
,
padding_type_
,
output_shape
.
data
(),
paddings
.
data
());
}
else
{
paddings
=
paddings_
;
CalcOutputSize
(
input
->
shape
().
data
(),
filter_shape
.
data
(),
paddings_
.
data
(),
dilations_
,
strides_
,
RoundType
::
CEIL
,
output_shape
.
data
());
}
std
::
vector
<
size_t
>
output_image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
output_image_shape
);
output
->
ResizeImage
(
output_shape
,
output_image_shape
);
index_t
batch
=
output
->
dim
(
0
);
index_t
out_height
=
output
->
dim
(
1
);
index_t
out_width
=
output
->
dim
(
2
);
index_t
channels
=
output
->
dim
(
3
);
index_t
channel_blocks
=
(
channels
+
3
)
/
4
;
if
(
kernel_
.
get
()
==
nullptr
)
{
if
(
kernel_
.
get
()
==
nullptr
)
{
const
DataType
dt
=
DataTypeToEnum
<
T
>::
value
;
const
DataType
dt
=
DataTypeToEnum
<
T
>::
value
;
...
@@ -62,18 +37,49 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -62,18 +37,49 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
}
kernel_
=
runtime
->
BuildKernel
(
"pooling"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"pooling"
,
kernel_name
,
built_options
);
}
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
index_t
>
filter_shape
=
{
kernels_
[
0
],
kernels_
[
1
],
input
->
dim
(
3
),
input
->
dim
(
3
)};
std
::
vector
<
int
>
paddings
(
2
);
if
(
paddings_
.
empty
())
{
kernels
::
CalcNHWCPaddingAndOutputSize
(
input
->
shape
().
data
(),
filter_shape
.
data
(),
dilations_
,
strides_
,
padding_type_
,
output_shape
.
data
(),
paddings
.
data
());
}
else
{
paddings
=
paddings_
;
CalcOutputSize
(
input
->
shape
().
data
(),
filter_shape
.
data
(),
paddings_
.
data
(),
dilations_
,
strides_
,
RoundType
::
CEIL
,
output_shape
.
data
());
}
std
::
vector
<
size_t
>
output_image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
output_image_shape
);
output
->
ResizeImage
(
output_shape
,
output_image_shape
);
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
1
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
1
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
2
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
2
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
out
_height
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
out
put
->
dim
(
1
)
));
kernel_
.
setArg
(
idx
++
,
paddings
[
0
]
/
2
);
kernel_
.
setArg
(
idx
++
,
paddings
[
0
]
/
2
);
kernel_
.
setArg
(
idx
++
,
paddings
[
1
]
/
2
);
kernel_
.
setArg
(
idx
++
,
paddings
[
1
]
/
2
);
kernel_
.
setArg
(
idx
++
,
strides_
[
0
]);
kernel_
.
setArg
(
idx
++
,
strides_
[
0
]);
kernel_
.
setArg
(
idx
++
,
kernels_
[
0
]);
kernel_
.
setArg
(
idx
++
,
kernels_
[
0
]);
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
input_shape_
=
input
->
shape
();
}
}
index_t
batch
=
output
->
dim
(
0
);
index_t
out_height
=
output
->
dim
(
1
);
index_t
out_width
=
output
->
dim
(
2
);
index_t
channels
=
output
->
dim
(
3
);
index_t
channel_blocks
=
(
channels
+
3
)
/
4
;
const
uint32_t
gws
[
3
]
=
{
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
batch
*
out_height
),
static_cast
<
uint32_t
>
(
batch
*
out_height
),
...
...
mace/kernels/opencl/resize_bilinear_opencl.cc
浏览文件 @
2460a946
...
@@ -24,6 +24,19 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -24,6 +24,19 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
const
index_t
out_height
=
out_height_
;
const
index_t
out_height
=
out_height_
;
const
index_t
out_width
=
out_width_
;
const
index_t
out_width
=
out_width_
;
if
(
kernel_
.
get
()
==
nullptr
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"resize_bilinear_nocache"
);
built_options
.
emplace
(
"-Dresize_bilinear_nocache="
+
kernel_name
);
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
kernel_
=
runtime
->
BuildKernel
(
"resize_bilinear"
,
kernel_name
,
built_options
);
}
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
MACE_CHECK
(
out_height
>
0
&&
out_width
>
0
);
MACE_CHECK
(
out_height
>
0
&&
out_width
>
0
);
std
::
vector
<
index_t
>
output_shape
{
batch
,
out_height
,
out_width
,
channels
};
std
::
vector
<
index_t
>
output_shape
{
batch
,
out_height
,
out_width
,
channels
};
...
@@ -32,23 +45,11 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -32,23 +45,11 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
output_image_shape
);
output_image_shape
);
output
->
ResizeImage
(
output_shape
,
output_image_shape
);
output
->
ResizeImage
(
output_shape
,
output_image_shape
);
if
(
kernel_
.
get
()
==
nullptr
)
{
float
height_scale
=
float
height_scale
=
CalculateResizeScale
(
in_height
,
out_height
,
align_corners_
);
CalculateResizeScale
(
in_height
,
out_height
,
align_corners_
);
float
width_scale
=
float
width_scale
=
CalculateResizeScale
(
in_width
,
out_width
,
align_corners_
);
CalculateResizeScale
(
in_width
,
out_width
,
align_corners_
);
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"resize_bilinear_nocache"
);
built_options
.
emplace
(
"-Dresize_bilinear_nocache="
+
kernel_name
);
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
kernel_
=
runtime
->
BuildKernel
(
"resize_bilinear"
,
kernel_name
,
built_options
);
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
...
@@ -57,6 +58,9 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -57,6 +58,9 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
in_height
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
in_height
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
in_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
in_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
out_height
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
out_height
));
input_shape_
=
input
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
...
...
mace/kernels/opencl/softmax_opencl.cc
浏览文件 @
2460a946
...
@@ -34,11 +34,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
...
@@ -34,11 +34,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
kernel_
=
runtime
->
BuildKernel
(
"softmax"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"softmax"
,
kernel_name
,
built_options
);
}
if
(
!
IsVecEqual
(
input_shape_
,
logits
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
logits
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
logits
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
channels
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
channels
));
kernel_
.
setArg
(
idx
++
,
remain_channels
);
kernel_
.
setArg
(
idx
++
,
remain_channels
);
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
input_shape_
=
logits
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
width
),
...
...
mace/kernels/opencl/space_to_batch_opencl.cc
浏览文件 @
2460a946
...
@@ -43,6 +43,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -43,6 +43,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
=
kernel_
=
runtime
->
BuildKernel
(
"space_to_batch"
,
kernel_name
,
built_options
);
runtime
->
BuildKernel
(
"space_to_batch"
,
kernel_name
,
built_options
);
}
if
(
!
IsVecEqual
(
space_shape_
,
space_tensor
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
b2s_
)
{
if
(
b2s_
)
{
kernel_
.
setArg
(
idx
++
,
*
(
batch_tensor
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
batch_tensor
->
opencl_image
()));
...
@@ -59,6 +61,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -59,6 +61,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
space_tensor
->
dim
(
2
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
space_tensor
->
dim
(
2
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
batch_tensor
->
dim
(
1
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
batch_tensor
->
dim
(
1
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
batch_tensor
->
dim
(
2
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
batch_tensor
->
dim
(
2
)));
space_shape_
=
space_tensor
->
shape
();
}
}
const
uint32_t
chan_blk
=
RoundUpDiv4
<
uint32_t
>
(
batch_tensor
->
dim
(
3
));
const
uint32_t
chan_blk
=
RoundUpDiv4
<
uint32_t
>
(
batch_tensor
->
dim
(
3
));
...
...
mace/kernels/opencl/winograd_transform.cc
浏览文件 @
2460a946
...
@@ -14,6 +14,21 @@ namespace kernels {
...
@@ -14,6 +14,21 @@ namespace kernels {
template
<
typename
T
>
template
<
typename
T
>
void
WinogradTransformFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
void
WinogradTransformFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input_tensor
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
const
Tensor
*
input_tensor
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_transform_2x2"
);
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-Dwinograd_transform_2x2="
+
obfuscated_kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
auto
runtime
=
OpenCLRuntime
::
Global
();
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
built_options
);
}
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
index_t
>
filter_shape
=
{
3
,
3
,
input_tensor
->
dim
(
3
),
1
};
std
::
vector
<
index_t
>
filter_shape
=
{
3
,
3
,
input_tensor
->
dim
(
3
),
1
};
std
::
vector
<
int
>
paddings
(
2
);
std
::
vector
<
int
>
paddings
(
2
);
...
@@ -27,29 +42,16 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -27,29 +42,16 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
paddings_
.
data
(),
dilations_
.
data
(),
strides_
.
data
(),
paddings_
.
data
(),
dilations_
.
data
(),
strides_
.
data
(),
RoundType
::
FLOOR
,
output_shape
.
data
());
RoundType
::
FLOOR
,
output_shape
.
data
());
}
}
const
index_t
round_h
=
(
output_shape
[
1
]
+
1
)
/
2
;
const
index_t
round_h
=
(
output_shape
[
1
]
+
1
)
/
2
;
const
index_t
round_w
=
(
output_shape
[
2
]
+
1
)
/
2
;
const
index_t
round_w
=
(
output_shape
[
2
]
+
1
)
/
2
;
const
index_t
out_width
=
input_tensor
->
dim
(
0
)
*
round_h
*
round_w
;
const
index_t
out_width
=
input_tensor
->
dim
(
0
)
*
round_h
*
round_w
;
if
(
!
IsVecEqual
(
input_shape_
,
input_tensor
->
shape
()))
{
output_shape
=
{
16
,
input_tensor
->
dim
(
3
),
out_width
,
1
};
output_shape
=
{
16
,
input_tensor
->
dim
(
3
),
out_width
,
1
};
std
::
vector
<
size_t
>
image_shape
;
std
::
vector
<
size_t
>
image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_HEIGHT
,
image_shape
);
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_HEIGHT
,
image_shape
);
output_tensor
->
ResizeImage
(
output_shape
,
image_shape
);
output_tensor
->
ResizeImage
(
output_shape
,
image_shape
);
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_transform_2x2"
);
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-Dwinograd_transform_2x2="
+
obfuscated_kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
auto
runtime
=
OpenCLRuntime
::
Global
();
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
built_options
);
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
input_tensor
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input_tensor
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output_tensor
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output_tensor
->
opencl_image
()));
...
@@ -60,6 +62,8 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -60,6 +62,8 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
round_w
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
round_w
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
paddings
[
0
]
/
2
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
paddings
[
0
]
/
2
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
paddings
[
1
]
/
2
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
paddings
[
1
]
/
2
));
input_shape_
=
input_tensor
->
shape
();
}
}
const
uint32_t
gws
[
2
]
=
{
const
uint32_t
gws
[
2
]
=
{
...
@@ -79,11 +83,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -79,11 +83,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const
Tensor
*
bias
,
const
Tensor
*
bias
,
Tensor
*
output_tensor
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
StatsFuture
*
future
)
{
std
::
vector
<
index_t
>
output_shape
=
{
batch_
,
height_
,
width_
,
input_tensor
->
dim
(
1
)};
std
::
vector
<
size_t
>
image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
image_shape
);
output_tensor
->
ResizeImage
(
output_shape
,
image_shape
);
if
(
kernel_
.
get
()
==
nullptr
)
{
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
string
obfuscated_kernel_name
=
std
::
string
obfuscated_kernel_name
=
...
@@ -121,6 +120,13 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -121,6 +120,13 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
auto
runtime
=
OpenCLRuntime
::
Global
();
auto
runtime
=
OpenCLRuntime
::
Global
();
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
built_options
);
built_options
);
}
if
(
!
IsVecEqual
(
input_shape_
,
input_tensor
->
shape
()))
{
std
::
vector
<
index_t
>
output_shape
=
{
batch_
,
height_
,
width_
,
input_tensor
->
dim
(
1
)};
std
::
vector
<
size_t
>
image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
image_shape
);
output_tensor
->
ResizeImage
(
output_shape
,
image_shape
);
const
uint32_t
round_h
=
(
height_
+
1
)
/
2
;
const
uint32_t
round_h
=
(
height_
+
1
)
/
2
;
const
uint32_t
round_w
=
(
width_
+
1
)
/
2
;
const
uint32_t
round_w
=
(
width_
+
1
)
/
2
;
...
@@ -139,6 +145,8 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -139,6 +145,8 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
round_h
*
round_w
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
round_h
*
round_w
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
round_w
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
round_w
));
kernel_
.
setArg
(
idx
++
,
relux_max_limit_
);
kernel_
.
setArg
(
idx
++
,
relux_max_limit_
);
input_shape_
=
input_tensor
->
shape
();
}
}
const
uint32_t
gws
[
2
]
=
{
const
uint32_t
gws
[
2
]
=
{
...
...
mace/kernels/pooling.h
浏览文件 @
2460a946
...
@@ -182,6 +182,7 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
...
@@ -182,6 +182,7 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
}
// namespace kernels
}
// namespace kernels
...
...
mace/kernels/resize_bilinear.h
浏览文件 @
2460a946
...
@@ -172,6 +172,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
...
@@ -172,6 +172,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
);
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
}
// namespace kernels
}
// namespace kernels
...
...
mace/kernels/softmax.h
浏览文件 @
2460a946
...
@@ -57,6 +57,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
...
@@ -57,6 +57,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
void
operator
()(
const
Tensor
*
logits
,
Tensor
*
output
,
StatsFuture
*
future
);
void
operator
()(
const
Tensor
*
logits
,
Tensor
*
output
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
}
// namepsace kernels
}
// namepsace kernels
...
...
mace/kernels/space_to_batch.h
浏览文件 @
2460a946
...
@@ -54,6 +54,7 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
...
@@ -54,6 +54,7 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
std
::
vector
<
index_t
>
space_shape_
;
};
};
}
// namespace kernels
}
// namespace kernels
...
...
mace/kernels/winograd_transform.h
浏览文件 @
2460a946
...
@@ -49,6 +49,7 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
...
@@ -49,6 +49,7 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
);
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
struct
WinogradInverseTransformFunctorBase
{
struct
WinogradInverseTransformFunctorBase
{
...
@@ -105,6 +106,7 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
...
@@ -105,6 +106,7 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
}
// namespace kernels
}
// namespace kernels
...
...
tools/wino_conv.py
浏览文件 @
2460a946
...
@@ -96,7 +96,7 @@ def output_shape(input_shape, filter_shape):
...
@@ -96,7 +96,7 @@ def output_shape(input_shape, filter_shape):
return
out_shape
return
out_shape
def
winog_conv
(
m
,
r
,
input
,
filter
):
def
winog
rad
_conv
(
m
,
r
,
input
,
filter
):
alpha
=
m
+
r
-
1
alpha
=
m
+
r
-
1
print
'Winograd(m = %d, r = %d, tile size=%d'
%
(
m
,
r
,
alpha
)
print
'Winograd(m = %d, r = %d, tile size=%d'
%
(
m
,
r
,
alpha
)
alpha_square
=
alpha
*
alpha
alpha_square
=
alpha
*
alpha
...
@@ -194,14 +194,14 @@ def main():
...
@@ -194,14 +194,14 @@ def main():
# filter.tofile("filter_in")
# filter.tofile("filter_in")
for
i
in
[
2
,
4
,
6
]:
for
i
in
[
2
,
4
,
6
]:
print
"==========f(%d,3)=========="
%
i
print
"==========f(%d,3)=========="
%
i
winog
_out
=
winog
_conv
(
i
,
3
,
input
,
filter
)
winog
rad_out
=
winograd
_conv
(
i
,
3
,
input
,
filter
)
res
=
np
.
allclose
(
tf_out
,
winog_out
)
res
=
np
.
allclose
(
tf_out
,
winog
rad
_out
)
if
res
:
if
res
:
print
"=========Pass========="
print
"=========Pass========="
else
:
else
:
print
"=========Failed======="
print
"=========Failed======="
print
"TF: "
,
tf_out
print
"TF: "
,
tf_out
print
"Winograd: "
,
winog_out
print
"Winograd: "
,
winog
rad
_out
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录