Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
02cadd13
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
02cadd13
编写于
6月 11, 2018
作者:
刘
刘琦
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'wino-6x6-gpu' into 'master'
add winograd 6x6 kernel See merge request !516
上级
087b477e
b2019687
变更
21
展开全部
隐藏空白更改
内联
并排
Showing
21 changed file
with
1710 addition
and
181 deletion
+1710
-181
mace/kernels/buffer_to_image.h
mace/kernels/buffer_to_image.h
+8
-4
mace/kernels/image_to_buffer.h
mace/kernels/image_to_buffer.h
+8
-4
mace/kernels/opencl/buffer_to_image.cc
mace/kernels/opencl/buffer_to_image.cc
+10
-5
mace/kernels/opencl/cl/buffer_to_image.cl
mace/kernels/opencl/cl/buffer_to_image.cl
+331
-2
mace/kernels/opencl/cl/winograd_transform.cl
mace/kernels/opencl/cl/winograd_transform.cl
+665
-8
mace/kernels/opencl/helper.cc
mace/kernels/opencl/helper.cc
+14
-8
mace/kernels/opencl/helper.h
mace/kernels/opencl/helper.h
+4
-2
mace/kernels/opencl/image_to_buffer.cc
mace/kernels/opencl/image_to_buffer.cc
+8
-4
mace/kernels/opencl/winograd_transform.cc
mace/kernels/opencl/winograd_transform.cc
+114
-27
mace/kernels/winograd_transform.h
mace/kernels/winograd_transform.h
+22
-12
mace/ops/buffer_to_image.h
mace/ops/buffer_to_image.h
+2
-1
mace/ops/conv_2d_benchmark.cc
mace/ops/conv_2d_benchmark.cc
+5
-0
mace/ops/image_to_buffer.h
mace/ops/image_to_buffer.h
+2
-1
mace/ops/ops_test_util.h
mace/ops/ops_test_util.h
+21
-12
mace/ops/winograd_convolution_benchmark.cc
mace/ops/winograd_convolution_benchmark.cc
+134
-0
mace/ops/winograd_convolution_test.cc
mace/ops/winograd_convolution_test.cc
+122
-46
mace/ops/winograd_inverse_transform.h
mace/ops/winograd_inverse_transform.h
+2
-1
mace/ops/winograd_transform.h
mace/ops/winograd_transform.h
+3
-1
mace/ops/winograd_transform_benchmark.cc
mace/ops/winograd_transform_benchmark.cc
+191
-32
mace/python/tools/converter_tool/base_converter.py
mace/python/tools/converter_tool/base_converter.py
+1
-0
mace/python/tools/converter_tool/transformer.py
mace/python/tools/converter_tool/transformer.py
+43
-11
未找到文件。
mace/kernels/buffer_to_image.h
浏览文件 @
02cadd13
...
...
@@ -25,14 +25,17 @@ namespace mace {
namespace
kernels
{
struct
BufferToImageFunctorBase
{
BufferToImageFunctorBase
()
:
kernel_error_
(
nullptr
)
{}
explicit
BufferToImageFunctorBase
(
const
int
wino_blk_size
)
:
kernel_error_
(
nullptr
),
wino_blk_size_
(
wino_blk_size
)
{}
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
const
int
wino_blk_size_
;
};
template
<
DeviceType
D
,
typename
T
>
struct
BufferToImageFunctor
:
BufferToImageFunctorBase
{
BufferToImageFunctor
()
{}
explicit
BufferToImageFunctor
(
const
int
wino_blk_size
)
:
BufferToImageFunctorBase
(
wino_blk_size
)
{}
MaceStatus
operator
()(
const
Tensor
*
input
,
const
BufferType
type
,
Tensor
*
output
,
...
...
@@ -48,7 +51,8 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
template
<
typename
T
>
struct
BufferToImageFunctor
<
DeviceType
::
GPU
,
T
>
:
BufferToImageFunctorBase
{
BufferToImageFunctor
()
{}
explicit
BufferToImageFunctor
(
const
int
wino_blk_size
)
:
BufferToImageFunctorBase
(
wino_blk_size
)
{}
MaceStatus
operator
()(
const
Tensor
*
input
,
const
BufferType
type
,
Tensor
*
output
,
...
...
mace/kernels/image_to_buffer.h
浏览文件 @
02cadd13
...
...
@@ -25,14 +25,17 @@ namespace mace {
namespace
kernels
{
struct
ImageToBufferFunctorBase
{
ImageToBufferFunctorBase
()
:
kernel_error_
(
nullptr
)
{}
explicit
ImageToBufferFunctorBase
(
const
int
wino_blk_size
)
:
kernel_error_
(
nullptr
),
wino_blk_size_
(
wino_blk_size
)
{}
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
const
int
wino_blk_size_
;
};
template
<
DeviceType
D
,
typename
T
>
struct
ImageToBufferFunctor
:
ImageToBufferFunctorBase
{
ImageToBufferFunctor
()
{}
explicit
ImageToBufferFunctor
(
const
int
wino_blk_size
)
:
ImageToBufferFunctorBase
(
wino_blk_size
)
{}
MaceStatus
operator
()(
const
Tensor
*
input
,
const
BufferType
type
,
Tensor
*
output
,
...
...
@@ -48,7 +51,8 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {
template
<
typename
T
>
struct
ImageToBufferFunctor
<
DeviceType
::
GPU
,
T
>
:
ImageToBufferFunctorBase
{
ImageToBufferFunctor
()
{}
explicit
ImageToBufferFunctor
(
const
int
wino_blk_size
)
:
ImageToBufferFunctorBase
(
wino_blk_size
)
{}
MaceStatus
operator
()(
const
Tensor
*
input
,
const
BufferType
type
,
Tensor
*
output
,
...
...
mace/kernels/opencl/buffer_to_image.cc
浏览文件 @
02cadd13
...
...
@@ -26,9 +26,10 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
Tensor
*
image
,
StatsFuture
*
future
)
{
std
::
vector
<
size_t
>
image_shape
;
CalImage2DShape
(
buffer
->
shape
(),
type
,
&
image_shape
);
CalImage2DShape
(
buffer
->
shape
(),
type
,
&
image_shape
,
wino_blk_size_
);
if
(
type
==
WINOGRAD_FILTER
)
{
std
::
vector
<
index_t
>
new_shape
=
CalWinogradShape
(
buffer
->
shape
(),
type
);
std
::
vector
<
index_t
>
new_shape
=
CalWinogradShape
(
buffer
->
shape
(),
type
,
wino_blk_size_
);
MACE_RETURN_IF_ERROR
(
image
->
ResizeImage
(
new_shape
,
image_shape
));
}
else
{
MACE_RETURN_IF_ERROR
(
image
->
ResizeImage
(
buffer
->
shape
(),
image_shape
));
...
...
@@ -62,10 +63,14 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
case
WEIGHT_WIDTH
:
kernel_name
=
"weight_width_buffer_to_image"
;
break
;
case
WINOGRAD_FILTER
:
gws
[
1
]
/=
16
;
kernel_name
=
"winograd_filter_buffer_to_image"
;
case
WINOGRAD_FILTER
:
{
std
::
stringstream
ss_tmp
;
gws
[
1
]
/=
(
wino_blk_size_
+
2
)
*
(
wino_blk_size_
+
2
);
ss_tmp
<<
"winograd_filter_buffer_to_image_"
<<
wino_blk_size_
<<
"x"
<<
wino_blk_size_
;
kernel_name
=
ss_tmp
.
str
();
break
;
}
}
auto
runtime
=
OpenCLRuntime
::
Global
();
...
...
mace/kernels/opencl/cl/buffer_to_image.cl
浏览文件 @
02cadd13
...
...
@@ -617,7 +617,7 @@ __kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS
}
// only support 3x3 now
__kernel void winograd_filter_buffer_to_image(KERNEL_ERROR_PARAMS
__kernel void winograd_filter_buffer_to_image
_2x2
(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset,
...
...
@@ -724,7 +724,7 @@ __kernel void winograd_filter_buffer_to_image(KERNEL_ERROR_PARAMS
}
// only support 3x3 now
__kernel void winograd_filter_image_to_buffer(KERNEL_ERROR_PARAMS
__kernel void winograd_filter_image_to_buffer
_2x2
(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height,
...
...
@@ -765,3 +765,332 @@ __kernel void winograd_filter_image_to_buffer(KERNEL_ERROR_PARAMS
offset += height * width;
}
}
// only support 3x3 now
__kernel void winograd_filter_buffer_to_image_6x6(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset,
__private const int in_channels,
__private const int height,
__private const int width,
__write_only image2d_t output) {
int w = get_global_id(0);
int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int out_channels = global_size_dim1;
#else
const int out_channels = get_global_size(1);
#endif
const int out_channel_idx = h;
const int in_channel_idx = w << 2;
const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
const int length = min((in_channels - in_channel_idx) * 9, 36);
DATA_TYPE in[36] = {0};
DATA_TYPE4 tt0, tt1, t1;
DATA_TYPE4 tu0[3], tu1[3], tu2[3], tu3[3], tu4[3], tu5[3], tu6[3], tu7[3];
const float a = -0.222222222f;
const float b = 0.011111111f;
const float c = 0.005555556f;
#pragma unroll
for (short i = 0; i < length; ++i) {
in[i] = *(input + offset + i);
}
tu0[0] = (DATA_TYPE4)(in[0], in[9], in[18], in[27]);
t1 = (DATA_TYPE4)(in[3], in[12], in[21], in[30]);
tu7[0] = (DATA_TYPE4)(in[6], in[15], in[24], in[33]);
tt0 = tu0[0] + tu7[0];
tt1 = t1;
tu1[0] = mad(tt0 + tt1, a, 0);
tu2[0] = mad(tt0 - tt1, a, 0);
tt0 = mad(tu7[0], 4, tu0[0]);
tt1 = mad(t1, 2, 0);
tu3[0] = mad(tt0 + tt1, b, 0);
tu4[0] = mad(tt0 - tt1, b, 0);
tt0 = mad(tu0[0], 4, tu7[0]);
tt1 = mad(t1, 2, 0);
tu5[0] = mad(tt0 + tt1, c, 0);
tu6[0] = mad(tt0 - tt1, c, 0);
tu0[1] = (DATA_TYPE4)(in[1], in[10], in[19], in[28]);
t1 = (DATA_TYPE4)(in[4], in[13], in[22], in[31]);
tu7[1] = (DATA_TYPE4)(in[7], in[16], in[25], in[34]);
tt0 = tu0[1] + tu7[1];
tt1 = t1;
tu1[1] = mad(tt0 + tt1, a, 0);
tu2[1] = mad(tt0 - tt1, a, 0);
tt0 = mad(tu7[1], 4, tu0[1]);
tt1 = mad(t1, 2, 0);
tu3[1] = mad(tt0 + tt1, b, 0);
tu4[1] = mad(tt0 - tt1, b, 0);
tt0 = mad(tu0[1], 4, tu7[1]);
tt1 = mad(t1, 2, 0);
tu5[1] = mad(tt0 + tt1, c, 0);
tu6[1] = mad(tt0 - tt1, c, 0);
tu0[2] = (DATA_TYPE4)(in[2], in[11], in[20], in[29]);
t1 = (DATA_TYPE4)(in[5], in[14], in[23], in[32]);
tu7[2] = (DATA_TYPE4)(in[8], in[17], in[26], in[35]);
tt0 = tu0[2] + tu7[2];
tt1 = t1;
tu1[2] = mad(tt0 + tt1, a, 0);
tu2[2] = mad(tt0 - tt1, a, 0);
tt0 = mad(tu7[2], 4, tu0[2]);
tt1 = mad(t1, 2, 0);
tu3[2] = mad(tt0 + tt1, b, 0);
tu4[2] = mad(tt0 - tt1, b, 0);
tt0 = mad(tu0[2], 4, tu7[2]);
tt1 = mad(t1, 2, 0);
tu5[2] = mad(tt0 + tt1, c, 0);
tu6[2] = mad(tt0 - tt1, c, 0);
#define PROCESS(i) \
t1 = tu##i[0]; \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
tt0 = tu##i[0] + tu##i[2]; \
tt1 = tu##i[1]; \
t1 = mad(tt0 + tt1, a, 0); \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
t1 = mad(tt0 - tt1, a, 0); \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
tt0 = mad(tu##i[2], 4, tu##i[0]); \
tt1 = mad(tu##i[1], 2, 0); \
t1 = mad(tt0 + tt1, b, 0); \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
t1 = mad(tt0 - tt1, b, 0); \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
tt0 = mad(tu##i[0], 4, tu##i[2]); \
tt1 = mad(tu##i[1], 2, 0); \
t1 = mad(tt0 + tt1, c, 0); \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
t1 = mad(tt0 - tt1, c, 0); \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
t1 = tu##i[2]; \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
PROCESS(0);
PROCESS(1);
PROCESS(2);
PROCESS(3);
PROCESS(4);
PROCESS(5);
PROCESS(6);
PROCESS(7);
#undef PROCESS
}
__kernel void winograd_filter_image_to_buffer_6x6(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height,
__private const int width,
__private const int channel,
__read_only image2d_t input) {
const int w = get_global_id(0);
const int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int width_idx = w << 2;
const int size = width - width_idx;
int offset = h * width + width_idx;
int2 coord = (int2)(w, h);
DATA_TYPE4 values;
for (short i = 0; i < 64; ++i) {
values = READ_IMAGET(input, SAMPLER, coord);
if (size < 4) {
switch (size) {
case 3:
output[offset+2] = values.z;
case 2:
output[offset+1] = values.y;
case 1:
output[offset] = values.x;
}
} else {
vstore4(values, 0, output + offset);
}
coord.y += height;
offset += height * width;
}
}
// only support 3x3 now
__kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset,
__private const int in_channels,
__private const int height,
__private const int width,
__write_only image2d_t output) {
int w = get_global_id(0);
int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int out_channels = global_size_dim1;
#else
const int out_channels = get_global_size(1);
#endif
const int out_channel_idx = h;
const int in_channel_idx = w << 2;
const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
const int length = min((in_channels - in_channel_idx) * 9, 36);
DATA_TYPE in[36] = {0};
DATA_TYPE4 tt0, tt1, tt2;
DATA_TYPE4 tu0[3], tu1[3], tu2[3], tu3[3], tu4[3], tu5[3];
const float a = 0.25f;
const float b = -0.166666667f;
const float c = 0.041666667f;
#pragma unroll
for (short i = 0; i < length; ++i) {
in[i] = *(input + offset + i);
}
tt0 = (DATA_TYPE4)(in[0], in[9], in[18], in[27]);
tt1 = (DATA_TYPE4)(in[3], in[12], in[21], in[30]);
tt2 = (DATA_TYPE4)(in[6], in[15], in[24], in[33]);
tu0[0] = mad(tt0, a, 0);
tu1[0] = mad((tt0 + tt1 + tt2), b, 0);
tu2[0] = mad((tt0 - tt1 + tt2), b, 0);
tt0 = mad(tt2, 4, tt0);
tu3[0] = mad(mad(tt1, 2, tt0), c, 0);
tu4[0] = mad(mad(tt1, -2, tt0), c, 0);
tu5[0] = tt2;
tt0 = (DATA_TYPE4)(in[1], in[10], in[19], in[28]);
tt1 = (DATA_TYPE4)(in[4], in[13], in[22], in[31]);
tt2 = (DATA_TYPE4)(in[7], in[16], in[25], in[34]);
tu0[1] = mad(tt0, a, 0);
tu1[1] = mad((tt0 + tt1 + tt2), b, 0);
tu2[1] = mad((tt0 - tt1 + tt2), b, 0);
tt0 = mad(tt2, 4, tt0);
tu3[1] = mad(mad(tt1, 2, tt0), c, 0);
tu4[1] = mad(mad(tt1, -2, tt0), c, 0);
tu5[1] = tt2;
tt0 = (DATA_TYPE4)(in[2], in[11], in[20], in[29]);
tt1 = (DATA_TYPE4)(in[5], in[14], in[23], in[32]);
tt2 = (DATA_TYPE4)(in[8], in[17], in[26], in[35]);
tu0[2] = mad(tt0, a, 0);
tu1[2] = mad((tt0 + tt1 + tt2), b, 0);
tu2[2] = mad((tt0 - tt1 + tt2), b, 0);
tt0 = mad(tt2, 4, tt0);
tu3[2] = mad(mad(tt1, 2, tt0), c, 0);
tu4[2] = mad(mad(tt1, -2, tt0), c, 0);
tu5[2] = tt2;
#define PROCESS(i) \
tt2 = mad(tu##i[0], a, 0); \
WRITE_IMAGET(output, (int2)(w, h), tt2); \
h += out_channels; \
tt0 = tu##i[1]; \
tt1 = tu##i[0] + tu##i[2]; \
tt2 = mad((tt0 + tt1), b, 0); \
WRITE_IMAGET(output, (int2)(w, h), tt2); \
h += out_channels; \
tt2 = mad(tt1 - tt0, b, 0); \
WRITE_IMAGET(output, (int2)(w, h), tt2); \
h += out_channels; \
tt0 = mad(tu##i[2], 4, tu##i[0]); \
tt1 = 2 * tu##i[1]; \
tt2 = mad(tt0 + tt1, c, 0); \
WRITE_IMAGET(output, (int2)(w, h), tt2); \
h += out_channels; \
tt2 = mad(tt0 - tt1, c, 0); \
WRITE_IMAGET(output, (int2)(w, h), tt2); \
h += out_channels; \
tt2 = tu##i[2]; \
WRITE_IMAGET(output, (int2)(w, h), tt2); \
h += out_channels; \
PROCESS(0);
PROCESS(1);
PROCESS(2);
PROCESS(3);
PROCESS(4);
PROCESS(5);
#undef PROCESS
}
__kernel void winograd_filter_image_to_buffer_4x4(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height,
__private const int width,
__private const int channel,
__read_only image2d_t input) {
const int w = get_global_id(0);
const int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 |
|
h
>=
global_size_dim1
)
{
return
;
}
#
endif
const
int
width_idx
=
w
<<
2
;
const
int
size
=
width
-
width_idx
;
int
offset
=
h
*
width
+
width_idx
;
int2
coord
=
(
int2
)(
w,
h
)
;
DATA_TYPE4
values
;
for
(
short
i
=
0
; i < 36; ++i) {
values
=
READ_IMAGET
(
input,
SAMPLER,
coord
)
;
if
(
size
<
4
)
{
switch
(
size
)
{
case
3:
output[offset+2]
=
values.z
;
case
2:
output[offset+1]
=
values.y
;
case
1:
output[offset]
=
values.x
;
}
}
else
{
vstore4
(
values,
0
,
output
+
offset
)
;
}
coord.y
+=
height
;
offset
+=
height
*
width
;
}
}
\ No newline at end of file
mace/kernels/opencl/cl/winograd_transform.cl
浏览文件 @
02cadd13
此差异已折叠。
点击以展开。
mace/kernels/opencl/helper.cc
浏览文件 @
02cadd13
...
...
@@ -66,13 +66,15 @@ void CalArgImageShape(const std::vector<index_t> &shape,
// [ (Ic + 3) / 4, 16 * Oc]
void
CalWinogradFilterImageShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* Oc, Ic, H, W*/
std
::
vector
<
size_t
>
*
image_shape
)
{
std
::
vector
<
size_t
>
*
image_shape
,
const
int
blk_size
)
{
MACE_CHECK
(
shape
.
size
()
==
4
);
image_shape
->
resize
(
2
);
(
*
image_shape
)[
0
]
=
RoundUpDiv4
(
shape
[
1
]);
(
*
image_shape
)[
1
]
=
(
shape
[
0
]
<<
4
);
(
*
image_shape
)[
1
]
=
(
shape
[
0
]
*
(
blk_size
+
2
)
*
(
blk_size
+
2
)
);
}
// [W * C, N * RoundUp<4>(H)]
void
CalInOutHeightImageShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* NHWC */
std
::
vector
<
size_t
>
*
image_shape
)
{
...
...
@@ -120,7 +122,8 @@ void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* OIHW */
void
CalImage2DShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* NHWC */
const
BufferType
type
,
std
::
vector
<
size_t
>
*
image_shape
)
{
std
::
vector
<
size_t
>
*
image_shape
,
const
int
wino_block_size
)
{
MACE_CHECK_NOTNULL
(
image_shape
);
switch
(
type
)
{
case
CONV2D_FILTER
:
...
...
@@ -142,7 +145,7 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
CalInOutWidthImageShape
(
shape
,
image_shape
);
break
;
case
WINOGRAD_FILTER
:
CalWinogradFilterImageShape
(
shape
,
image_shape
);
CalWinogradFilterImageShape
(
shape
,
image_shape
,
wino_block_size
);
break
;
case
WEIGHT_HEIGHT
:
CalWeightHeightImageShape
(
shape
,
image_shape
);
...
...
@@ -156,12 +159,15 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
}
std
::
vector
<
index_t
>
CalWinogradShape
(
const
std
::
vector
<
index_t
>
&
shape
,
const
BufferType
type
)
{
const
BufferType
type
,
const
int
wino_blk_size
)
{
if
(
type
==
WINOGRAD_FILTER
)
{
return
{
16
,
shape
[
0
],
shape
[
1
]};
return
{
(
wino_blk_size
+
2
)
*
(
wino_blk_size
+
2
)
,
shape
[
0
],
shape
[
1
]};
}
else
if
(
type
==
IN_OUT_HEIGHT
)
{
index_t
out_width
=
shape
[
0
]
*
((
shape
[
1
]
-
1
)
/
2
)
*
((
shape
[
2
]
-
1
)
/
2
);
return
{
16
,
shape
[
3
],
out_width
};
index_t
out_width
=
shape
[
0
]
*
((
shape
[
1
]
+
wino_blk_size
-
1
)
/
wino_blk_size
)
*
((
shape
[
2
]
+
wino_blk_size
-
1
)
/
wino_blk_size
);
return
{(
wino_blk_size
+
2
)
*
(
wino_blk_size
+
2
),
shape
[
3
],
out_width
};
}
else
{
LOG
(
FATAL
)
<<
"Mace not supported yet."
;
return
std
::
vector
<
index_t
>
();
...
...
mace/kernels/opencl/helper.h
浏览文件 @
02cadd13
...
...
@@ -46,10 +46,12 @@ enum BufferType {
void
CalImage2DShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* NHWC */
const
BufferType
type
,
std
::
vector
<
size_t
>
*
image_shape
);
std
::
vector
<
size_t
>
*
image_shape
,
const
int
wino_blk_size
=
2
);
std
::
vector
<
index_t
>
CalWinogradShape
(
const
std
::
vector
<
index_t
>
&
shape
,
const
BufferType
type
);
const
BufferType
type
,
const
int
wino_blk_size
=
2
);
std
::
string
DtToCLCMDDt
(
const
DataType
dt
);
...
...
mace/kernels/opencl/image_to_buffer.cc
浏览文件 @
02cadd13
...
...
@@ -26,7 +26,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
Tensor
*
buffer
,
StatsFuture
*
future
)
{
std
::
vector
<
size_t
>
image_shape
;
CalImage2DShape
(
image
->
shape
(),
type
,
&
image_shape
);
CalImage2DShape
(
image
->
shape
(),
type
,
&
image_shape
,
wino_blk_size_
);
MACE_RETURN_IF_ERROR
(
buffer
->
Resize
(
image
->
shape
()));
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
image_shape
[
0
]),
...
...
@@ -45,10 +45,14 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
case
IN_OUT_HEIGHT
:
kernel_name
=
"in_out_height_image_to_buffer"
;
break
;
case
WINOGRAD_FILTER
:
gws
[
1
]
/=
16
;
kernel_name
=
"winograd_filter_image_to_buffer"
;
case
WINOGRAD_FILTER
:
{
std
::
stringstream
ss_tmp
;
gws
[
1
]
/=
(
wino_blk_size_
+
2
)
*
(
wino_blk_size_
+
2
);
ss_tmp
<<
"winograd_filter_image_to_buffer_"
<<
wino_blk_size_
<<
"x"
<<
wino_blk_size_
;
kernel_name
=
ss_tmp
.
str
();
break
;
}
case
WEIGHT_HEIGHT
:
kernel_name
=
"weight_height_image_to_buffer"
;
break
;
...
...
mace/kernels/opencl/winograd_transform.cc
浏览文件 @
02cadd13
...
...
@@ -27,10 +27,24 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_transform_2x2"
);
std
::
string
obfuscated_kernel_name
;
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-Dwinograd_transform_2x2="
+
obfuscated_kernel_name
);
if
(
wino_blk_size_
==
6
)
{
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_transform_6x6"
);
built_options
.
emplace
(
"-Dwinograd_transform_6x6="
+
obfuscated_kernel_name
);
}
else
if
(
wino_blk_size_
==
4
)
{
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_transform_4x4"
);
built_options
.
emplace
(
"-Dwinograd_transform_4x4="
+
obfuscated_kernel_name
);
}
else
{
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_transform_2x2"
);
built_options
.
emplace
(
"-Dwinograd_transform_2x2="
+
obfuscated_kernel_name
);
}
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
...
...
@@ -66,15 +80,28 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
paddings_
.
data
(),
dilations_
.
data
(),
strides_
.
data
(),
RoundType
::
FLOOR
,
output_shape
.
data
());
}
const
index_t
round_h
=
(
output_shape
[
1
]
+
1
)
/
2
;
const
index_t
round_w
=
(
output_shape
[
2
]
+
1
)
/
2
;
const
index_t
round_h
=
(
output_shape
[
1
]
+
wino_blk_size_
-
1
)
/
wino_blk_size_
;
const
index_t
round_w
=
(
output_shape
[
2
]
+
wino_blk_size_
-
1
)
/
wino_blk_size_
;
const
index_t
out_width
=
input_tensor
->
dim
(
0
)
*
round_h
*
round_w
;
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
input_tensor
->
dim
(
3
)))};
const
float
round_hw_r
=
1.
f
/
static_cast
<
float
>
(
round_h
*
round_w
);
const
float
round_w_r
=
1.
f
/
static_cast
<
float
>
(
round_w
);
const
index_t
blk_sqr
=
(
wino_blk_size_
+
2
)
*
(
wino_blk_size_
+
2
);
uint32_t
gws
[
2
];
if
(
wino_blk_size_
==
6
)
{
gws
[
0
]
=
static_cast
<
uint32_t
>
(
out_width
)
*
(
wino_blk_size_
+
2
);
gws
[
1
]
=
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
input_tensor
->
dim
(
3
)))
*
(
wino_blk_size_
+
2
);
}
else
{
gws
[
0
]
=
static_cast
<
uint32_t
>
(
out_width
);
gws
[
1
]
=
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
input_tensor
->
dim
(
3
)));
}
if
(
!
IsVecEqual
(
input_shape_
,
input_tensor
->
shape
()))
{
output_shape
=
{
16
,
input_tensor
->
dim
(
3
),
out_width
};
output_shape
=
{
blk_sqr
,
input_tensor
->
dim
(
3
),
out_width
};
std
::
vector
<
size_t
>
image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_HEIGHT
,
&
image_shape
);
MACE_RETURN_IF_ERROR
(
output_tensor
->
ResizeImage
(
output_shape
,
image_shape
));
...
...
@@ -94,24 +121,66 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input_tensor
->
dim
(
2
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input_tensor
->
dim
(
3
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
round_h
*
round_w
));
kernel_
.
setArg
(
idx
++
,
round_hw_r
);
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
round_w
));
kernel_
.
setArg
(
idx
++
,
round_w_r
);
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
paddings
[
0
]
/
2
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
paddings
[
1
]
/
2
));
input_shape_
=
input_tensor
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
8
,
8
,
0
};
std
::
string
tuning_key
=
Concat
(
"winograd_transform_kernel"
,
output_tensor
->
dim
(
0
),
output_tensor
->
dim
(
1
),
output_tensor
->
dim
(
2
));
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
wino_blk_size_
==
6
)
{
const
std
::
vector
<
uint32_t
>
lws
=
{
static_cast
<
uint32_t
>
(
wino_blk_size_
+
2
),
static_cast
<
uint32_t
>
(
wino_blk_size_
+
2
),
0
};
cl
::
Event
event
;
cl_int
error
;
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel_
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
}
else
{
std
::
vector
<
uint32_t
>
roundup_gws
(
2
,
0
);
roundup_gws
[
0
]
=
RoundUp
(
gws
[
0
],
lws
[
0
]);
roundup_gws
[
1
]
=
RoundUp
(
gws
[
1
],
lws
[
1
]);
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel_
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
}
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
event
.
wait
();
if
(
stats
!=
nullptr
)
{
runtime
->
GetCallStats
(
event
,
stats
);
}
};
}
}
else
{
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
8
,
8
,
0
};
std
::
string
tuning_key
=
Concat
(
"winograd_transform_kernel"
,
output_tensor
->
dim
(
0
),
output_tensor
->
dim
(
1
),
output_tensor
->
dim
(
2
));
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
return
MACE_SUCCESS
;
...
...
@@ -126,11 +195,25 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_inverse_transform_2x2"
);
std
::
string
obfuscated_kernel_name
;
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-Dwinograd_inverse_transform_2x2="
+
obfuscated_kernel_name
);
if
(
wino_blk_size_
==
6
)
{
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_inverse_transform_6x6"
);
built_options
.
emplace
(
"-Dwinograd_inverse_transform_6x6="
+
obfuscated_kernel_name
);
}
else
if
(
wino_blk_size_
==
4
)
{
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_inverse_transform_4x4"
);
built_options
.
emplace
(
"-Dwinograd_inverse_transform_4x4="
+
obfuscated_kernel_name
);
}
else
{
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_inverse_transform_2x2"
);
built_options
.
emplace
(
"-Dwinograd_inverse_transform_2x2="
+
obfuscated_kernel_name
);
}
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
...
...
@@ -187,8 +270,12 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
&
image_shape
);
MACE_RETURN_IF_ERROR
(
output_tensor
->
ResizeImage
(
output_shape
,
image_shape
));
const
uint32_t
round_h
=
(
height_
+
1
)
/
2
;
const
uint32_t
round_w
=
(
width_
+
1
)
/
2
;
const
index_t
round_h
=
(
height_
+
wino_blk_size_
-
1
)
/
wino_blk_size_
;
const
index_t
round_w
=
(
width_
+
wino_blk_size_
-
1
)
/
wino_blk_size_
;
const
float
round_hw_r
=
1.
f
/
static_cast
<
float
>
(
round_h
*
round_w
);
const
float
round_w_r
=
1.
f
/
static_cast
<
float
>
(
round_w
);
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
...
...
@@ -210,12 +297,13 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
output_shape
[
1
]));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
output_shape
[
2
]));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
round_h
*
round_w
));
kernel_
.
setArg
(
idx
++
,
round_hw_r
);
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
round_w
));
kernel_
.
setArg
(
idx
++
,
round_w_r
);
kernel_
.
setArg
(
idx
++
,
relux_max_limit_
);
input_shape_
=
input_tensor
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
8
,
8
,
0
};
std
::
string
tuning_key
=
Concat
(
"winograd_inverse_transform_kernel"
,
output_tensor
->
dim
(
0
),
...
...
@@ -229,7 +317,6 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
return
MACE_SUCCESS
;
}
...
...
mace/kernels/winograd_transform.h
浏览文件 @
02cadd13
...
...
@@ -32,23 +32,27 @@ namespace kernels {
struct
WinogradTransformFunctorBase
{
WinogradTransformFunctorBase
(
const
Padding
&
padding_type
,
const
std
::
vector
<
int
>
&
paddings
)
const
std
::
vector
<
int
>
&
paddings
,
const
int
block_size
)
:
strides_
({
1
,
1
}),
dilations_
({
1
,
1
}),
padding_type_
(
padding_type
),
paddings_
(
paddings
)
{}
paddings_
(
paddings
),
wino_blk_size_
(
block_size
)
{}
const
std
::
vector
<
int
>
strides_
;
// [stride_h, stride_w]
const
std
::
vector
<
int
>
dilations_
;
// [dilation_h, dilation_w]
Padding
padding_type_
;
std
::
vector
<
int
>
paddings_
;
const
int
wino_blk_size_
;
};
template
<
DeviceType
D
,
typename
T
>
struct
WinogradTransformFunctor
:
WinogradTransformFunctorBase
{
WinogradTransformFunctor
(
const
Padding
&
padding_type
,
const
std
::
vector
<
int
>
&
paddings
)
:
WinogradTransformFunctorBase
(
padding_type
,
paddings
)
{}
const
std
::
vector
<
int
>
&
paddings
,
const
int
block_size
)
:
WinogradTransformFunctorBase
(
padding_type
,
paddings
,
block_size
)
{}
MaceStatus
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
...
...
@@ -66,8 +70,9 @@ template<typename T>
struct
WinogradTransformFunctor
<
DeviceType
::
GPU
,
T
>
:
WinogradTransformFunctorBase
{
WinogradTransformFunctor
(
const
Padding
&
padding_type
,
const
std
::
vector
<
int
>
&
paddings
)
:
WinogradTransformFunctorBase
(
padding_type
,
paddings
)
{}
const
std
::
vector
<
int
>
&
paddings
,
const
int
block_size
)
:
WinogradTransformFunctorBase
(
padding_type
,
paddings
,
block_size
)
{}
MaceStatus
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
...
...
@@ -85,16 +90,19 @@ struct WinogradInverseTransformFunctorBase {
const
int
height
,
const
int
width
,
const
ActivationType
activation
,
const
float
relux_max_limit
)
const
float
relux_max_limit
,
const
int
block_size
)
:
batch_
(
batch
),
height_
(
height
),
width_
(
width
),
activation_
(
activation
),
relux_max_limit_
(
relux_max_limit
)
{}
relux_max_limit_
(
relux_max_limit
),
wino_blk_size_
(
block_size
)
{}
const
int
batch_
;
const
int
height_
;
const
int
width_
;
const
int
wino_blk_size_
;
const
ActivationType
activation_
;
const
float
relux_max_limit_
;
};
...
...
@@ -105,9 +113,10 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
const
int
height
,
const
int
width
,
const
ActivationType
activation
,
const
float
relux_max_limit
)
const
float
relux_max_limit
,
const
int
block_size
)
:
WinogradInverseTransformFunctorBase
(
batch
,
height
,
width
,
activation
,
relux_max_limit
)
{}
batch
,
height
,
width
,
activation
,
relux_max_limit
,
block_size
)
{}
MaceStatus
operator
()(
const
Tensor
*
input
,
const
Tensor
*
bias
,
...
...
@@ -130,9 +139,10 @@ struct WinogradInverseTransformFunctor<DeviceType::GPU, T>
const
int
height
,
const
int
width
,
const
ActivationType
activation
,
const
float
relux_max_limit
)
const
float
relux_max_limit
,
const
int
block_size
)
:
WinogradInverseTransformFunctorBase
(
batch
,
height
,
width
,
activation
,
relux_max_limit
)
{}
batch
,
height
,
width
,
activation
,
relux_max_limit
,
block_size
)
{}
MaceStatus
operator
()(
const
Tensor
*
input
,
const
Tensor
*
bias
,
...
...
mace/ops/buffer_to_image.h
浏览文件 @
02cadd13
...
...
@@ -25,7 +25,8 @@ template <DeviceType D, typename T>
class
BufferToImageOp
:
public
Operator
<
D
,
T
>
{
public:
BufferToImageOp
(
const
OperatorDef
&
op_def
,
Workspace
*
ws
)
:
Operator
<
D
,
T
>
(
op_def
,
ws
)
{}
:
Operator
<
D
,
T
>
(
op_def
,
ws
),
functor_
(
OperatorBase
::
GetOptionalArg
<
int
>
(
"wino_block_size"
,
2
))
{}
MaceStatus
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input_tensor
=
this
->
Input
(
INPUT
);
...
...
mace/ops/conv_2d_benchmark.cc
浏览文件 @
02cadd13
...
...
@@ -175,6 +175,11 @@ MACE_BM_CONV_2D(1, 160, 17, 17, 7, 1, 1, 1, SAME, 192);
MACE_BM_CONV_2D
(
1
,
32
,
256
,
256
,
1
,
15
,
1
,
1
,
SAME
,
2
);
MACE_BM_CONV_2D
(
1
,
32
,
256
,
256
,
15
,
1
,
1
,
1
,
SAME
,
2
);
MACE_BM_CONV_2D
(
1
,
64
,
64
,
64
,
15
,
1
,
1
,
1
,
SAME
,
2
);
MACE_BM_CONV_2D
(
1
,
3
,
128
,
128
,
3
,
3
,
1
,
1
,
SAME
,
16
);
MACE_BM_CONV_2D
(
1
,
3
,
256
,
256
,
3
,
3
,
1
,
1
,
SAME
,
16
);
MACE_BM_CONV_2D
(
1
,
3
,
64
,
64
,
3
,
3
,
1
,
1
,
SAME
,
16
);
}
// namespace test
}
// namespace ops
}
// namespace mace
mace/ops/image_to_buffer.h
浏览文件 @
02cadd13
...
...
@@ -25,7 +25,8 @@ template <DeviceType D, typename T>
class
ImageToBufferOp
:
public
Operator
<
D
,
T
>
{
public:
ImageToBufferOp
(
const
OperatorDef
&
op_def
,
Workspace
*
ws
)
:
Operator
<
D
,
T
>
(
op_def
,
ws
)
{}
:
Operator
<
D
,
T
>
(
op_def
,
ws
),
functor_
(
OperatorBase
::
GetOptionalArg
<
int
>
(
"wino_block_size"
,
2
))
{}
MaceStatus
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input
=
this
->
Input
(
INPUT
);
...
...
mace/ops/ops_test_util.h
浏览文件 @
02cadd13
...
...
@@ -339,6 +339,11 @@ class OpsTestNet {
return
&
op_defs_
[
op_defs_
.
size
()
-
1
];
}
OperatorDef
*
AddNewOperatorDef
()
{
op_defs_
.
emplace_back
(
OperatorDef
());
return
&
op_defs_
[
op_defs_
.
size
()
-
1
];
}
Workspace
*
ws
()
{
return
&
ws_
;
}
bool
Setup
(
DeviceType
device
)
{
...
...
@@ -630,15 +635,17 @@ template <DeviceType D, typename T>
void
BufferToImage
(
OpsTestNet
*
net
,
const
std
::
string
&
input_name
,
const
std
::
string
&
output_name
,
const
kernels
::
BufferType
type
)
{
const
kernels
::
BufferType
type
,
const
int
wino_block_size
=
2
)
{
MACE_CHECK_NOTNULL
(
net
);
OpDefBuilder
(
"BufferToImage"
,
"BufferToImageTest"
)
.
Input
(
input_name
)
.
Output
(
output_name
)
.
AddIntArg
(
"buffer_type"
,
type
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
->
NewOperatorDef
());
.
Input
(
input_name
)
.
Output
(
output_name
)
.
AddIntArg
(
"buffer_type"
,
type
)
.
AddIntArg
(
"wino_block_size"
,
wino_block_size
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
->
NewOperatorDef
());
// Run
net
->
RunOp
(
D
);
...
...
@@ -650,15 +657,17 @@ template <DeviceType D, typename T>
void
ImageToBuffer
(
OpsTestNet
*
net
,
const
std
::
string
&
input_name
,
const
std
::
string
&
output_name
,
const
kernels
::
BufferType
type
)
{
const
kernels
::
BufferType
type
,
const
int
wino_block_size
=
2
)
{
MACE_CHECK_NOTNULL
(
net
);
OpDefBuilder
(
"ImageToBuffer"
,
"ImageToBufferTest"
)
.
Input
(
input_name
)
.
Output
(
output_name
)
.
AddIntArg
(
"buffer_type"
,
type
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
->
NewOperatorDef
());
.
Input
(
input_name
)
.
Output
(
output_name
)
.
AddIntArg
(
"buffer_type"
,
type
)
.
AddIntArg
(
"wino_block_size"
,
wino_block_size
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
->
NewOperatorDef
());
// Run
net
->
RunOp
(
D
);
...
...
mace/ops/winograd_convolution_benchmark.cc
0 → 100644
浏览文件 @
02cadd13
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/ops/ops_test_util.h"
namespace
mace
{
namespace
ops
{
namespace
test
{
namespace
{
template
<
DeviceType
D
,
typename
T
>
void
BMWinogradConvolution
(
int
iters
,
int
batch
,
int
height
,
int
width
,
int
in_channels
,
int
out_channels
,
int
block_size
)
{
mace
::
testing
::
StopTiming
();
OpsTestNet
net
;
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
in_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Filter"
,
{
out_channels
,
in_channels
,
3
,
3
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
out_channels
});
BufferToImage
<
D
,
T
>
(
&
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
T
>
(
&
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
CONV2D_FILTER
);
BufferToImage
<
D
,
T
>
(
&
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
// Winograd convolution
// transform filter
BufferToImage
<
D
,
T
>
(
&
net
,
"Filter"
,
"WinoFilter"
,
kernels
::
BufferType
::
WINOGRAD_FILTER
,
block_size
);
// transform input
OpDefBuilder
(
"WinogradTransform"
,
"WinogradTransformTest"
)
.
Input
(
"InputImage"
)
.
Output
(
"WinoInput"
)
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntArg
(
"wino_block_size"
,
block_size
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
AddNewOperatorDef
());
// MatMul
OpDefBuilder
(
"MatMul"
,
"MatMulTest"
)
.
Input
(
"WinoFilter"
)
.
Input
(
"WinoInput"
)
.
Output
(
"WinoGemm"
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
AddNewOperatorDef
());
// Inverse transform
OpDefBuilder
(
"WinogradInverseTransform"
,
"WinogradInverseTransformTest"
)
.
Input
(
"WinoGemm"
)
.
Input
(
"BiasImage"
)
.
AddIntArg
(
"batch"
,
batch
)
.
AddIntArg
(
"height"
,
height
)
.
AddIntArg
(
"width"
,
width
)
.
AddIntArg
(
"wino_block_size"
,
block_size
)
.
Output
(
"OutputImage"
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
AddNewOperatorDef
());
net
.
Setup
(
D
);
// Warm-up
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
net
.
Run
();
}
net
.
Sync
();
mace
::
testing
::
StartTiming
();
while
(
iters
--
)
{
net
.
Run
();
}
net
.
Sync
();
}
}
// namespace
#define MACE_BM_WINOGRAD_CONV_MACRO(N, H, W, IC, OC, M, TYPE, DEVICE) \
static void MACE_BM_WINOGRAD_CONV_##N##_##H##_##W##_##IC##_##OC##_##M##_##\
TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * IC * H * W; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * OC * H * W * (3 * 3 * IC + 1); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradConvolution<DEVICE, TYPE>(iters, N, H, W, IC, OC, M); \
} \
MACE_BENCHMARK( \
MACE_BM_WINOGRAD_CONV_##N##_##H##_##W##_##IC##_##OC##_##M##_##TYPE##_##DEVICE)
#define MACE_BM_WINOGRAD_CONV(N, H, W, IC, OC, M) \
MACE_BM_WINOGRAD_CONV_MACRO(N, H, W, IC, OC, M, half, GPU);
MACE_BM_WINOGRAD_CONV
(
1
,
64
,
64
,
3
,
16
,
2
);
MACE_BM_WINOGRAD_CONV
(
1
,
128
,
128
,
3
,
16
,
2
);
MACE_BM_WINOGRAD_CONV
(
1
,
256
,
256
,
3
,
16
,
2
);
MACE_BM_WINOGRAD_CONV
(
1
,
64
,
64
,
3
,
16
,
4
);
MACE_BM_WINOGRAD_CONV
(
1
,
128
,
128
,
3
,
16
,
4
);
MACE_BM_WINOGRAD_CONV
(
1
,
256
,
256
,
3
,
16
,
4
);
MACE_BM_WINOGRAD_CONV
(
1
,
64
,
64
,
3
,
16
,
6
);
MACE_BM_WINOGRAD_CONV
(
1
,
128
,
128
,
3
,
16
,
6
);
MACE_BM_WINOGRAD_CONV
(
1
,
256
,
256
,
3
,
16
,
6
);
MACE_BM_WINOGRAD_CONV
(
1
,
28
,
28
,
256
,
256
,
2
);
MACE_BM_WINOGRAD_CONV
(
1
,
28
,
28
,
256
,
256
,
4
);
MACE_BM_WINOGRAD_CONV
(
1
,
28
,
28
,
256
,
256
,
6
);
MACE_BM_WINOGRAD_CONV
(
1
,
56
,
56
,
256
,
256
,
2
);
MACE_BM_WINOGRAD_CONV
(
1
,
56
,
56
,
256
,
256
,
4
);
MACE_BM_WINOGRAD_CONV
(
1
,
56
,
56
,
256
,
256
,
6
);
MACE_BM_WINOGRAD_CONV
(
1
,
128
,
128
,
128
,
256
,
2
);
MACE_BM_WINOGRAD_CONV
(
1
,
128
,
128
,
128
,
256
,
4
);
MACE_BM_WINOGRAD_CONV
(
1
,
128
,
128
,
128
,
256
,
6
);
MACE_BM_WINOGRAD_CONV
(
1
,
256
,
256
,
256
,
256
,
2
);
MACE_BM_WINOGRAD_CONV
(
1
,
256
,
256
,
256
,
256
,
4
);
MACE_BM_WINOGRAD_CONV
(
1
,
256
,
256
,
256
,
256
,
6
);
}
// namespace test
}
// namespace ops
}
// namespace mace
mace/ops/winograd_convolution_test.cc
浏览文件 @
02cadd13
...
...
@@ -25,27 +25,6 @@ namespace test {
class
WinogradConvlutionTest
:
public
OpsTestBase
{};
namespace
{
void
TransposeFilter
(
const
std
::
vector
<
float
>
&
input
,
const
std
::
vector
<
index_t
>
&
input_shape
,
std
::
vector
<
float
>
*
output
)
{
MACE_CHECK_NOTNULL
(
output
);
output
->
resize
(
input
.
size
());
const
float
*
input_ptr
=
input
.
data
();
for
(
index_t
h
=
0
;
h
<
input_shape
[
0
];
++
h
)
{
for
(
index_t
w
=
0
;
w
<
input_shape
[
1
];
++
w
)
{
for
(
index_t
oc
=
0
;
oc
<
input_shape
[
2
];
++
oc
)
{
for
(
index_t
ic
=
0
;
ic
<
input_shape
[
3
];
++
ic
)
{
int
offset
=
((
oc
*
input_shape
[
3
]
+
ic
)
*
input_shape
[
0
]
+
h
)
*
input_shape
[
1
]
+
w
;
(
*
output
)[
offset
]
=
*
input_ptr
;
++
input_ptr
;
}
}
}
}
}
template
<
DeviceType
D
,
typename
T
>
void
WinogradConvolution
(
const
index_t
batch
,
...
...
@@ -53,7 +32,8 @@ void WinogradConvolution(const index_t batch,
const
index_t
width
,
const
index_t
in_channels
,
const
index_t
out_channels
,
const
Padding
padding
)
{
const
Padding
padding
,
const
int
block_size
)
{
// srand(time(NULL));
// Construct graph
...
...
@@ -91,13 +71,13 @@ void WinogradConvolution(const index_t batch,
// Winograd convolution
// transform filter
BufferToImage
<
D
,
T
>
(
&
net
,
"Filter"
,
"WinoFilter"
,
kernels
::
BufferType
::
WINOGRAD_FILTER
);
kernels
::
BufferType
::
WINOGRAD_FILTER
,
block_size
);
// transform input
OpDefBuilder
(
"WinogradTransform"
,
"WinogradTransformTest"
)
.
Input
(
"InputImage"
)
.
Output
(
"WinoInput"
)
.
AddIntArg
(
"padding"
,
padding
)
.
AddIntArg
(
"wino_block_size"
,
block_size
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
...
...
@@ -121,6 +101,7 @@ void WinogradConvolution(const index_t batch,
.
AddIntArg
(
"batch"
,
batch
)
.
AddIntArg
(
"height"
,
output_shape
[
1
])
.
AddIntArg
(
"width"
,
output_shape
[
2
])
.
AddIntArg
(
"wino_block_size"
,
block_size
)
.
Output
(
"WinoOutputImage"
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
...
...
@@ -139,22 +120,67 @@ void WinogradConvolution(const index_t batch,
}
}
// namespace
TEST_F
(
WinogradConvlutionTest
,
AlignedConvolution
)
{
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
32
,
16
,
Padding
::
VALID
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
32
,
16
,
Padding
::
SAME
);
TEST_F
(
WinogradConvlutionTest
,
AlignedConvolutionM2
)
{
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
3
,
3
,
Padding
::
VALID
,
2
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
3
,
3
,
Padding
::
SAME
,
2
);
}
TEST_F
(
WinogradConvlutionTest
,
UnAlignedConvolutionM2
)
{
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
61
,
67
,
31
,
37
,
Padding
::
VALID
,
2
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
61
,
67
,
37
,
31
,
Padding
::
SAME
,
2
);
}
TEST_F
(
WinogradConvlutionTest
,
BatchConvolutionM2
)
{
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
3
,
64
,
64
,
32
,
32
,
Padding
::
VALID
,
2
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
5
,
61
,
67
,
37
,
31
,
Padding
::
SAME
,
2
);
}
TEST_F
(
WinogradConvlutionTest
,
AlignedConvolutionM6
)
{
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
3
,
3
,
Padding
::
VALID
,
6
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
3
,
3
,
Padding
::
SAME
,
6
);
}
TEST_F
(
WinogradConvlutionTest
,
UnAlignedConvolutionM6
)
{
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
61
,
67
,
31
,
37
,
Padding
::
VALID
,
6
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
61
,
67
,
37
,
31
,
Padding
::
SAME
,
6
);
}
TEST_F
(
WinogradConvlutionTest
,
BatchConvolutionM6
)
{
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
3
,
64
,
64
,
32
,
32
,
Padding
::
VALID
,
6
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
5
,
61
,
67
,
37
,
31
,
Padding
::
SAME
,
6
);
}
TEST_F
(
WinogradConvlutionTest
,
AlignedConvolutionM4
)
{
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
3
,
3
,
Padding
::
VALID
,
4
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
3
,
3
,
Padding
::
SAME
,
4
);
}
TEST_F
(
WinogradConvlutionTest
,
UnAlignedConvolution
)
{
TEST_F
(
WinogradConvlutionTest
,
UnAlignedConvolution
M4
)
{
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
61
,
67
,
31
,
37
,
Padding
::
VALID
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
61
,
67
,
37
,
31
,
Padding
::
SAME
);
Padding
::
VALID
,
4
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
61
,
67
,
37
,
31
,
Padding
::
SAME
,
4
);
}
TEST_F
(
WinogradConvlutionTest
,
BatchConvolution
)
{
TEST_F
(
WinogradConvlutionTest
,
BatchConvolution
M4
)
{
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
3
,
64
,
64
,
32
,
32
,
Padding
::
VALID
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
5
,
61
,
67
,
37
,
31
,
Padding
::
SAME
);
Padding
::
VALID
,
4
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
5
,
61
,
67
,
37
,
31
,
Padding
::
SAME
,
4
);
}
namespace
{
...
...
@@ -164,7 +190,8 @@ void WinogradConvolutionWithPad(const index_t batch,
const
index_t
width
,
const
index_t
in_channels
,
const
index_t
out_channels
,
const
int
padding
)
{
const
int
padding
,
const
int
block_size
)
{
// srand(time(NULL));
// Construct graph
...
...
@@ -202,14 +229,14 @@ void WinogradConvolutionWithPad(const index_t batch,
// Winograd convolution
// transform filter
BufferToImage
<
D
,
T
>
(
&
net
,
"Filter"
,
"WinoFilter"
,
kernels
::
BufferType
::
WINOGRAD_FILTER
);
kernels
::
BufferType
::
WINOGRAD_FILTER
,
block_size
);
// transform input
OpDefBuilder
(
"WinogradTransform"
,
"WinogradTransformTest"
)
.
Input
(
"InputImage"
)
.
Output
(
"WinoInput"
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
AddIntsArg
(
"padding_values"
,
{
padding
,
padding
})
.
AddIntArg
(
"wino_block_size"
,
block_size
)
.
Finalize
(
net
.
NewOperatorDef
());
// Run on opencl
...
...
@@ -232,6 +259,7 @@ void WinogradConvolutionWithPad(const index_t batch,
.
AddIntArg
(
"batch"
,
batch
)
.
AddIntArg
(
"height"
,
output_shape
[
1
])
.
AddIntArg
(
"width"
,
output_shape
[
2
])
.
AddIntArg
(
"wino_block_size"
,
block_size
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Output
(
"WinoOutputImage"
)
.
Finalize
(
net
.
NewOperatorDef
());
...
...
@@ -250,19 +278,67 @@ void WinogradConvolutionWithPad(const index_t batch,
}
}
// namespace
TEST_F
(
WinogradConvlutionTest
,
AlignedConvolutionWithPad
)
{
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
32
,
16
,
1
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
half
>
(
1
,
32
,
32
,
32
,
16
,
2
);
TEST_F
(
WinogradConvlutionTest
,
AlignedConvolutionM2WithPad
)
{
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
32
,
16
,
1
,
2
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
half
>
(
1
,
32
,
32
,
32
,
16
,
2
,
2
);
}
TEST_F
(
WinogradConvlutionTest
,
UnAlignedConvolutionM2WithPad
)
{
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
1
,
61
,
67
,
31
,
37
,
1
,
2
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
half
>
(
1
,
61
,
67
,
37
,
31
,
2
,
2
);
}
TEST_F
(
WinogradConvlutionTest
,
BatchConvolutionWithM2Pad
)
{
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
3
,
64
,
64
,
32
,
32
,
1
,
2
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
half
>
(
5
,
61
,
67
,
37
,
31
,
2
,
2
);
}
TEST_F
(
WinogradConvlutionTest
,
AlignedConvolutionM6WithPad
)
{
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
32
,
16
,
1
,
6
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
half
>
(
1
,
32
,
32
,
32
,
16
,
2
,
6
);
}
TEST_F
(
WinogradConvlutionTest
,
UnAlignedConvolutionM6WithPad
)
{
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
1
,
61
,
67
,
31
,
37
,
1
,
6
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
half
>
(
1
,
61
,
67
,
37
,
31
,
2
,
6
);
}
TEST_F
(
WinogradConvlutionTest
,
BatchConvolutionWithM6Pad
)
{
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
3
,
64
,
64
,
32
,
32
,
1
,
6
);
// WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31,
// 2, 6);
}
TEST_F
(
WinogradConvlutionTest
,
AlignedConvolutionM4WithPad
)
{
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
32
,
16
,
1
,
4
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
half
>
(
1
,
32
,
32
,
32
,
16
,
2
,
4
);
}
TEST_F
(
WinogradConvlutionTest
,
UnAlignedConvolutionWithPad
)
{
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
1
,
61
,
67
,
31
,
37
,
1
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
half
>
(
1
,
61
,
67
,
37
,
31
,
2
);
TEST_F
(
WinogradConvlutionTest
,
UnAlignedConvolutionM4WithPad
)
{
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
1
,
61
,
67
,
31
,
37
,
1
,
4
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
half
>
(
1
,
61
,
67
,
37
,
31
,
2
,
4
);
}
TEST_F
(
WinogradConvlutionTest
,
BatchConvolutionWithPad
)
{
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
3
,
64
,
64
,
32
,
32
,
1
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
half
>
(
5
,
61
,
67
,
37
,
31
,
2
);
TEST_F
(
WinogradConvlutionTest
,
BatchConvolutionWithM4Pad
)
{
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
3
,
64
,
64
,
32
,
32
,
1
,
4
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
half
>
(
5
,
61
,
67
,
37
,
31
,
2
,
4
);
}
}
// namespace test
...
...
mace/ops/winograd_inverse_transform.h
浏览文件 @
02cadd13
...
...
@@ -36,7 +36,8 @@ class WinogradInverseTransformOp : public Operator<D, T> {
kernels
::
StringToActivationType
(
OperatorBase
::
GetOptionalArg
<
std
::
string
>
(
"activation"
,
"NOOP"
)),
OperatorBase
::
GetOptionalArg
<
float
>
(
"max_limit"
,
0.0
f
))
{}
OperatorBase
::
GetOptionalArg
<
float
>
(
"max_limit"
,
0.0
f
),
OperatorBase
::
GetOptionalArg
<
int
>
(
"wino_block_size"
,
2
))
{}
MaceStatus
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input_tensor
=
this
->
Input
(
INPUT
);
...
...
mace/ops/winograd_transform.h
浏览文件 @
02cadd13
...
...
@@ -30,7 +30,9 @@ class WinogradTransformOp : public Operator<D, T> {
:
Operator
<
D
,
T
>
(
op_def
,
ws
),
functor_
(
static_cast
<
Padding
>
(
OperatorBase
::
GetOptionalArg
<
int
>
(
"padding"
,
static_cast
<
int
>
(
VALID
))),
OperatorBase
::
GetRepeatedArgs
<
int
>
(
"padding_values"
))
{}
OperatorBase
::
GetRepeatedArgs
<
int
>
(
"padding_values"
),
OperatorBase
::
GetOptionalArg
<
int
>
(
"wino_block_size"
,
2
))
{}
MaceStatus
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input_tensor
=
this
->
Input
(
INPUT
);
...
...
mace/ops/winograd_transform_benchmark.cc
浏览文件 @
02cadd13
...
...
@@ -23,7 +23,7 @@ namespace test {
namespace
{
template
<
DeviceType
D
,
typename
T
>
void
BMWinogradTransform
(
int
iters
,
int
batch
,
int
height
,
int
width
,
int
channels
)
{
int
iters
,
int
batch
,
int
height
,
int
width
,
int
channels
,
int
block_size
)
{
mace
::
testing
::
StopTiming
();
OpsTestNet
net
;
...
...
@@ -35,50 +35,60 @@ void BMWinogradTransform(
.
Input
(
"InputImage"
)
.
Output
(
"OutputImage"
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
AddIntArg
(
"block_size"
,
block_size
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
Setup
(
D
);
// Warm-up
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
net
.
Run
Op
(
D
);
net
.
Run
(
);
}
net
.
Sync
();
mace
::
testing
::
StartTiming
();
while
(
iters
--
)
{
net
.
Run
Op
(
D
);
net
.
Run
(
);
}
net
.
Sync
();
}
}
// namespace
#define MACE_BM_WINO
GRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE)
\
static void
\
MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(
\
#define MACE_BM_WINO
_TRANSFORM_MACRO(N, H, W, C, M, TYPE, DEVICE)
\
static void
MACE_BM_WINO_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##
\
DEVICE(
\
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C
);
\
BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C
, M);
\
} \
MACE_BENCHMARK( \
MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C
##_##TYPE##_##DEVICE)
MACE_BM_WINO_TRANSFORM_##N##_##H##_##W##_##C##_##M
##_##TYPE##_##DEVICE)
#define MACE_BM_WINO
GRAD_TRANSFORM(N, H, W, C
) \
MACE_BM_WINO
GRAD_TRANSFORM_MACRO(N, H, W, C
, half, GPU);
#define MACE_BM_WINO
_TRANSFORM(N, H, W, C, M
) \
MACE_BM_WINO
_TRANSFORM_MACRO(N, H, W, C, M
, half, GPU);
MACE_BM_WINOGRAD_TRANSFORM
(
1
,
16
,
16
,
128
);
MACE_BM_WINOGRAD_TRANSFORM
(
1
,
64
,
64
,
128
);
MACE_BM_WINOGRAD_TRANSFORM
(
1
,
128
,
128
,
128
);
MACE_BM_WINO_TRANSFORM
(
1
,
128
,
128
,
3
,
2
);
MACE_BM_WINO_TRANSFORM
(
1
,
256
,
256
,
3
,
2
);
MACE_BM_WINO_TRANSFORM
(
1
,
64
,
64
,
3
,
2
);
MACE_BM_WINO_TRANSFORM
(
1
,
128
,
128
,
3
,
4
);
MACE_BM_WINO_TRANSFORM
(
1
,
256
,
256
,
3
,
4
);
MACE_BM_WINO_TRANSFORM
(
1
,
64
,
64
,
3
,
4
);
MACE_BM_WINO_TRANSFORM
(
1
,
128
,
128
,
3
,
6
);
MACE_BM_WINO_TRANSFORM
(
1
,
256
,
256
,
3
,
6
);
MACE_BM_WINO_TRANSFORM
(
1
,
64
,
64
,
3
,
6
);
namespace
{
template
<
DeviceType
D
,
typename
T
>
void
BMWinogradInverseTransform
(
int
iters
,
int
batch
,
int
height
,
int
width
,
int
channels
)
{
int
iters
,
int
batch
,
int
height
,
int
width
,
int
channels
,
int
block_size
)
{
mace
::
testing
::
StopTiming
();
index_t
p
=
batch
*
((
height
+
1
)
/
2
)
*
((
width
+
1
)
/
2
);
index_t
p
=
batch
*
((
height
+
block_size
-
1
)
/
block_size
)
*
((
width
+
block_size
-
1
)
/
block_size
);
OpsTestNet
net
;
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
16
,
channels
,
p
,
1
});
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{(
block_size
+
2
)
*
(
block_size
+
2
),
channels
,
p
,
1
});
BufferToImage
<
D
,
T
>
(
&
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_HEIGHT
);
...
...
@@ -87,42 +97,191 @@ void BMWinogradInverseTransform(
.
AddIntArg
(
"batch"
,
batch
)
.
AddIntArg
(
"height"
,
height
)
.
AddIntArg
(
"width"
,
width
)
.
AddIntArg
(
"block_size"
,
block_size
)
.
Output
(
"OutputImage"
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
net
.
Setup
(
D
);
// Warm-up
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
net
.
Run
Op
(
D
);
net
.
Run
(
);
}
net
.
Sync
();
mace
::
testing
::
StartTiming
();
while
(
iters
--
)
{
net
.
Run
Op
(
D
);
net
.
Run
(
);
}
net
.
Sync
();
}
}
// namespace
#define MACE_BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \
static void \
MACE_BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(\
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
#define MACE_BM_WINO_INVERSE_TRANSFORM_MACRO(N, H, W, C, M, TYPE, DEVICE) \
static void \
MACE_BM_WINO_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_\
##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C, M); \
} \
MACE_BENCHMARK( \
MACE_BM_WINO_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##\
DEVICE)
#define MACE_BM_WINO_INVERSE_TRANSFORM(N, H, W, C, M) \
MACE_BM_WINO_INVERSE_TRANSFORM_MACRO(N, H, W, C, M, half, GPU);
MACE_BM_WINO_INVERSE_TRANSFORM
(
1
,
126
,
126
,
16
,
2
);
MACE_BM_WINO_INVERSE_TRANSFORM
(
1
,
62
,
62
,
16
,
2
);
MACE_BM_WINO_INVERSE_TRANSFORM
(
1
,
254
,
254
,
16
,
2
);
MACE_BM_WINO_INVERSE_TRANSFORM
(
1
,
126
,
126
,
16
,
4
);
MACE_BM_WINO_INVERSE_TRANSFORM
(
1
,
62
,
62
,
16
,
4
);
MACE_BM_WINO_INVERSE_TRANSFORM
(
1
,
254
,
254
,
16
,
4
);
MACE_BM_WINO_INVERSE_TRANSFORM
(
1
,
126
,
126
,
16
,
6
);
MACE_BM_WINO_INVERSE_TRANSFORM
(
1
,
62
,
62
,
16
,
6
);
MACE_BM_WINO_INVERSE_TRANSFORM
(
1
,
254
,
254
,
16
,
6
);
namespace
{
template
<
DeviceType
D
,
typename
T
>
void
WinoFilterBufferToImage
(
int
iters
,
int
out_channel
,
int
in_channel
,
int
height
,
int
width
,
int
wino_block_size
)
{
mace
::
testing
::
StopTiming
();
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
out_channel
,
in_channel
,
height
,
width
});
OpDefBuilder
(
"BufferToImage"
,
"BufferToImageTest"
)
.
Input
(
"Input"
)
.
Output
(
"Output"
)
.
AddIntArg
(
"buffer_type"
,
kernels
::
BufferType
::
WINOGRAD_FILTER
)
.
AddIntArg
(
"wino_block_size"
,
wino_block_size
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Warm-up
net
.
Setup
(
D
);
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
net
.
Run
();
}
net
.
Sync
();
mace
::
testing
::
StartTiming
();
while
(
iters
--
)
{
net
.
Run
();
}
net
.
Sync
();
}
}
// namespace
#define MACE_BM_WINO_B2I_MACRO(O, I, H, W, M, TYPE, DEVICE) \
static void MACE_BM_WINO_B2I_##O##_##I##_##H##_##W##_##M##_##TYPE##_##DEVICE(\
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * O * I * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
WinoFilterBufferToImage<DEVICE, TYPE>(iters, O, I, H, W, M); \
} \
MACE_BENCHMARK(\
MACE_BM_WINO_B2I_##O##_##I##_##H##_##W##_##M##_##TYPE##_##DEVICE)
#define MACE_BM_WINO_B2I(O, I, H, W, M) \
MACE_BM_WINO_B2I_MACRO(O, I, H, W, M, half, GPU);
MACE_BM_WINO_B2I
(
16
,
3
,
3
,
3
,
2
);
MACE_BM_WINO_B2I
(
16
,
3
,
3
,
3
,
4
);
MACE_BM_WINO_B2I
(
16
,
3
,
3
,
3
,
6
);
MACE_BM_WINO_B2I
(
32
,
3
,
3
,
3
,
2
);
MACE_BM_WINO_B2I
(
32
,
3
,
3
,
3
,
4
);
MACE_BM_WINO_B2I
(
32
,
3
,
3
,
3
,
6
);
MACE_BM_WINO_B2I
(
128
,
3
,
3
,
3
,
2
);
MACE_BM_WINO_B2I
(
128
,
3
,
3
,
3
,
4
);
MACE_BM_WINO_B2I
(
128
,
3
,
3
,
3
,
6
);
MACE_BM_WINO_B2I
(
256
,
3
,
3
,
3
,
2
);
MACE_BM_WINO_B2I
(
256
,
3
,
3
,
3
,
4
);
MACE_BM_WINO_B2I
(
256
,
3
,
3
,
3
,
6
);
namespace
{
template
<
DeviceType
D
,
typename
T
>
void
WinoMatMulBenchmark
(
int
iters
,
int
out_channels
,
int
in_channels
,
int
height
,
int
width
,
int
block_size
)
{
mace
::
testing
::
StopTiming
();
OpsTestNet
net
;
const
int
batch
=
(
block_size
+
2
)
*
(
block_size
+
2
);
const
index_t
round_h
=
(
height
+
block_size
-
1
)
/
block_size
;
const
index_t
round_w
=
(
width
+
block_size
-
1
)
/
block_size
;
const
index_t
out_width
=
round_h
*
round_w
;
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"A"
,
{
batch
,
out_channels
,
in_channels
,
1
});
net
.
AddRandomInput
<
D
,
float
>
(
"B"
,
{
batch
,
in_channels
,
out_width
,
1
});
if
(
D
==
DeviceType
::
GPU
)
{
BufferToImage
<
D
,
T
>
(
&
net
,
"A"
,
"AImage"
,
kernels
::
BufferType
::
IN_OUT_WIDTH
);
BufferToImage
<
D
,
T
>
(
&
net
,
"B"
,
"BImage"
,
kernels
::
BufferType
::
IN_OUT_HEIGHT
);
OpDefBuilder
(
"MatMul"
,
"MatMulBM"
)
.
Input
(
"AImage"
)
.
Input
(
"BImage"
)
.
Output
(
"Output"
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
}
else
{
OpDefBuilder
(
"MatMul"
,
"MatMulBM"
)
.
Input
(
"A"
)
.
Input
(
"B"
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
}
net
.
Setup
(
D
);
// Warm-up
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
net
.
Run
();
}
net
.
Sync
();
mace
::
testing
::
StartTiming
();
while
(
iters
--
)
{
net
.
Run
();
}
net
.
Sync
();
}
}
// namespace
#define MACE_BM_WINO_MATMUL_MACRO(OC, IC, H, W, M, TYPE, DEVICE) \
static void MACE_BM_WINO_MATMUL_##OC##_##IC##_##H##_##W##_##M##_##TYPE##_##\
DEVICE(int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * OC * IC * H * W; \
const int64_t tot = static_cast<int64_t>(iters) * OC * (IC * H + H * W); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C);
\
WinoMatMulBenchmark<DEVICE, TYPE>(iters, OC, IC, H, W, M);
\
} \
MACE_BENCHMARK(
\
MACE_BM_WINO
GRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C
##_##TYPE##_##DEVICE)
MACE_BENCHMARK(\
MACE_BM_WINO
_MATMUL_##OC##_##IC##_##H##_##W##_##M
##_##TYPE##_##DEVICE)
#define MACE_BM_WINO
GRAD_INVERSE_TRANSFORM(N, H, W, C)
\
MACE_BM_WINO
GRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C
, half, GPU);
#define MACE_BM_WINO
_MATMUL(OC, IC, H, W, M)
\
MACE_BM_WINO
_MATMUL_MACRO(OC, IC, H, W, M
, half, GPU);
MACE_BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
14
,
14
,
32
);
MACE_BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
62
,
62
,
32
);
MACE_BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
126
,
126
,
32
);
MACE_BM_WINO_MATMUL
(
16
,
3
,
128
,
128
,
2
);
MACE_BM_WINO_MATMUL
(
16
,
3
,
128
,
128
,
4
);
MACE_BM_WINO_MATMUL
(
16
,
3
,
128
,
128
,
6
);
MACE_BM_WINO_MATMUL
(
32
,
3
,
256
,
256
,
2
);
MACE_BM_WINO_MATMUL
(
32
,
3
,
256
,
256
,
4
);
MACE_BM_WINO_MATMUL
(
32
,
3
,
256
,
256
,
6
);
}
// namespace test
}
// namespace ops
...
...
mace/python/tools/converter_tool/base_converter.py
浏览文件 @
02cadd13
...
...
@@ -140,6 +140,7 @@ class MaceKeyword(object):
mace_winograd_filter_transformed
=
'is_filter_transformed'
mace_device
=
'device'
mace_value_str
=
'value'
mace_wino_block_size
=
'wino_block_size'
class
TransformerRule
(
Enum
):
...
...
mace/python/tools/converter_tool/transformer.py
浏览文件 @
02cadd13
...
...
@@ -31,6 +31,7 @@ from mace.python.tools.converter_tool.base_converter import TransformerRule
from
mace.python.tools.convert_util
import
mace_check
OPENCL_IMAGE_MAX_SIZE
=
16384
DEFAULT_GPU_WINO_BLK_SIZE
=
4
class
OpenCLBufferType
(
enum
.
Enum
):
...
...
@@ -111,6 +112,7 @@ class Transformer(base_converter.ConverterInterface):
self
.
_option
=
option
self
.
_model
=
model
self
.
_gpu_wino_blk
=
DEFAULT_GPU_WINO_BLK_SIZE
self
.
_ops
=
{}
self
.
_consts
=
{}
...
...
@@ -482,21 +484,36 @@ class Transformer(base_converter.ConverterInterface):
if
filter_height
!=
3
or
filter_width
!=
3
or
strides
[
0
]
>
1
\
or
strides
[
1
]
>
1
or
dilations
[
0
]
>
1
or
dilations
[
1
]
>
1
:
return
False
width
=
batch
*
((
out_height
+
1
)
/
2
)
*
((
out_width
+
1
)
/
2
)
return
(
16
*
in_channels
<
OPENCL_IMAGE_MAX_SIZE
)
and
\
(
16
*
out_channels
<
OPENCL_IMAGE_MAX_SIZE
)
and
\
(
width
<
OPENCL_IMAGE_MAX_SIZE
)
self
.
_gpu_wino_blk
=
DEFAULT_GPU_WINO_BLK_SIZE
block_size
=
self
.
_gpu_wino_blk
blk_sqr
=
(
block_size
+
2
)
*
(
block_size
+
2
)
width
=
\
batch
*
((
out_height
+
block_size
-
1
)
/
block_size
)
*
\
((
out_width
+
block_size
-
1
)
/
block_size
)
if
blk_sqr
*
in_channels
>
OPENCL_IMAGE_MAX_SIZE
\
or
blk_sqr
*
out_channels
>
OPENCL_IMAGE_MAX_SIZE
\
or
width
>
OPENCL_IMAGE_MAX_SIZE
:
self
.
_gpu_wino_blk
=
2
block_size
=
self
.
_gpu_wino_blk
blk_sqr
=
(
block_size
+
2
)
*
(
block_size
+
2
)
width
=
\
batch
*
((
out_height
+
block_size
-
1
)
/
block_size
)
*
\
((
out_width
+
block_size
-
1
)
/
block_size
)
return
(
blk_sqr
*
in_channels
<=
OPENCL_IMAGE_MAX_SIZE
)
and
\
(
blk_sqr
*
out_channels
<=
OPENCL_IMAGE_MAX_SIZE
)
and
\
(
width
<=
OPENCL_IMAGE_MAX_SIZE
)
def
transform_gpu_winograd
(
self
):
"""Only gpu needs winograd transform."""
net
=
self
.
_model
filter_format
=
self
.
filter_format
()
if
self
.
_option
.
device
==
DeviceType
.
GPU
.
value
:
for
op
in
net
.
op
:
if
op
.
type
==
MaceOp
.
Conv2D
.
name
\
and
self
.
check_if_gpu_use_winograd_conv
(
op
):
print
(
"Transform gpu winograd %s(%s)"
%
(
op
.
name
,
op
.
type
))
block_size
=
self
.
_gpu_wino_blk
blk_sqr
=
(
block_size
+
2
)
*
(
block_size
+
2
)
output_shape
=
op
.
output_shape
[
0
].
dims
filter
=
self
.
_consts
[
op
.
input
[
1
]]
filter_shape
=
filter
.
dims
...
...
@@ -515,10 +532,15 @@ class Transformer(base_converter.ConverterInterface):
wt_op
.
input
.
extend
([
op
.
input
[
0
]])
wt_op
.
output
.
extend
([
wt_op
.
name
])
wt_output_shape
=
wt_op
.
output_shape
.
add
()
wt_output_width
=
batch
*
(
(
out_height
+
1
)
/
2
)
*
((
out_width
+
1
)
/
2
)
wt_output_width
=
\
batch
*
((
out_height
+
block_size
-
1
)
/
block_size
)
*
\
((
out_width
+
block_size
-
1
)
/
block_size
)
wt_output_shape
.
dims
.
extend
(
[
16
,
in_channels
,
wt_output_width
])
[
blk_sqr
,
in_channels
,
wt_output_width
])
blk_size_arg
=
wt_op
.
arg
.
add
()
blk_size_arg
.
name
=
MaceKeyword
.
mace_wino_block_size
blk_size_arg
.
i
=
block_size
if
ConverterUtil
.
get_arg
(
op
,
MaceKeyword
.
mace_padding_str
)
\
...
...
@@ -543,7 +565,7 @@ class Transformer(base_converter.ConverterInterface):
matmul_op
.
output
.
extend
([
matmul_op
.
name
])
matmul_output_shape
=
matmul_op
.
output_shape
.
add
()
matmul_output_shape
.
dims
.
extend
(
[
16
,
out_channels
,
wt_output_width
])
[
blk_sqr
,
out_channels
,
wt_output_width
])
arg
=
matmul_op
.
arg
.
add
()
arg
.
name
=
MaceKeyword
.
mace_winograd_filter_transformed
...
...
@@ -570,6 +592,9 @@ class Transformer(base_converter.ConverterInterface):
width_arg
=
iwt_op
.
arg
.
add
()
width_arg
.
name
=
'width'
width_arg
.
i
=
out_width
blk_size_arg
=
iwt_op
.
arg
.
add
()
blk_size_arg
.
name
=
MaceKeyword
.
mace_wino_block_size
blk_size_arg
.
i
=
block_size
ConverterUtil
.
add_data_format_arg
(
iwt_op
,
data_format
)
filter_data
=
np
.
array
(
filter
.
float_data
).
reshape
(
...
...
@@ -872,6 +897,13 @@ class Transformer(base_converter.ConverterInterface):
arg
.
name
=
MaceKeyword
.
mace_mode
arg
.
i
=
0
if
input_type
==
OpenCLBufferType
.
WINOGRAD_FILTER
:
blk_sqr
=
op
.
output_shape
[
0
].
dims
[
0
]
wino_blk
=
int
(
np
.
sqrt
(
blk_sqr
))
-
2
wino_arg
=
op_def
.
arg
.
add
()
wino_arg
.
name
=
MaceKeyword
.
mace_wino_block_size
wino_arg
.
i
=
wino_blk
op
.
input
[
input_idx
]
=
output_name
def
transform_buffer_image
(
self
):
...
...
@@ -1002,8 +1034,8 @@ class Transformer(base_converter.ConverterInterface):
def
transform_global_conv_to_fc
(
self
):
"""Transform global conv to fc should be placed after transposing
input/output and filter"""
if
self
.
_option
.
device
==
DeviceType
.
GPU
.
value
:
return
False
#
if self._option.device == DeviceType.GPU.value:
#
return False
net
=
self
.
_model
for
op
in
net
.
op
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录