Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
8a480d62
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
8a480d62
编写于
4月 18, 2018
作者:
W
wuchenghui
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
move cpu winograd filter transform to graph converter
上级
971a0bc4
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
389 addition
and
258 deletion
+389
-258
mace/kernels/arm/conv_2d.cc
mace/kernels/arm/conv_2d.cc
+56
-23
mace/kernels/arm/conv_winograd.cc
mace/kernels/arm/conv_winograd.cc
+166
-173
mace/kernels/arm/conv_winograd.h
mace/kernels/arm/conv_winograd.h
+11
-3
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+5
-2
mace/ops/conv_2d.h
mace/ops/conv_2d.h
+2
-0
mace/ops/fused_conv_2d.h
mace/ops/fused_conv_2d.h
+2
-0
mace/python/tools/caffe_converter_lib.py
mace/python/tools/caffe_converter_lib.py
+81
-45
mace/python/tools/tf_converter_lib.py
mace/python/tools/tf_converter_lib.py
+66
-12
未找到文件。
mace/kernels/arm/conv_2d.cc
浏览文件 @
8a480d62
...
...
@@ -15,10 +15,6 @@
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/arm/conv_winograd.h"
// winograd is always superior to neon impl during benchmark
#define USE_WINOGRAD 1
#define WINOGRAD_OUT_TILE_SIZE 6
namespace
mace
{
namespace
kernels
{
...
...
@@ -109,11 +105,21 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
MACE_CHECK_NOTNULL
(
filter
);
MACE_CHECK_NOTNULL
(
output
);
std
::
vector
<
index_t
>
filter_shape
(
4
);
if
(
is_filter_transformed_
)
{
// TOC -> OIHW
filter_shape
[
0
]
=
filter
->
dim
(
1
);
filter_shape
[
1
]
=
filter
->
dim
(
2
);
filter_shape
[
2
]
=
filter_shape
[
3
]
=
3
;
}
else
{
filter_shape
=
filter
->
shape
();
}
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
int
>
paddings
(
2
);
if
(
paddings_
.
empty
())
{
CalcNCHWPaddingAndOutputSize
(
input
->
shape
().
data
(),
filter
->
shape
()
.
data
(),
filter
_shape
.
data
(),
dilations_
,
strides_
,
padding_type_
,
...
...
@@ -121,7 +127,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
paddings
.
data
());
}
else
{
paddings
=
paddings_
;
CalcNCHWOutputSize
(
input
->
shape
().
data
(),
filter
->
shape
()
.
data
(),
CalcNCHWOutputSize
(
input
->
shape
().
data
(),
filter
_shape
.
data
(),
paddings_
.
data
(),
dilations_
,
strides_
,
RoundType
::
FLOOR
,
output_shape
.
data
());
}
...
...
@@ -138,10 +144,10 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
index_t
input_height
=
input
->
dim
(
2
);
index_t
input_width
=
input
->
dim
(
3
);
index_t
filter_h
=
filter
->
dim
(
2
)
;
index_t
filter_w
=
filter
->
dim
(
3
)
;
MACE_CHECK
(
filter
->
dim
(
0
)
==
channels
,
filter
->
dim
(
0
)
,
" != "
,
channels
);
MACE_CHECK
(
filter
->
dim
(
1
)
==
input_channels
,
filter
->
dim
(
1
)
,
" != "
,
index_t
filter_h
=
filter
_shape
[
2
]
;
index_t
filter_w
=
filter
_shape
[
3
]
;
MACE_CHECK
(
filter
_shape
[
0
]
==
channels
,
filter_shape
[
0
]
,
" != "
,
channels
);
MACE_CHECK
(
filter
_shape
[
1
]
==
input_channels
,
filter_shape
[
1
]
,
" != "
,
input_channels
);
index_t
stride_h
=
strides_
[
0
];
...
...
@@ -171,9 +177,9 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
std
::
function
<
void
(
const
float
*
input
,
float
*
output
)
>
conv_func
;
bool
use_winograd
=
USE_WINOGRAD
&&
filter_h
==
3
&&
filter_w
==
3
bool
use_winograd
=
is_filter_transformed_
||
(
filter_h
==
3
&&
filter_w
==
3
&&
stride_h
==
1
&&
stride_w
==
1
&&
dilation_h
==
1
&&
dilation_w
==
1
&&
input_channels
>=
8
&&
channels
>=
8
;
&&
input_channels
>=
8
&&
channels
>=
8
)
;
bool
use_neon_3x3_s1
=
filter_h
==
3
&&
filter_w
==
3
&&
stride_h
==
1
&&
stride_w
==
1
&&
dilation_h
==
1
&&
dilation_w
==
1
;
bool
use_neon_3x3_s2
=
filter_h
==
3
&&
filter_w
==
3
...
...
@@ -185,10 +191,17 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
std
::
vector
<
index_t
>
transformed_output_shape
;
std
::
vector
<
index_t
>
transformed_filter_shape
;
// When size of input feature map is bigger than 16x16,
// set winograd out tile size to 6 to get higher performance.
index_t
winograd_out_tile_size
=
2
;
if
(
input_height
>
16
&&
input_width
>
16
)
{
winograd_out_tile_size
=
6
;
}
if
(
use_winograd
)
{
extra_output_height
=
RoundUp
<
index_t
>
(
height
,
WINOGRAD_OUT_TILE_SIZE
);
extra_output_height
=
RoundUp
<
index_t
>
(
height
,
winograd_out_tile_size
);
extra_input_height
=
std
::
max
(
padded_input_height
,
extra_output_height
+
2
);
extra_output_width
=
RoundUp
<
index_t
>
(
width
,
WINOGRAD_OUT_TILE_SIZE
);
extra_output_width
=
RoundUp
<
index_t
>
(
width
,
winograd_out_tile_size
);
extra_input_width
=
std
::
max
(
padded_input_width
,
extra_output_width
+
2
);
if
(
extra_input_height
!=
padded_input_height
)
{
pad_bottom
+=
(
extra_input_height
-
padded_input_height
);
...
...
@@ -197,11 +210,11 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
pad_right
+=
(
extra_input_width
-
padded_input_width
);
}
index_t
tile_height_count
=
extra_output_height
/
WINOGRAD_OUT_TILE_SIZE
;
index_t
tile_width_count
=
extra_output_width
/
WINOGRAD_OUT_TILE_SIZE
;
index_t
tile_height_count
=
extra_output_height
/
winograd_out_tile_size
;
index_t
tile_width_count
=
extra_output_width
/
winograd_out_tile_size
;
index_t
tile_count
=
tile_height_count
*
tile_width_count
;
index_t
in_tile_area
=
(
WINOGRAD_OUT_TILE_SIZE
+
2
)
*
(
WINOGRAD_OUT_TILE_SIZE
+
2
);
(
winograd_out_tile_size
+
2
)
*
(
winograd_out_tile_size
+
2
);
transformed_input_shape
.
insert
(
transformed_input_shape
.
end
(),
{
in_tile_area
,
batch
,
input_channels
,
...
...
@@ -281,25 +294,45 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
if
(
use_winograd
)
{
transformed_input
.
Resize
(
transformed_input_shape
);
transformed_output
.
Resize
(
transformed_output_shape
);
if
(
!
is_filter_transformed_
)
{
const
float
*
transformed_filter_ptr
;
if
(
transformed_filter_
.
dim_size
()
==
0
)
{
transformed_filter_
.
Resize
(
transformed_filter_shape
);
if
(
is_filter_transformed_
)
{
transformed_filter_ptr
=
filter_data
;
}
else
{
switch
(
winograd_out_tile_size
)
{
case
2
:
TransformFilter4x4
(
filter_data
,
filter_shape
[
1
],
filter_shape
[
0
],
transformed_filter_
.
mutable_data
<
float
>
());
break
;
case
6
:
TransformFilter8x8
(
filter_data
,
filter_shape
[
1
],
filter_shape
[
0
],
transformed_filter_
.
mutable_data
<
float
>
());
break
;
default:
MACE_NOT_IMPLEMENTED
;
}
transformed_filter_ptr
=
transformed_filter_
.
data
<
float
>
();
}
}
else
{
transformed_filter_ptr
=
transformed_filter_
.
data
<
float
>
();
}
conv_func
=
[
&
](
const
float
*
pad_input
,
float
*
pad_output
)
{
WinoGradConv3x3s1
(
pad_input
,
filter_data
,
transformed_filter_ptr
,
batch
,
extra_input_height
,
extra_input_width
,
input_channels
,
channels
,
WINOGRAD_OUT_TILE_SIZE
,
winograd_out_tile_size
,
transformed_input
.
mutable_data
<
float
>
(),
transformed_filter_
.
mutable_data
<
float
>
(),
transformed_output
.
mutable_data
<
float
>
(),
is_filter_transformed_
,
pad_output
);
is_filter_transformed_
=
true
;
};
}
else
if
(
use_neon_3x3_s1
)
{
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
...
...
mace/kernels/arm/conv_winograd.cc
浏览文件 @
8a480d62
...
...
@@ -224,153 +224,6 @@ void TransformInput8x8(const float *input,
}
}
// OCHW => TOC
// no need to optimize, it will exist in converter
void
TransformFilter4x4
(
const
float
*
filter
,
const
index_t
in_channels
,
const
index_t
out_channels
,
float
*
output
)
{
const
index_t
stride
=
out_channels
*
in_channels
;
#pragma omp parallel for collapse(2)
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
float
g0
,
g1
,
g2
,
g3
,
g4
,
g5
,
g6
,
g7
,
g8
;
float
s0
,
s1
,
s2
,
s3
,
s4
,
s5
,
s6
,
s7
,
s8
,
s9
,
s10
,
s11
,
s12
,
s13
,
s14
,
s15
;
// load filter
index_t
filter_offset
=
(
m
*
in_channels
+
c
)
*
9
;
g0
=
filter
[
filter_offset
];
g1
=
filter
[
filter_offset
+
1
];
g2
=
filter
[
filter_offset
+
2
];
g3
=
filter
[
filter_offset
+
3
];
g4
=
filter
[
filter_offset
+
4
];
g5
=
filter
[
filter_offset
+
5
];
g6
=
filter
[
filter_offset
+
6
];
g7
=
filter
[
filter_offset
+
7
];
g8
=
filter
[
filter_offset
+
8
];
// s = G * g * GT
s0
=
g0
;
s1
=
(
g0
+
g2
+
g1
)
*
0.5
f
;
s2
=
(
g0
+
g2
-
g1
)
*
0.5
f
;
s3
=
g2
;
s4
=
(
g0
+
g6
+
g3
)
*
0.5
f
;
s5
=
((
g0
+
g6
+
g3
)
+
(
g2
+
g8
+
g5
)
+
(
g1
+
g7
+
g4
))
*
0.25
f
;
s6
=
((
g0
+
g6
+
g3
)
+
(
g2
+
g8
+
g5
)
-
(
g1
+
g7
+
g4
))
*
0.25
f
;
s7
=
(
g2
+
g8
+
g5
)
*
0.5
f
;
s8
=
(
g0
+
g6
-
g3
)
*
0.5
f
;
s9
=
((
g0
+
g6
-
g3
)
+
(
g2
+
g8
-
g5
)
+
(
g1
+
g7
-
g4
))
*
0.25
f
;
s10
=
((
g0
+
g6
-
g3
)
+
(
g2
+
g8
-
g5
)
-
(
g1
+
g7
-
g4
))
*
0.25
f
;
s11
=
(
g2
+
g8
-
g5
)
*
0.5
f
;
s12
=
g6
;
s13
=
(
g6
+
g8
+
g7
)
*
0.5
f
;
s14
=
(
g6
+
g8
-
g7
)
*
0.5
f
;
s15
=
g8
;
// store output
index_t
output_offset
=
m
*
in_channels
+
c
;
output
[
output_offset
+
0
*
stride
]
=
s0
;
output
[
output_offset
+
1
*
stride
]
=
s1
;
output
[
output_offset
+
2
*
stride
]
=
s2
;
output
[
output_offset
+
3
*
stride
]
=
s3
;
output
[
output_offset
+
4
*
stride
]
=
s4
;
output
[
output_offset
+
5
*
stride
]
=
s5
;
output
[
output_offset
+
6
*
stride
]
=
s6
;
output
[
output_offset
+
7
*
stride
]
=
s7
;
output
[
output_offset
+
8
*
stride
]
=
s8
;
output
[
output_offset
+
9
*
stride
]
=
s9
;
output
[
output_offset
+
10
*
stride
]
=
s10
;
output
[
output_offset
+
11
*
stride
]
=
s11
;
output
[
output_offset
+
12
*
stride
]
=
s12
;
output
[
output_offset
+
13
*
stride
]
=
s13
;
output
[
output_offset
+
14
*
stride
]
=
s14
;
output
[
output_offset
+
15
*
stride
]
=
s15
;
}
}
}
// OCHW => TOC
// no need to optimize, it will exist in converter
/**
* G =
⎡ 1 0 0 ⎤
⎢ ⎥
⎢-2/9 -2/9 -2/9 ⎥
⎢ ⎥
⎢-2/9 2/9 -2/9 ⎥
⎢ ⎥
⎢1/90 1/45 2/45 ⎥
⎢ ⎥
⎢1/90 -1/45 2/45 ⎥
⎢ ⎥
⎢1/45 1/90 1/180⎥
⎢ ⎥
⎢1/45 -1/90 1/180⎥
⎢ ⎥
⎣ 0 0 1 ⎦
*
* @param filter
* @param in_channels
* @param out_channels
* @param output
*/
void
TransformFilter8x8
(
const
float
*
filter
,
const
index_t
in_channels
,
const
index_t
out_channels
,
float
*
output
)
{
const
index_t
stride
=
out_channels
*
in_channels
;
const
float
G
[
8
][
3
]
=
{
{
1.0
f
,
0.0
f
,
0.0
f
},
{
-
2.0
f
/
9
,
-
2.0
f
/
9
,
-
2.0
f
/
9
},
{
-
2.0
f
/
9
,
2.0
f
/
9
,
-
2.0
f
/
9
},
{
1.0
f
/
90
,
1.0
f
/
45
,
2.0
f
/
45
},
{
1.0
f
/
90
,
-
1.0
f
/
45
,
2.0
f
/
45
},
{
1.0
f
/
45
,
1.0
f
/
90
,
1.0
f
/
180
},
{
1.0
f
/
45
,
-
1.0
f
/
90
,
1.0
f
/
180
},
{
0.0
f
,
0.0
f
,
1.0
f
}
};
#pragma omp parallel for collapse(2)
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
// load filter
index_t
filter_offset
=
(
m
*
in_channels
+
c
)
*
9
;
float
g0
,
g1
,
g2
,
g3
,
g4
,
g5
,
g6
,
g7
,
g8
;
g0
=
filter
[
filter_offset
];
g1
=
filter
[
filter_offset
+
1
];
g2
=
filter
[
filter_offset
+
2
];
g3
=
filter
[
filter_offset
+
3
];
g4
=
filter
[
filter_offset
+
4
];
g5
=
filter
[
filter_offset
+
5
];
g6
=
filter
[
filter_offset
+
6
];
g7
=
filter
[
filter_offset
+
7
];
g8
=
filter
[
filter_offset
+
8
];
float
s
[
3
][
8
];
for
(
int
i
=
0
;
i
<
8
;
++
i
)
{
s
[
0
][
i
]
=
g0
*
G
[
i
][
0
]
+
g1
*
G
[
i
][
1
]
+
g2
*
G
[
i
][
2
];
s
[
1
][
i
]
=
g3
*
G
[
i
][
0
]
+
g4
*
G
[
i
][
1
]
+
g5
*
G
[
i
][
2
];
s
[
2
][
i
]
=
g6
*
G
[
i
][
0
]
+
g7
*
G
[
i
][
1
]
+
g8
*
G
[
i
][
2
];
}
// store output
index_t
output_offset
=
m
*
in_channels
+
c
;
for
(
int
i
=
0
;
i
<
8
;
++
i
)
{
for
(
int
j
=
0
;
j
<
8
;
++
j
)
{
output
[
output_offset
+
(
i
*
8
+
j
)
*
stride
]
=
G
[
i
][
0
]
*
s
[
0
][
j
]
+
G
[
i
][
1
]
*
s
[
1
][
j
]
+
G
[
i
][
2
]
*
s
[
2
][
j
];
}
}
}
}
}
// TOC * TNCB => TNOB
void
BatchGemm
(
const
float
*
input
,
const
float
*
filter
,
...
...
@@ -581,8 +434,156 @@ void TransformOutput8x8(const float *input,
}
}
// namespace
// OCHW => TOC
// no need to optimize, it will exist in converter
void
TransformFilter4x4
(
const
float
*
filter
,
const
index_t
in_channels
,
const
index_t
out_channels
,
float
*
output
)
{
const
index_t
stride
=
out_channels
*
in_channels
;
#pragma omp parallel for collapse(2)
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
float
g0
,
g1
,
g2
,
g3
,
g4
,
g5
,
g6
,
g7
,
g8
;
float
s0
,
s1
,
s2
,
s3
,
s4
,
s5
,
s6
,
s7
,
s8
,
s9
,
s10
,
s11
,
s12
,
s13
,
s14
,
s15
;
// load filter
index_t
filter_offset
=
(
m
*
in_channels
+
c
)
*
9
;
g0
=
filter
[
filter_offset
];
g1
=
filter
[
filter_offset
+
1
];
g2
=
filter
[
filter_offset
+
2
];
g3
=
filter
[
filter_offset
+
3
];
g4
=
filter
[
filter_offset
+
4
];
g5
=
filter
[
filter_offset
+
5
];
g6
=
filter
[
filter_offset
+
6
];
g7
=
filter
[
filter_offset
+
7
];
g8
=
filter
[
filter_offset
+
8
];
// s = G * g * GT
s0
=
g0
;
s1
=
(
g0
+
g2
+
g1
)
*
0.5
f
;
s2
=
(
g0
+
g2
-
g1
)
*
0.5
f
;
s3
=
g2
;
s4
=
(
g0
+
g6
+
g3
)
*
0.5
f
;
s5
=
((
g0
+
g6
+
g3
)
+
(
g2
+
g8
+
g5
)
+
(
g1
+
g7
+
g4
))
*
0.25
f
;
s6
=
((
g0
+
g6
+
g3
)
+
(
g2
+
g8
+
g5
)
-
(
g1
+
g7
+
g4
))
*
0.25
f
;
s7
=
(
g2
+
g8
+
g5
)
*
0.5
f
;
s8
=
(
g0
+
g6
-
g3
)
*
0.5
f
;
s9
=
((
g0
+
g6
-
g3
)
+
(
g2
+
g8
-
g5
)
+
(
g1
+
g7
-
g4
))
*
0.25
f
;
s10
=
((
g0
+
g6
-
g3
)
+
(
g2
+
g8
-
g5
)
-
(
g1
+
g7
-
g4
))
*
0.25
f
;
s11
=
(
g2
+
g8
-
g5
)
*
0.5
f
;
s12
=
g6
;
s13
=
(
g6
+
g8
+
g7
)
*
0.5
f
;
s14
=
(
g6
+
g8
-
g7
)
*
0.5
f
;
s15
=
g8
;
// store output
index_t
output_offset
=
m
*
in_channels
+
c
;
output
[
output_offset
+
0
*
stride
]
=
s0
;
output
[
output_offset
+
1
*
stride
]
=
s1
;
output
[
output_offset
+
2
*
stride
]
=
s2
;
output
[
output_offset
+
3
*
stride
]
=
s3
;
output
[
output_offset
+
4
*
stride
]
=
s4
;
output
[
output_offset
+
5
*
stride
]
=
s5
;
output
[
output_offset
+
6
*
stride
]
=
s6
;
output
[
output_offset
+
7
*
stride
]
=
s7
;
output
[
output_offset
+
8
*
stride
]
=
s8
;
output
[
output_offset
+
9
*
stride
]
=
s9
;
output
[
output_offset
+
10
*
stride
]
=
s10
;
output
[
output_offset
+
11
*
stride
]
=
s11
;
output
[
output_offset
+
12
*
stride
]
=
s12
;
output
[
output_offset
+
13
*
stride
]
=
s13
;
output
[
output_offset
+
14
*
stride
]
=
s14
;
output
[
output_offset
+
15
*
stride
]
=
s15
;
}
}
}
// OCHW => TOC
// no need to optimize, it will exist in converter
/**
* G =
⎡ 1 0 0 ⎤
⎢ ⎥
⎢-2/9 -2/9 -2/9 ⎥
⎢ ⎥
⎢-2/9 2/9 -2/9 ⎥
⎢ ⎥
⎢1/90 1/45 2/45 ⎥
⎢ ⎥
⎢1/90 -1/45 2/45 ⎥
⎢ ⎥
⎢1/45 1/90 1/180⎥
⎢ ⎥
⎢1/45 -1/90 1/180⎥
⎢ ⎥
⎣ 0 0 1 ⎦
*
* @param filter
* @param in_channels
* @param out_channels
* @param output
*/
void
TransformFilter8x8
(
const
float
*
filter
,
const
index_t
in_channels
,
const
index_t
out_channels
,
float
*
output
)
{
const
index_t
stride
=
out_channels
*
in_channels
;
const
float
G
[
8
][
3
]
=
{
{
1.0
f
,
0.0
f
,
0.0
f
},
{
-
2.0
f
/
9
,
-
2.0
f
/
9
,
-
2.0
f
/
9
},
{
-
2.0
f
/
9
,
2.0
f
/
9
,
-
2.0
f
/
9
},
{
1.0
f
/
90
,
1.0
f
/
45
,
2.0
f
/
45
},
{
1.0
f
/
90
,
-
1.0
f
/
45
,
2.0
f
/
45
},
{
1.0
f
/
45
,
1.0
f
/
90
,
1.0
f
/
180
},
{
1.0
f
/
45
,
-
1.0
f
/
90
,
1.0
f
/
180
},
{
0.0
f
,
0.0
f
,
1.0
f
}
};
#pragma omp parallel for collapse(2)
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
// load filter
index_t
filter_offset
=
(
m
*
in_channels
+
c
)
*
9
;
float
g0
,
g1
,
g2
,
g3
,
g4
,
g5
,
g6
,
g7
,
g8
;
g0
=
filter
[
filter_offset
];
g1
=
filter
[
filter_offset
+
1
];
g2
=
filter
[
filter_offset
+
2
];
g3
=
filter
[
filter_offset
+
3
];
g4
=
filter
[
filter_offset
+
4
];
g5
=
filter
[
filter_offset
+
5
];
g6
=
filter
[
filter_offset
+
6
];
g7
=
filter
[
filter_offset
+
7
];
g8
=
filter
[
filter_offset
+
8
];
float
s
[
3
][
8
];
for
(
int
i
=
0
;
i
<
8
;
++
i
)
{
s
[
0
][
i
]
=
g0
*
G
[
i
][
0
]
+
g1
*
G
[
i
][
1
]
+
g2
*
G
[
i
][
2
];
s
[
1
][
i
]
=
g3
*
G
[
i
][
0
]
+
g4
*
G
[
i
][
1
]
+
g5
*
G
[
i
][
2
];
s
[
2
][
i
]
=
g6
*
G
[
i
][
0
]
+
g7
*
G
[
i
][
1
]
+
g8
*
G
[
i
][
2
];
}
// store output
index_t
output_offset
=
m
*
in_channels
+
c
;
for
(
int
i
=
0
;
i
<
8
;
++
i
)
{
for
(
int
j
=
0
;
j
<
8
;
++
j
)
{
output
[
output_offset
+
(
i
*
8
+
j
)
*
stride
]
=
G
[
i
][
0
]
*
s
[
0
][
j
]
+
G
[
i
][
1
]
*
s
[
1
][
j
]
+
G
[
i
][
2
]
*
s
[
2
][
j
];
}
}
}
}
}
void
WinoGradConv3x3s1
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
transformed_
filter
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
...
...
@@ -590,9 +591,7 @@ void WinoGradConv3x3s1(const float *input,
const
index_t
out_channels
,
const
int
out_tile_size
,
float
*
transformed_input
,
float
*
transformed_filter
,
float
*
transformed_output
,
bool
is_filter_transformed
,
float
*
output
)
{
index_t
out_height
=
in_height
-
2
;
index_t
out_width
=
in_width
-
2
;
...
...
@@ -624,26 +623,6 @@ void WinoGradConv3x3s1(const float *input,
default:
MACE_NOT_IMPLEMENTED
;
}
// TODO(liyin): put it in model converter, but do not worry, it is fast and
// will only do once
if
(
!
is_filter_transformed
)
{
switch
(
out_tile_size
)
{
case
2
:
TransformFilter4x4
(
filter
,
in_channels
,
out_channels
,
transformed_filter
);
break
;
case
6
:
TransformFilter8x8
(
filter
,
in_channels
,
out_channels
,
transformed_filter
);
break
;
default:
MACE_NOT_IMPLEMENTED
;
}
}
BatchGemm
(
transformed_input
,
transformed_filter
,
batch
,
...
...
@@ -703,8 +682,24 @@ void WinoGradConv3x3s1(const float *input,
float
*
transformed_filter
=
new
float
[
transformed_filter_size
];
// TOC
float
*
transformed_output
=
new
float
[
transformed_output_size
];
switch
(
out_tile_size
)
{
case
2
:
TransformFilter4x4
(
filter
,
in_channels
,
out_channels
,
transformed_filter
);
break
;
case
6
:
TransformFilter8x8
(
filter
,
in_channels
,
out_channels
,
transformed_filter
);
break
;
default:
MACE_NOT_IMPLEMENTED
;
}
WinoGradConv3x3s1
(
input
,
filter
,
transformed_
filter
,
batch
,
in_height
,
in_width
,
...
...
@@ -712,9 +707,7 @@ void WinoGradConv3x3s1(const float *input,
out_channels
,
out_tile_size
,
transformed_input
,
transformed_filter
,
transformed_output
,
false
,
output
);
delete
[]
transformed_input
;
...
...
mace/kernels/arm/conv_winograd.h
浏览文件 @
8a480d62
...
...
@@ -24,6 +24,16 @@
namespace
mace
{
namespace
kernels
{
void
TransformFilter4x4
(
const
float
*
filter
,
const
index_t
in_channels
,
const
index_t
out_channels
,
float
*
output
);
void
TransformFilter8x8
(
const
float
*
filter
,
const
index_t
in_channels
,
const
index_t
out_channels
,
float
*
output
);
void
WinoGradConv3x3s1
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
batch
,
...
...
@@ -35,7 +45,7 @@ void WinoGradConv3x3s1(const float *input,
float
*
output
);
void
WinoGradConv3x3s1
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
transformed_
filter
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
...
...
@@ -43,9 +53,7 @@ void WinoGradConv3x3s1(const float *input,
const
index_t
out_channels
,
const
int
out_tile_size
,
float
*
transformed_input
,
float
*
transformed_filter
,
float
*
transformed_output
,
bool
is_filter_transformed
,
float
*
output
);
void
ConvRef3x3s1
(
const
float
*
input
,
...
...
mace/kernels/conv_2d.h
浏览文件 @
8a480d62
...
...
@@ -308,6 +308,7 @@ struct Conv2dFunctor : Conv2dFunctorBase {
const
int
*
dilations
,
const
ActivationType
activation
,
const
float
relux_max_limit
,
const
bool
is_filter_transformed
,
ScratchBuffer
*
scratch
)
:
Conv2dFunctorBase
(
strides
,
padding_type
,
...
...
@@ -317,7 +318,7 @@ struct Conv2dFunctor : Conv2dFunctorBase {
relux_max_limit
)
{}
void
operator
()(
const
Tensor
*
input
,
// NHWC
const
Tensor
*
filter
,
// HWOI
const
Tensor
*
filter
,
// HWOI
or TOI
const
Tensor
*
bias
,
Tensor
*
output
,
StatsFuture
*
future
)
{
...
...
@@ -434,6 +435,7 @@ struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase {
const
int
*
dilations
,
const
ActivationType
activation
,
const
float
relux_max_limit
,
const
bool
is_filter_transformed
,
ScratchBuffer
*
scratch
)
:
Conv2dFunctorBase
(
strides
,
padding_type
,
...
...
@@ -441,7 +443,7 @@ struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase {
dilations
,
activation
,
relux_max_limit
),
is_filter_transformed_
(
false
),
is_filter_transformed_
(
is_filter_transformed
),
scratch_
(
scratch
)
{}
void
operator
()(
const
Tensor
*
input
,
...
...
@@ -463,6 +465,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
const
int
*
dilations
,
const
ActivationType
activation
,
const
float
relux_max_limit
,
const
bool
is_filter_transformed
,
ScratchBuffer
*
scratch
)
:
Conv2dFunctorBase
(
strides
,
padding_type
,
...
...
mace/ops/conv_2d.h
浏览文件 @
8a480d62
...
...
@@ -35,6 +35,8 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
this
->
dilations_
.
data
(),
kernels
::
ActivationType
::
NOOP
,
0.0
f
,
static_cast
<
bool
>
(
OperatorBase
::
GetSingleArgument
<
int
>
(
"is_filter_transformed"
,
false
)),
ws
->
GetScratchBuffer
(
D
))
{}
bool
Run
(
StatsFuture
*
future
)
override
{
...
...
mace/ops/fused_conv_2d.h
浏览文件 @
8a480d62
...
...
@@ -38,6 +38,8 @@ class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
OperatorBase
::
GetSingleArgument
<
std
::
string
>
(
"activation"
,
"NOOP"
)),
OperatorBase
::
GetSingleArgument
<
float
>
(
"max_limit"
,
0.0
f
),
static_cast
<
bool
>
(
OperatorBase
::
GetSingleArgument
<
int
>
(
"is_filter_transformed"
,
false
)),
ws
->
GetScratchBuffer
(
D
))
{}
bool
Run
(
StatsFuture
*
future
)
override
{
...
...
mace/python/tools/caffe_converter_lib.py
浏览文件 @
8a480d62
...
...
@@ -374,6 +374,10 @@ class CaffeConverter(object):
return
pad
,
stride
,
kernel
def
convert_conv2d
(
self
,
op
):
use_winograd
=
False
if
self
.
device
==
'neon'
:
use_winograd
=
self
.
check_winograd_conv
(
op
)
param
=
op
.
layer
.
convolution_param
is_depthwise
=
False
if
param
.
HasField
(
'group'
):
...
...
@@ -394,7 +398,11 @@ class CaffeConverter(object):
else
:
# OIHW -> HWOI
weight_data
=
op
.
data
[
0
].
transpose
((
2
,
3
,
0
,
1
))
self
.
add_tensor
(
weight_tensor_name
,
weight_data
)
if
self
.
device
==
'neon'
and
use_winograd
:
self
.
convert_winograd_conv_filter_neon
(
op
,
op_def
)
else
:
self
.
add_tensor
(
weight_tensor_name
,
weight_data
)
if
self
.
device
==
'gpu'
:
buffer_type
=
"DW_CONV2D_FILTER"
\
...
...
@@ -438,7 +446,7 @@ class CaffeConverter(object):
op
.
output_shape_map
[
op
.
layer
.
top
[
0
]]
=
output_shape
if
len
(
self
.
ops_map
[
final_op
.
name
].
children
)
==
1
and
\
self
.
ops_map
[
final_op
.
name
].
children
[
0
].
type
\
self
.
ops_map
[
final_op
.
name
].
children
[
0
].
type
\
in
activation_name_map
:
activation_op
=
self
.
ops_map
[
final_op
.
name
].
children
[
0
]
if
not
is_depthwise
:
...
...
@@ -455,15 +463,18 @@ class CaffeConverter(object):
self
.
net_def
.
op
.
extend
([
op_def
])
def
check_winograd_conv
(
self
,
op
):
# TODO: support winograd conv on neon
if
self
.
device
==
'neon'
:
return
False
param
=
op
.
layer
.
convolution_param
filter_shape
=
np
.
asarray
(
op
.
data
[
0
].
shape
)
if
self
.
device
!=
'neon'
:
filter_shape
=
filter_shape
[[
2
,
3
,
0
,
1
]]
# OIHW -> HWOI
paddings
,
strides
,
_
=
self
.
add_stride_pad_kernel_arg
(
param
,
None
)
if
param
.
HasField
(
'group'
):
if
param
.
group
==
op
.
data
[
0
].
shape
[
0
]
and
op
.
data
[
0
].
shape
[
1
]
==
1
:
return
False
# Depthwise conv not support winograd
else
:
raise
Exception
(
"Mace do not support group convolution yet"
)
dilations
=
[
1
,
1
]
if
len
(
param
.
dilation
)
>
0
:
if
len
(
param
.
dilation
)
==
1
:
...
...
@@ -476,23 +487,60 @@ class CaffeConverter(object):
op
.
get_single_parent
().
output_shape_map
[
op
.
layer
.
bottom
[
0
]],
filter_shape
,
paddings
,
strides
,
dilations
,
math
.
floor
,
input_format
)
width
=
output_shape
[
0
]
*
((
output_shape
[
1
]
+
1
)
/
2
)
*
((
output_shape
[
2
]
+
1
)
/
2
)
if
self
.
winograd
and
dilations
[
0
]
==
1
and
\
(
dilations
[
0
]
==
dilations
[
1
])
and
\
(
strides
[
0
]
==
1
)
and
(
strides
[
0
]
==
strides
[
1
]):
if
self
.
device
==
'gpu'
:
width
=
output_shape
[
0
]
*
((
output_shape
[
1
]
+
1
)
/
2
)
*
\
((
output_shape
[
2
]
+
1
)
/
2
)
return
filter_shape
[
0
]
==
3
and
\
(
filter_shape
[
0
]
==
filter_shape
[
1
])
and
\
(
16
*
filter_shape
[
2
]
<
OPENCL_IMAGE_MAX_SIZE
)
and
\
(
16
*
filter_shape
[
3
]
<
OPENCL_IMAGE_MAX_SIZE
)
and
\
(
width
<
OPENCL_IMAGE_MAX_SIZE
)
filter_shape
[
0
]
==
filter_shape
[
1
]
and
\
(
16
*
filter_shape
[
2
]
<
OPENCL_IMAGE_MAX_SIZE
)
and
\
(
16
*
filter_shape
[
3
]
<
OPENCL_IMAGE_MAX_SIZE
)
and
\
(
width
<
OPENCL_IMAGE_MAX_SIZE
)
elif
self
.
device
==
'neon'
:
return
filter_shape
[
2
]
==
3
and
(
filter_shape
[
2
]
==
filter_shape
[
3
])
return
filter_shape
[
2
]
==
3
and
\
filter_shape
[
2
]
==
filter_shape
[
3
]
and
\
filter_shape
[
0
]
>=
8
and
filter_shape
[
1
]
>=
8
return
False
def
convert_winograd_conv
(
self
,
op
):
def
convert_winograd_conv_filter_neon
(
self
,
op
,
op_def
):
# Add filter
weight_tensor_name
=
op
.
name
+
'_weight:0'
weight_data
=
op
.
data
[
0
]
# OIHW
input_shape
=
op
.
data
[
1
].
shape
if
input_shape
[
2
]
>
16
and
input_shape
[
3
]
>
16
:
G
=
np
.
array
([
[
1.0
,
0.0
,
0.0
],
[
-
2.0
/
9
,
-
2.0
/
9
,
-
2.0
/
9
],
[
-
2.0
/
9
,
2.0
/
9
,
-
2.0
/
9
],
[
1.0
/
90
,
1.0
/
45
,
2.0
/
45
],
[
1.0
/
90
,
-
1.0
/
45
,
2.0
/
45
],
[
1.0
/
45
,
1.0
/
90
,
1.0
/
180
],
[
1.0
/
45
,
-
1.0
/
90
,
1.0
/
180
],
[
0.0
,
0.0
,
1.0
]
],
dtype
=
np
.
float32
)
new_shape
=
[
64
,
weight_data
.
shape
[
0
],
weight_data
.
shape
[
1
]]
# TOC
else
:
G
=
np
.
array
([
[
1.0
,
0.0
,
0.0
],
[
0.5
,
0.5
,
0.5
],
[
0.5
,
-
0.5
,
0.5
],
[
0.0
,
0.0
,
1.0
],
],
dtype
=
np
.
float32
)
new_shape
=
[
16
,
weight_data
.
shape
[
0
],
weight_data
.
shape
[
1
]]
# TOC
new_weight_value
=
G
.
dot
(
weight_data
).
dot
(
G
.
T
)
# [8, O, I, 8]
new_weight_value
=
new_weight_value
.
transpose
(
0
,
3
,
1
,
2
)
new_weight_value
=
new_weight_value
.
reshape
(
new_shape
)
self
.
add_tensor
(
weight_tensor_name
,
new_weight_value
)
op_def
.
input
.
extend
([
weight_tensor_name
])
winograd_transformed_arg
=
op_def
.
arg
.
add
()
winograd_transformed_arg
.
name
=
'is_filter_transformed'
winograd_transformed_arg
.
i
=
1
def
convert_winograd_conv_gpu
(
self
,
op
):
# Add filter
weight_tensor_name
=
op
.
name
+
'_weight:0'
self
.
add_tensor
(
weight_tensor_name
,
op
.
data
[
0
])
...
...
@@ -504,10 +552,8 @@ class CaffeConverter(object):
paddings
,
strides
,
_
=
self
.
add_stride_pad_kernel_arg
(
param
,
None
)
filter_shape
=
np
.
asarray
(
op
.
data
[
0
].
shape
)
if
self
.
device
!=
'neon'
:
filter_shape
=
filter_shape
[[
2
,
3
,
0
,
1
]]
# OIHW -> HWOI
input_format
=
'N
CHW'
if
self
.
device
==
'neon'
else
'N
HWC'
input_format
=
'NHWC'
output_shape
=
Shapes
.
conv_pool_shape
(
op
.
get_single_parent
().
output_shape_map
[
op
.
layer
.
bottom
[
0
]],
filter_shape
,
paddings
,
strides
,
[
1
,
1
],
math
.
floor
,
input_format
)
...
...
@@ -526,16 +572,10 @@ class CaffeConverter(object):
wt_output_name
=
wt_op
.
name
+
":0"
wt_op
.
output
.
extend
([
wt_output_name
])
wt_output_shape
=
mace_pb2
.
OutputShape
()
if
self
.
device
!=
'neon'
:
wt_output_width
=
output_shape
[
0
]
*
((
output_shape
[
1
]
+
1
)
/
2
)
*
((
output_shape
[
2
]
+
1
)
/
2
)
wt_output_shape
.
dims
.
extend
(
[
16
,
filter_shape
[
3
],
wt_output_width
,
1
])
else
:
wt_output_width
=
output_shape
[
0
]
*
((
output_shape
[
2
]
+
1
)
/
2
)
*
((
output_shape
[
3
]
+
1
)
/
2
)
wt_output_shape
.
dims
.
extend
(
[
16
,
filter_shape
[
1
],
wt_output_width
,
1
])
wt_output_width
=
output_shape
[
0
]
*
((
output_shape
[
1
]
+
1
)
/
2
)
*
((
output_shape
[
2
]
+
1
)
/
2
)
wt_output_shape
.
dims
.
extend
(
[
16
,
filter_shape
[
3
],
wt_output_width
,
1
])
wt_op
.
output_shape
.
extend
([
wt_output_shape
])
# MatMul
...
...
@@ -549,12 +589,8 @@ class CaffeConverter(object):
matmul_output_name
=
matmul_op
.
name
+
":0"
matmul_op
.
output
.
extend
([
matmul_output_name
])
matmul_output_shape
=
mace_pb2
.
OutputShape
()
if
self
.
device
!=
'neon'
:
matmul_output_shape
.
dims
.
extend
(
[
16
,
filter_shape
[
2
],
wt_output_width
,
1
])
else
:
matmul_output_shape
.
dims
.
extend
(
[
16
,
filter_shape
[
0
],
wt_output_width
,
1
])
matmul_output_shape
.
dims
.
extend
(
[
16
,
filter_shape
[
2
],
wt_output_width
,
1
])
matmul_op
.
output_shape
.
extend
([
matmul_output_shape
])
# Inverse transform
...
...
@@ -567,12 +603,10 @@ class CaffeConverter(object):
batch_arg
.
i
=
output_shape
[
0
]
height_arg
=
iwt_op
.
arg
.
add
()
height_arg
.
name
=
'height'
height_arg
.
i
=
output_shape
[
1
]
if
self
.
device
!=
'neon'
else
output_shape
[
2
]
height_arg
.
i
=
output_shape
[
1
]
width_arg
=
iwt_op
.
arg
.
add
()
width_arg
.
name
=
'width'
width_arg
.
i
=
output_shape
[
2
]
if
self
.
device
!=
'neon'
else
output_shape
[
3
]
width_arg
.
i
=
output_shape
[
2
]
iwt_op
.
name
=
op
.
name
+
'_inverse_transform'
iwt_op
.
type
=
'WinogradInverseTransform'
iwt_op
.
input
.
extend
([
matmul_output_name
])
...
...
@@ -591,7 +625,7 @@ class CaffeConverter(object):
self
.
resolved_ops
.
add
(
op
.
name
)
if
len
(
self
.
ops_map
[
final_op
.
name
].
children
)
==
1
and
\
self
.
ops_map
[
final_op
.
name
].
children
[
0
].
type
\
self
.
ops_map
[
final_op
.
name
].
children
[
0
].
type
\
in
activation_name_map
:
activation_op
=
self
.
ops_map
[
final_op
.
name
].
children
[
0
]
fused_act_arg
=
iwt_op
.
arg
.
add
()
...
...
@@ -645,8 +679,8 @@ class CaffeConverter(object):
output_shape
=
op
.
get_single_parent
().
output_shape_map
[
op
.
layer
.
bottom
[
0
]]
if
len
(
self
.
ops_map
[
final_op
.
name
].
children
)
==
1
\
and
self
.
ops_map
[
final_op
.
name
].
children
[
0
].
type
\
if
len
(
self
.
ops_map
[
final_op
.
name
].
children
)
==
1
and
\
self
.
ops_map
[
final_op
.
name
].
children
[
0
].
type
\
in
activation_name_map
:
activation_op
=
self
.
ops_map
[
final_op
.
name
].
children
[
0
]
fused_act_arg
=
op_def
.
arg
.
add
()
...
...
@@ -727,13 +761,15 @@ class CaffeConverter(object):
op_def
.
input
.
extend
([
bias_tensor_name
])
self
.
resolved_ops
.
add
(
op
.
name
)
input_format
=
'NCHW'
if
self
.
device
==
'neon'
else
'NHWC'
output_shape
=
Shapes
.
fully_connected_shape
(
input_shape
,
weight_data
.
shape
)
weight_data
.
shape
,
input_format
)
op
.
output_shape_map
[
op
.
layer
.
top
[
0
]]
=
output_shape
final_op
=
op
if
len
(
self
.
ops_map
[
final_op
.
name
].
children
)
==
1
\
and
self
.
ops_map
[
final_op
.
name
].
children
[
0
].
type
\
and
self
.
ops_map
[
final_op
.
name
].
children
[
0
].
type
\
in
activation_name_map
:
activation_op
=
self
.
ops_map
[
final_op
.
name
].
children
[
0
]
fused_act_arg
=
op_def
.
arg
.
add
()
...
...
@@ -764,7 +800,7 @@ class CaffeConverter(object):
input_shape
=
op
.
get_single_parent
().
output_shape_map
[
op
.
layer
.
bottom
[
0
]]
if
param
.
HasField
(
'global_pooling'
)
and
param
.
global_pooling
:
kernels
=
[
input_shape
[
1
],
input_shape
[
2
]]
kernels
=
[
input_shape
[
2
],
input_shape
[
3
]]
kernel_arg
=
op_def
.
arg
.
add
()
kernel_arg
.
name
=
'kernels'
...
...
@@ -1054,8 +1090,8 @@ class CaffeConverter(object):
if
op
.
type
==
'Input'
:
self
.
resolved_ops
.
add
(
op
.
name
)
elif
op
.
type
==
'Convolution'
:
if
self
.
check_winograd_conv
(
op
):
self
.
convert_winograd_conv
(
op
)
if
self
.
device
==
'gpu'
and
self
.
check_winograd_conv
(
op
):
self
.
convert_winograd_conv
_gpu
(
op
)
else
:
self
.
convert_conv2d
(
op
)
elif
op
.
type
==
'BatchNorm'
:
...
...
mace/python/tools/tf_converter_lib.py
浏览文件 @
8a480d62
...
...
@@ -257,15 +257,19 @@ class TFConverter(object):
return
False
width
=
output_shape
[
0
]
*
((
output_shape
[
1
]
+
1
)
/
2
)
*
((
output_shape
[
2
]
+
1
)
/
2
)
return
self
.
winograd
and
op
.
type
!=
'DepthwiseConv2dNative'
and
\
self
.
device
==
'gpu'
and
filter_shape
[
0
]
==
3
and
\
(
filter_shape
[
0
]
==
filter_shape
[
1
])
and
\
(
strides
[
0
]
==
1
)
and
(
strides
[
0
]
==
strides
[
1
])
and
\
(
16
*
filter_shape
[
2
]
<
OPENCL_IMAGE_MAX_SIZE
)
and
\
(
16
*
filter_shape
[
3
]
<
OPENCL_IMAGE_MAX_SIZE
)
and
\
(
width
<
OPENCL_IMAGE_MAX_SIZE
)
def
convert_winograd_conv
(
self
,
op
):
if
self
.
winograd
and
op
.
type
!=
'DepthwiseConv2dNative'
and
\
filter_shape
[
0
]
==
3
and
\
(
filter_shape
[
0
]
==
filter_shape
[
1
])
and
\
(
strides
[
0
]
==
1
)
and
(
strides
[
0
]
==
strides
[
1
]):
if
self
.
device
==
'gpu'
:
return
(
16
*
filter_shape
[
2
]
<
OPENCL_IMAGE_MAX_SIZE
)
and
\
(
16
*
filter_shape
[
3
]
<
OPENCL_IMAGE_MAX_SIZE
)
and
\
(
width
<
OPENCL_IMAGE_MAX_SIZE
)
elif
self
.
device
==
'neon'
:
return
filter_shape
[
2
]
>=
8
and
filter_shape
[
3
]
>=
8
return
False
def
convert_winograd_conv_gpu
(
self
,
op
):
filter_tensor
=
get_input_tensor
(
op
,
1
)
filter_shape
=
filter_tensor
.
shape
.
as_list
()
output_shape
=
op
.
outputs
[
0
].
shape
.
as_list
()
...
...
@@ -355,7 +359,55 @@ class TFConverter(object):
self
.
add_output_shape
(
final_op
.
outputs
,
iwt_op
)
self
.
net_def
.
op
.
extend
([
wt_op
,
matmul_op
,
iwt_op
])
def
convert_conv_winograd_filter_neon
(
self
,
op
,
op_def
):
weight_tensor
=
get_input_tensor
(
op
,
1
)
weight_tensor_value
=
weight_tensor
.
eval
().
astype
(
np
.
float32
)
input_shape
=
get_input_tensor
(
op
,
0
).
shape
.
as_list
()
output_channels
=
weight_tensor_value
.
shape
[
3
]
input_channels
=
weight_tensor_value
.
shape
[
2
]
# HWIO -> OIHW
weight_tensor_value
=
weight_tensor_value
.
transpose
(
3
,
2
,
0
,
1
)
if
input_shape
[
2
]
>
16
and
input_shape
[
3
]
>
16
:
G
=
np
.
array
([
[
1.0
,
0.0
,
0.0
],
[
-
2.0
/
9
,
-
2.0
/
9
,
-
2.0
/
9
],
[
-
2.0
/
9
,
2.0
/
9
,
-
2.0
/
9
],
[
1.0
/
90
,
1.0
/
45
,
2.0
/
45
],
[
1.0
/
90
,
-
1.0
/
45
,
2.0
/
45
],
[
1.0
/
45
,
1.0
/
90
,
1.0
/
180
],
[
1.0
/
45
,
-
1.0
/
90
,
1.0
/
180
],
[
0.0
,
0.0
,
1.0
]
],
dtype
=
np
.
float32
)
new_shape
=
[
64
,
output_channels
,
input_channels
]
# TOC
else
:
G
=
np
.
array
([
[
1.0
,
0.0
,
0.0
],
[
0.5
,
0.5
,
0.5
],
[
0.5
,
-
0.5
,
0.5
],
[
0.0
,
0.0
,
1.0
],
],
dtype
=
np
.
float32
)
new_shape
=
[
16
,
output_channels
,
input_channels
]
# TOC
new_weight_value
=
G
.
dot
(
weight_tensor_value
).
dot
(
G
.
T
)
# [t, O, I, t]
new_weight_value
=
new_weight_value
.
transpose
(
0
,
3
,
1
,
2
)
new_weight_value
=
new_weight_value
.
reshape
(
new_shape
)
new_tensor_name
=
weight_tensor
.
name
[:
-
2
]
+
'/winograd_transformed:0'
self
.
add_tensor
(
new_tensor_name
,
new_shape
,
tf
.
float32
,
new_weight_value
)
winograd_transformed_arg
=
op_def
.
arg
.
add
()
winograd_transformed_arg
.
name
=
'is_filter_transformed'
winograd_transformed_arg
.
i
=
1
self
.
unused_tensor
.
add
(
weight_tensor
.
name
)
op_def
.
input
.
extend
([
op
.
inputs
[
0
].
name
])
op_def
.
input
.
extend
([
new_tensor_name
])
def
convert_conv2d
(
self
,
op
):
use_winograd
=
False
if
self
.
device
==
'neon'
:
use_winograd
=
self
.
check_winograd_conv
(
op
)
op_def
=
mace_pb2
.
OperatorDef
()
arg
=
op_def
.
arg
.
add
()
arg
.
name
=
'T'
...
...
@@ -366,7 +418,7 @@ class TFConverter(object):
else
:
op_def
.
type
=
op
.
type
if
self
.
device
==
'neon'
:
if
self
.
device
==
'neon'
and
not
use_winograd
:
self
.
transpose_filter_tensor
[
get_input_tensor
(
op
,
1
).
name
]
=
(
3
,
2
,
0
,
1
)
elif
op
.
type
==
'Conv2D'
:
...
...
@@ -381,6 +433,8 @@ class TFConverter(object):
output_name
=
self
.
add_buffer_to_image
(
get_input_tensor
(
op
,
1
).
name
,
buffer_type
)
op_def
.
input
.
extend
([
output_name
])
elif
self
.
device
==
'neon'
and
use_winograd
:
self
.
convert_conv_winograd_filter_neon
(
op
,
op_def
)
else
:
op_def
.
input
.
extend
(
[
get_input_tensor
(
op
,
i
).
name
for
i
in
range
(
len
(
op
.
inputs
))])
...
...
@@ -1057,8 +1111,8 @@ class TFConverter(object):
elif
self
.
check_conv_to_fc
(
op
):
self
.
convert_global_conv_to_fc
(
op
)
elif
op
.
type
==
'Conv2D'
or
op
.
type
==
'DepthwiseConv2dNative'
:
if
self
.
check_winograd_conv
(
op
):
self
.
convert_winograd_conv
(
op
)
if
self
.
device
==
'gpu'
and
self
.
check_winograd_conv
(
op
):
self
.
convert_winograd_conv
_gpu
(
op
)
else
:
self
.
convert_conv2d
(
op
)
elif
op
.
type
==
'FusedBatchNorm'
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录