Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
63a9bd30
MegEngine
项目概览
MegEngine 天元
/
MegEngine
接近 2 年 前同步成功
通知
414
Star
4708
Fork
583
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
63a9bd30
编写于
3月 31, 2021
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(mgb/gopt): add an opt pass for padding channels to enable fast int8/int4 support on GPU
GitOrigin-RevId: 94c719bb5c5410925f57d626de088b86afed4750
上级
56e863b7
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
657 addition
and
0 deletion
+657
-0
src/gopt/impl/tensor_reformat.cpp
src/gopt/impl/tensor_reformat.cpp
+340
-0
src/gopt/include/megbrain/gopt/inference.h
src/gopt/include/megbrain/gopt/inference.h
+10
-0
src/gopt/test/inference.cpp
src/gopt/test/inference.cpp
+307
-0
未找到文件。
src/gopt/impl/tensor_reformat.cpp
浏览文件 @
63a9bd30
...
...
@@ -3624,4 +3624,344 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const {
MIDOUT_E
}
/* ==================== PaddingChaannelPass ================= */
const
char
*
PaddingChannelPass
::
name
()
const
{
return
mgb_cstr_log
(
"padding output channel to multiple of 4/32"
);
}
void
PaddingChannelPass
::
apply
(
OptState
&
opt
)
const
{
MIDOUT_B
(
"PaddingChannelPassPass::apply"
);
// do not check shape
opt
.
set_var_replace_check_flag
(
VarReplaceCheckFlag
::
CHECK_ALL
^
VarReplaceCheckFlag
::
CHECK_SHAPE
);
ThinHashSet
<
OperatorNodeBase
*>
padding_oprs
;
ThinHashMap
<
Typeinfo
*
,
thin_function
<
OperatorNodeBase
*
(
OperatorNodeBase
*
,
const
VarNodeArray
&
)
>>
opr_replace_funcs
;
auto
rewriter
=
opt
.
graph
().
make_rewriter
();
auto
pad_in_channels
=
[](
VarNode
*
inp
,
size_t
pad_channels
)
->
VarNode
*
{
mgb_assert
(
inp
->
shape
().
ndim
==
4
);
mgb_assert
(
inp
->
dtype
().
enumv
()
==
DTypeEnum
::
QuantizedS8
||
inp
->
dtype
().
enumv
()
==
DTypeEnum
::
QuantizedS32
);
TensorShape
shape
{
inp
->
shape
()[
0
],
pad_channels
,
inp
->
shape
()[
2
],
inp
->
shape
()[
3
]};
std
::
shared_ptr
<
HostTensorND
>
host_val
=
std
::
make_shared
<
HostTensorND
>
(
inp
->
comp_node
(),
shape
,
inp
->
dtype
());
auto
ptr
=
host_val
->
raw_ptr
();
std
::
memset
(
ptr
,
0
,
shape
.
total_nr_elems
()
*
inp
->
dtype
().
size
());
auto
padding
=
opr
::
ImmutableTensor
::
make
(
*
inp
->
owner_graph
(),
*
host_val
);
auto
out
=
opr
::
Concat
::
make
({
inp
,
padding
},
1
);
return
out
.
node
();
};
auto
pad_out_channels
=
[](
VarNode
*
inp
,
size_t
pad_channels
)
->
VarNode
*
{
mgb_assert
(
inp
->
shape
().
ndim
==
4
);
mgb_assert
(
inp
->
dtype
().
enumv
()
==
DTypeEnum
::
QuantizedS8
||
inp
->
dtype
().
enumv
()
==
DTypeEnum
::
QuantizedS32
);
TensorShape
shape
{
pad_channels
,
inp
->
shape
()[
1
],
inp
->
shape
()[
2
],
inp
->
shape
()[
3
]};
std
::
shared_ptr
<
HostTensorND
>
host_val
=
std
::
make_shared
<
HostTensorND
>
(
inp
->
comp_node
(),
shape
,
inp
->
dtype
());
auto
ptr
=
host_val
->
raw_ptr
();
std
::
memset
(
ptr
,
0
,
shape
.
total_nr_elems
()
*
inp
->
dtype
().
size
());
auto
padding
=
opr
::
ImmutableTensor
::
make
(
*
inp
->
owner_graph
(),
*
host_val
);
auto
out
=
opr
::
Concat
::
make
({
inp
,
padding
},
0
);
return
out
.
node
();
};
auto
extract_subtensor
=
[](
VarNode
*
inp
,
size_t
orig_channels
)
->
VarNode
*
{
mgb_assert
(
inp
->
shape
().
ndim
==
4
);
auto
x
=
SymbolVar
(
inp
);
auto
cv
=
[
&
x
](
int
v
)
{
return
x
.
make_scalar
(
v
);
};
using
AIdx
=
opr
::
Subtensor
::
AxisIndexer
;
auto
sub
=
opr
::
Subtensor
::
make
(
x
,
{
AIdx
::
make_interval
(
0
,
None
,
None
,
cv
(
1
)),
AIdx
::
make_interval
(
1
,
None
,
cv
(
orig_channels
),
None
),
AIdx
::
make_interval
(
2
,
None
,
None
,
cv
(
1
)),
AIdx
::
make_interval
(
3
,
None
,
None
,
cv
(
1
))});
return
sub
.
node
();
};
// padding policy for conv bias with data type qint8
auto
padding_policy_qint8
=
[
&
padding_oprs
,
&
pad_in_channels
,
&
pad_out_channels
](
OperatorNodeBase
*
opr
,
const
VarNodeArray
&
new_inp
)
{
mgb_assert
(
opr
->
input
().
size
()
==
new_inp
.
size
());
mgb_assert
(
new_inp
.
size
()
==
3
);
mgb_assert
(
opr
->
input
(
1
)
->
shape
().
eq_shape
(
new_inp
[
1
]
->
shape
()));
auto
inps
=
new_inp
;
size_t
out_channels
=
opr
->
input
(
1
)
->
shape
()[
0
];
size_t
in_channels
=
opr
->
input
(
1
)
->
shape
()[
1
];
size_t
new_in_channels
=
new_inp
[
0
]
->
shape
()[
1
];
// pad input channels
if
(
padding_oprs
.
count
(
opr
->
input
(
0
)
->
owner_opr
()))
{
size_t
pad_channels
=
new_in_channels
-
in_channels
;
inps
[
1
]
=
pad_in_channels
(
new_inp
[
1
],
pad_channels
);
}
else
{
size_t
pad_channels
=
0
;
mgb_assert
(
new_in_channels
==
in_channels
);
if
(
in_channels
<=
16
)
{
if
(
in_channels
%
4
)
pad_channels
=
4
-
(
in_channels
%
4
);
// pad to use dp4a
}
else
{
if
(
in_channels
%
32
)
pad_channels
=
32
-
(
in_channels
%
32
);
// pad to use tensorcore
}
if
(
pad_channels
>
0
)
{
inps
[
0
]
=
pad_in_channels
(
new_inp
[
0
],
pad_channels
);
inps
[
1
]
=
pad_in_channels
(
new_inp
[
1
],
pad_channels
);
}
}
out_channels
=
inps
[
1
]
->
shape
()[
0
];
in_channels
=
inps
[
1
]
->
shape
()[
1
];
size_t
pad_channels
=
0
;
if
(
in_channels
<=
16
)
{
if
(
out_channels
%
4
)
pad_channels
=
4
-
(
out_channels
%
4
);
}
else
{
if
(
out_channels
<=
16
)
{
if
(
out_channels
%
4
)
pad_channels
=
4
-
(
out_channels
%
4
);
}
else
{
if
(
out_channels
%
32
)
pad_channels
=
32
-
(
out_channels
%
32
);
}
}
if
(
pad_channels
>
0
)
{
inps
[
1
]
=
pad_out_channels
(
inps
[
1
],
pad_channels
);
inps
[
2
]
=
pad_in_channels
(
inps
[
2
],
pad_channels
);
padding_oprs
.
insert
(
opr
);
}
return
serialization
::
copy_opr_shallow
(
*
opr
,
inps
,
opr
->
config
());
};
// padding policy for conv bias with data type qint4 and quint4
auto
padding_policy_int4
=
[
&
padding_oprs
,
&
pad_in_channels
,
&
pad_out_channels
](
OperatorNodeBase
*
opr
,
const
VarNodeArray
&
new_inp
)
{
mgb_assert
(
opr
->
input
().
size
()
==
new_inp
.
size
());
mgb_assert
(
new_inp
.
size
()
==
3
);
mgb_assert
(
opr
->
input
(
1
)
->
shape
().
eq_shape
(
new_inp
[
1
]
->
shape
()));
auto
inps
=
new_inp
;
return
serialization
::
copy_opr_shallow
(
*
opr
,
inps
,
opr
->
config
());
};
opr_replace_funcs
[
opr
::
ConvBiasForward
::
typeinfo
()]
=
[
&
padding_oprs
,
&
padding_policy_qint8
,
&
padding_policy_int4
](
OperatorNodeBase
*
opr
,
const
VarNodeArray
&
new_inp
)
{
if
(
opr
->
input
(
0
)
->
dtype
().
enumv
()
==
DTypeEnum
::
QuantizedS8
)
{
return
padding_policy_qint8
(
opr
,
new_inp
);
}
else
if
(
opr
->
input
(
0
)
->
dtype
().
enumv
()
==
DTypeEnum
::
QuantizedS4
||
opr
->
input
(
0
)
->
dtype
().
enumv
()
==
DTypeEnum
::
Quantized4Asymm
)
{
return
padding_policy_int4
(
opr
,
new_inp
);
}
else
{
mgb_assert
(
padding_oprs
.
count
(
opr
->
input
(
0
)
->
owner_opr
())
==
0
,
"conv bias operator for data type(%s) cannot be "
"padded channel. "
"consumer(%s), producer(%s)"
,
opr
->
input
(
0
)
->
dtype
().
name
(),
opr
->
cname
(),
opr
->
input
(
0
)
->
owner_opr
()
->
cname
());
return
serialization
::
copy_opr_shallow
(
*
opr
,
new_inp
,
opr
->
config
());
}
};
opr_replace_funcs
[
opr
::
ConvolutionBackwardData
::
typeinfo
()]
=
[
&
padding_oprs
,
&
pad_in_channels
,
&
pad_out_channels
](
OperatorNodeBase
*
opr
,
const
VarNodeArray
&
new_inp
)
{
if
(
opr
->
input
(
1
)
->
dtype
().
enumv
()
!=
DTypeEnum
::
QuantizedS8
)
{
mgb_assert
(
padding_oprs
.
count
(
opr
->
input
(
0
)
->
owner_opr
())
==
0
,
"conv bwd data operator for data type(%s) cannot "
"be "
"padded channel. "
"consumer(%s), producer(%s)"
,
opr
->
input
(
0
)
->
dtype
().
name
(),
opr
->
cname
(),
opr
->
input
(
0
)
->
owner_opr
()
->
cname
());
return
serialization
::
copy_opr_shallow
(
*
opr
,
new_inp
,
opr
->
config
());
}
mgb_assert
(
opr
->
input
().
size
()
==
new_inp
.
size
());
mgb_assert
(
new_inp
.
size
()
==
2
,
"deconv (conv bwd data) operator for inference can "
"only have 2 input vars(got:%zu)"
,
new_inp
.
size
());
mgb_assert
(
opr
->
input
(
0
)
->
shape
().
eq_shape
(
new_inp
[
0
]
->
shape
()));
auto
inps
=
new_inp
;
size_t
out_channels
=
opr
->
input
(
0
)
->
shape
()[
0
];
size_t
in_channels
=
opr
->
input
(
0
)
->
shape
()[
1
];
size_t
new_out_channels
=
new_inp
[
1
]
->
shape
()[
1
];
// pad output channels
if
(
padding_oprs
.
count
(
opr
->
input
(
1
)
->
owner_opr
()))
{
size_t
pad_channels
=
new_out_channels
-
out_channels
;
inps
[
0
]
=
pad_out_channels
(
new_inp
[
0
],
pad_channels
);
}
else
{
size_t
pad_channels
=
0
;
if
(
out_channels
%
4
)
pad_channels
=
4
-
(
out_channels
%
4
);
if
(
pad_channels
>
0
)
{
inps
[
0
]
=
pad_out_channels
(
new_inp
[
0
],
pad_channels
);
inps
[
1
]
=
pad_in_channels
(
new_inp
[
1
],
pad_channels
);
}
}
out_channels
=
inps
[
0
]
->
shape
()[
0
];
in_channels
=
inps
[
0
]
->
shape
()[
1
];
// pad input channels
size_t
pad_channels
=
0
;
if
(
in_channels
%
4
)
pad_channels
=
4
-
(
in_channels
%
4
);
if
(
pad_channels
>
0
)
{
inps
[
0
]
=
pad_in_channels
(
inps
[
0
],
pad_channels
);
padding_oprs
.
insert
(
opr
);
}
return
serialization
::
copy_opr_shallow
(
*
opr
,
inps
,
opr
->
config
());
};
auto
replace_format_aware_opr
=
[
&
padding_oprs
](
OperatorNodeBase
*
opr
,
const
VarNodeArray
&
new_inp
)
{
if
(
opr
->
input
(
0
)
->
dtype
().
enumv
()
!=
DTypeEnum
::
QuantizedS8
&&
opr
->
input
(
0
)
->
dtype
().
enumv
()
!=
DTypeEnum
::
QuantizedS4
&&
opr
->
input
(
0
)
->
dtype
().
enumv
()
!=
DTypeEnum
::
Quantized4Asymm
)
{
mgb_assert
(
padding_oprs
.
count
(
opr
->
input
(
0
)
->
owner_opr
())
==
0
,
"operator(type:%s,name:%s) for data type(%s) cannot be "
"padded channel. extra info:"
"consumer(%s), producer(%s)"
,
opr
->
dyn_typeinfo
()
->
name
,
opr
->
cname
(),
opr
->
input
(
0
)
->
dtype
().
name
(),
opr
->
cname
(),
opr
->
input
(
0
)
->
owner_opr
()
->
cname
());
return
serialization
::
copy_opr_shallow
(
*
opr
,
new_inp
,
opr
->
config
());
}
mgb_assert
(
opr
->
input
().
size
()
==
new_inp
.
size
());
if
(
padding_oprs
.
count
(
opr
->
input
(
0
)
->
owner_opr
()))
{
padding_oprs
.
insert
(
opr
);
}
return
serialization
::
copy_opr_shallow
(
*
opr
,
new_inp
,
opr
->
config
());
};
opr_replace_funcs
[
opr
::
PoolingForward
::
typeinfo
()]
=
replace_format_aware_opr
;
opr_replace_funcs
[
opr
::
WarpPerspectiveForward
::
typeinfo
()]
=
replace_format_aware_opr
;
auto
replace_elemwise_like_opr
=
[
&
padding_oprs
,
&
extract_subtensor
](
OperatorNodeBase
*
opr
,
const
VarNodeArray
&
new_inp
)
{
mgb_assert
(
opr
->
input
().
size
()
==
new_inp
.
size
());
bool
have_padding_inp
=
false
;
bool
padding_all_inps
=
true
;
bool
same_padding
=
true
;
size_t
channels_after_padding
=
0
;
for
(
auto
&&
cur_inp
:
opr
->
input
())
{
bool
padding_cur_inp
=
padding_oprs
.
count
(
cur_inp
->
owner_opr
())
>
0
;
if
(
padding_cur_inp
)
{
if
(
!
have_padding_inp
)
have_padding_inp
=
true
;
if
(
channels_after_padding
==
0
)
{
channels_after_padding
=
cur_inp
->
shape
()[
1
];
}
else
{
same_padding
=
channels_after_padding
==
cur_inp
->
shape
()[
1
];
}
}
if
(
padding_all_inps
&&
(
!
padding_cur_inp
||
!
same_padding
))
padding_all_inps
=
false
;
}
if
(
have_padding_inp
&&
!
padding_all_inps
)
{
auto
inps
=
new_inp
;
for
(
size_t
i
=
0
;
i
<
new_inp
.
size
();
++
i
)
{
auto
cur_inp
=
opr
->
input
(
i
);
bool
padding_cur_inp
=
padding_oprs
.
count
(
cur_inp
->
owner_opr
())
>
0
;
if
(
padding_cur_inp
)
{
size_t
orig_channels
=
cur_inp
->
shape
()[
1
];
inps
[
i
]
=
extract_subtensor
(
inps
[
i
],
orig_channels
);
}
}
return
serialization
::
copy_opr_shallow
(
*
opr
,
inps
,
opr
->
config
());
}
if
(
padding_all_inps
)
{
padding_oprs
.
insert
(
opr
);
}
return
serialization
::
copy_opr_shallow
(
*
opr
,
new_inp
,
opr
->
config
());
};
opr_replace_funcs
[
opr
::
ElemwiseMultiType
::
typeinfo
()]
=
replace_elemwise_like_opr
;
opr_replace_funcs
[
opr
::
Elemwise
::
typeinfo
()]
=
replace_elemwise_like_opr
;
opr_replace_funcs
[
opr
::
TypeCvt
::
typeinfo
()]
=
replace_elemwise_like_opr
;
auto
replace_nonpadding_oprs
=
[
&
padding_oprs
,
&
extract_subtensor
](
OperatorNodeBase
*
opr
,
const
VarNodeArray
&
new_inp
)
{
mgb_assert
(
opr
->
input
().
size
()
==
new_inp
.
size
());
bool
have_padding_inp
=
false
;
auto
inps
=
new_inp
;
for
(
size_t
i
=
0
;
i
<
new_inp
.
size
();
++
i
)
{
auto
cur_inp
=
opr
->
input
(
i
);
bool
padding_cur_inp
=
padding_oprs
.
count
(
cur_inp
->
owner_opr
())
>
0
;
if
(
padding_cur_inp
)
{
if
(
!
have_padding_inp
)
have_padding_inp
=
true
;
size_t
orig_channels
=
cur_inp
->
shape
()[
1
];
inps
[
i
]
=
extract_subtensor
(
inps
[
i
],
orig_channels
);
}
}
return
serialization
::
copy_opr_shallow
(
*
opr
,
inps
,
opr
->
config
());
};
opr_replace_funcs
[
opr
::
Reshape
::
typeinfo
()]
=
replace_nonpadding_oprs
;
opr_replace_funcs
[
opr
::
GetVarShape
::
typeinfo
()]
=
replace_nonpadding_oprs
;
opr_replace_funcs
[
opr
::
Concat
::
typeinfo
()]
=
replace_nonpadding_oprs
;
auto
on_opr
=
[
&
opt
,
&
rewriter
,
&
opr_replace_funcs
,
&
extract_subtensor
](
OperatorNodeBase
*
opr
)
{
auto
it
=
opr_replace_funcs
.
find
(
opr
->
dyn_typeinfo
());
if
(
it
!=
opr_replace_funcs
.
end
())
{
VarNodeArray
new_inp
;
new_inp
.
reserve
(
opr
->
input
().
size
());
for
(
auto
&&
inp
:
opr
->
input
())
{
new_inp
.
push_back
(
rewriter
.
get_var
(
inp
));
}
auto
new_opr
=
(
it
->
second
)(
opr
,
new_inp
);
auto
&&
out0
=
opr
->
output
(),
&&
out1
=
new_opr
->
output
();
mgb_assert
(
out0
.
size
()
==
out1
.
size
(),
"bad opr replace: src=%s{%s} dst=%s{%s}, "
"src.size=%zu "
"dst.size=%zu"
,
opr
->
cname
(),
opr
->
dyn_typeinfo
()
->
name
,
new_opr
->
cname
(),
new_opr
->
dyn_typeinfo
()
->
name
,
out0
.
size
(),
out1
.
size
());
for
(
size_t
i
=
0
;
i
<
out0
.
size
();
++
i
)
{
if
(
!
out0
[
i
]
->
contain_flag
(
VarNode
::
Flag
::
VOLATILE_CONTENT
))
{
mgb_assert
(
!
out1
[
i
]
->
contain_flag
(
VarNode
::
Flag
::
VOLATILE_CONTENT
));
auto
src
=
out0
[
i
];
auto
dst
=
out1
[
i
];
if
(
opt
.
graph
().
endpoint_contain
(
src
)
&&
!
src
->
shape
().
eq_shape
(
dst
->
shape
()))
{
size_t
orig_channels
=
src
->
shape
()[
1
];
dst
=
extract_subtensor
(
dst
,
orig_channels
);
}
rewriter
.
replace_var
(
src
,
dst
,
nullptr
);
}
}
}
else
{
rewriter
.
auto_replace_outputs
(
opr
);
}
};
opt
.
graph
().
iter
(
on_opr
);
rewriter
.
apply_inplace
();
MIDOUT_E
}
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
src/gopt/include/megbrain/gopt/inference.h
浏览文件 @
63a9bd30
...
...
@@ -409,6 +409,16 @@ namespace gopt {
void
apply
(
OptState
&
opt
)
const
override
;
};
/*!
* \brief padding channel to enable fast int8/int4 support
* assume input network is built in NCHW tensor format
*/
class
PaddingChannelPass
final
:
public
Pass
{
public:
const
char
*
name
()
const
override
;
void
apply
(
OptState
&
opt
)
const
override
;
};
}
// namespace gopt
}
// namespace mgb
...
...
src/gopt/test/inference.cpp
浏览文件 @
63a9bd30
...
...
@@ -4178,6 +4178,313 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) {
MGB_ASSERT_TENSOR_EQ
(
host_y_fuse
,
host_y_non_fuse
);
}
#endif
TEST
(
TestGoptInference
,
PaddingChannels
)
{
REQUIRE_GPU
(
1
);
auto
cn
=
CompNode
::
load
(
"gpu0"
);
cn
.
activate
();
auto
&&
prop
=
CompNodeEnv
::
from_comp_node
(
cn
).
cuda_env
().
device_prop
;
auto
sm_ver
=
prop
.
major
*
10
+
prop
.
minor
;
if
(
sm_ver
<
61
)
{
printf
(
"This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)
\n
"
,
sm_ver
,
61
);
return
;
}
HostTensorGenerator
<
dtype
::
Int8
>
gen
;
auto
graph
=
ComputingGraph
::
make
();
graph
->
options
().
graph_opt_level
=
0
;
auto
mkvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
,
const
DType
&
dtype
)
{
return
opr
::
TypeCvt
::
make
(
opr
::
Host2DeviceCopy
::
make
(
*
graph
,
gen
(
shp
,
cn
)).
rename
(
name
),
dtype
);
};
auto
mkcvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
,
const
DType
&
dtype
)
{
return
opr
::
TypeCvt
::
make
(
opr
::
SharedDeviceTensor
::
make
(
*
graph
,
*
gen
(
shp
,
cn
))
.
rename
(
name
),
dtype
);
};
auto
x
=
mkvar
(
"x"
,
{
16
,
3
,
14
,
14
},
dtype
::
QuantizedS8
(
2.5
f
)),
w
=
mkcvar
(
"w"
,
{
20
,
3
,
3
,
3
},
dtype
::
QuantizedS8
(
2.5
f
)),
b
=
mkcvar
(
"b"
,
{
1
,
20
,
1
,
1
},
dtype
::
QuantizedS32
(
6.25
f
));
opr
::
ConvBias
::
Param
param
;
param
.
format
=
opr
::
ConvBias
::
Param
::
Format
::
NCHW
;
param
.
nonlineMode
=
opr
::
ConvBias
::
Param
::
NonlineMode
::
RELU
;
param
.
stride_h
=
param
.
stride_w
=
1
;
param
.
pad_h
=
param
.
pad_w
=
1
;
auto
y
=
opr
::
ConvBias
::
make
(
x
,
w
,
b
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
(
2.5
f
)});
auto
w1
=
mkcvar
(
"w1"
,
{
24
,
20
,
3
,
3
},
dtype
::
QuantizedS8
(
2.5
f
)),
b1
=
mkcvar
(
"b1"
,
{
1
,
24
,
1
,
1
},
dtype
::
QuantizedS32
(
6.25
f
));
auto
y1
=
opr
::
ConvBias
::
make
(
y
,
w1
,
b1
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
(
2.5
f
)});
auto
w2
=
mkcvar
(
"w2"
,
{
20
,
24
,
3
,
3
},
dtype
::
QuantizedS8
(
2.5
f
)),
b2
=
mkcvar
(
"b2"
,
{
1
,
20
,
1
,
1
},
dtype
::
QuantizedS32
(
6.25
f
));
auto
y2
=
opr
::
ConvBias
::
make
(
y1
,
w2
,
b2
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
(
2.5
f
)});
using
ElemMultiMode
=
opr
::
ElemwiseMultiType
::
Param
::
Mode
;
auto
y3
=
opr
::
ElemwiseMultiType
::
make
(
{
y
,
y2
},
{
ElemMultiMode
::
QFUSE_ADD_RELU
},
OperatorNodeConfig
{
dtype
::
QuantizedS8
{
1.2
f
}});
y3
=
opr
::
TypeCvt
::
make
(
y3
,
dtype
::
Float32
());
SymbolVar
y3_pad
;
unpack_vector
(
gopt
::
GraphOptimizer
{}
.
add_pass
<
gopt
::
PaddingChannelPass
>
()
.
apply
({{
y3
}})
.
endpoint_vars
(),
y3_pad
);
ASSERT_EQ
(
y3_pad
.
node
()
->
shape
()[
1
],
y3
.
node
()
->
shape
()[
1
]);
SmallVector
<
cg
::
OperatorNodeBase
*>
oprs
;
auto
cb
=
[
&
oprs
](
cg
::
OperatorNodeBase
*
opr
)
{
if
(
opr
->
same_type
<
opr
::
ConvBias
>
())
{
oprs
.
push_back
(
opr
);
}
};
cg
::
DepOprIter
{
cb
}.
add
(
y3_pad
.
node
()
->
owner_opr
());
ASSERT_EQ
(
oprs
.
size
(),
3
);
ASSERT_EQ
(
oprs
[
0
]
->
output
(
0
)
->
shape
()[
1
],
20
);
ASSERT_EQ
(
oprs
[
1
]
->
output
(
0
)
->
shape
()[
1
],
32
);
ASSERT_EQ
(
oprs
[
2
]
->
output
(
0
)
->
shape
()[
1
],
32
);
HostTensorND
t1
,
t2
;
auto
func1
=
graph
->
compile
({
make_callback_copy
(
y3
,
t1
)});
func1
->
execute
();
auto
func2
=
graph
->
compile
({
make_callback_copy
(
y3_pad
,
t2
)});
func2
->
execute
();
MGB_ASSERT_TENSOR_EQ
(
t1
,
t2
);
}
TEST
(
TestGoptInference
,
ConcatAfterPaddingChannels
)
{
REQUIRE_GPU
(
1
);
auto
cn
=
CompNode
::
load
(
"gpu0"
);
cn
.
activate
();
auto
&&
prop
=
CompNodeEnv
::
from_comp_node
(
cn
).
cuda_env
().
device_prop
;
auto
sm_ver
=
prop
.
major
*
10
+
prop
.
minor
;
if
(
sm_ver
<
61
)
{
printf
(
"This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)
\n
"
,
sm_ver
,
61
);
return
;
}
HostTensorGenerator
<
dtype
::
Int8
>
gen
;
auto
graph
=
ComputingGraph
::
make
();
graph
->
options
().
graph_opt_level
=
0
;
auto
mkvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
,
const
DType
&
dtype
)
{
return
opr
::
TypeCvt
::
make
(
opr
::
Host2DeviceCopy
::
make
(
*
graph
,
gen
(
shp
,
cn
)).
rename
(
name
),
dtype
);
};
auto
mkcvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
,
const
DType
&
dtype
)
{
return
opr
::
TypeCvt
::
make
(
opr
::
SharedDeviceTensor
::
make
(
*
graph
,
*
gen
(
shp
,
cn
))
.
rename
(
name
),
dtype
);
};
auto
x
=
mkvar
(
"x"
,
{
16
,
3
,
14
,
14
},
dtype
::
QuantizedS8
(
2.5
f
)),
w
=
mkcvar
(
"w"
,
{
18
,
3
,
3
,
3
},
dtype
::
QuantizedS8
(
2.5
f
)),
b
=
mkcvar
(
"b"
,
{
1
,
18
,
1
,
1
},
dtype
::
QuantizedS32
(
6.25
f
));
opr
::
ConvBias
::
Param
param
;
param
.
format
=
opr
::
ConvBias
::
Param
::
Format
::
NCHW
;
param
.
nonlineMode
=
opr
::
ConvBias
::
Param
::
NonlineMode
::
RELU
;
param
.
stride_h
=
param
.
stride_w
=
1
;
param
.
pad_h
=
param
.
pad_w
=
1
;
auto
y
=
opr
::
ConvBias
::
make
(
x
,
w
,
b
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
(
2.5
f
)});
auto
w1
=
mkcvar
(
"w1"
,
{
18
,
18
,
3
,
3
},
dtype
::
QuantizedS8
(
2.5
f
)),
b1
=
mkcvar
(
"b1"
,
{
1
,
18
,
1
,
1
},
dtype
::
QuantizedS32
(
6.25
f
));
auto
y1
=
opr
::
ConvBias
::
make
(
y
,
w1
,
b1
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
(
2.5
f
)});
// concat at batch dim
auto
y2
=
opr
::
Concat
::
make
({
y
,
y1
},
0
);
y2
=
opr
::
TypeCvt
::
make
(
y2
,
dtype
::
Float32
());
SymbolVar
y2_pad
;
unpack_vector
(
gopt
::
GraphOptimizer
{}
.
add_pass
<
gopt
::
PaddingChannelPass
>
()
.
apply
({{
y2
}})
.
endpoint_vars
(),
y2_pad
);
ASSERT_EQ
(
y2_pad
.
node
()
->
shape
()[
1
],
y2
.
node
()
->
shape
()[
1
]);
SmallVector
<
cg
::
OperatorNodeBase
*>
oprs
;
auto
cb
=
[
&
oprs
](
cg
::
OperatorNodeBase
*
opr
)
{
if
(
opr
->
same_type
<
opr
::
ConvBias
>
())
{
oprs
.
push_back
(
opr
);
}
};
cg
::
DepOprIter
{
cb
}.
add
(
y2_pad
.
node
()
->
owner_opr
());
ASSERT_EQ
(
oprs
.
size
(),
2
);
ASSERT_EQ
(
oprs
[
0
]
->
output
(
0
)
->
shape
()[
1
],
20
);
ASSERT_EQ
(
oprs
[
1
]
->
output
(
0
)
->
shape
()[
1
],
32
);
HostTensorND
t1
,
t2
;
auto
func1
=
graph
->
compile
({
make_callback_copy
(
y2
,
t1
)});
func1
->
execute
();
auto
func2
=
graph
->
compile
({
make_callback_copy
(
y2_pad
,
t2
)});
func2
->
execute
();
MGB_ASSERT_TENSOR_EQ
(
t1
,
t2
);
}
// FIXME replace cpu with gpu to enable gpu validation
TEST
(
TestGoptInference
,
PaddingChannelsWithPooling
)
{
REQUIRE_GPU
(
1
);
auto
cn
=
CompNode
::
load
(
"cpu0"
);
// cn.activate();
// auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
// auto sm_ver = prop.major * 10 + prop.minor;
// if (sm_ver < 61) {
// printf("This testcast ignored due to insufficient cuda cap(got: %d, "
// "expected: %d)\n",
// sm_ver, 61);
// return;
// }
HostTensorGenerator
<
dtype
::
Int8
>
gen
;
auto
graph
=
ComputingGraph
::
make
();
graph
->
options
().
graph_opt_level
=
0
;
auto
mkvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
,
const
DType
&
dtype
)
{
return
opr
::
TypeCvt
::
make
(
opr
::
Host2DeviceCopy
::
make
(
*
graph
,
gen
(
shp
,
cn
)).
rename
(
name
),
dtype
);
};
auto
mkcvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
,
const
DType
&
dtype
)
{
return
opr
::
TypeCvt
::
make
(
opr
::
SharedDeviceTensor
::
make
(
*
graph
,
*
gen
(
shp
,
cn
))
.
rename
(
name
),
dtype
);
};
auto
x
=
mkvar
(
"x"
,
{
16
,
3
,
14
,
14
},
dtype
::
QuantizedS8
(
2.5
f
)),
w
=
mkcvar
(
"w"
,
{
20
,
3
,
3
,
3
},
dtype
::
QuantizedS8
(
2.5
f
)),
b
=
mkcvar
(
"b"
,
{
1
,
20
,
1
,
1
},
dtype
::
QuantizedS32
(
6.25
f
));
opr
::
ConvBias
::
Param
param
;
param
.
format
=
opr
::
ConvBias
::
Param
::
Format
::
NCHW
;
param
.
nonlineMode
=
opr
::
ConvBias
::
Param
::
NonlineMode
::
RELU
;
param
.
stride_h
=
param
.
stride_w
=
1
;
param
.
pad_h
=
param
.
pad_w
=
1
;
auto
y
=
opr
::
ConvBias
::
make
(
x
,
w
,
b
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
(
2.5
f
)});
auto
w1
=
mkcvar
(
"w1"
,
{
24
,
20
,
3
,
3
},
dtype
::
QuantizedS8
(
2.5
f
)),
b1
=
mkcvar
(
"b1"
,
{
1
,
24
,
1
,
1
},
dtype
::
QuantizedS32
(
6.25
f
));
auto
y1
=
opr
::
ConvBias
::
make
(
y
,
w1
,
b1
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
(
2.5
f
)});
opr
::
Pooling
::
Param
pool_param
;
pool_param
.
format
=
opr
::
Pooling
::
Param
::
Format
::
NCHW
;
y1
=
opr
::
Pooling
::
make
(
y1
,
pool_param
);
y1
=
opr
::
TypeCvt
::
make
(
y1
,
dtype
::
Float32
());
SymbolVar
y1_pad
;
unpack_vector
(
gopt
::
GraphOptimizer
{}
.
add_pass
<
gopt
::
PaddingChannelPass
>
()
.
apply
({{
y1
}})
.
endpoint_vars
(),
y1_pad
);
ASSERT_EQ
(
y1_pad
.
node
()
->
shape
()[
1
],
y1
.
node
()
->
shape
()[
1
]);
SmallVector
<
cg
::
OperatorNodeBase
*>
oprs
;
auto
cb
=
[
&
oprs
](
cg
::
OperatorNodeBase
*
opr
)
{
if
(
opr
->
same_type
<
opr
::
Pooling
>
())
{
oprs
.
push_back
(
opr
);
}
};
cg
::
DepOprIter
{
cb
}.
add
(
y1_pad
.
node
()
->
owner_opr
());
ASSERT_EQ
(
oprs
[
0
]
->
output
(
0
)
->
shape
()[
1
],
32
);
HostTensorND
t1
,
t2
;
auto
func1
=
graph
->
compile
({
make_callback_copy
(
y1
,
t1
)});
func1
->
execute
();
auto
func2
=
graph
->
compile
({
make_callback_copy
(
y1_pad
,
t2
)});
func2
->
execute
();
MGB_ASSERT_TENSOR_EQ
(
t1
,
t2
);
}
// FIXME replace cpu with gpu to enable gpu validation
TEST
(
TestGoptInference
,
PaddingChannelsWithWarpPerspective
)
{
REQUIRE_GPU
(
1
);
auto
cn
=
CompNode
::
load
(
"cpu0"
);
// cn.activate();
// auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
// auto sm_ver = prop.major * 10 + prop.minor;
// if (sm_ver < 61) {
// printf("This testcast ignored due to insufficient cuda cap(got: %d, "
// "expected: %d)\n",
// sm_ver, 61);
// return;
// }
HostTensorGenerator
<
dtype
::
Int8
>
gen
;
auto
graph
=
ComputingGraph
::
make
();
graph
->
options
().
graph_opt_level
=
0
;
auto
mkvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
,
const
DType
&
dtype
)
{
return
opr
::
TypeCvt
::
make
(
opr
::
Host2DeviceCopy
::
make
(
*
graph
,
gen
(
shp
,
cn
)).
rename
(
name
),
dtype
);
};
auto
mkcvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
,
const
DType
&
dtype
)
{
return
opr
::
TypeCvt
::
make
(
opr
::
SharedDeviceTensor
::
make
(
*
graph
,
*
gen
(
shp
,
cn
))
.
rename
(
name
),
dtype
);
};
std
::
shared_ptr
<
HostTensorND
>
mat
=
std
::
make_shared
<
HostTensorND
>
(
cn
,
TensorShape
{
16
,
3
,
3
},
dtype
::
Float32
());
warp_perspective_mat_gen
(
*
mat
,
16
,
14
,
14
);
auto
mat_var
=
opr
::
Host2DeviceCopy
::
make
(
*
graph
,
mat
).
rename
(
"mat"
);
auto
x
=
mkvar
(
"x"
,
{
16
,
3
,
14
,
14
},
dtype
::
QuantizedS8
(
2.5
f
)),
w
=
mkcvar
(
"w"
,
{
20
,
3
,
3
,
3
},
dtype
::
QuantizedS8
(
2.5
f
)),
b
=
mkcvar
(
"b"
,
{
1
,
20
,
1
,
1
},
dtype
::
QuantizedS32
(
6.25
f
));
opr
::
ConvBias
::
Param
param
;
param
.
format
=
opr
::
ConvBias
::
Param
::
Format
::
NCHW
;
param
.
nonlineMode
=
opr
::
ConvBias
::
Param
::
NonlineMode
::
RELU
;
param
.
stride_h
=
param
.
stride_w
=
1
;
param
.
pad_h
=
param
.
pad_w
=
1
;
auto
y
=
opr
::
ConvBias
::
make
(
x
,
w
,
b
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
(
2.5
f
)});
auto
w1
=
mkcvar
(
"w1"
,
{
24
,
20
,
3
,
3
},
dtype
::
QuantizedS8
(
2.5
f
)),
b1
=
mkcvar
(
"b1"
,
{
1
,
24
,
1
,
1
},
dtype
::
QuantizedS32
(
6.25
f
));
auto
y1
=
opr
::
ConvBias
::
make
(
y
,
w1
,
b1
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
(
2.5
f
)});
opr
::
WarpPerspective
::
Param
warp_param
;
warp_param
.
format
=
opr
::
WarpPerspective
::
Param
::
Format
::
NCHW
;
y1
=
opr
::
WarpPerspective
::
make
(
y1
,
mat_var
,
TensorShape
{
14
,
14
},
warp_param
);
y1
=
opr
::
TypeCvt
::
make
(
y1
,
dtype
::
Float32
());
SymbolVar
y1_pad
;
unpack_vector
(
gopt
::
GraphOptimizer
{}
.
add_pass
<
gopt
::
PaddingChannelPass
>
()
.
apply
({{
y1
}})
.
endpoint_vars
(),
y1_pad
);
ASSERT_EQ
(
y1_pad
.
node
()
->
shape
()[
1
],
y1
.
node
()
->
shape
()[
1
]);
SmallVector
<
cg
::
OperatorNodeBase
*>
oprs
;
auto
cb
=
[
&
oprs
](
cg
::
OperatorNodeBase
*
opr
)
{
if
(
opr
->
same_type
<
opr
::
WarpPerspective
>
())
{
oprs
.
push_back
(
opr
);
}
};
cg
::
DepOprIter
{
cb
}.
add
(
y1_pad
.
node
()
->
owner_opr
());
ASSERT_EQ
(
oprs
[
0
]
->
output
(
0
)
->
shape
()[
1
],
32
);
HostTensorND
t1
,
t2
;
auto
func1
=
graph
->
compile
({
make_callback_copy
(
y1
,
t1
)});
func1
->
execute
();
auto
func2
=
graph
->
compile
({
make_callback_copy
(
y1_pad
,
t2
)});
func2
->
execute
();
MGB_ASSERT_TENSOR_EQ
(
t1
,
t2
);
}
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录