Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
e715423f
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
403
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
e715423f
编写于
12月 27, 2021
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(src/gopt): add optpass on arm for fusing typecvt and elemwise to elemwise multi type
GitOrigin-RevId: e6bcbbf91bd24460b2ba2bf7dff3cd3ba13ca7e5
上级
f6d99094
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
297 addition
and
0 deletion
+297
-0
src/gopt/impl/framework.cpp
src/gopt/impl/framework.cpp
+6
-0
src/gopt/impl/inference.cpp
src/gopt/impl/inference.cpp
+140
-0
src/gopt/include/megbrain/gopt/inference.h
src/gopt/include/megbrain/gopt/inference.h
+9
-0
src/gopt/test/inference.cpp
src/gopt/test/inference.cpp
+132
-0
src/opr/impl/nn_int.cpp
src/opr/impl/nn_int.cpp
+8
-0
src/opr/include/megbrain/opr/nn_int.h
src/opr/include/megbrain/opr/nn_int.h
+2
-0
未找到文件。
src/gopt/impl/framework.cpp
浏览文件 @
e715423f
...
...
@@ -644,6 +644,11 @@ GraphOptimizer& GraphOptimizer::add_preset_passes(
add_pass
<
RemoveRedundantTypeCvtPass
>
();
add_pass
<
RemoveRedundantCopyPass
>
();
//! Only arm_common implement Fuse TypeCvt and Elemwise optimized kernel
#if (MEGDNN_AARCH64 || MEGDNN_ARMV7) && !MGB_OPENCL && !MGB_CUDA
add_pass
<
FuseTypecvtElemwisePass
>
();
#endif
#if MGB_JIT
using
JITConfig
=
cg
::
ComputingGraph
::
Options
::
GraphOpt
::
JITConfig
;
int
jit_opt_level
=
0
;
...
...
@@ -691,6 +696,7 @@ GraphOptimizer& GraphOptimizer::add_preset_passes(
// remove shape hint after inference optimization
add_pass
<
RemoveShapeHintPass
>
();
}
return
*
this
;
}
...
...
src/gopt/impl/inference.cpp
浏览文件 @
e715423f
...
...
@@ -2187,4 +2187,144 @@ void ParamMergePass::apply(OptState& opt_state) const {
MIDOUT_E
}
/* ==================== FuseTypecvtElemwisePass ================= */
const
char
*
FuseTypecvtElemwisePass
::
name
()
const
{
return
mgb_cstr_log
(
"Fuse typecvt elemwise pass"
);
}
void
FuseTypecvtElemwisePass
::
apply
(
OptState
&
opt
)
const
{
MIDOUT_B
(
"FuseTypecvtElemwisePass::apply"
)
opt
.
set_var_replace_check_flag
(
VarReplaceCheckFlag
::
CHECK_DTYPE
|
VarReplaceCheckFlag
::
CHECK_SHAPE
);
auto
rewriter
=
opt
.
graph
().
make_rewriter
();
auto
uniq_reader_check
=
UniqReaderCheck
{
opt
.
graph
()};
auto
try_typecvt_elemwise_fma_i16xf32xf32xf32
=
[
&
rewriter
,
&
uniq_reader_check
](
OperatorNodeBase
*
opr
)
{
// check elemwise
auto
elemwise
=
try_cast_as_op
<
opr
::
Elemwise
>
(
opr
);
if
(
elemwise
==
nullptr
)
return
false
;
if
(
elemwise
->
param
().
mode
!=
opr
::
Elemwise
::
Mode
::
FUSE_MUL_ADD3
)
return
false
;
bool
is_elem_src_f32
=
elemwise
->
input
(
0
)
->
dtype
().
enumv
()
==
DTypeEnum
::
Float32
;
bool
is_elem_dst_f32
=
elemwise
->
output
(
0
)
->
dtype
().
enumv
()
==
DTypeEnum
::
Float32
;
if
(
!
(
is_elem_src_f32
&&
is_elem_dst_f32
))
return
false
;
if
(
!
uniq_reader_check
(
elemwise
->
input
(
0
)))
return
false
;
// check typecvt
auto
typecvt
=
try_cast_as_op
<
opr
::
TypeCvt
>
(
elemwise
->
input
(
0
)
->
owner_opr
());
if
(
typecvt
==
nullptr
)
return
false
;
bool
is_typecvt_src_i16
=
typecvt
->
input
(
0
)
->
dtype
().
enumv
()
==
DTypeEnum
::
Int16
;
bool
is_typecvt_src_u8
=
typecvt
->
input
(
0
)
->
dtype
().
enumv
()
==
DTypeEnum
::
Uint8
;
bool
is_typecvt_dst_f32
=
typecvt
->
output
(
0
)
->
dtype
().
enumv
()
==
DTypeEnum
::
Float32
;
if
(
!
((
is_typecvt_src_i16
||
is_typecvt_src_u8
)
&&
is_typecvt_dst_f32
))
return
false
;
SymbolVar
new_elem
;
auto
src0
=
rewriter
.
get_var
(
typecvt
->
input
(
0
)),
src1
=
rewriter
.
get_var
(
elemwise
->
input
(
1
)),
src2
=
rewriter
.
get_var
(
elemwise
->
input
(
2
));
if
(
is_typecvt_src_i16
)
{
new_elem
=
opr
::
ElemwiseMultiType
::
make
(
{
src0
,
src1
,
src2
},
{
opr
::
ElemwiseMultiType
::
Mode
::
FUSE_MUL_ADD3_INT16xF32xF32xF32
},
OperatorNodeConfig
{
dtype
::
Float32
()});
}
else
{
new_elem
=
opr
::
ElemwiseMultiType
::
make
(
{
src0
,
src1
,
src2
},
{
opr
::
ElemwiseMultiType
::
Mode
::
FUSE_MUL_ADD3_UINT8xF32xF32xF32
},
OperatorNodeConfig
{
dtype
::
Float32
()});
}
rewriter
.
replace_var
(
opr
->
output
(
0
),
new_elem
.
node
(),
mgb_cstr_log
(
"replace typecvt + elemwise(FUSE_MUL_ADD3)"
"to ElemwiseMultiType(FUSE_MUL_ADD3_INTXxF32xF32xF32)"
));
return
true
;
};
auto
try_typecvt_elemwise_mul_i16xf32xf32
=
[
&
rewriter
,
&
uniq_reader_check
](
OperatorNodeBase
*
opr
)
{
// check elemwise
auto
elemwise
=
try_cast_as_op
<
opr
::
Elemwise
>
(
opr
);
if
(
elemwise
==
nullptr
)
return
false
;
if
(
elemwise
->
param
().
mode
!=
opr
::
Elemwise
::
Mode
::
MUL
)
return
false
;
bool
is_elem_src_f32
=
elemwise
->
input
(
0
)
->
dtype
().
enumv
()
==
DTypeEnum
::
Float32
;
bool
is_elem_dst_f32
=
elemwise
->
output
(
0
)
->
dtype
().
enumv
()
==
DTypeEnum
::
Float32
;
if
(
!
(
is_elem_src_f32
&&
is_elem_dst_f32
))
return
false
;
// maybe src0 or src1
if
(
!
(
try_cast_as_op
<
opr
::
TypeCvt
>
(
elemwise
->
input
(
0
)
->
owner_opr
())
||
try_cast_as_op
<
opr
::
TypeCvt
>
(
elemwise
->
input
(
1
)
->
owner_opr
())))
return
false
;
int
typecvt_src_idx
=
(
try_cast_as_op
<
opr
::
TypeCvt
>
(
elemwise
->
input
(
0
)
->
owner_opr
())
!=
nullptr
)
?
0
:
1
;
int
other_src_idx
=
(
typecvt_src_idx
==
0
)
?
1
:
0
;
if
(
!
uniq_reader_check
(
elemwise
->
input
(
typecvt_src_idx
)))
return
false
;
// check typecvt
auto
typecvt
=
try_cast_as_op
<
opr
::
TypeCvt
>
(
elemwise
->
input
(
typecvt_src_idx
)
->
owner_opr
());
bool
is_typecvt_src_i16
=
typecvt
->
input
(
0
)
->
dtype
().
enumv
()
==
DTypeEnum
::
Int16
;
bool
is_typecvt_dst_f32
=
typecvt
->
output
(
0
)
->
dtype
().
enumv
()
==
DTypeEnum
::
Float32
;
if
(
!
(
is_typecvt_src_i16
&&
is_typecvt_dst_f32
))
return
false
;
SymbolVar
new_elem
;
auto
src0
=
rewriter
.
get_var
(
typecvt
->
input
(
0
)),
src1
=
rewriter
.
get_var
(
elemwise
->
input
(
other_src_idx
));
new_elem
=
opr
::
ElemwiseMultiType
::
make
(
{
src0
,
src1
},
{
opr
::
ElemwiseMultiType
::
Mode
::
MUL_INT16xF32xF32
},
OperatorNodeConfig
{
dtype
::
Float32
()});
rewriter
.
replace_var
(
opr
->
output
(
0
),
new_elem
.
node
(),
mgb_cstr_log
(
"replace typecvt + elemwise(MUL)"
"to ElemwiseMultiType(MUL_INT16xF32xF32)"
));
return
true
;
};
auto
on_opr
=
[
&
try_typecvt_elemwise_fma_i16xf32xf32xf32
,
&
try_typecvt_elemwise_mul_i16xf32xf32
,
&
rewriter
](
OperatorNodeBase
*
opr
)
{
if
(
!
try_typecvt_elemwise_fma_i16xf32xf32xf32
(
opr
)
&&
!
try_typecvt_elemwise_mul_i16xf32xf32
(
opr
))
{
rewriter
.
auto_replace_outputs
(
opr
);
}
};
opt
.
graph
().
iter
(
on_opr
);
rewriter
.
apply_inplace
();
MIDOUT_E
}
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
src/gopt/include/megbrain/gopt/inference.h
浏览文件 @
e715423f
...
...
@@ -200,6 +200,15 @@ public:
void
apply
(
OptState
&
opt_state
)
const
override
;
};
/*!
* \brief Fuse typecvt elemwise
*/
class
FuseTypecvtElemwisePass
final
:
public
Pass
{
public:
const
char
*
name
()
const
override
;
void
apply
(
OptState
&
opt
)
const
override
;
};
/*!
* \brief tensor format converter to accelerate inference speed on Nvidia
* platform
...
...
src/gopt/test/inference.cpp
浏览文件 @
e715423f
...
...
@@ -1780,6 +1780,138 @@ TEST(TestGoptInference, ConvBiasNonlinearityFusePass_FullBias) {
}
}
#if (MEGDNN_AARCH64 || MEGDNN_ARMV7) && !MGB_OPENCL && !MGB_CUDA
TEST
(
TestGoptInference
,
FuseTypeCvtAndElemwiseCase0
)
{
HostTensorGenerator
<
dtype
::
Int16
,
RandomDistribution
::
UNIFORM
>
gen
(
0
,
255
);
auto
cn
=
CompNode
::
load
(
"cpu0"
);
auto
graph
=
ComputingGraph
::
make
();
graph
->
options
().
graph_opt_level
=
0
;
size_t
n
=
1
;
size_t
c
=
128
;
size_t
h
=
16
;
size_t
w
=
16
;
auto
host_x1
=
gen
({
n
,
h
,
w
,
c
},
cn
);
auto
x
=
opr
::
Host2DeviceCopy
::
make
(
*
graph
,
host_x1
);
auto
x_nchw
=
opr
::
Dimshuffle
::
make
(
x
,
{
0
,
3
,
1
,
2
},
4
,
cn
);
auto
x_f32
=
opr
::
TypeCvt
::
make
(
x_nchw
,
dtype
::
Float32
(),
cn
);
auto
mkcvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
)
{
return
opr
::
SharedDeviceTensor
::
make
(
*
graph
,
*
gen
(
shp
,
cn
)).
rename
(
name
);
};
auto
s
=
mkcvar
(
"s"
,
{
1
,
c
,
1
,
1
});
auto
b
=
mkcvar
(
"b"
,
{
1
,
c
,
1
,
1
});
auto
result
=
opr
::
Elemwise
::
make
(
{
x_f32
,
s
,
b
},
opr
::
Elemwise
::
Param
::
Mode
::
FUSE_MUL_ADD3
);
auto
y
=
result
;
SymbolVar
y_opt
;
auto
options
=
gopt
::
OptimizeForInferenceOptions
{};
unpack_vector
(
gopt
::
optimize_for_inference
({
y
},
options
),
y_opt
);
ASSERT_TRUE
(
y_opt
.
node
()
->
owner_opr
()
->
same_type
<
opr
::
ElemwiseMultiType
>
());
ASSERT_EQ
(
opr
::
ElemwiseMultiType
::
Param
::
Mode
::
FUSE_MUL_ADD3_INT16xF32xF32xF32
,
find_opr
<
opr
::
ElemwiseMultiType
>
(
y_opt
).
param
().
mode
);
HostTensorND
host_y_opt
,
host_y
;
auto
func
=
graph
->
compile
({
make_callback_copy
(
y
,
host_y
)});
func
->
execute
();
graph
->
options
().
graph_opt_level
=
2
;
auto
func_opt
=
graph
->
compile
({
make_callback_copy
(
y
,
host_y_opt
)});
func_opt
->
execute
();
MGB_ASSERT_TENSOR_NEAR
(
host_y
,
host_y_opt
,
1e-5
);
}
TEST
(
TestGoptInference
,
FuseTypeCvtAndElemwiseCase1
)
{
HostTensorGenerator
<
dtype
::
Int16
,
RandomDistribution
::
UNIFORM
>
gen
(
0
,
255
);
auto
cn
=
CompNode
::
load
(
"cpu0"
);
auto
graph
=
ComputingGraph
::
make
();
graph
->
options
().
graph_opt_level
=
0
;
size_t
n
=
1
;
size_t
c
=
128
;
size_t
h
=
16
;
size_t
w
=
16
;
auto
host_x1
=
gen
({
n
,
h
,
w
,
c
},
cn
);
auto
x
=
opr
::
Host2DeviceCopy
::
make
(
*
graph
,
host_x1
);
auto
x_nchw
=
opr
::
Dimshuffle
::
make
(
x
,
{
0
,
3
,
1
,
2
},
4
,
cn
);
auto
x_f32
=
opr
::
TypeCvt
::
make
(
x_nchw
,
dtype
::
Float32
(),
cn
);
auto
mkcvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
)
{
return
opr
::
SharedDeviceTensor
::
make
(
*
graph
,
*
gen
(
shp
,
cn
)).
rename
(
name
);
};
auto
s
=
mkcvar
(
"s"
,
{
1
,
c
,
1
,
1
});
auto
result
=
opr
::
Elemwise
::
make
({
x_f32
,
s
},
opr
::
Elemwise
::
Param
::
Mode
::
MUL
);
auto
y
=
result
;
SymbolVar
y_opt
;
auto
options
=
gopt
::
OptimizeForInferenceOptions
{};
unpack_vector
(
gopt
::
optimize_for_inference
({
y
},
options
),
y_opt
);
ASSERT_TRUE
(
y_opt
.
node
()
->
owner_opr
()
->
same_type
<
opr
::
ElemwiseMultiType
>
());
ASSERT_EQ
(
opr
::
ElemwiseMultiType
::
Param
::
Mode
::
MUL_INT16xF32xF32
,
find_opr
<
opr
::
ElemwiseMultiType
>
(
y_opt
).
param
().
mode
);
HostTensorND
host_y_opt
,
host_y
;
auto
func
=
graph
->
compile
({
make_callback_copy
(
y
,
host_y
)});
func
->
execute
();
graph
->
options
().
graph_opt_level
=
2
;
auto
func_opt
=
graph
->
compile
({
make_callback_copy
(
y
,
host_y_opt
)});
func_opt
->
execute
();
MGB_ASSERT_TENSOR_NEAR
(
host_y
,
host_y_opt
,
1e-5
);
}
TEST
(
TestGoptInference
,
FuseTypeCvtAndElemwiseCase2
)
{
HostTensorGenerator
<
dtype
::
Uint8
,
RandomDistribution
::
UNIFORM
>
gen
(
0
,
255
);
auto
cn
=
CompNode
::
load
(
"cpu0"
);
auto
graph
=
ComputingGraph
::
make
();
graph
->
options
().
graph_opt_level
=
0
;
size_t
n
=
1
;
size_t
c
=
128
;
size_t
h
=
16
;
size_t
w
=
16
;
auto
host_x1
=
gen
({
n
,
h
,
w
,
c
},
cn
);
auto
x
=
opr
::
Host2DeviceCopy
::
make
(
*
graph
,
host_x1
);
auto
x_nchw
=
opr
::
Dimshuffle
::
make
(
x
,
{
0
,
3
,
1
,
2
},
4
,
cn
);
auto
x_f32
=
opr
::
TypeCvt
::
make
(
x_nchw
,
dtype
::
Float32
(),
cn
);
auto
mkcvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
)
{
return
opr
::
SharedDeviceTensor
::
make
(
*
graph
,
*
gen
(
shp
,
cn
)).
rename
(
name
);
};
auto
s
=
mkcvar
(
"s"
,
{
1
,
c
,
1
,
1
});
auto
b
=
mkcvar
(
"b"
,
{
1
,
c
,
1
,
1
});
auto
result
=
opr
::
Elemwise
::
make
(
{
x_f32
,
s
,
b
},
opr
::
Elemwise
::
Param
::
Mode
::
FUSE_MUL_ADD3
);
auto
y
=
result
;
SymbolVar
y_opt
;
auto
options
=
gopt
::
OptimizeForInferenceOptions
{};
unpack_vector
(
gopt
::
optimize_for_inference
({
y
},
options
),
y_opt
);
ASSERT_TRUE
(
y_opt
.
node
()
->
owner_opr
()
->
same_type
<
opr
::
ElemwiseMultiType
>
());
ASSERT_EQ
(
opr
::
ElemwiseMultiType
::
Param
::
Mode
::
FUSE_MUL_ADD3_UINT8xF32xF32xF32
,
find_opr
<
opr
::
ElemwiseMultiType
>
(
y_opt
).
param
().
mode
);
HostTensorND
host_y_opt
,
host_y
;
auto
func
=
graph
->
compile
({
make_callback_copy
(
y
,
host_y
)});
func
->
execute
();
graph
->
options
().
graph_opt_level
=
2
;
auto
func_opt
=
graph
->
compile
({
make_callback_copy
(
y
,
host_y_opt
)});
func_opt
->
execute
();
MGB_ASSERT_TENSOR_NEAR
(
host_y
,
host_y_opt
,
1e-5
);
}
#endif
TEST
(
TestGoptInference
,
ParamMerge
)
{
auto
cns
=
load_multiple_xpus
(
2
);
HostTensorGenerator
<>
gen
;
...
...
src/opr/impl/nn_int.cpp
浏览文件 @
e715423f
...
...
@@ -77,4 +77,12 @@ void ElemwiseMultiType::record_execute_deps(ExecDependencyArray& deps) {
record_megdnn_opr
(
deps
);
}
void
ElemwiseMultiType
::
add_input_layout_constraint
()
{
#if (MEGDNN_AARCH64 || MEGDNN_ARMV7) && !MGB_OPENCL && !MGB_CUDA
for
(
auto
i
:
input
())
{
i
->
add_layout_constraint_contiguous
();
}
#endif
}
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
src/opr/include/megbrain/opr/nn_int.h
浏览文件 @
e715423f
...
...
@@ -49,6 +49,8 @@ private:
void
init_output_dtype
()
override
;
void
record_execute_deps
(
ExecDependencyArray
&
deps
)
override
;
void
add_input_layout_constraint
()
override
;
};
//! deprecated; TODO: remove in megbrain 8
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录