Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
45e2beea
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
404
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
45e2beea
编写于
5月 18, 2020
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(mgb/gopt): add nchw4 optpass
GitOrigin-RevId: 551b6b828d33916b8e0a8bec73e6d3c6abd65536
上级
f2e1bb41
变更
10
展开全部
隐藏空白更改
内联
并排
Showing
10 changed file
with
554 addition
and
50 deletion
+554
-50
python_module/megengine/_internal/__init__.py
python_module/megengine/_internal/__init__.py
+3
-0
python_module/megengine/jit/__init__.py
python_module/megengine/jit/__init__.py
+1
-0
python_module/src/swig/misc.i
python_module/src/swig/misc.i
+1
-0
sdk/load-and-run/dump_with_testcase_mge.py
sdk/load-and-run/dump_with_testcase_mge.py
+7
-0
sdk/load-and-run/src/mgblar.cpp
sdk/load-and-run/src/mgblar.cpp
+1
-0
src/core/include/megbrain/graph/cg.h
src/core/include/megbrain/graph/cg.h
+2
-0
src/gopt/impl/framework.cpp
src/gopt/impl/framework.cpp
+7
-0
src/gopt/impl/tensor_reformat.cpp
src/gopt/impl/tensor_reformat.cpp
+393
-50
src/gopt/include/megbrain/gopt/inference.h
src/gopt/include/megbrain/gopt/inference.h
+13
-0
src/gopt/test/inference.cpp
src/gopt/test/inference.cpp
+126
-0
未找到文件。
python_module/megengine/_internal/__init__.py
浏览文件 @
45e2beea
...
...
@@ -541,6 +541,7 @@ def optimize_for_inference(
fuse_conv_bias_nonlinearity
=
False
,
use_nchw32
=
False
,
fuse_conv_bias_with_z
=
False
,
use_nchw4
=
False
,
use_nchw88
=
False
,
use_nchw44
=
False
,
use_chwn4
=
False
...
...
@@ -561,6 +562,7 @@ def optimize_for_inference(
OpenCL devices
:param fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
into one opr. This is supported only in NHWCD4 format.
:param use_nchw4: whether to use NCHW4 tensor format.
:param use_nchw88: whether to use NCHW88 tensor format. This maybe faster some
times.
:param use_nchw44: whether to use NCHW44 tensor format. This maybe faster some
...
...
@@ -588,6 +590,7 @@ def optimize_for_inference(
layout_tranform
=
None
for
k
,
v
in
{
"use_nchw4"
:
"nchw4"
,
"use_nhwcd4"
:
"nhwcd4"
,
"use_nchw32"
:
"nchw32"
,
"use_nchw88"
:
"nchw88"
,
...
...
python_module/megengine/jit/__init__.py
浏览文件 @
45e2beea
...
...
@@ -463,6 +463,7 @@ class trace:
"enable_io16xc32"
:
"f16_io_f32_comp"
,
"enable_ioc16"
:
"f16_io_comp"
,
"enable_hwcd4"
:
"use_nhwcd4"
,
"enable_nchw4"
:
"use_nchw4"
,
"enable_nchw88"
:
"use_nchw88"
,
"enable_nchw32"
:
"use_nchw32"
,
"enable_nchw44"
:
"use_nchw44"
,
...
...
python_module/src/swig/misc.i
浏览文件 @
45e2beea
...
...
@@ -80,6 +80,7 @@ struct _OptimizeForInferenceOptions {
#
define
SET
(
_trans
,
_trans_capital
)
\
void
enable_
##
_trans
()
;
\
SET
(
nchw4
,
NCHW4
)
;
SET
(
nhwcd4
,
NHWCD4
)
;
SET
(
nchw88
,
NCHW88
)
;
SET
(
nchw44
,
NCHW44
)
;
...
...
sdk/load-and-run/dump_with_testcase_mge.py
浏览文件 @
45e2beea
...
...
@@ -252,6 +252,7 @@ def optimize_for_inference(args, outputs):
'enable_io16xc32'
:
'f16_io_f32_comp'
,
'enable_ioc16'
:
'f16_io_comp'
,
'enable_hwcd4'
:
'use_nhwcd4'
,
'enable_nchw4'
:
'use_nchw4'
,
'enable_nchw88'
:
'use_nchw88'
,
'enable_nchw44'
:
'use_nchw44'
,
'enable_nchw32'
:
'use_nchw32'
,
...
...
@@ -381,6 +382,12 @@ def main():
'for inference; you may need to disable CUDA and set '
'MGB_USE_MEGDNN_DBG=2'
)
parser
.
add_argument
(
'--enable-nchw4'
,
action
=
'store_true'
,
help
=
'transform the model format from NCHW to NCHW4 '
'for inference'
)
parser
.
add_argument
(
'--enable-nchw88'
,
action
=
'store_true'
,
...
...
sdk/load-and-run/src/mgblar.cpp
浏览文件 @
45e2beea
...
...
@@ -980,6 +980,7 @@ Args Args::from_argv(int argc, char **argv) {
continue; \
}
cb
(
nchw4
);
cb
(
chwn4
);
cb
(
nchw44
);
cb
(
nchw88
);
...
...
src/core/include/megbrain/graph/cg.h
浏览文件 @
45e2beea
...
...
@@ -97,6 +97,7 @@ struct GraphCommonOptimizeOptions {
bool
fuse_conv_bias_with_z
=
false
;
enum
LayoutTransform
:
uint32_t
{
DEFAULT
,
NCHW4
,
///< compute using NCHW4 tensor format
NHWCD4
,
///< compute using NHWCD4 tensor format
NCHW88
,
///< compute using NCHW88 tensor format
NCHW44
,
///< compute using NCHW44 tensor format
...
...
@@ -137,6 +138,7 @@ struct GraphCommonOptimizeOptions {
return layout_transform == LayoutTransform::_trans_capital; \
}
SET
(
nchw4
,
NCHW4
);
SET
(
nhwcd4
,
NHWCD4
);
SET
(
nchw88
,
NCHW88
);
SET
(
nchw44
,
NCHW44
);
...
...
src/gopt/impl/framework.cpp
浏览文件 @
45e2beea
...
...
@@ -725,6 +725,13 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options(
cb
(
f16_io_comp
,
{
add_pass
(
ConvertF32ToF16Pass
::
make
(
false
));
});
cb
(
f16_io_f32_comp
,
{
add_pass
(
ConvertF32ToF16Pass
::
make
(
true
));
});
cb
(
nchw4
,
{
add_pass
<
FuseConvBiasNonlinPass
>
();
add_pass
<
FuseConvBiasZPass
>
();
add_pass
(
EnableNCHW4Pass
::
make_nchw4_converter
());
add_pass
<
ShuffleShuffleRemovePass
>
();
add_pass
<
RemoveRedundantTypeCvtPass
>
();
});
cb
(
nhwcd4
,
{
add_pass
<
FuseConvBiasNonlinPass
>
();
add_pass
(
ConvertFormatPass
::
make_nhwcd4_converter
());
...
...
src/gopt/impl/tensor_reformat.cpp
浏览文件 @
45e2beea
此差异已折叠。
点击以展开。
src/gopt/include/megbrain/gopt/inference.h
浏览文件 @
45e2beea
...
...
@@ -229,6 +229,19 @@ namespace gopt {
static
std
::
unique_ptr
<
EnableCHWN4Pass
>
make_chwn4_converter
();
};
/*!
* \brief convert tensor format to nchw4 to speed up inference on CUDA
*/
class
EnableNCHW4Pass
final
:
public
TensorReformatPass
{
VarNode
*
on_graph_endpoint_var
(
VarNode
*
new_var
,
VarNode
*
orig_var
)
const
override
;
public:
const
char
*
name
()
const
override
{
return
mgb_cstr_log
(
"tensor_format_nchw4"
);
}
//! make nchw -> nchw4 converter opt pass
static
std
::
unique_ptr
<
EnableNCHW4Pass
>
make_nchw4_converter
();
};
/*!
* \brief convert tensor format to nchwxx to speed up inference on certain
* devices
...
...
src/gopt/test/inference.cpp
浏览文件 @
45e2beea
...
...
@@ -2327,8 +2327,134 @@ TEST(TestGoptInference, EnableCHWN4ShuffleRemove) {
MGB_ASSERT_TENSOR_EQ
(
host_y
,
host_y_opt
);
}
TEST
(
TestGoptInference
,
ConvertFormatNCHW4GPU
)
{
REQUIRE_GPU
(
1
);
auto
cn
=
CompNode
::
load
(
"gpu0"
);
cn
.
activate
();
auto
&&
prop
=
CompNodeEnv
::
from_comp_node
(
cn
).
cuda_env
().
device_prop
;
auto
sm_ver
=
prop
.
major
*
10
+
prop
.
minor
;
if
(
sm_ver
<
61
)
{
printf
(
"This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)
\n
"
,
sm_ver
,
61
);
return
;
}
HostTensorGenerator
<
dtype
::
Int8
>
gen
;
auto
graph
=
ComputingGraph
::
make
();
graph
->
options
().
graph_opt_level
=
0
;
auto
mkvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
,
const
DType
&
dtype
)
{
return
opr
::
TypeCvt
::
make
(
opr
::
Host2DeviceCopy
::
make
(
*
graph
,
gen
(
shp
,
cn
)).
rename
(
name
),
dtype
);
};
auto
mkcvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
,
const
DType
&
dtype
)
{
return
opr
::
TypeCvt
::
make
(
opr
::
SharedDeviceTensor
::
make
(
*
graph
,
*
gen
(
shp
,
cn
))
.
rename
(
name
),
dtype
);
};
auto
x
=
mkvar
(
"x"
,
{
2
,
4
,
16
,
16
},
dtype
::
QuantizedS8
(
2.5
f
));
opr
::
ConvBias
::
Param
param_conv_bias
;
param_conv_bias
.
format
=
opr
::
ConvBias
::
Param
::
Format
::
NCHW
;
param_conv_bias
.
stride_h
=
param_conv_bias
.
stride_w
=
1
;
param_conv_bias
.
pad_h
=
param_conv_bias
.
pad_w
=
1
;
param_conv_bias
.
nonlineMode
=
opr
::
ConvBias
::
Param
::
NonlineMode
::
RELU
;
// dense
param_conv_bias
.
sparse
=
opr
::
ConvBias
::
Param
::
Sparse
::
DENSE
;
auto
w1
=
mkcvar
(
"w1"
,
{
8
,
4
,
3
,
3
},
dtype
::
QuantizedS8
(
2.5
f
)),
b1
=
mkcvar
(
"b1"
,
{
1
,
8
,
1
,
1
},
dtype
::
QuantizedS32
(
6.25
f
));
auto
conv1
=
opr
::
ConvBiasForward
::
make
(
x
,
w1
,
b1
,
param_conv_bias
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
{
2.5
f
}});
// group
// icpg != 1 && ocpg != 1
param_conv_bias
.
sparse
=
opr
::
ConvBias
::
Param
::
Sparse
::
GROUP
;
auto
w2
=
mkcvar
(
"w2"
,
{
2
,
4
,
4
,
3
,
3
},
dtype
::
QuantizedS8
(
2.5
f
)),
b2
=
mkcvar
(
"b2"
,
{
1
,
8
,
1
,
1
},
dtype
::
QuantizedS32
(
6.25
f
));
auto
conv2
=
opr
::
ConvBiasForward
::
make
(
conv1
,
w2
,
b2
,
param_conv_bias
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
{
2.5
f
}});
auto
y
=
opr
::
TypeCvt
::
make
(
conv2
,
dtype
::
Float32
());
SymbolVar
y_opt
;
{
auto
options
=
gopt
::
OptimizeForInferenceOptions
{};
options
.
enable_nchw4
();
unpack_vector
(
gopt
::
optimize_for_inference
({
y
},
options
),
y_opt
);
}
ASSERT_EQ
(
opr
::
ConvBias
::
Param
::
Format
::
NCHW4
,
find_opr
<
opr
::
ConvBias
>
(
y_opt
).
param
().
format
);
graph
->
compile
({{
y_opt
,
{}}})
->
to_json
()
->
writeto_fpath
(
output_file
(
"TestGoptInference.ConvertFormatNCHW4GPU.json"
));
HostTensorND
host_y
,
host_y_opt
;
auto
func
=
graph
->
compile
({
make_callback_copy
(
y
,
host_y
),
make_callback_copy
(
y_opt
,
host_y_opt
)});
func
->
execute
();
MGB_ASSERT_TENSOR_EQ
(
host_y
,
host_y_opt
);
}
#endif
TEST
(
TestGoptInference
,
ConvertFormatNCHW4
)
{
HostTensorGenerator
<>
gen
;
auto
cn
=
CompNode
::
load
(
"cpu0"
);
auto
graph
=
ComputingGraph
::
make
();
graph
->
options
().
graph_opt_level
=
0
;
auto
mkvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
)
{
return
opr
::
Host2DeviceCopy
::
make
(
*
graph
,
gen
(
shp
,
cn
)).
rename
(
name
);
};
auto
mkcvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
)
{
return
opr
::
SharedDeviceTensor
::
make
(
*
graph
,
*
gen
(
shp
,
cn
))
.
rename
(
name
);
};
auto
x
=
mkvar
(
"x"
,
{
2
,
4
,
16
,
16
});
// ConvBias
opr
::
ConvBias
::
Param
param_conv_bias
;
param_conv_bias
.
pad_h
=
param_conv_bias
.
pad_w
=
1
;
param_conv_bias
.
sparse
=
opr
::
ConvBias
::
Param
::
Sparse
::
DENSE
;
auto
w1
=
mkcvar
(
"w1"
,
{
8
,
4
,
3
,
3
}),
b1
=
mkcvar
(
"b1"
,
{
1
,
8
,
1
,
1
});
auto
conv1
=
opr
::
ConvBias
::
make
(
x
,
w1
,
b1
,
param_conv_bias
);
param_conv_bias
.
sparse
=
opr
::
ConvBias
::
Param
::
Sparse
::
GROUP
;
auto
w2
=
mkcvar
(
"w2"
,
{
2
,
4
,
4
,
3
,
3
}),
b2
=
mkcvar
(
"b2"
,
{
1
,
8
,
1
,
1
});
auto
conv2
=
opr
::
ConvBias
::
make
(
conv1
,
w2
,
b2
,
param_conv_bias
);
// Convolution
opr
::
Convolution
::
Param
param_conv
;
param_conv
.
pad_h
=
param_conv
.
pad_w
=
1
;
param_conv
.
sparse
=
opr
::
Convolution
::
Param
::
Sparse
::
DENSE
;
auto
w3
=
mkcvar
(
"w3"
,
{
8
,
8
,
3
,
3
});
auto
y
=
opr
::
Convolution
::
make
(
conv2
,
w3
,
param_conv
);
SymbolVar
y_opt
;
{
auto
options
=
gopt
::
OptimizeForInferenceOptions
{};
options
.
enable_nchw4
();
unpack_vector
(
gopt
::
optimize_for_inference
({
y
},
options
),
y_opt
);
}
ASSERT_EQ
(
opr
::
ConvBias
::
Param
::
Format
::
NCHW4
,
find_opr
<
opr
::
ConvBias
>
(
y_opt
).
param
().
format
);
graph
->
compile
({{
y_opt
,
{}}})
->
to_json
()
->
writeto_fpath
(
output_file
(
"TestGoptInference.ConvertFormatNCHW4.json"
));
HostTensorND
host_y_opt
,
host_y
;
auto
func
=
graph
->
compile
({
make_callback_copy
(
y
,
host_y
),
make_callback_copy
(
y_opt
,
host_y_opt
)});
func
->
execute
();
MGB_ASSERT_TENSOR_NEAR
(
host_y
,
host_y_opt
,
1e-3
);
}
TEST
(
TestGoptInference
,
ConvertFormatNCHW88
)
{
HostTensorGenerator
<>
gen
;
auto
cn
=
CompNode
::
load
(
"cpu0"
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录