Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
fad5bc74
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
403
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
fad5bc74
编写于
3月 04, 2021
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(mge/tools): add load_network_and_run for megengine
GitOrigin-RevId: 7d8ed5af79619b50c14e062f50da29cbee13e875
上级
3f2e15f5
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
425 addition
and
6 deletion
+425
-6
imperative/python/megengine/serialization.py
imperative/python/megengine/serialization.py
+5
-5
imperative/python/megengine/tools/load_network_and_run.py
imperative/python/megengine/tools/load_network_and_run.py
+418
-0
imperative/python/megengine/utils/profile_analyzer.py
imperative/python/megengine/utils/profile_analyzer.py
+1
-1
imperative/python/src/graph_rt.cpp
imperative/python/src/graph_rt.cpp
+1
-0
未找到文件。
imperative/python/megengine/serialization.py
浏览文件 @
fad5bc74
...
...
@@ -107,14 +107,14 @@ def load(f, map_location=None, pickle_module=pickle):
.. testcode:
import megengine as mge
# Load tensors to the same device as defined in model.
mge
mge.load('model.
mge
')
# Load tensors to the same device as defined in model.
pkl
mge.load('model.
pkl
')
# Load all tensors to gpu0.
mge.load('model.
mge
', map_location='gpu0')
mge.load('model.
pkl
', map_location='gpu0')
# Load all tensors originally on gpu0 to cpu0
mge.load('model.
mge
', map_location={'gpu0':'cpu0'})
mge.load('model.
pkl
', map_location={'gpu0':'cpu0'})
# Load all tensors to cpu0
mge.load('model.
mge
', map_location=lambda dev: 'cpu0')
mge.load('model.
pkl
', map_location=lambda dev: 'cpu0')
"""
if
isinstance
(
f
,
str
):
...
...
imperative/python/megengine/tools/load_network_and_run.py
0 → 100755
浏览文件 @
fad5bc74
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
#
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import
argparse
import
logging
import
time
from
collections
import
OrderedDict
import
numpy
as
np
import
megengine
as
mge
from
megengine.core.tensor
import
megbrain_graph
as
G
from
megengine.device
import
get_device_count
,
set_default_device
from
megengine.functional.debug_param
import
set_execution_strategy
from
megengine.logger
import
enable_debug_log
,
get_logger
,
set_log_file
from
megengine.utils
import
comp_graph_tools
as
tools
logger
=
get_logger
(
__name__
)
def
make_data_given_desc
(
args
,
inputs
,
shape0_multiply
=
1
):
if
args
.
load_input_data
:
logger
.
info
(
"load data from {}"
.
format
(
args
.
load_input_data
))
data
=
mge
.
load
(
args
.
load_input_data
)
data_names
=
[
inp
.
name
for
inp
in
inputs
]
if
isinstance
(
data
,
np
.
ndarray
):
assert
len
(
data_names
)
==
1
,
(
"data is given as a single numpy array, so there should be "
"exactly one input in the graph; got: {}"
.
format
(
data_names
)
)
data
=
{
data_names
[
0
]:
data
}
assert
isinstance
(
data
,
dict
)
for
v
in
data
.
values
():
assert
isinstance
(
v
,
np
.
ndarray
),
"data should provide ndarray; got {} instead"
.
format
(
v
)
if
args
.
batchsize
:
for
k
,
v
in
list
(
data
.
items
()):
assert
(
args
.
batchsize
%
v
.
shape
[
0
]
==
0
),
"current batch size must divide given batch size: {} {}"
.
format
(
args
.
batchsize
,
v
.
shape
[
0
]
)
data
[
k
]
=
np
.
repeat
(
v
,
args
.
batchsize
//
v
.
shape
[
0
],
axis
=
0
)
return
data
def
iter_inpdesc
(
desc
):
if
not
desc
:
return
for
pair
in
desc
.
split
(
";"
):
name
,
value
=
pair
.
split
(
":"
)
if
name
not
in
data_shapes
:
logger
.
warning
(
"rng name {} not in data provider"
.
format
(
name
))
yield
name
,
value
rng
=
np
.
random
.
RandomState
(
args
.
seed
)
data_shapes
=
OrderedDict
((
inp
.
name
,
list
(
inp
.
shape
))
for
inp
in
inputs
)
data_dtypes
=
OrderedDict
((
inp
.
name
,
inp
.
dtype
)
for
inp
in
inputs
)
for
name
,
shape
in
iter_inpdesc
(
args
.
input_desc
):
data_shapes
[
name
]
=
list
(
map
(
int
,
shape
.
split
(
","
)))
if
args
.
batchsize
:
for
i
in
data_shapes
.
values
():
i
[
0
]
=
args
.
batchsize
data_rngs
=
dict
(
iter_inpdesc
(
args
.
rng
))
result
=
OrderedDict
()
for
name
,
shape
in
data_shapes
.
items
():
shape
[
0
]
*=
shape0_multiply
rng_expr
=
data_rngs
.
get
(
name
)
if
rng_expr
:
value
=
eval
(
"rng.{}"
.
format
(
rng_expr
).
format
(
shape
),
{
"rng"
:
rng
})
else
:
value
=
rng
.
uniform
(
size
=
shape
)
value
=
np
.
ascontiguousarray
(
value
,
dtype
=
data_dtypes
[
name
])
assert
value
.
shape
==
tuple
(
shape
)
result
[
name
]
=
value
return
result
def
get_execution_strategy
(
args
):
if
not
args
.
fast_run
:
logger
.
warning
(
"--fast-run not enabled; execution may be slow"
)
strategy
=
"HEURISTIC"
else
:
logger
.
warning
(
"--fast-run enabled; compile may be slow"
)
strategy
=
"PROFILE"
if
args
.
reproducible
:
strategy
+=
"_REPRODUCIBLE"
return
strategy
def
get_opt_kwargs
(
args
):
args_list
=
[
"enable_io16xc32"
,
"enable_ioc16"
,
"enable_hwcd4"
,
"enable_nchw4"
,
"enable_nchw88"
,
"enable_nchw44"
,
"enable_nchw44_dot"
,
"enable_nchw32"
,
"enable_chwn4"
,
"enable_fuse_conv_bias_nonlinearity"
,
"enable_fuse_conv_bias_with_z"
,
]
kwargs
=
{}
for
k
in
args_list
:
if
getattr
(
args
,
k
):
kwargs
[
k
]
=
True
return
kwargs
def
run_model
(
args
,
graph
,
inputs
,
outputs
,
data
):
# must use level0 to avoid unintended opr modification
graph
.
options
.
graph_opt_level
=
0
logger
.
info
(
"input tensors: "
)
for
k
,
v
in
data
.
items
():
logger
.
info
(
" {}: {}"
.
format
(
k
,
v
.
shape
))
G
.
modify_opr_algo_strategy_inplace
(
outputs
,
get_execution_strategy
(
args
))
if
args
.
optimize_for_inference
:
opt_kwargs
=
get_opt_kwargs
(
args
)
outputs
=
G
.
optimize_for_inference
(
outputs
,
**
opt_kwargs
)
# embed inputs must be on the last, to avoid const fold
if
args
.
embed_input
:
outputs
,
inp_dict
=
tools
.
embed_inputs
(
outputs
,
data
.
values
(),
inputs
=
inputs
)
else
:
outputs
,
inp_dict
=
tools
.
convert_inputs
(
outputs
,
inputs
=
inputs
)
if
args
.
dump_cpp_model
:
dump_content
,
_
=
G
.
dump_graph
(
outputs
,
keep_var_name
=
2
)
with
open
(
args
.
dump_cpp_model
,
"wb"
)
as
file
:
file
.
write
(
dump_content
)
logger
.
info
(
"C++ model written to {}"
.
format
(
args
.
dump_cpp_model
))
outputs
,
output_dict
=
tools
.
convert_outputs
(
outputs
)
if
args
.
profile
:
profiler
=
tools
.
GraphProfiler
(
graph
)
func
=
graph
.
compile
(
outputs
)
def
run
():
if
not
args
.
embed_input
:
for
key
in
inp_dict
:
inp_dict
[
key
].
set_value
(
mge
.
Tensor
(
data
[
key
]).
_dev_tensor
())
func
.
execute
()
func
.
wait
()
return
[
oup_node
.
get_value
().
numpy
()
for
oup_node
in
output_dict
.
values
()]
if
args
.
warm_up
:
logger
.
info
(
"warming up"
)
run
()
total_time
=
0
for
i
in
range
(
args
.
iter
):
logger
.
info
(
"iter {}"
.
format
(
i
))
start_time
=
time
.
time
()
retval
=
run
()
cur_time
=
time
.
time
()
-
start_time
total_time
+=
cur_time
avg_speed
=
(
i
+
1
)
/
total_time
if
"data"
in
data
:
avg_speed
*=
data
[
"data"
].
shape
[
0
]
avg_speed_txt
=
"{:.3f}sample/s"
.
format
(
avg_speed
)
else
:
avg_speed_txt
=
"{:.3f}batch/s"
.
format
(
avg_speed
)
msg
=
(
"iter {}: duration={:.4f}({:.4f})s average={:.4f}s "
"avg_speed={} time={:.4f}s"
).
format
(
i
,
cur_time
,
func
.
get_prev_exec_time
(),
total_time
/
(
i
+
1
),
avg_speed_txt
,
total_time
,
)
if
args
.
calc_output_rms
:
rms
=
[]
for
v
in
retval
:
rms
.
append
(
"{:.3g}"
.
format
(
float
(((
v
**
2
).
mean
())
**
0.5
)))
msg
+=
" output_rms=[{}]"
.
format
(
", "
.
join
(
rms
))
if
logger
.
level
>
logging
.
INFO
:
print
(
msg
)
else
:
logger
.
info
(
msg
)
if
args
.
focused_nvprof
:
if
get_device_count
(
"gpu"
)
<
1
:
logger
.
warning
(
"No cuda device detected. ``focused_nvprof`` will be ignored."
)
else
:
try
:
import
pycuda.driver
as
D
D
.
start_profiler
()
func
.
execute
()
func
.
wait
()
D
.
stop_profiler
()
except
ImportError
:
logger
.
error
(
"`focused_nvprof need pycuda`"
,
exc_info
=
True
)
if
args
.
profile
:
with
open
(
args
.
profile
,
"w"
)
as
fout
:
fout
.
write
(
profiler
.
get
())
return
avg_speed
def
main
():
parser
=
argparse
.
ArgumentParser
(
description
=
"load a network and run inference on random data"
,
formatter_class
=
argparse
.
ArgumentDefaultsHelpFormatter
,
)
parser
.
add_argument
(
"net"
)
parser
.
add_argument
(
"--device"
,
"-d"
,
help
=
"set defult device, like 'gpux' or 'cpux'"
)
parser
.
add_argument
(
"--calc-output-rms"
,
action
=
"store_true"
,
help
=
"compute RMS of outputs; useful for comparing computing results"
,
)
parser
.
add_argument
(
"--output-name"
,
nargs
=
"*"
,
help
=
"Specify output name. This option can be"
" specified multiple times. We will look for opr/var"
" in the graph"
,
)
parser
.
add_argument
(
"--load-input-data"
,
help
=
"load input data from pickle file; it should be"
" a numpy array or a dict of numpy array"
,
)
parser
.
add_argument
(
"--profile"
,
help
=
"profiler output file"
)
parser
.
add_argument
(
"--fast-run"
,
action
=
"store_true"
,
help
=
"enable fast running by profiling conv algorithms during compiling."
,
)
parser
.
add_argument
(
"--reproducible"
,
action
=
"store_true"
,
help
=
"use reproducible kernels"
)
parser
.
add_argument
(
"--input-desc"
,
help
=
"specifiy input names and shapes manually in"
" format: <name>:<shape>[;<name>:<shape>, ...], where"
" name is a string and shape is a comma separated"
' string. e.g., "data:128,1,28,28,label:128".'
" different input tensor are separated by semicolon."
,
)
parser
.
add_argument
(
"--batchsize"
,
type
=
int
,
help
=
"change batchsize; the first dimension of each"
" input is assumed to be batch size"
,
)
parser
.
add_argument
(
"--warm-up"
,
action
=
"store_true"
,
help
=
"warm up model before do timing "
" for better estimation"
,
)
parser
.
add_argument
(
"--verbose"
,
"-v"
,
action
=
"store_true"
,
help
=
"verbose output, logging in debug mode"
,
)
parser
.
add_argument
(
"--iter"
,
type
=
int
,
default
=
1
,
help
=
"number of iters to run the model"
)
parser
.
add_argument
(
"--log"
,
help
=
"give a file path to duplicate log to"
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
0
,
help
=
"seed for random number generator for input data"
,
)
parser
.
add_argument
(
"--rng"
,
help
=
"special RNG options to generate input data in"
" format: <name>:func[;<name>:func, ...] where name is"
" a string and func is a python expression containing"
' "{}" for the size param, e.g. '
' "label:randint(low=0,high=1000,size={})"'
,
)
parser
.
add_argument
(
"--focused-nvprof"
,
action
=
"store_true"
,
help
=
"only profile last iter for `nvprof --profile-from-start off`"
,
)
parser
.
add_argument
(
"--optimize-for-inference"
,
action
=
"store_true"
,
help
=
"optimize model for inference"
,
)
parser
.
add_argument
(
"--enable-io16xc32"
,
action
=
"store_true"
,
help
=
"transform the mode to float16 io float32 compute"
,
)
parser
.
add_argument
(
"--enable-ioc16"
,
action
=
"store_true"
,
help
=
"transform the dtype of the model to float16 io and compute"
,
)
parser
.
add_argument
(
"--enable-hwcd4"
,
action
=
"store_true"
,
help
=
"transform the model format from NCHW to NHWCD4 for inference"
,
)
parser
.
add_argument
(
"--enable-nchw4"
,
action
=
"store_true"
,
help
=
"transform the model format from NCHW to NCHW4 for inference"
,
)
parser
.
add_argument
(
"--enable-nchw88"
,
action
=
"store_true"
,
help
=
"transform the model format from NCHW to NCHW88 for inference"
,
)
parser
.
add_argument
(
"--enable-nchw44"
,
action
=
"store_true"
,
help
=
"transform the model format from NCHW to NCHW44 for inference"
,
)
parser
.
add_argument
(
"--enable-nchw44-dot"
,
action
=
"store_true"
,
help
=
"transform the model format from NCHW to NCHW44_DOT "
"for optimizing armv8.2 dot in inference"
,
)
parser
.
add_argument
(
"--enable-chwn4"
,
action
=
"store_true"
,
help
=
"transform the model format to CHWN4 "
"for inference, mainly used for nvidia tensorcore"
,
)
parser
.
add_argument
(
"--enable-nchw32"
,
action
=
"store_true"
,
help
=
"transform the model format from NCHW4 to NCHW32 "
"for inference on nvidia TensoCore"
,
)
parser
.
add_argument
(
"--enable-fuse-conv-bias-nonlinearity"
,
action
=
"store_true"
,
help
=
"fuse convolution bias and nonlinearity opr to a "
"conv_bias opr and compute"
,
)
parser
.
add_argument
(
"--enable-fuse-conv-bias-with-z"
,
action
=
"store_true"
,
help
=
"fuse conv_bias with z input for inference on "
"nvidia GPU (this optimization pass will result in mismatch "
"of the precision of output of training and inference)"
,
)
parser
.
add_argument
(
"--dump-cpp-model"
,
help
=
"write a C++ model that can be loaded by "
"megbrain/sdk/load-and-run; "
"this implies --embed-input"
,
)
parser
.
add_argument
(
"--embed-input"
,
action
=
"store_true"
,
help
=
"embed input data as SharedDeviceTensor in model, "
"to remove memory copy for inputs"
,
)
args
=
parser
.
parse_args
()
if
args
.
verbose
:
enable_debug_log
()
if
args
.
log
:
set_log_file
(
args
.
log
)
if
args
.
device
:
set_default_device
(
args
.
device
)
if
args
.
dump_cpp_model
:
args
.
embed_input
=
True
logger
.
info
(
"loading model ..."
)
graph
,
_
,
output_vars
=
G
.
load_graph
(
args
.
net
)
input_vars
=
tools
.
get_dep_vars
(
output_vars
,
"Host2DeviceCopy"
)
if
args
.
output_name
is
not
None
:
output_vars
=
tools
.
find_vars_by_name
(
output_vars
,
args
.
output_name
)
data
=
make_data_given_desc
(
args
,
input_vars
)
run_model
(
args
,
graph
,
input_vars
,
output_vars
,
data
)
if
__name__
==
"__main__"
:
main
()
imperative/python/megengine/utils/profile_analyzer.py
浏览文件 @
fad5bc74
...
...
@@ -181,7 +181,7 @@ class ProfileAnalyzer:
:param opr_filter: function that filter oprs.
"""
self
.
_opr_set
=
dict
()
# type: dict
assert
isinstance
(
obj
,
dict
)
assert
isinstance
(
obj
,
dict
)
,
type
(
obj
)
varz
=
obj
[
"graph_exec"
][
"var"
]
for
opr_id
,
entry
in
obj
[
"graph_exec"
][
"operator"
].
items
():
inp
=
[
varz
[
i
]
for
i
in
entry
[
"input"
]]
...
...
imperative/python/src/graph_rt.cpp
浏览文件 @
fad5bc74
...
...
@@ -195,6 +195,7 @@ void init_graph_rt(py::module m) {
py
::
class_
<
cg
::
AsyncExecutable
>
(
m
,
"AsyncExecutable"
)
.
def
(
"execute"
,
&
cg
::
AsyncExecutable
::
execute
,
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"wait"
,
&
cg
::
AsyncExecutable
::
wait
,
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"get_prev_exec_time"
,
&
cg
::
AsyncExecutable
::
get_prev_exec_time
,
py
::
call_guard
<
py
::
gil_scoped_release
>
())
// only used for exception handle
.
def_property_readonly
(
"_all_rendezvous"
,
[](
cg
::
AsyncExecutable
*
exec
)
{
auto
ud
=
exec
->
owner_graph
()
->
options
().
user_data
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录