Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
478f2c51
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
404
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
478f2c51
编写于
9月 06, 2020
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(mge/jit): add trace/dump options
GitOrigin-RevId: 0f43c14599e7ca45aee12c4402bf8d178218a620
上级
44d0b5da
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
290 addition
and
31 deletion
+290
-31
imperative/python/megengine/core/tensor/megbrain_graph.py
imperative/python/megengine/core/tensor/megbrain_graph.py
+79
-0
imperative/python/megengine/jit/tracing.py
imperative/python/megengine/jit/tracing.py
+80
-2
imperative/python/src/graph_rt.cpp
imperative/python/src/graph_rt.cpp
+36
-0
imperative/python/test/unit/test_jit.py
imperative/python/test/unit/test_jit.py
+0
-29
imperative/python/test/unit/test_tracing.py
imperative/python/test/unit/test_tracing.py
+95
-0
未找到文件。
imperative/python/megengine/core/tensor/megbrain_graph.py
浏览文件 @
478f2c51
...
...
@@ -14,6 +14,7 @@ from concurrent.futures import Future, ThreadPoolExecutor
import
numpy
as
np
from
..
import
_imperative_rt
from
.._imperative_rt
import
GraphOptimizeOptions
from
.._imperative_rt.ops
import
BackwardGraph
from
.._wrap
import
device
as
as_device
from
..ops.builtin
import
OpDef
...
...
@@ -83,6 +84,84 @@ class Graph(_imperative_rt.ComputingGraph):
return
self
.
_wrap
(
_imperative_rt
.
make_h2d
(
self
,
device
,
dtype
,
shape
,
name
))
def
optimize_for_inference
(
dest_vars
,
**
kwargs
):
r
"""Applies optimize_for_inference pass for computing graph.
:param dest_vars: list of output vars in the computing graph
:Keyword Arguments:
* enable_io16xc32 --
whether to use float16 for I/O between oprs and use
float32 as internal computation precision. Note the output var would be
changed to float16.
* enable_ioc16 --
whether to use float16 for both I/O and computation
precision.
* enable_hwcd4 --
whether to use NHWCD4 data layout. This is faster on some
OpenCL backend.
* enable_nchw88 --
whether to use NCHW88 data layout, currently
used in X86 AVX backend.
* enable_nchw44 --
whether to use NCHW44 data layout, currently
used in arm backend.
* enable_nchw44_dot --
whether to use NCHW44_dot data layout, currently
used in armv8.2+dotprod backend.
* enable_nchw4 --
whether to use NCHW4 data layout, currently
used in nvidia backend(based on cudnn).
* enable_nchw32 --
whether to use NCHW32 data layout, currently
used in nvidia backend with tensorcore(based on cudnn).
* enable_chwn4 --
whether to use CHWN4 data layout, currently
used in nvidia backend with tensorcore.
* enable_fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
into one opr.
* enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z
input for inference on nvidia backend(this optimization pass will
result in mismatch of the precision of output of training and
inference)
"""
inference_options
=
GraphOptimizeOptions
()
if
optimize_for_inference
:
inference_optimize_layout_transform_map
=
{
"enable_hwcd4"
:
GraphOptimizeOptions
.
LayoutTransform
.
NHWCD4
,
"enable_nchw4"
:
GraphOptimizeOptions
.
LayoutTransform
.
NCHW4
,
"enable_nchw88"
:
GraphOptimizeOptions
.
LayoutTransform
.
NCHW88
,
"enable_nchw32"
:
GraphOptimizeOptions
.
LayoutTransform
.
NCHW32
,
"enable_nchw44"
:
GraphOptimizeOptions
.
LayoutTransform
.
NCHW44
,
"enable_nchw44_dot"
:
GraphOptimizeOptions
.
LayoutTransform
.
NCHW44_DOT
,
"enable_chwn4"
:
GraphOptimizeOptions
.
LayoutTransform
.
CHWN4
,
}
for
k
,
v
in
inference_optimize_layout_transform_map
.
items
():
if
kwargs
.
pop
(
k
,
False
):
inference_options
.
layout_transform
=
v
if
kwargs
.
pop
(
"enable_io16xc32"
,
False
):
inference_options
.
f16_io_f32_comp
=
True
if
kwargs
.
pop
(
"enable_ioc16"
,
False
):
inference_options
.
f16_io_comp
=
True
if
kwargs
.
pop
(
"enable_fuse_conv_bias_nonlinearity"
,
False
):
inference_options
.
fuse_conv_bias_nonlinearity
=
True
if
kwargs
.
pop
(
"enable_fuse_conv_bias_with_z"
,
False
):
inference_options
.
fuse_conv_bias_with_z
=
True
if
kwargs
:
raise
ValueError
(
"unknown options: %s"
%
list
(
kwargs
))
res_vars
=
_imperative_rt
.
optimize_for_inference
(
[
i
.
_node
for
i
in
dest_vars
],
inference_options
)
return
[
VarNode
(
i
)
for
i
in
res_vars
]
def
dump
(
*
args
):
return
_imperative_rt
.
dump_graph
([
i
.
_node
for
i
in
args
])
...
...
imperative/python/megengine/jit/tracing.py
浏览文件 @
478f2c51
...
...
@@ -11,6 +11,7 @@ import numpy as np
from
..core._imperative_rt
import
GraphProfiler
from
..core._imperative_rt.ops
import
OprAttr
from
..core._trace_option
import
set_tensor_shape
from
..core.ops.special
import
Const
from
..core.tensor
import
megbrain_graph
as
G
from
..core.tensor.core
import
OpBase
,
TensorBase
,
TensorWrapperBase
,
apply
...
...
@@ -76,6 +77,22 @@ class TensorInfo:
class
trace
:
"""
Wraps a callable and provide:
* tracing via :meth:`.trace` and :meth:`.dump`
* accelerated evalutaion via :meth:`.__call__`
:param function: the function will be traced.
:param symbolic: whether to apply symbolic execution for tracing. Default: False
:param capture_as_const: capture global vars or closures as const value. Default: False
:param sublinear_memory_config: configuration for sublinear memory optimization.
If not None, it enables sublinear memory optimization with given setting.
:param profiling: whether to profile compiled trace. Default: False
:param opt_level: optimization level for compiling trace.
:param symbolic_shape: whether to use symbolic shape for tracing. Default: True
"""
def
__new__
(
cls
,
*
args
,
**
kwargs
):
if
not
args
:
return
functools
.
partial
(
cls
,
**
kwargs
)
...
...
@@ -88,6 +105,8 @@ class trace:
capture_as_const
=
False
,
sublinear_memory_config
:
SublinearMemoryConfig
=
None
,
profiling
:
bool
=
False
,
opt_level
:
int
=
None
,
tensor_shape
:
bool
=
True
,
):
self
.
__wrapped__
=
function
self
.
_symbolic
=
symbolic
...
...
@@ -95,6 +114,8 @@ class trace:
self
.
_sublinear_memory_config
=
sublinear_memory_config
self
.
_profiling
=
profiling
self
.
_profiler
=
None
self
.
_graph_opt_level
=
opt_level
self
.
_tensor_shape
=
tensor_shape
self
.
_untraced
=
True
self
.
_tinfo
=
[]
# handle -> TensorInfo
...
...
@@ -112,6 +133,8 @@ class trace:
self
.
_output_bindings
=
None
self
.
_output_names
=
None
set_tensor_shape
(
self
.
_tensor_shape
)
def
_new_handle
(
self
):
handle
=
len
(
self
.
_tinfo
)
info
=
TensorInfo
()
...
...
@@ -307,6 +330,9 @@ class trace:
def
_apply_graph_options
(
self
,
graph
):
graph
.
options
.
seq_opt
.
enable_seq_comp_node_opt
=
False
# graph opt level
if
self
.
_graph_opt_level
is
not
None
:
graph
.
options
.
graph_opt_level
=
self
.
_graph_opt_level
# sublinear
if
self
.
_sublinear_memory_config
is
not
None
:
graph
.
options
.
enable_sublinear_memory_opt
=
True
...
...
@@ -320,6 +346,7 @@ class trace:
)
sublinear_config
.
thresh_nr_try
=
self
.
_sublinear_memory_config
.
thresh_nr_try
sublinear_config
.
num_worker
=
self
.
_sublinear_memory_config
.
num_worker
# profile
if
self
.
_profiling
:
self
.
_profiler
=
GraphProfiler
(
graph
)
...
...
@@ -416,7 +443,55 @@ class trace:
self
.
_process_outputs
(
outputs
)
return
outputs
def
dump
(
self
,
file
,
*
,
arg_names
=
None
,
output_names
=
None
):
def
dump
(
self
,
file
,
*
,
arg_names
=
None
,
output_names
=
None
,
append
=
False
,
**
kwargs
):
r
"""Serializes trace to file system.
:param file: output file, could be file object or filename.
:param arg_names: names of the input tensors in the traced function.
:param output_names: names of the output tensors in the traced function,
use the default name if not specified.
:param append: whether output is appended to ``file``.
Only works when ``file`` is str.
:Keyword Arguments:
* enable_io16xc32 --
whether to use float16 for I/O between oprs and use
float32 as internal computation precision. Note the output var would be
changed to float16.
* enable_ioc16 --
whether to use float16 for both I/O and computation
precision.
* enable_hwcd4 --
whether to use NHWCD4 data layout. This is faster on some
OpenCL backend.
* enable_nchw88 --
whether to use NCHW88 data layout, currently
used in X86 AVX backend.
* enable_nchw44 --
whether to use NCHW44 data layout, currently
used in arm backend.
* enable_nchw44_dot --
whether to use NCHW44_dot data layout, currently
used in armv8.2+dotprod backend.
* enable_nchw4 --
whether to use NCHW4 data layout, currently
used in nvidia backend(based on cudnn).
* enable_nchw32 --
whether to use NCHW32 data layout, currently
used in nvidia backend with tensorcore(based on cudnn).
* enable_chwn4 --
whether to use CHWN4 data layout, currently
used in nvidia backend with tensorcore.
* enable_fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
into one opr.
* enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z
input for inference on nvidia backend(this optimization pass will
result in mismatch of the precision of output of training and
inference)
"""
if
not
self
.
_capture_as_const
:
raise
ValueError
(
"you must specify capture_as_const=True at __init__ to use dump"
...
...
@@ -482,8 +557,11 @@ class trace:
v
.
name
=
output_names
[
i
]
dest_vars
.
append
(
v
)
dest_vars
=
G
.
optimize_for_inference
(
dest_vars
,
**
kwargs
)
if
isinstance
(
file
,
str
):
file
=
open
(
file
,
"wb"
)
permission
=
"wb"
if
append
==
False
else
"ab"
file
=
open
(
file
,
permission
)
file
.
write
(
G
.
dump
(
*
dest_vars
))
def
_process_inputs
(
self
,
*
args
,
**
kwargs
):
...
...
imperative/python/src/graph_rt.cpp
浏览文件 @
478f2c51
...
...
@@ -20,12 +20,17 @@
#include "./helper.h"
#include "megbrain/plugin/profiler.h"
#include "./common.h"
#include "megbrain/gopt/inference.h"
namespace
py
=
pybind11
;
using
namespace
mgb
;
using
namespace
imperative
;
using
_OptimizeForInferenceOptions
=
mgb
::
gopt
::
OptimizeForInferenceOptions
;
using
_LayoutTransform
=
_OptimizeForInferenceOptions
::
LayoutTransform
;
namespace
{
class
_CompGraphProfilerImpl
{
std
::
shared_ptr
<
ComputingGraph
>
m_comp_graph
;
...
...
@@ -138,6 +143,37 @@ void init_graph_rt(py::module m) {
return
py
::
bytes
(
reinterpret_cast
<
const
char
*>
(
&
buf
[
0
]),
buf
.
size
());
});
auto
GraphOptimizeOptions
=
py
::
class_
<
_OptimizeForInferenceOptions
>
(
m
,
"GraphOptimizeOptions"
)
.
def
(
py
::
init
())
.
def_readwrite
(
"f16_io_f32_comp"
,
&
_OptimizeForInferenceOptions
::
f16_io_f32_comp
)
.
def_readwrite
(
"f16_io_comp"
,
&
_OptimizeForInferenceOptions
::
f16_io_comp
)
.
def_readwrite
(
"fuse_conv_bias_nonlinearity"
,
&
_OptimizeForInferenceOptions
::
fuse_conv_bias_nonlinearity
)
.
def_readwrite
(
"fuse_conv_bias_with_z"
,
&
_OptimizeForInferenceOptions
::
fuse_conv_bias_with_z
)
.
def_readwrite
(
"layout_transform"
,
&
_OptimizeForInferenceOptions
::
layout_transform
)
;
py
::
enum_
<
_LayoutTransform
>
(
GraphOptimizeOptions
,
"LayoutTransform"
)
.
value
(
"DEFAULT"
,
_LayoutTransform
::
DEFAULT
)
.
value
(
"NCHW4"
,
_LayoutTransform
::
NCHW4
)
.
value
(
"NHWCD4"
,
_LayoutTransform
::
NHWCD4
)
.
value
(
"NCHW88"
,
_LayoutTransform
::
NCHW88
)
.
value
(
"NCHW44"
,
_LayoutTransform
::
NCHW44
)
.
value
(
"NCHW44_DOT"
,
_LayoutTransform
::
NCHW44_DOT
)
.
value
(
"NCHW32"
,
_LayoutTransform
::
NCHW32
)
.
value
(
"CHWN4"
,
_LayoutTransform
::
CHWN4
)
.
export_values
()
;
m
.
def
(
"optimize_for_inference"
,
[](
const
VarNodeArray
&
dest_vars
,
const
_OptimizeForInferenceOptions
&
opt
)
{
SymbolVarArray
symvars
(
dest_vars
.
begin
(),
dest_vars
.
end
());
auto
res_symvars
=
mgb
::
gopt
::
optimize_for_inference
(
symvars
,
opt
);
VarNodeArray
vars
;
for
(
auto
&
si
:
res_symvars
)
vars
.
push_back
(
si
.
node
());
return
vars
;
});
#define CURRENT_CLASS cg::ComputingGraph::Options
auto
PyComputingGraphOptions
=
py
::
class_
<
cg
::
ComputingGraph
::
Options
>
(
PyComputingGraph
,
"Options"
)
...
...
imperative/python/test/unit/test_jit.py
已删除
100644 → 0
浏览文件 @
44d0b5da
# -*- coding: utf-8 -*-
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
#
# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import
pytest
from
megengine.core
import
Tensor
# from megengine.core.interpreter.hints import function
@
pytest
.
mark
.
skip
(
reason
=
"under rewrite"
)
def
test_1
():
@
function
def
f
(
x
,
p
):
x
=
x
+
1
if
p
:
return
x
*
x
return
x
*
2
x
=
Tensor
(
0
)
for
_
in
range
(
5
):
assert
f
(
x
,
0
).
numpy
()
==
2
assert
f
(
x
,
1
).
numpy
()
==
1
imperative/python/test/unit/test_tracing.py
浏览文件 @
478f2c51
# -*- coding: utf-8 -*-
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
#
# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import
io
from
tempfile
import
mkstemp
import
numpy
as
np
import
pytest
from
megengine
import
tensor
from
megengine.core.ops
import
builtin
as
ops
from
megengine.core.tensor
import
megbrain_graph
as
G
from
megengine.core.tensor.core
import
apply
from
megengine.core.tensor.raw_tensor
import
as_raw_tensor
from
megengine.functional
import
exp
,
log
from
megengine.jit
import
exclude_from_trace
,
trace
...
...
@@ -101,3 +114,85 @@ def test_trace_profiler():
out
=
f
.
get_profile
()
assert
out
.
get
(
"profiler"
)
@
pytest
.
mark
.
skip
(
reason
=
"eq_to_unit failed in inplace.cpp"
)
def
test_goptions_div_zero
():
@
trace
(
symbolic
=
True
,
opt_level
=
0
)
def
f
(
x
):
return
x
/
x
@
trace
(
symbolic
=
True
,
opt_level
=
1
)
def
g
(
x
):
return
x
/
x
out
=
f
(
tensor
(
0.0
))
if
out
==
out
:
raise
ValueError
(
"actual result should be nan"
)
out
=
g
(
tensor
(
0.0
))
if
out
!=
out
:
raise
ValueError
(
"actual result should be 1"
)
@
pytest
.
mark
.
skip
(
reason
=
"cast to Elemwise failed in inplace.cpp"
)
def
test_goptions_log_exp
():
@
trace
(
symbolic
=
True
,
opt_level
=
0
,
capture_as_const
=
True
)
def
f
(
x
):
return
log
(
exp
(
x
))
@
trace
(
symbolic
=
True
,
opt_level
=
1
,
capture_as_const
=
True
)
def
g
(
x
):
return
log
(
exp
(
x
))
f
(
tensor
(
1.0
))
_
,
out
=
mkstemp
()
f
.
dump
(
out
)
*
_
,
outputs
=
G
.
load_comp_graph_from_file
(
out
)
oprs_1
=
cgtools
.
get_oprs_seq
(
outputs
)
g
(
tensor
(
1.0
))
g
.
dump
(
out
)
*
_
,
outputs
=
G
.
load_comp_graph_from_file
(
out
)
oprs_2
=
cgtools
.
get_oprs_seq
(
outputs
)
assert
len
(
oprs_1
)
-
len
(
oprs_2
)
==
2
@
pytest
.
mark
.
skip
(
reason
=
"need cgtools to check final oprs"
)
def
test_goptions_log_sum_exp
():
@
trace
(
symbolic
=
True
,
opt_level
=
0
,
capture_as_const
=
True
)
def
f
(
x
,
y
):
return
log
(
exp
(
x
)
+
exp
(
y
))
@
trace
(
symbolic
=
True
,
opt_level
=
1
,
capture_as_const
=
True
)
def
g
(
x
,
y
):
return
log
(
exp
(
x
)
+
exp
(
y
))
f
(
tensor
(
1.0
),
tensor
(
2.0
))
_
,
out
=
mkstemp
()
f
.
dump
(
out
)
*
_
,
outputs
=
G
.
load_comp_graph_from_file
(
out
)
oprs_1
=
cgtools
.
get_oprs_seq
(
outputs
)
g
(
tensor
(
1.0
),
tensor
(
2.0
))
g
.
dump
(
out
)
*
_
,
outputs
=
G
.
load_comp_graph_from_file
(
out
)
oprs_2
=
cgtools
.
get_oprs_seq
(
outputs
)
assert
len
(
oprs_1
)
-
len
(
oprs_2
)
==
2
@
pytest
.
mark
.
skip
(
reason
=
"need cgtools to check computing input dtype"
)
def
test_optimize_for_inference
():
@
trace
(
symbolic
=
True
,
capture_as_const
=
True
)
def
f
(
x
):
return
exp
(
x
)
_
,
out
=
mkstemp
()
f
(
tensor
(
5.0
))
f
.
dump
(
out
,
optimize_for_inference
=
True
,
optimize_options
=
{
"enable_io16xc32"
:
True
})
res
=
G
.
load_comp_graph_from_file
(
out
)
computing_input
=
res
.
output_vars_list
[
0
].
owner
.
inputs
[
0
]
assert
computing_input
.
dtype
==
np
.
float16
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录