Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
d7c546c9
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
403
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
d7c546c9
编写于
7月 28, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix(mge/interpreter): regenerates tensor when its dev value is needed
GitOrigin-RevId: ed26d52ee4382d3e4d7c02e1d4a612b62393cb5f
上级
1f7bf1ad
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
98 addition
and
5 deletion
+98
-5
imperative/python/test/integration/test_dtr.py
imperative/python/test/integration/test_dtr.py
+21
-0
imperative/src/impl/interpreter/commands.h
imperative/src/impl/interpreter/commands.h
+23
-1
imperative/src/impl/interpreter/interpreter_impl.cpp
imperative/src/impl/interpreter/interpreter_impl.cpp
+54
-4
未找到文件。
imperative/python/test/integration/test_dtr.py
浏览文件 @
d7c546c9
...
...
@@ -90,6 +90,18 @@ class ResNet(M.Module):
return
out
def
run_dtr_drop_copy_dev_tensor
():
mge
.
dtr
.
evictee_minimum_size
=
128
mge
.
dtr
.
enable
()
x
=
F
.
ones
((
10
,
100
))
x
.
_drop
()
x
[...]
=
mge
.
tensor
(
x
,
no_cache
=
True
)
x
.
numpy
()
mge
.
dtr
.
evictee_minimum_size
=
1024
**
2
mge
.
dtr
.
disable
()
mge
.
_exit
(
0
)
def
run_dtr_resnet1202
():
batch_size
=
6
resnet1202
=
ResNet
(
BasicBlock
,
[
200
,
200
,
200
])
...
...
@@ -135,3 +147,12 @@ def test_dtr_resnet1202():
p
.
start
()
p
.
join
()
assert
p
.
exitcode
==
0
@
pytest
.
mark
.
require_ngpu
(
1
)
@
pytest
.
mark
.
isolated_distributed
def
test_dtr_drop_copy_dev_tensor
():
p
=
mp
.
Process
(
target
=
run_dtr_drop_copy_dev_tensor
)
p
.
start
()
p
.
join
()
assert
p
.
exitcode
==
0
imperative/src/impl/interpreter/commands.h
浏览文件 @
d7c546c9
...
...
@@ -136,9 +136,31 @@ struct PopScope {
const
char
*
get_name
()
const
{
return
"PopScope"
;
}
};
struct
StartRegen
{
TensorInfo
*
dest
;
template
<
typename
TFunctor
>
void
get_props
(
TFunctor
&&
functor
)
const
{
functor
(
"dest"
,
dest
);
}
const
char
*
get_name
()
const
{
return
"StartRegen"
;
}
};
struct
StopRegen
{
TensorInfo
*
dest
;
template
<
typename
TFunctor
>
void
get_props
(
TFunctor
&&
functor
)
const
{
functor
(
"dest"
,
dest
);
}
const
char
*
get_name
()
const
{
return
"StopRegen"
;
}
};
using
CommandData
=
std
::
variant
<
Put
,
ApplyOp
,
Del
,
GetValue
,
Drop
,
SetOption
,
StartProfile
,
StopProfile
,
PushScope
,
PopScope
>
;
PushScope
,
PopScope
,
StartRegen
,
StopRegen
>
;
struct
Command
{
uint64_t
id
;
...
...
imperative/src/impl/interpreter/interpreter_impl.cpp
浏览文件 @
d7c546c9
...
...
@@ -1002,8 +1002,11 @@ TensorPtr ChannelImpl::wait_tensor(TensorInfo* info, TensorProp prop) {
m_waitee_id
=
Profiler
::
next_id
();
MGB_RECORD_EVENT
(
TensorWaitPropEvent
,
info
->
id
,
m_waitee_id
,
prop
);
bool
require_host
=
prop
==
TensorProp
::
HostValue
;
bool
require_dev
=
prop
==
TensorProp
::
DevValue
;
auto
host_available
=
[
&
]
{
return
info
->
ptr
&&
info
->
ptr
->
value_fetched
();
};
auto
dev_available
=
[
&
]
{
return
info
->
ptr
;
};
bool
wait_host
=
false
;
bool
wait_regen
=
false
;
if
(
require_host
&&
!
host_available
())
{
// avoid dead lock
lock
.
unlock
();
...
...
@@ -1020,16 +1023,52 @@ TensorPtr ChannelImpl::wait_tensor(TensorInfo* info, TensorProp prop) {
lock
.
lock
();
wait_host
=
true
;
}
m_cv
.
wait
(
lock
,
[
&
]()
{
check_worker_exc_unsafe
();
return
require_host
?
host_available
()
:
static_cast
<
bool
>
(
info
->
ptr
);
});
if
(
require_dev
&&
!
dev_available
())
{
lock
.
unlock
();
if
(
Profiler
::
is_profiling
())
{
m_worker
.
add_task
(
{
Profiler
::
next_id
(),
StartRegen
{
info
},
get_channel_state
().
stack_manager
.
dump
()});
}
else
{
m_worker
.
add_task
({
Profiler
::
next_id
(),
StartRegen
{
info
},
});
}
lock
.
lock
();
wait_regen
=
true
;
}
if
(
require_dev
)
{
m_cv
.
wait
(
lock
,
[
&
]()
{
check_worker_exc_unsafe
();
return
dev_available
();
});
}
else
{
m_cv
.
wait
(
lock
,
[
&
]()
{
check_worker_exc_unsafe
();
return
require_host
?
host_available
()
:
static_cast
<
bool
>
(
info
->
ptr
);
});
}
MGB_RECORD_EVENT
(
TensorWaitPropFinishEvent
,
info
->
id
,
m_waitee_id
,
prop
);
m_waitee
=
nullptr
;
if
(
wait_host
)
{
auto
err
=
info
->
ptr
->
comp_node
().
check_async_error
();
mgb_assert
(
!
err
,
"%s"
,
err
->
what
());
}
if
(
wait_regen
)
{
lock
.
unlock
();
if
(
Profiler
::
is_profiling
())
{
m_worker
.
add_task
(
{
Profiler
::
next_id
(),
StopRegen
{
info
},
get_channel_state
().
stack_manager
.
dump
()});
}
else
{
m_worker
.
add_task
({
Profiler
::
next_id
(),
StopRegen
{
info
},
});
}
lock
.
lock
();
}
return
info
->
ptr
;
}
...
...
@@ -1254,6 +1293,17 @@ void ChannelImpl::process_one_task(Command& icmd) {
MGB_RECORD_EVENT
(
ScopeEvent
,
cmd
.
scope_name
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
PopScope
>
)
{
MGB_RECORD_EVENT
(
ScopeFinishEvent
,
cmd
.
scope_name
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
StartRegen
>
)
{
if
(
cmd
.
dest
->
invalid
)
return
;
cmd
.
dest
->
pin
();
if
(
!
cmd
.
dest
->
ptr
&&
cmd
.
dest
->
evict_type
!=
EvictType
::
NONE
)
{
regenerate
(
cmd
.
dest
);
}
MGB_LOCK_GUARD
(
m_mutex
);
notify_tensor_unsafe
(
cmd
.
dest
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
StopRegen
>
)
{
cmd
.
dest
->
unpin
();
}
else
{
static_assert
(
!
std
::
is_same_v
<
T
,
T
>
);
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录