Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
b8c7557b
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
410
Star
4707
Fork
583
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
b8c7557b
编写于
8月 10, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix(mm): fix mm error when use sync
GitOrigin-RevId: 63387bda049e51b0a7c65b9a9a0ebd746a713446
上级
73ad06ba
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
102 addition
and
31 deletion
+102
-31
imperative/python/megengine/distributed/helper.py
imperative/python/megengine/distributed/helper.py
+6
-0
imperative/python/megengine/distributed/launcher.py
imperative/python/megengine/distributed/launcher.py
+2
-1
imperative/src/impl/interpreter/interpreter_impl.cpp
imperative/src/impl/interpreter/interpreter_impl.cpp
+67
-22
imperative/src/impl/interpreter/interpreter_impl.h
imperative/src/impl/interpreter/interpreter_impl.h
+11
-8
imperative/src/include/megbrain/imperative/interpreter.h
imperative/src/include/megbrain/imperative/interpreter.h
+3
-0
src/core/include/megbrain/utils/metahelper.h
src/core/include/megbrain/utils/metahelper.h
+11
-0
src/core/include/megbrain/utils/thread_impl_1.h
src/core/include/megbrain/utils/thread_impl_1.h
+2
-0
未找到文件。
imperative/python/megengine/distributed/helper.py
浏览文件 @
b8c7557b
...
...
@@ -162,6 +162,12 @@ def _check_device_initialized(device_type: str, rank: int):
raise
RuntimeError
(
errmsg
)
def
_check_interpreter_status
():
from
..core._imperative_rt.core2
import
get_option
_
=
get_option
(
"async_level"
)
get_device_count_by_fork
=
deprecated_func
(
"1.5"
,
"megengine.device"
,
"get_device_count"
,
False
)
...
...
imperative/python/megengine/distributed/launcher.py
浏览文件 @
b8c7557b
...
...
@@ -9,7 +9,7 @@ from ..core._imperative_rt.core2 import full_sync
from
..device
import
get_device_count
from
..logger
import
get_logger
from
.group
import
_set_machine_ranks
,
group_barrier
,
init_process_group
from
.helper
import
_check_device_initialized
from
.helper
import
_check_device_initialized
,
_check_interpreter_status
from
.server
import
Client
,
Server
WARN_SUBPROCESS_EXIT_WITHOUT_RETURN
=
(
...
...
@@ -33,6 +33,7 @@ def _run_wrapped(
machine_ranks
:
list
,
):
r
"""Init distributed process group and run wrapped function."""
_check_interpreter_status
()
_check_device_initialized
(
device_type
,
dev
)
init_process_group
(
master_ip
=
master_ip
,
...
...
imperative/src/impl/interpreter/interpreter_impl.cpp
浏览文件 @
b8c7557b
...
...
@@ -115,7 +115,18 @@ void ChannelImpl::WorkQueue::on_async_queue_worker_thread_start() {
#define m_worker_state
std
::
unique_ptr
<
Interpreter
::
Channel
>
InterpreterImpl
::
create_channel
()
{
return
std
::
make_unique
<
ChannelImpl
>
();
auto
ret
=
std
::
make_unique
<
ChannelImpl
>
();
#if !(defined(_WIN32) || defined(_WIN64))
auto
disable_channels
=
[](
void
)
->
void
{
for
(
ChannelImpl
*
channel
:
ChannelImpl
::
m_all_active_channels
)
{
if
(
channel
->
worker_started
())
{
channel
->
update_status_to_forked
();
}
}
};
pthread_atfork
(
nullptr
,
nullptr
,
static_cast
<
void
(
*
)(
void
)
>
(
disable_channels
));
#endif
return
ret
;
}
Interpreter
&
Interpreter
::
inst
()
{
...
...
@@ -125,7 +136,7 @@ Interpreter& Interpreter::inst() {
Handle
ChannelImpl
::
put
(
const
HostTensorND
&
value
,
bool
no_cache
)
{
MGB_LOCK_GUARD
(
m_spin
);
mgb_assert
(
check_available
(),
"Channel already closed"
);
assert_available
(
);
std
::
optional
<
StackManager
::
Guard
>
guard
;
if
(
Profiler
::
is_profiling
())
{
auto
&
state
=
get_channel_state
();
...
...
@@ -158,7 +169,8 @@ TensorInfo* ChannelImpl::put_impl(const HostTensorND& value, bool no_cache) {
Put
{
info
,
value
,
no_cache
},
});
}
if
(
m_async_level
==
0
)
{
if
(
get_channel_state
().
options
.
async_level
==
0
)
{
sync_impl
();
info
->
desc
.
comp_node
.
sync
();
auto
err
=
info
->
desc
.
comp_node
.
check_async_error
();
...
...
@@ -169,7 +181,7 @@ TensorInfo* ChannelImpl::put_impl(const HostTensorND& value, bool no_cache) {
Handle
ChannelImpl
::
put
(
const
DeviceTensorND
&
data
,
const
HostTensorND
&
hvalue
)
{
MGB_LOCK_GUARD
(
m_spin
);
mgb_assert
(
check_available
(),
"Channel already closed"
);
assert_available
(
);
return
reinterpret_cast
<
Handle
>
(
put_impl
(
data
,
hvalue
));
}
TensorInfo
*
ChannelImpl
::
put_impl
(
...
...
@@ -221,7 +233,7 @@ void ChannelImpl::del_impl(Handle handle) {
void
ChannelImpl
::
drop
(
Handle
handle
)
{
MGB_LOCK_GUARD
(
m_spin
);
mgb_assert
(
check_available
(),
"Channel already closed"
);
assert_available
(
);
auto
&
state
=
get_channel_state
();
if
(
state
.
options
.
enable_drop
)
{
mgb_assert
(
...
...
@@ -404,7 +416,7 @@ void ChannelImpl::dispatch_kernel(
SmallVector
<
Handle
>
ChannelImpl
::
apply_op
(
std
::
shared_ptr
<
OpDef
>
op
,
const
SmallVector
<
Handle
>&
inputs
)
{
MGB_LOCK_GUARD
(
m_spin
);
mgb_assert
(
check_available
(),
"Channel already closed"
);
assert_available
(
);
auto
*
input
=
reinterpret_cast
<
TensorInfo
*>
(
inputs
[
0
]);
if
(
op
->
same_type
<
GetVarShape
>
()
&&
input
->
shape_valid
())
{
size_t
ndim
=
input
->
desc
.
layout
.
ndim
;
...
...
@@ -460,7 +472,7 @@ SmallVector<Handle> ChannelImpl::apply_op_impl(
HostTensorND
ChannelImpl
::
get_value
(
Handle
handle
)
{
MGB_LOCK_GUARD
(
m_spin
);
mgb_assert
(
check_available
(),
"Channel already closed"
);
assert_available
(
);
mgb_assert
(
m_valid_handle
.
find
(
handle
)
!=
m_valid_handle
.
end
(),
"invalid handle: %p"
,
handle
);
...
...
@@ -472,7 +484,7 @@ HostTensorND ChannelImpl::get_value(Handle handle) {
TensorShape
ChannelImpl
::
get_shape
(
Handle
handle
)
{
MGB_LOCK_GUARD
(
m_spin
);
mgb_assert
(
check_available
(),
"Channel already closed"
);
assert_available
(
);
mgb_assert
(
m_valid_handle
.
find
(
handle
)
!=
m_valid_handle
.
end
(),
"invalid handle: %p"
,
handle
);
...
...
@@ -487,7 +499,7 @@ TensorShape ChannelImpl::get_shape(Handle handle) {
DType
ChannelImpl
::
get_dtype
(
Handle
handle
)
{
MGB_LOCK_GUARD
(
m_spin
);
mgb_assert
(
check_available
(),
"Channel already closed"
);
assert_available
(
);
mgb_assert
(
m_valid_handle
.
find
(
handle
)
!=
m_valid_handle
.
end
(),
"invalid handle: %p"
,
handle
);
...
...
@@ -500,7 +512,7 @@ DType ChannelImpl::get_dtype(Handle handle) {
CompNode
ChannelImpl
::
get_device
(
Handle
handle
)
{
MGB_LOCK_GUARD
(
m_spin
);
mgb_assert
(
check_available
(),
"Channel already closed"
);
assert_available
(
);
mgb_assert
(
m_valid_handle
.
find
(
handle
)
!=
m_valid_handle
.
end
(),
"invalid handle: %p"
,
handle
);
...
...
@@ -513,7 +525,7 @@ CompNode ChannelImpl::get_device(Handle handle) {
DeviceTensorND
ChannelImpl
::
get_dev_tensor
(
Handle
handle
)
{
MGB_LOCK_GUARD
(
m_spin
);
mgb_assert
(
check_available
(),
"Channel already closed"
);
assert_available
(
);
mgb_assert
(
m_valid_handle
.
find
(
handle
)
!=
m_valid_handle
.
end
(),
"invalid handle: %p"
,
handle
);
...
...
@@ -523,7 +535,7 @@ DeviceTensorND ChannelImpl::get_dev_tensor(Handle handle) {
void
ChannelImpl
::
sync
()
{
MGB_LOCK_GUARD
(
m_spin
);
mgb_assert
(
check_available
(),
"Channel already closed"
);
assert_available
(
);
sync_impl
();
}
...
...
@@ -545,19 +557,19 @@ void ChannelImpl::close() {
mgb_assert
(
m_valid_handle
.
empty
());
mgb_log_debug
(
"%ld tensor exists before channel close"
,
(
long
)
valid_handles
.
size
());
sync_impl
();
m_
closed
=
true
;
m_
status
=
ChannelRunningStatus
::
CLOSED
;
}
size_t
ChannelImpl
::
get_option
(
std
::
string
name
)
{
MGB_LOCK_GUARD
(
m_spin
);
mgb_assert
(
check_available
(),
"Channel already closed"
);
assert_available
(
);
auto
&
state
=
get_channel_state
();
return
state
.
options
.
get_option
(
name
);
}
void
ChannelImpl
::
set_option
(
std
::
string
name
,
size_t
value
)
{
MGB_LOCK_GUARD
(
m_spin
);
mgb_assert
(
check_available
(),
"Channel already closed"
);
assert_available
(
);
auto
&
state
=
get_channel_state
();
state
.
options
.
set_option
(
name
,
value
);
// FIXME
...
...
@@ -583,7 +595,7 @@ void ChannelImpl::set_option(std::string name, size_t value) {
void
ChannelImpl
::
clear_candidates
()
{
MGB_LOCK_GUARD
(
m_spin
);
mgb_assert
(
check_available
(),
"Channel already closed"
);
assert_available
(
);
m_dtr
.
candidates
.
clear
();
}
...
...
@@ -681,10 +693,18 @@ void ChannelImpl::real_free(TensorInfo* ptr) {
m_pool
.
free
(
ptr
);
}
ChannelImpl
::
ChannelImpl
()
:
m_worker
(
this
)
{}
std
::
unordered_set
<
ChannelImpl
*>
ChannelImpl
::
m_all_active_channels
{};
MGB_MUTEX
ChannelImpl
::
m_all_active_channels_mutex
{};
ChannelImpl
::
ChannelImpl
()
:
m_worker
(
this
)
{
MGB_LOCK_GUARD
(
m_all_active_channels_mutex
);
m_all_active_channels
.
emplace
(
this
);
}
ChannelImpl
::~
ChannelImpl
()
{
close
();
MGB_LOCK_GUARD
(
m_all_active_channels_mutex
);
m_all_active_channels
.
erase
(
this
);
}
void
ChannelImpl
::
produce_tensor
(
TensorInfo
*
dest
,
TensorPtr
ptr
)
{
...
...
@@ -992,7 +1012,7 @@ void ChannelImpl::detach_users(TensorInfo* dest) {
}
bool
ChannelImpl
::
check_available
()
{
return
!
m_closed
;
return
m_status
==
ChannelRunningStatus
::
RUNING
;
}
TensorPtr
ChannelImpl
::
wait_tensor
(
TensorInfo
*
info
,
TensorProp
prop
)
{
...
...
@@ -1352,7 +1372,7 @@ void ChannelImpl::check_worker_exc_unsafe() {
void
ChannelImpl
::
start_profile
()
{
MGB_LOCK_GUARD
(
m_spin
);
mgb_assert
(
check_available
(),
"Channel already closed"
);
assert_available
(
);
auto
capture_tensors
=
collect_valid_tensors
();
if
(
capture_tensors
.
size
()
>
0
)
{
if
(
Profiler
::
is_profiling
())
{
...
...
@@ -1370,7 +1390,7 @@ void ChannelImpl::start_profile() {
void
ChannelImpl
::
stop_profile
()
{
MGB_LOCK_GUARD
(
m_spin
);
mgb_assert
(
check_available
(),
"Channel already closed"
);
assert_available
(
);
auto
escape_tensors
=
collect_valid_tensors
();
if
(
escape_tensors
.
size
()
>
0
)
{
if
(
Profiler
::
is_profiling
())
{
...
...
@@ -1388,7 +1408,7 @@ void ChannelImpl::stop_profile() {
void
ChannelImpl
::
push_scope
(
std
::
string
name
)
{
MGB_LOCK_GUARD
(
m_spin
);
mgb_assert
(
check_available
(),
"Channel already closed"
);
assert_available
(
);
auto
&
state
=
get_channel_state
();
state
.
stack_manager
.
enter
(
name
);
MGB_RECORD_EVENT
(
ScopeEvent
,
name
);
...
...
@@ -1406,7 +1426,7 @@ void ChannelImpl::push_scope(std::string name) {
void
ChannelImpl
::
pop_scope
(
std
::
string
name
)
{
MGB_LOCK_GUARD
(
m_spin
);
mgb_assert
(
check_available
(),
"Channel already closed"
);
assert_available
(
);
auto
&
state
=
get_channel_state
();
state
.
stack_manager
.
exit
(
name
);
MGB_RECORD_EVENT
(
ScopeFinishEvent
,
name
);
...
...
@@ -1422,6 +1442,31 @@ void ChannelImpl::pop_scope(std::string name) {
}
}
bool
ChannelImpl
::
worker_started
()
const
{
return
m_worker
.
worker_started
();
}
void
ChannelImpl
::
update_status_to_forked
(
void
)
{
MGB_LOCK_GUARD
(
m_spin
);
m_status
=
ChannelRunningStatus
::
FORKED
;
}
void
ChannelImpl
::
assert_available
()
const
{
if
(
m_status
==
ChannelRunningStatus
::
RUNING
)
{
return
;
}
else
if
(
m_status
==
ChannelRunningStatus
::
CLOSED
)
{
mgb_assert
(
false
,
"Channel already closed"
);
}
else
if
(
m_status
==
ChannelRunningStatus
::
FORKED
)
{
mgb_assert
(
false
,
"your program is forked and megengine is be disabled in subprocess, if "
"you want to use megengine in subprocess, please DO NOT setup and use "
"megengine before fork"
);
}
else
{
mgb_assert
(
false
,
"impossible, Channel status is undefined"
);
}
}
void
ChannelImpl
::
assert_in_channel
()
{
mgb_assert
(
get_worker_tid
()
!=
std
::
this_thread
::
get_id
(),
...
...
imperative/src/impl/interpreter/interpreter_impl.h
浏览文件 @
b8c7557b
...
...
@@ -27,7 +27,7 @@ struct InterpreterImpl : Interpreter {
std
::
unique_ptr
<
Channel
>
create_channel
()
override
;
};
struct
ChannelImpl
:
Interpreter
::
Channel
{
struct
ChannelImpl
:
Interpreter
::
Channel
,
NonCopyableObj
,
NonMoveableObj
{
ChannelImpl
();
~
ChannelImpl
()
override
;
...
...
@@ -61,6 +61,13 @@ struct ChannelImpl : Interpreter::Channel {
void
push_scope
(
std
::
string
)
override
;
void
pop_scope
(
std
::
string
)
override
;
bool
worker_started
()
const
;
void
update_status_to_forked
(
void
);
void
assert_available
()
const
;
static
std
::
unordered_set
<
ChannelImpl
*>
m_all_active_channels
;
static
MGB_MUTEX
m_all_active_channels_mutex
;
private:
struct
WorkQueue
;
struct
State
;
...
...
@@ -130,7 +137,9 @@ private:
// TODO: use explicit struct
std
::
stack
<
std
::
tuple
<
ApplyOp
,
size_t
,
TensorInfo
*
,
std
::
string
>>
m_apply_stack
;
bool
m_applying
=
false
;
bool
m_closed
=
false
;
enum
class
ChannelRunningStatus
{
RUNING
,
CLOSED
,
FORKED
};
ChannelRunningStatus
m_status
=
ChannelRunningStatus
::
RUNING
;
struct
WorkQueue
:
AsyncQueueSC
<
Command
,
WorkQueue
>
{
// set max_spin=0 to prevent Queue fetch task in busy wait manner.
...
...
@@ -159,12 +168,6 @@ private:
ChannelImpl
*
m_owner
;
}
m_worker
;
//! config whether raise error exactly when invoking op.
//! level 2: both device and user side errors are async;
//! level 1: user side errors are sync;
//! level 0: both sync.
int
m_async_level
=
2
;
struct
State
{
std
::
thread
::
id
tid
;
OptionManager
options
;
...
...
imperative/src/include/megbrain/imperative/interpreter.h
浏览文件 @
b8c7557b
...
...
@@ -60,6 +60,9 @@ struct Interpreter {
virtual
std
::
unique_ptr
<
Channel
>
create_channel
()
=
0
;
static
Interpreter
&
inst
();
protected:
Interpreter
()
=
default
;
};
}
// namespace mgb::imperative::interpreter
src/core/include/megbrain/utils/metahelper.h
浏览文件 @
b8c7557b
...
...
@@ -151,6 +151,17 @@ public:
NonCopyableObj
()
=
default
;
};
/*!
* \brief base class for non-moveable objects
*/
class
NonMoveableObj
{
NonMoveableObj
(
NonMoveableObj
&&
)
=
delete
;
NonMoveableObj
&
operator
=
(
NonMoveableObj
&&
)
=
delete
;
public:
NonMoveableObj
()
=
default
;
};
template
<
typename
T
>
class
ReverseAdaptor
{
T
&
m_t
;
...
...
src/core/include/megbrain/utils/thread_impl_1.h
浏览文件 @
b8c7557b
...
...
@@ -253,6 +253,8 @@ public:
}
}
inline
bool
worker_started
()
const
{
return
m_synchronizer
.
worker_started
();
}
protected:
~
AsyncQueueSC
()
noexcept
=
default
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录