Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
877bda41
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
399
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
877bda41
编写于
2月 14, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
perf(mge): improve cross stream memory borrowing
GitOrigin-RevId: c68977c5dce157bfc624f887f085d7a7aa6c54ac
上级
ed7fa104
变更
21
显示空白变更内容
内联
并排
Showing
21 changed file
with
756 addition
and
183 deletion
+756
-183
imperative/python/megengine/_multistream.py
imperative/python/megengine/_multistream.py
+21
-0
imperative/python/megengine/device.py
imperative/python/megengine/device.py
+3
-3
imperative/python/megengine/tensor.py
imperative/python/megengine/tensor.py
+6
-5
imperative/python/src/module_trace.h
imperative/python/src/module_trace.h
+1
-0
imperative/python/src/tensor.cpp
imperative/python/src/tensor.cpp
+3
-0
imperative/python/test/unit/core/test_stream.py
imperative/python/test/unit/core/test_stream.py
+74
-0
imperative/src/impl/async_releaser.h
imperative/src/impl/async_releaser.h
+0
-84
imperative/src/impl/blob_manager_impl.cpp
imperative/src/impl/blob_manager_impl.cpp
+10
-10
imperative/src/impl/blob_manager_impl.h
imperative/src/impl/blob_manager_impl.h
+9
-9
imperative/src/impl/interpreter/interpreter_impl.cpp
imperative/src/impl/interpreter/interpreter_impl.cpp
+8
-3
imperative/src/impl/interpreter/interpreter_impl.h
imperative/src/impl/interpreter/interpreter_impl.h
+1
-1
imperative/src/impl/ops/elemwise.cpp
imperative/src/impl/ops/elemwise.cpp
+1
-1
imperative/src/impl/ops/tensor_manip.cpp
imperative/src/impl/ops/tensor_manip.cpp
+1
-3
imperative/src/impl/ops/utility.cpp
imperative/src/impl/ops/utility.cpp
+98
-0
imperative/src/impl/physical_tensor.cpp
imperative/src/impl/physical_tensor.cpp
+407
-38
imperative/src/impl/proxy_graph/mini_graph.h
imperative/src/impl/proxy_graph/mini_graph.h
+9
-3
imperative/src/impl/transformations/lazy.cpp
imperative/src/impl/transformations/lazy.cpp
+2
-3
imperative/src/include/megbrain/imperative/blob_manager.h
imperative/src/include/megbrain/imperative/blob_manager.h
+4
-4
imperative/src/include/megbrain/imperative/physical_tensor.h
imperative/src/include/megbrain/imperative/physical_tensor.h
+84
-15
src/core/impl/comp_node/impl_helper.cpp
src/core/impl/comp_node/impl_helper.cpp
+1
-1
src/core/include/megbrain/ir/ops.td
src/core/include/megbrain/ir/ops.td
+13
-0
未找到文件。
imperative/python/megengine/_multistream.py
0 → 100644
浏览文件 @
877bda41
from
.core._imperative_rt.core2
import
apply
from
.core.ops.builtin
import
Barrier
from
.tensor
import
Tensor
_dummy_tensors
=
{}
def
_get_dummy_tensor
(
device
):
if
device
not
in
_dummy_tensors
:
_dummy_tensors
[
device
]
=
Tensor
([],
device
=
device
)
return
_dummy_tensors
[
device
]
def
record_event
(
device
):
x
=
_get_dummy_tensor
(
device
)
(
x
,)
=
apply
(
Barrier
(
device
,
1
),
x
)
return
x
def
wait_event
(
device
,
event
):
apply
(
Barrier
(
device
,
0
),
event
)
imperative/python/megengine/device.py
浏览文件 @
877bda41
...
...
@@ -51,7 +51,7 @@ _sh = _stream_helper()
def
_valid_device
(
inp
):
if
isinstance
(
inp
,
str
)
and
re
.
match
(
"^([cxg]pu|rocm|multithread)(
\d+|\d+:\d+|x)
$"
,
inp
"^([cxg]pu|rocm|multithread)(
x|\d+)(:\d+)?
$"
,
inp
):
return
True
return
False
...
...
imperative/python/megengine/tensor.py
浏览文件 @
877bda41
...
...
@@ -15,7 +15,7 @@ from .core._imperative_rt.core2 import Tensor as _Tensor
from
.core._imperative_rt.core2
import
apply
,
set_py_tensor_type
from
.core._trace_option
import
use_symbolic_shape
from
.core._wrap
import
as_device
from
.core.ops.builtin
import
Copy
,
GetVarShape
from
.core.ops.builtin
import
Borrow
,
Copy
,
GetVarShape
from
.core.tensor.array_method
import
ArrayMethodMixin
from
.device
import
_valid_device
,
get_default_device
from
.logger
import
get_logger
...
...
@@ -205,7 +205,7 @@ class Tensor(_Tensor, ArrayMethodMixin):
def
reset_zero
(
self
):
self
*=
0
def
to
(
self
,
device
):
def
to
(
self
,
device
,
*
,
_borrow
=
False
):
r
"""Copy self :class:`~.Tensor` to specified device. See :func:`~.copy`"""
if
isinstance
(
device
,
str
)
and
not
_valid_device
(
device
):
raise
ValueError
(
...
...
@@ -214,7 +214,8 @@ class Tensor(_Tensor, ArrayMethodMixin):
)
)
cn
=
as_device
(
device
).
to_c
()
return
apply
(
Copy
(
comp_node
=
cn
),
self
)[
0
]
op
=
Borrow
(
comp_node
=
cn
)
if
_borrow
else
Copy
(
comp_node
=
cn
)
return
apply
(
op
,
self
)[
0
]
@
property
def
requires_grad
(
self
):
...
...
@@ -232,11 +233,11 @@ class Tensor(_Tensor, ArrayMethodMixin):
return
id
(
self
)
def
__getnewargs__
(
self
):
r
"""
__getnewargs__ will be called for pickle serialization or deep copy"""
r
"""__getnewargs__ will be called for pickle serialization or deep copy"""
return
(
self
.
numpy
(),
self
.
dtype
,
self
.
device
.
logical_name
)
def
__getstate__
(
self
):
r
"""
__getstate__ will be called for pickle serialization or deep copy"""
r
"""__getstate__ will be called for pickle serialization or deep copy"""
state
=
{}
if
self
.
_qparams
is
not
None
:
state
[
"qparams"
]
=
self
.
_qparams
...
...
imperative/python/src/module_trace.h
浏览文件 @
877bda41
...
...
@@ -11,6 +11,7 @@
#pragma once
#include <list>
#include "megbrain/imperative/transformations/trace.h"
#include "megbrain/imperative/utils/map.h"
#include "megbrain/imperative/utils/stats.h"
...
...
imperative/python/src/tensor.cpp
浏览文件 @
877bda41
...
...
@@ -998,6 +998,9 @@ void init_tensor(py::module m) {
module_trace_hook
.
inc_ref
();
});
auto
atexit
=
py
::
module
::
import
(
"atexit"
);
atexit
.
attr
(
"register"
)(
py
::
cpp_function
([]()
{
module_trace_hook
=
{};
}));
m
.
def
(
"begin_record_values"
,
[]
{
Value
::
begin_record_values
();
});
m
.
def
(
"end_record_values"
,
[]
{
...
...
imperative/python/test/unit/core/test_stream.py
0 → 100644
浏览文件 @
877bda41
import
gc
import
numpy
as
np
import
pytest
import
megengine
as
mge
import
megengine.functional
as
F
from
megengine._multistream
import
record_event
,
wait_event
class
MemStat
:
def
__init__
(
self
,
*
args
):
for
d
in
args
:
mge
.
Tensor
([],
device
=
d
)
gc
.
collect
()
mge
.
_full_sync
()
self
.
baseline
=
{
d
:
mge
.
device
.
get_allocated_memory
(
d
)
for
d
in
args
}
for
d
in
args
:
mge
.
device
.
reset_max_memory_stats
(
d
)
def
get_max
(
self
,
device
):
return
mge
.
device
.
get_max_allocated_memory
(
device
)
-
self
.
baseline
[
device
]
@
pytest
.
mark
.
require_ngpu
(
1
)
def
test_mem_stats
():
memstat
=
MemStat
(
"xpux:0"
,
"xpux:1"
)
F
.
arange
(
1024
,
device
=
"xpux:0"
)
mge
.
_full_sync
()
assert
4096
<=
memstat
.
get_max
(
"xpux:0"
)
==
memstat
.
get_max
(
"xpux:1"
)
<=
4096
+
128
@
pytest
.
mark
.
require_ngpu
(
1
)
def
test_borrow
():
memstat
=
MemStat
(
"xpux:0"
,
"xpux:1"
)
x_np
=
np
.
random
.
randint
(
2
**
30
,
size
=
(
1
*
1024
*
1024
,),
dtype
=
"int32"
)
unit
=
x_np
.
size
*
4
x0
=
mge
.
Tensor
(
x_np
,
device
=
"xpux:0"
)
x1
=
x0
.
to
(
"xpux:1"
,
_borrow
=
True
)
y
=
-
x1
np
.
testing
.
assert_equal
(
-
x_np
,
y
.
numpy
())
mge
.
_full_sync
()
assert
memstat
.
get_max
(
"xpux:0"
)
/
unit
<
2.1
@
pytest
.
mark
.
require_ngpu
(
1
)
def
test_stream_mem
():
memstat
=
MemStat
(
"xpux:0"
,
"xpux:1"
)
x_np
=
np
.
random
.
randint
(
2
**
10
,
size
=
(
1
*
1024
*
1024
,),
dtype
=
"int32"
)
unit
=
x_np
.
size
*
4
x0
=
mge
.
Tensor
(
x_np
,
device
=
"xpux:0"
)
results
=
[]
events
=
[]
for
i
in
range
(
100
):
if
len
(
events
)
>=
2
:
wait_event
(
"xpux:0"
,
events
[
-
2
])
x0
=
x0
+
1
results
.
append
(
x0
.
to
(
"xpux:1"
,
_borrow
=
True
).
sum
())
events
.
append
(
record_event
(
"xpux:1"
))
del
events
[:
-
2
]
y_np
=
x_np
.
sum
()
for
i
,
y
in
enumerate
(
results
):
y_np
+=
x_np
.
size
assert
y_np
==
y
.
numpy
()
mge
.
_full_sync
()
assert
memstat
.
get_max
(
"xpux:0"
)
/
unit
<
2.1
imperative/src/impl/async_releaser.h
已删除
100644 → 0
浏览文件 @
ed7fa104
/**
* \file imperative/src/impl/async_releaser.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "megbrain/comp_node.h"
#include "megbrain/imperative/blob_manager.h"
#include "megbrain/imperative/resource_manager.h"
#include "megbrain/system.h"
#include "./event_pool.h"
namespace
mgb
{
namespace
imperative
{
class
AsyncReleaser
:
public
CompNodeDepedentObject
{
struct
WaiterParam
{
CompNode
cn
;
CompNode
::
Event
*
event
;
BlobPtr
blob
;
HostTensorStorage
::
RawStorage
storage
;
};
class
Waiter
final
:
public
AsyncQueueSC
<
WaiterParam
,
Waiter
>
{
AsyncReleaser
*
m_par_releaser
;
public:
// disable busy wait by set max_spin=0 to save CPU cycle
Waiter
(
AsyncReleaser
*
releaser
)
:
AsyncQueueSC
<
WaiterParam
,
Waiter
>
(
0
),
m_par_releaser
(
releaser
)
{}
void
process_one_task
(
WaiterParam
&
param
)
{
if
(
param
.
event
->
finished
())
{
param
.
blob
.
reset
();
param
.
storage
.
reset
();
EventPool
::
without_timer
().
free
(
param
.
event
);
return
;
}
using
namespace
std
::
literals
;
std
::
this_thread
::
sleep_for
(
1us
);
add_task
(
std
::
move
(
param
));
}
void
on_async_queue_worker_thread_start
()
override
{
sys
::
set_thread_name
(
"releaser"
);
}
};
Waiter
m_waiter
{
this
};
protected:
std
::
shared_ptr
<
void
>
on_comp_node_finalize
()
override
{
m_waiter
.
wait_task_queue_empty
();
return
{};
}
public:
static
AsyncReleaser
*
inst
()
{
static
auto
*
releaser
=
ResourceManager
::
create_global
<
AsyncReleaser
>
();
return
releaser
;
}
~
AsyncReleaser
()
{
m_waiter
.
wait_task_queue_empty
();
}
void
add
(
BlobPtr
blob
,
CompNode
cn
)
{
add
(
cn
,
std
::
move
(
blob
),
{});
}
void
add
(
const
HostTensorND
&
hv
)
{
add
(
hv
.
comp_node
(),
{},
hv
.
storage
().
raw_storage
());
}
void
add
(
CompNode
cn
,
BlobPtr
blob
,
HostTensorStorage
::
RawStorage
storage
=
{})
{
auto
event
=
EventPool
::
without_timer
().
alloc
(
cn
);
event
->
record
();
m_waiter
.
add_task
({
cn
,
event
,
std
::
move
(
blob
),
std
::
move
(
storage
)});
}
};
}
// namespace imperative
}
// namespace mgb
imperative/src/impl/blob_manager_impl.cpp
浏览文件 @
877bda41
...
...
@@ -16,7 +16,7 @@
namespace
mgb
{
namespace
imperative
{
BlobManagerImpl
::
BlobData
::
BlobData
(
Blob
*
in_blob
)
{
BlobManagerImpl
::
BlobData
::
BlobData
(
Owned
Blob
*
in_blob
)
{
blob
=
in_blob
;
DeviceTensorStorage
d_storage
;
d_storage
.
reset
(
blob
->
m_comp_node
,
blob
->
m_size
,
blob
->
m_storage
);
...
...
@@ -28,19 +28,19 @@ BlobManagerImpl::BlobData::BlobData(Blob* in_blob) {
h_storage
.
copy_from
(
const_cast
<
DeviceTensorStorage
&>
(
d_storage
),
blob
->
m_size
);
}
void
BlobManagerImpl
::
register_blob
(
Blob
*
blob
)
{
void
BlobManagerImpl
::
register_blob
(
Owned
Blob
*
blob
)
{
// add blob into the comp2blobs map
MGB_LOCK_GUARD
(
m_mtx
);
mgb_assert
(
m_comp2blobs_map
[
blob
->
m_comp_node
].
insert
(
blob
));
}
void
BlobManagerImpl
::
unregister_blob
(
Blob
*
blob
)
{
void
BlobManagerImpl
::
unregister_blob
(
Owned
Blob
*
blob
)
{
// erase blob into the comp2blobs map
MGB_LOCK_GUARD
(
m_mtx
);
mgb_assert
(
1
==
m_comp2blobs_map
[
blob
->
m_comp_node
].
erase
(
blob
));
}
void
BlobManagerImpl
::
alloc_with_defrag
(
Blob
*
blob
,
size_t
size
)
{
void
BlobManagerImpl
::
alloc_with_defrag
(
Owned
Blob
*
blob
,
size_t
size
)
{
if
(
custom_allocator
)
{
blob
->
m_storage
=
custom_allocator
(
blob
->
m_comp_node
,
size
);
return
;
...
...
@@ -55,7 +55,7 @@ void BlobManagerImpl::alloc_with_defrag(Blob* blob, size_t size) {
});
}
void
BlobManagerImpl
::
alloc_direct
(
Blob
*
blob
,
size_t
size
)
{
void
BlobManagerImpl
::
alloc_direct
(
Owned
Blob
*
blob
,
size_t
size
)
{
DeviceTensorStorage
storage
(
blob
->
m_comp_node
);
mgb_assert
(
blob
->
m_comp_node
.
valid
());
storage
.
ensure_size
(
size
);
...
...
@@ -143,7 +143,7 @@ void BlobManagerImpl::defrag(const CompNode& cn) {
// sort blobs by created time, may be helpful for reduce memory fragment
std
::
sort
(
blob_data_arrary
.
begin
(),
blob_data_arrary
.
end
(),
[](
auto
&
lhs
,
auto
&
rhs
)
{
return
lhs
.
blob
->
id
()
<
rhs
.
blob
->
id
()
;
});
[](
auto
&
lhs
,
auto
&
rhs
)
{
return
lhs
.
blob
->
m_id
<
rhs
.
blob
->
m_id
;
});
// allocate for each storage
for
(
auto
i
:
blob_data_arrary
)
{
...
...
@@ -158,19 +158,19 @@ void BlobManagerImpl::defrag(const CompNode& cn) {
}
struct
BlobManagerStub
:
BlobManager
{
void
alloc_direct
(
Blob
*
blob
,
size_t
size
)
{
void
alloc_direct
(
Owned
Blob
*
blob
,
size_t
size
)
{
mgb_assert
(
0
,
"prohibited after global variable destruction"
);
};
void
alloc_with_defrag
(
Blob
*
blob
,
size_t
size
)
{
void
alloc_with_defrag
(
Owned
Blob
*
blob
,
size_t
size
)
{
mgb_assert
(
0
,
"prohibited after global variable destruction"
);
};
DeviceTensorND
alloc_workspace_with_defrag
(
CompNode
cn
,
TensorLayout
&
layout
)
{
mgb_assert
(
0
,
"prohibited after global variable destruction"
);
};
void
register_blob
(
Blob
*
blob
)
{
void
register_blob
(
Owned
Blob
*
blob
)
{
mgb_assert
(
0
,
"prohibited after global variable destruction"
);
};
void
unregister_blob
(
Blob
*
blob
){};
void
unregister_blob
(
Owned
Blob
*
blob
){};
void
defrag
(
const
CompNode
&
cn
)
{
mgb_assert
(
0
,
"prohibited after global variable destruction"
);
};
...
...
imperative/src/impl/blob_manager_impl.h
浏览文件 @
877bda41
...
...
@@ -19,21 +19,21 @@ namespace imperative {
class
BlobManagerImpl
final
:
public
BlobManager
{
struct
BlobSetWithMux
{
std
::
mutex
mtx
;
ThinHashSet
<
Blob
*>
blobs_set
;
bool
insert
(
Blob
*
blob
)
{
ThinHashSet
<
Owned
Blob
*>
blobs_set
;
bool
insert
(
Owned
Blob
*
blob
)
{
MGB_LOCK_GUARD
(
mtx
);
return
blobs_set
.
insert
(
blob
).
second
;
}
size_t
erase
(
Blob
*
blob
)
{
size_t
erase
(
Owned
Blob
*
blob
)
{
MGB_LOCK_GUARD
(
mtx
);
return
blobs_set
.
erase
(
blob
);
}
};
struct
BlobData
{
Blob
*
blob
;
Owned
Blob
*
blob
;
HostTensorStorage
h_storage
;
BlobData
(
Blob
*
in_blob
);
BlobData
(
Owned
Blob
*
in_blob
);
};
std
::
mutex
m_mtx
;
...
...
@@ -41,7 +41,7 @@ class BlobManagerImpl final : public BlobManager {
void
defrag
(
const
CompNode
&
cn
)
override
;
void
alloc_direct
(
Blob
*
blob
,
size_t
size
)
override
;
void
alloc_direct
(
Owned
Blob
*
blob
,
size_t
size
)
override
;
DeviceTensorND
alloc_workspace
(
CompNode
cn
,
TensorLayout
layout
);
...
...
@@ -50,14 +50,14 @@ class BlobManagerImpl final : public BlobManager {
public:
static
BlobManager
*
inst
();
void
alloc_with_defrag
(
Blob
*
blob
,
size_t
size
)
override
;
void
alloc_with_defrag
(
Owned
Blob
*
blob
,
size_t
size
)
override
;
DeviceTensorND
alloc_workspace_with_defrag
(
CompNode
cn
,
TensorLayout
&
layout
)
override
;
void
register_blob
(
Blob
*
blob
)
override
;
void
register_blob
(
Owned
Blob
*
blob
)
override
;
void
unregister_blob
(
Blob
*
blob
)
override
;
void
unregister_blob
(
Owned
Blob
*
blob
)
override
;
void
set_allocator
(
allocator_t
allocator
)
override
;
};
...
...
imperative/src/impl/interpreter/interpreter_impl.cpp
浏览文件 @
877bda41
...
...
@@ -704,7 +704,7 @@ void ChannelImpl::produce_tensor(TensorInfo* dest, TensorPtr ptr) {
m_dtr
.
update_used_time
(
dest
);
MGB_RECORD_EVENT
(
TensorProduceEvent
,
dest
->
id
,
ptr
->
layout
(),
ptr
->
comp_node
(),
ptr
->
dev_tensor
(
false
).
raw_ptr
());
ptr
->
raw_ptr_not_for_readwrite
());
// update tensor desc for static infer
if
(
dest
->
desc
.
layout
.
ndim
)
{
mgb_assert
(
...
...
@@ -805,8 +805,13 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd, std::string reason) {
inputs
[
idx
]
->
to_contiguous_inplace
(
layout_checker
);
}
}
return
OpDef
::
apply_on_physical_tensor
(
auto
outputs
=
OpDef
::
apply_on_physical_tensor
(
def
,
std
::
move
(
inputs
),
output_descs
,
validated
);
for
(
auto
&
o
:
outputs
)
{
o
->
set_ready_event
(
record_event
(
o
->
comp_node
(),
def
.
same_type
<
imperative
::
Barrier
>
()));
}
return
outputs
;
};
MGB_RECORD_EVENT
(
OpExecuteEvent
,
apply_id
,
{},
reason
);
SmallVector
<
std
::
pair
<
CompNode
,
uint64_t
>>
kernels
;
...
...
@@ -1059,7 +1064,7 @@ std::unordered_set<TensorInfo*> ChannelImpl::collect_valid_tensors() {
return
valid_tensors
;
}
void
ChannelImpl
::
alloc_tensor_with_evict
(
Blob
*
x
)
{
void
ChannelImpl
::
alloc_tensor_with_evict
(
Owned
Blob
*
x
)
{
bool
in_worker
=
(
get_worker_tid
()
==
std
::
this_thread
::
get_id
());
auto
reserve_size
=
[
&
](
size_t
size
)
{
if
(
!
m_dtr
.
comp_node
.
valid
())
{
...
...
imperative/src/impl/interpreter/interpreter_impl.h
浏览文件 @
877bda41
...
...
@@ -304,7 +304,7 @@ private:
//! automatically evict an optimal tensor
bool
auto_evict
(
size_t
);
void
alloc_tensor_with_evict
(
Blob
*
);
void
alloc_tensor_with_evict
(
Owned
Blob
*
);
// assert thread id when call get_xxx_state to avoid misuse
ChannelState
&
get_channel_state
();
...
...
imperative/src/impl/ops/elemwise.cpp
浏览文件 @
877bda41
...
...
@@ -249,7 +249,7 @@ SmallVector<TensorPtr> apply_inplace_add_on_physical_tensor(
const
OpDef
&
def
,
const
SmallVector
<
TensorPtr
>&
inputs
,
SmallVector
<
LogicalTensorDesc
>&
output_descs
,
const
bool
&
validated
)
{
auto
dest
=
inputs
[
0
],
delta
=
inputs
[
1
],
alpha
=
inputs
[
2
],
beta
=
inputs
[
3
];
if
(
!
(
inputs
[
0
]
->
blob
().
unique
()
&&
inputs
[
0
]
->
blob
()
->
storage
().
unique
()
))
{
if
(
!
inputs
[
0
]
->
storage_is_unique
(
))
{
mgb_log_warn
(
"This inplace modification may change the elements of other tensors. "
"Fallback to non-inplace update."
);
...
...
imperative/src/impl/ops/tensor_manip.cpp
浏览文件 @
877bda41
...
...
@@ -13,7 +13,6 @@
#include "megbrain/imperative/ops/autogen.h"
#include "megbrain/imperative/ops/opr_attr.h"
#include "../async_releaser.h"
#include "../dnn_op_helper.h"
#include "../op_trait.h"
...
...
@@ -267,8 +266,7 @@ SmallVector<TensorPtr> param_pack_concat_apply_on_physical_tensor(
{
srcs_raw_ptr
,
srcs_layout
},
inputs
.
back
()
->
dev_tensor
().
as_megdnn
(),
output
->
dev_tensor
().
as_megdnn
(),
caller
.
create_workspace
({{
ws_size
},
dtype
::
Byte
()}));
AsyncReleaser
::
inst
()
->
add
(
HostTensorND
{
comp_node
,
srcs_layout
}.
storage
(
srcs_storage
));
async_release
(
HostTensorND
{
comp_node
,
srcs_layout
}.
storage
(
srcs_storage
));
return
{
output
};
}
...
...
imperative/src/impl/ops/utility.cpp
浏览文件 @
877bda41
...
...
@@ -23,6 +23,8 @@
#include "megbrain/opr/tensor_gen.h"
#include "megbrain/opr/tensor_manip.h"
#include "megbrain/opr/utility.h"
#include "megbrain/tensor.h"
#include "megdnn/dtype.h"
#if MGB_JIT
#include "megbrain/jit/executor_opr.h"
...
...
@@ -36,6 +38,102 @@ namespace mgb::imperative {
MGB_DYN_TYPE_OBJ_FINAL_IMPL
(
GenericPyOp
);
OP_TRAIT_REG
(
GenericPyOp
,
GenericPyOp
).
fallback
();
namespace
{
namespace
borrow
{
SmallVector
<
TensorPtr
>
apply_on_physical_tensor
(
const
OpDef
&
def
,
const
SmallVector
<
TensorPtr
>&
inputs
,
SmallVector
<
LogicalTensorDesc
>&
output_descs
,
const
bool
&
validated
)
{
auto
&
op
=
def
.
cast_final_safe
<
Borrow
>
();
SmallVector
<
TensorPtr
>
outputs
;
outputs
.
reserve
(
inputs
.
size
());
for
(
auto
&
i
:
inputs
)
{
outputs
.
push_back
(
Tensor
::
make
(
i
->
blob
()
->
borrow_to
(
op
.
comp_node
),
i
->
layout
(),
i
->
offset
()));
}
return
outputs
;
}
SmallVector
<
VarNode
::
LayoutConstraintCallback
>
get_input_layout_constraint
(
const
OpDef
&
def
,
const
SmallVector
<
TensorPtr
>&
inputs
)
{
return
SmallVector
<
VarNode
::
LayoutConstraintCallback
>
(
inputs
.
size
());
}
auto
infer_output_attrs_fallible
(
const
OpDef
&
def
,
const
SmallVector
<
LogicalTensorDesc
>&
inputs
)
{
auto
&
op
=
def
.
cast_final_safe
<
Borrow
>
();
std
::
tuple
<
SmallVector
<
LogicalTensorDesc
>
,
bool
>
ret
(
inputs
,
true
);
for
(
auto
&
i
:
std
::
get
<
0
>
(
ret
))
{
mgb_assert
(
i
.
comp_node
.
mem_node
()
==
op
.
comp_node
.
mem_node
(),
"cannot borrow memory from %s to %s"
,
i
.
comp_node
.
to_string
().
c_str
(),
op
.
comp_node
.
to_string
().
c_str
());
i
.
comp_node
=
op
.
comp_node
;
}
return
ret
;
}
OP_TRAIT_REG
(
Borrow
,
Borrow
)
.
apply_on_physical_tensor
(
apply_on_physical_tensor
)
.
infer_output_attrs_fallible
(
infer_output_attrs_fallible
)
.
get_input_layout_constraint
(
get_input_layout_constraint
)
.
fallback
();
}
// namespace borrow
}
// namespace
namespace
{
namespace
barrier
{
SmallVector
<
TensorPtr
>
apply_on_physical_tensor
(
const
OpDef
&
def
,
const
SmallVector
<
TensorPtr
>&
inputs
,
SmallVector
<
LogicalTensorDesc
>&
output_descs
,
const
bool
&
validated
)
{
auto
&
op
=
def
.
cast_final_safe
<
imperative
::
Barrier
>
();
SmallVector
<
TensorPtr
>
outputs
;
for
(
auto
&
i
:
inputs
)
{
if
(
i
->
comp_node
()
!=
op
.
comp_node
)
{
device_wait_event
(
op
.
comp_node
,
i
->
comp_node
(),
i
->
get_ready_event
());
}
}
if
(
op
.
nr_outputs
)
{
outputs
.
resize
(
op
.
nr_outputs
);
TensorLayout
layout
(
TensorShape
({
0
}),
dtype
::
Int32
{});
for
(
auto
&
i
:
outputs
)
{
i
=
Tensor
::
make
(
layout
,
op
.
comp_node
);
}
}
return
outputs
;
}
SmallVector
<
VarNode
::
LayoutConstraintCallback
>
get_input_layout_constraint
(
const
OpDef
&
def
,
const
SmallVector
<
TensorPtr
>&
inputs
)
{
return
SmallVector
<
VarNode
::
LayoutConstraintCallback
>
(
inputs
.
size
());
}
auto
infer_output_attrs_fallible
(
const
OpDef
&
def
,
const
SmallVector
<
LogicalTensorDesc
>&
inputs
)
{
auto
&
op
=
def
.
cast_final_safe
<
imperative
::
Barrier
>
();
std
::
tuple
<
SmallVector
<
LogicalTensorDesc
>
,
bool
>
ret
;
auto
&
[
descs
,
checked
]
=
ret
;
descs
.
resize
(
op
.
nr_outputs
);
checked
=
true
;
for
(
auto
&
desc
:
descs
)
{
desc
.
comp_node
=
op
.
comp_node
;
desc
.
layout
.
dtype
=
dtype
::
Int32
{};
desc
.
layout
.
ndim
=
1
;
desc
.
layout
.
shape
[
0
]
=
0
;
desc
.
layout
.
stride
[
0
]
=
1
;
}
return
ret
;
}
OP_TRAIT_REG
(
Barrier
,
imperative
::
Barrier
)
.
apply_on_physical_tensor
(
apply_on_physical_tensor
)
.
infer_output_attrs_fallible
(
infer_output_attrs_fallible
)
.
get_input_layout_constraint
(
get_input_layout_constraint
)
.
fallback
();
}
// namespace barrier
}
// namespace
namespace
{
namespace
fastpathcopy
{
auto
apply_on_var_node
(
const
OpDef
&
def
,
const
VarNodeArray
&
inputs
)
{
...
...
imperative/src/impl/physical_tensor.cpp
浏览文件 @
877bda41
...
...
@@ -9,57 +9,384 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "megbrain/imperative/physical_tensor.h"
#include "megbrain/common.h"
#include "megbrain/comp_node.h"
#include "megbrain/imperative.h"
#include "megbrain/imperative/blob_manager.h"
#include "megbrain/imperative/profiler.h"
#include "megbrain/imperative/resource_manager.h"
#include "./async_releaser.h"
#include "./event_pool.h"
#include "./profiler/events.h"
#include <condition_variable>
#include <cstdint>
#include <deque>
#include <map>
#include <memory>
#include <mutex>
#include <type_traits>
#include <unordered_map>
#include <utility>
#include <variant>
#include <vector>
#ifndef WIN32
#include <pthread.h>
#endif
#include "range/v3/all.hpp"
namespace
views
=
ranges
::
views
;
namespace
mgb
{
namespace
imperative
{
namespace
{
class
CompNodeSyncManager
:
public
CompNodeDepedentObject
{
ThinHashMap
<
Blob
*
,
std
::
unique_ptr
<
CompNode
::
Event
>>
m_blob2event
;
struct
CompNodeHash
{
auto
operator
()(
CompNode
cn
)
const
{
return
mgb
::
hash
(
cn
);
}
};
template
<
typename
T
>
struct
NoThrowMovable
:
T
{
using
T
::
T
;
NoThrowMovable
(
NoThrowMovable
&&
)
noexcept
=
default
;
};
template
<
typename
...
Ts
>
using
Map
=
NoThrowMovable
<
std
::
map
<
Ts
...
>>
;
class
CompNodeSyncManager
{
struct
CompNodeData
{
template
<
typename
T
>
class
ReleaseQueue
{
Map
<
uint64_t
,
T
>
map
;
public:
template
<
typename
A
>
void
emplace
(
uint64_t
t
,
A
&&
a
)
{
map
.
emplace_hint
(
map
.
end
(),
t
,
std
::
forward
<
A
>
(
a
));
}
void
release
(
uint64_t
t
)
{
auto
it
=
map
.
upper_bound
(
t
);
map
.
erase
(
map
.
begin
(),
it
);
}
};
//! next virtual event
uint64_t
next
=
1
;
//! last completed virtual event
uint64_t
completed
=
0
;
//! virtual event to real event
Map
<
uint64_t
,
EventPtr
>
events
;
//! ordering information at some virtual events:
//! what virtual events on other comp nodes is _sequenced before_ this virtual
//! event
Map
<
uint64_t
,
std
::
vector
<
uint64_t
>>
ordering
;
//! release queue for dev storage, keyed by releaser. this comp node is the
//! **receiver**
std
::
vector
<
ReleaseQueue
<
BlobPtr
>>
release_queues
;
//! release queue for host storage. this comp node is the **releaser**
ReleaseQueue
<
HostTensorStorage
::
RawStorage
>
host_release_queue
;
};
std
::
mutex
m_mtx
;
std
::
condition_variable
m_cv
;
bool
m_should_stop
=
false
;
std
::
thread
m_polling_thread
;
std
::
unordered_map
<
CompNode
,
size_t
,
CompNodeHash
>
m_cn2id
;
std
::
vector
<
CompNodeData
>
m_cndata
;
auto
do_record
(
CompNode
cn
,
size_t
cnid
,
std
::
unique_lock
<
std
::
mutex
>&
lock
)
{
// CAUSION: don't keep reference across locking boundary
lock
.
unlock
();
auto
e
=
EventPool
::
without_timer
().
alloc
(
cn
);
e
->
record
();
lock
.
lock
();
auto
&
cndata
=
m_cndata
[
cnid
];
return
cndata
.
events
.
emplace_hint
(
cndata
.
events
.
end
(),
cndata
.
next
++
,
e
);
}
std
::
pair
<
uint64_t
,
CompNode
::
Event
*>
get_event
(
CompNode
cn
,
size_t
cnid
,
uint64_t
t
,
std
::
unique_lock
<
std
::
mutex
>&
lock
)
{
auto
&
cndata
=
m_cndata
[
cnid
];
auto
it
=
cndata
.
events
.
lower_bound
(
t
);
if
(
it
==
cndata
.
events
.
end
())
{
it
=
do_record
(
cn
,
cnid
,
lock
);
}
return
{
it
->
first
,
it
->
second
.
get
()};
}
size_t
get_cnid_unsafe
(
CompNode
cn
)
{
auto
[
it
,
unseen
]
=
m_cn2id
.
try_emplace
(
cn
,
m_cndata
.
size
());
if
(
unseen
)
{
m_cndata
.
emplace_back
();
}
return
it
->
second
;
}
void
monitor_events
()
{
#if defined(__APPLE__)
pthread_setname_np
(
"CompNodeSync"
);
#elif defined(__unix__)
pthread_setname_np
(
pthread_self
(),
"CompNodeSync"
);
#endif
// poll events in rounds. sleep for a fixed duration between rounds.
// number of events to query is decided by the number of successful queries in
// last round, independently for each comp node:
// a. all -> double
// b. 0 -> 1
// c. otherwise -> #successful
struct
Item
{
size_t
cnid
;
decltype
(
CompNodeData
::
events
)
::
iterator
it
;
};
struct
Stat
{
size_t
num_success
=
0
;
size_t
num_attempts
=
0
;
// iterator to the last finished event
decltype
(
CompNodeData
::
events
)
::
iterator
it
;
};
std
::
vector
<
Stat
>
stats
;
std
::
vector
<
Item
>
todos
;
std
::
unique_lock
lock
(
m_mtx
);
for
(;;)
{
// copy events to a temporary storage so that we may unlock while polling
stats
.
resize
(
m_cndata
.
size
());
for
(
size_t
cnid
=
0
;
cnid
<
m_cndata
.
size
();
++
cnid
)
{
// decide max number of events to query
// rule c: #successful
size_t
n
=
stats
[
cnid
].
num_success
;
if
(
n
==
stats
[
cnid
].
num_attempts
)
{
// rule a: double
n
*=
2
;
}
if
(
n
==
0
)
{
// rule b: 1
n
=
1
;
}
// now copy upto n events
auto
&
events
=
m_cndata
[
cnid
].
events
;
size_t
i
=
0
;
for
(
auto
it
=
events
.
begin
();
i
<
n
&&
it
!=
events
.
end
();
++
i
,
++
it
)
{
todos
.
push_back
({
cnid
,
it
});
}
// reset stats for this round
stats
[
cnid
].
num_success
=
0
;
stats
[
cnid
].
num_attempts
=
n
;
}
lock
.
unlock
();
bool
last_result
=
false
;
size_t
last_cnid
=
-
1
;
for
(
auto
item
:
todos
)
{
if
(
item
.
cnid
==
last_cnid
&&
!
last_result
)
{
// previous failed, this one almost certainly should fail
continue
;
}
last_cnid
=
item
.
cnid
;
last_result
=
item
.
it
->
second
->
finished
();
if
(
last_result
)
{
stats
[
item
.
cnid
].
num_success
++
;
stats
[
item
.
cnid
].
it
=
item
.
it
;
}
}
todos
.
clear
();
lock
.
lock
();
// release dev storage
for
(
size_t
receiver_cnid
=
0
;
receiver_cnid
<
m_cndata
.
size
();
++
receiver_cnid
)
{
for
(
size_t
releaser_cnid
=
0
;
releaser_cnid
<
m_cndata
[
receiver_cnid
].
release_queues
.
size
();
++
releaser_cnid
)
{
if
(
releaser_cnid
>=
stats
.
size
()
||
stats
[
releaser_cnid
].
num_success
==
0
)
{
continue
;
}
auto
&
q
=
m_cndata
[
receiver_cnid
].
release_queues
[
releaser_cnid
];
q
.
release
(
stats
[
releaser_cnid
].
it
->
first
);
}
}
for
(
size_t
cnid
=
0
;
cnid
<
stats
.
size
();
++
cnid
)
{
if
(
stats
[
cnid
].
num_success
==
0
)
{
continue
;
}
auto
&
cndata
=
m_cndata
[
cnid
];
auto
it
=
stats
[
cnid
].
it
;
auto
t
=
it
->
first
;
// update completed
cndata
.
completed
=
t
;
// release host storage
cndata
.
host_release_queue
.
release
(
t
);
// remove completed events
auto
&
events
=
cndata
.
events
;
events
.
erase
(
events
.
begin
(),
std
::
next
(
it
));
}
using
namespace
std
::
literals
;
if
(
m_cv
.
wait_for
(
lock
,
10us
,
[
&
]
{
return
m_should_stop
;
}))
{
return
;
}
}
}
CompNodeSyncManager
()
{
m_polling_thread
=
std
::
thread
([
this
]
{
monitor_events
();
});
}
public:
std
::
shared_ptr
<
void
>
on_comp_node_finalize
()
override
{
~
CompNodeSyncManager
()
{
{
MGB_LOCK_GUARD
(
m_mtx
);
m_blob2event
.
clear
()
;
return
{}
;
m_should_stop
=
true
;
m_cv
.
notify_all
()
;
}
m_polling_thread
.
join
();
}
static
CompNodeSyncManager
&
inst
();
static
CompNodeSyncManager
&
inst
()
{
static
auto
*
sl_inst
=
ResourceManager
::
create_global
<
CompNodeSyncManager
>
();
return
*
sl_inst
;
uint64_t
record
(
CompNode
cn
,
bool
doitnow
=
false
)
{
std
::
unique_lock
lock
(
m_mtx
);
auto
cnid
=
get_cnid_unsafe
(
cn
);
if
(
doitnow
)
{
return
do_record
(
cn
,
cnid
,
lock
)
->
first
;
}
return
m_cndata
[
cnid
].
next
++
;
}
CompNode
::
Event
*
get_or_create_event
(
Blob
*
blob
)
{
mgb_assert
(
!
is_finalized
());
void
async_release
(
CompNode
cn
,
uint64_t
t
,
BlobPtr
blob
)
{
MGB_LOCK_GUARD
(
m_mtx
);
auto
&&
e
=
m_blob2event
[
blob
]
;
if
(
!
e
)
{
e
=
blob
->
comp_node
().
create_event
()
;
auto
releaser_cnid
=
get_cnid_unsafe
(
cn
)
;
if
(
t
<=
m_cndata
[
releaser_cnid
].
completed
)
{
return
;
}
return
e
.
get
();
auto
receiver_cnid
=
get_cnid_unsafe
(
blob
->
comp_node
());
auto
&
qs
=
m_cndata
[
receiver_cnid
].
release_queues
;
if
(
releaser_cnid
>=
qs
.
size
())
{
qs
.
resize
(
releaser_cnid
+
1
);
}
auto
&
q
=
qs
[
releaser_cnid
];
q
.
emplace
(
t
,
std
::
move
(
blob
));
}
void
remove
(
Blob
*
blob
)
{
void
async_release
(
CompNode
cn
,
uint64_t
t
,
HostTensorStorage
::
RawStorage
storage
)
{
MGB_LOCK_GUARD
(
m_mtx
);
m_blob2event
.
erase
(
blob
);
auto
releaser_cnid
=
get_cnid_unsafe
(
cn
);
if
(
t
<=
m_cndata
[
releaser_cnid
].
completed
)
{
return
;
}
auto
&
q
=
m_cndata
[
releaser_cnid
].
host_release_queue
;
q
.
emplace
(
t
,
std
::
move
(
storage
));
}
void
device_wait
(
CompNode
waiter
,
CompNode
waitee
,
uint64_t
t
)
{
std
::
unique_lock
lock
(
m_mtx
);
auto
waiter_id
=
get_cnid_unsafe
(
waiter
);
auto
waitee_id
=
get_cnid_unsafe
(
waitee
);
auto
&
waiter_data
=
m_cndata
.
at
(
waiter_id
);
auto
&
waitee_data
=
m_cndata
.
at
(
waitee_id
);
auto
[
t_waitee
,
e
]
=
get_event
(
waitee
,
waitee_id
,
t
,
lock
);
// DO NOT unlock around this line! Event* could be invalidated!
e
->
device_wait_by
(
waiter
);
auto
t_waiter
=
waiter_data
.
next
++
;
std
::
vector
<
uint64_t
>
ordering
(
m_cndata
.
size
(),
0
);
if
(
!
waiter_data
.
ordering
.
empty
())
{
auto
&
o
=
waiter_data
.
ordering
.
rbegin
()
->
second
;
std
::
copy
(
o
.
begin
(),
o
.
end
(),
ordering
.
begin
());
}
ordering
[
waitee_id
]
=
t_waitee
;
ordering
[
waiter_id
]
=
t_waiter
;
{
auto
it
=
waitee_data
.
ordering
.
lower_bound
(
t_waitee
);
if
(
it
!=
waitee_data
.
ordering
.
begin
())
{
for
(
auto
[
a
,
b
]
:
views
::
zip
(
ordering
,
std
::
prev
(
it
)
->
second
))
{
static_assert
(
std
::
is_lvalue_reference_v
<
decltype
(
a
)
>
);
a
=
std
::
max
(
a
,
b
);
}
}
}
waiter_data
.
ordering
.
emplace_hint
(
waiter_data
.
ordering
.
end
(),
t_waiter
,
ordering
);
for
(
auto
[
t
,
q
]
:
views
::
zip
(
ordering
,
waiter_data
.
release_queues
))
{
q
.
release
(
t
);
}
}
};
CompNodeSyncManager
&
CompNodeSyncManager
::
inst
()
{
static
std
::
mutex
mtx
;
static
std
::
unique_ptr
<
CompNodeSyncManager
>
inst
;
struct
Guard
final
:
CompNodeDepedentObject
{
std
::
shared_ptr
<
void
>
on_comp_node_finalize
()
override
{
MGB_LOCK_GUARD
(
mtx
);
inst
.
reset
();
return
{};
}
};
static
std
::
optional
<
Guard
>
guard
;
#ifndef WIN32
static
bool
broken
=
false
;
static
struct
ForkGuard
{
ForkGuard
()
{
mgb_assert
(
0
==
pthread_atfork
(
NULL
,
NULL
,
[]
{
if
(
inst
)
{
inst
.
release
();
// deliberate leak, unfixable
broken
=
true
;
}
}));
}
}
fork_guard
;
#endif
MGB_LOCK_GUARD
(
mtx
);
if
(
!
inst
)
{
#ifndef WIN32
mgb_assert
(
!
broken
);
#endif
EventPool
::
without_timer
();
inst
.
reset
(
new
CompNodeSyncManager
);
guard
.
emplace
();
}
return
*
inst
;
}
}
// namespace
uint64_t
record_event
(
CompNode
cn
,
bool
doitnow
)
{
return
CompNodeSyncManager
::
inst
().
record
(
cn
,
doitnow
);
}
void
device_wait_event
(
CompNode
waiter
,
CompNode
waitee
,
uint64_t
event
)
{
CompNodeSyncManager
::
inst
().
device_wait
(
waiter
,
waitee
,
event
);
}
void
async_release
(
CompNode
cn
,
uint64_t
event
,
BlobPtr
blob
)
{
CompNodeSyncManager
::
inst
().
async_release
(
cn
,
event
,
std
::
move
(
blob
));
}
void
async_release
(
CompNode
cn
,
uint64_t
event
,
HostTensorStorage
::
RawStorage
storage
)
{
CompNodeSyncManager
::
inst
().
async_release
(
cn
,
event
,
std
::
move
(
storage
));
}
void
EventDeleter
::
operator
()(
CompNode
::
Event
*
event
)
{
EventPool
::
without_timer
().
free
(
event
);
}
...
...
@@ -68,31 +395,74 @@ namespace {
std
::
atomic_uint64_t
next_blob_id
=
0
;
}
Blob
::
Blob
(
const
DeviceTensorStorage
&
s
)
:
m_comp_node
{
s
.
comp_node
()}
,
OwnedBlob
::
Owned
Blob
(
const
DeviceTensorStorage
&
s
)
:
Blob
(
s
.
comp_node
(),
s
.
size
()
+
s
.
offset
())
,
m_storage
{
s
.
raw_storage
()},
m_size
{
s
.
size
()
+
s
.
offset
()}
{
m_id
=
next_blob_id
++
;
m_id
{
next_blob_id
++
}
{
BlobManager
::
inst
()
->
register_blob
(
this
);
}
Blob
::
Blob
(
CompNode
cn
,
size_t
sz
)
:
m_comp_node
{
cn
},
m_storage
{},
m_size
{
sz
}
{
m_id
=
next_blob_id
++
;
OwnedBlob
::
OwnedBlob
(
CompNode
cn
,
size_t
sz
)
:
Blob
(
cn
,
sz
),
m_storage
{},
m_id
{
next_blob_id
++
}
{
BlobManager
::
inst
()
->
register_blob
(
this
);
}
Blob
::~
Blob
()
{
OwnedBlob
::~
Owned
Blob
()
{
BlobManager
::
inst
()
->
unregister_blob
(
this
);
CompNodeSyncManager
::
inst
().
remove
(
this
);
}
const
Blob
::
RawStorage
&
Blob
::
storage
()
{
const
Blob
::
RawStorage
&
Owned
Blob
::
storage
()
{
if
(
!
m_storage
&&
m_size
)
{
BlobManager
::
inst
()
->
alloc_with_defrag
(
this
,
m_size
);
}
return
m_storage
;
}
BlobPtr
OwnedBlob
::
borrow_to
(
CompNode
cn
)
{
return
std
::
make_shared
<
BorrowedBlob
>
(
cn
,
std
::
static_pointer_cast
<
OwnedBlob
>
(
shared_from_this
()));
}
bool
OwnedBlob
::
storage_is_unique
()
{
return
m_storage
.
unique
();
}
void
*
OwnedBlob
::
raw_ptr_not_for_readwrite
()
{
return
m_storage
.
get
();
}
BorrowedBlob
::
BorrowedBlob
(
CompNode
cn
,
std
::
shared_ptr
<
OwnedBlob
>
owner
)
:
Blob
(
cn
,
owner
->
size
()),
m_owner
(
std
::
move
(
owner
)),
m_event
(
record_event
(
m_owner
->
comp_node
(),
true
))
{}
BorrowedBlob
::~
BorrowedBlob
()
{
async_release
(
m_comp_node
,
record_event
(
m_comp_node
,
true
),
std
::
move
(
m_owner
));
}
const
Blob
::
RawStorage
&
BorrowedBlob
::
storage
()
{
{
MGB_LOCK_GUARD
(
m_mtx
);
if
(
!
m_initialized
)
{
device_wait_event
(
m_comp_node
,
m_owner
->
comp_node
(),
m_event
);
m_initialized
=
true
;
}
}
return
m_owner
->
storage
();
}
BlobPtr
BorrowedBlob
::
borrow_to
(
CompNode
cn
)
{
return
std
::
make_shared
<
BorrowedBlob
>
(
cn
,
m_owner
);
}
bool
BorrowedBlob
::
storage_is_unique
()
{
return
m_owner
.
unique
()
&&
m_owner
->
storage_is_unique
();
}
void
*
BorrowedBlob
::
raw_ptr_not_for_readwrite
()
{
return
m_owner
->
raw_ptr_not_for_readwrite
();
}
Tensor
::
Tensor
(
BlobPtr
blob
,
const
TensorLayout
&
layout
,
size_t
offset
,
const
HostTensorND
&
hv
)
:
m_cn
(
blob
->
comp_node
()),
...
...
@@ -119,7 +489,7 @@ Tensor::Tensor(const HostTensorND& hv) : Tensor(hv.layout(), hv.comp_node()) {
MGB_RECORD_EVENT
(
profiler
::
HostToDeviceFinishEvent
,
hv
.
layout
(),
hv
.
comp_node
(),
hv
.
raw_ptr
(),
dev_tensor
().
raw_ptr
());
AsyncReleaser
::
inst
()
->
add
(
hv
);
async_release
(
hv
);
}
}
...
...
@@ -174,7 +544,7 @@ void Tensor::to_contiguous_inplace(VarNode::LayoutConstraintCallback& layout_che
DeviceTensorND
dv_contig
;
dv_contig
.
copy_from
(
dv
);
m_layout
=
dv_contig
.
layout
();
std
::
atomic_store
(
&
m_blob
,
Blob
::
make
(
dv_contig
.
storage
(
)));
std
::
atomic_store
(
&
m_blob
,
Blob
Ptr
(
Blob
::
make
(
dv_contig
.
storage
()
)));
mgb_assert
(
m_layout
.
is_contiguous
());
m_offset
=
0
;
}
...
...
@@ -188,7 +558,7 @@ void Tensor::to_contiguous_inplace() {
void
Tensor
::
assign_from_dev_tensor
(
DeviceTensorND
dv
)
{
MGB_LOCK_GUARD
(
m_blob_mtx
);
std
::
atomic_store
(
&
m_blob
,
Blob
::
make
(
dv
.
storage
(
)));
std
::
atomic_store
(
&
m_blob
,
Blob
Ptr
(
Blob
::
make
(
dv
.
storage
()
)));
m_offset
=
dv
.
storage
().
offset
();
m_layout
=
dv
.
layout
();
}
...
...
@@ -254,21 +624,20 @@ TensorPtr Tensor::sub(size_t offset, TensorShape shape) {
return
Tensor
::
make
(
m_blob
,
offset
+
m_offset
,
layout
);
}
void
Tensor
::
add_release_callback
(
CompNode
cn
)
{
AsyncReleaser
::
inst
()
->
add
(
m_blob
,
cn
);
uint64_t
Tensor
::
get_ready_event
()
{
if
(
m_produced_at
==
0
)
{
m_produced_at
=
record_event
(
comp_node
());
}
return
m_produced_at
;
}
CompNode
::
Event
*
Tensor
::
get_or_create_event
()
{
auto
e
=
CompNodeSyncManager
::
inst
().
get_or_create_event
(
m_blob
.
get
());
e
->
record
();
return
e
;
bool
Tensor
::
storage_is_unique
()
{
return
m_blob
.
unique
()
&&
m_blob
->
storage_is_unique
();
}
void
Tensor
::
static_initialize
()
{
EventPool
::
with_timer
();
EventPool
::
without_timer
();
AsyncReleaser
::
inst
();
CompNodeSyncManager
::
inst
();
MultiCNConstTensorCache
::
inst
();
}
...
...
imperative/src/impl/proxy_graph/mini_graph.h
浏览文件 @
877bda41
...
...
@@ -836,8 +836,8 @@ public:
if
(
used_cns
.
insert
(
cn
).
second
)
{
for
(
auto
&&
in
:
inputs
)
{
if
(
in
->
comp_node
()
!=
cn
)
{
auto
&&
e
=
in
->
get_or_create
_event
();
e
->
device_wait_by
(
cn
);
auto
e
=
in
->
get_ready
_event
();
device_wait_event
(
cn
,
in
->
comp_node
(),
e
);
}
}
}
...
...
@@ -847,11 +847,17 @@ public:
// so we need create inference session here
minigraph
.
execute
(
raw_inputs
,
raw_outputs
,
m_env
);
for
(
auto
&&
cn
:
used_cns
)
{
bool
should_record
=
false
;
for
(
auto
&&
in
:
inputs
)
{
if
(
in
->
comp_node
()
!=
cn
)
{
in
->
add_release_callback
(
cn
);
should_record
=
true
;
auto
e
=
record_event
(
cn
);
async_release
(
cn
,
e
,
*
in
);
}
}
if
(
should_record
)
{
record_event
(
cn
,
true
);
}
}
return
outputs
;
...
...
imperative/src/impl/transformations/lazy.cpp
浏览文件 @
877bda41
...
...
@@ -15,7 +15,6 @@
#include "megbrain/opr/utility.h"
#include "../async_releaser.h"
#include "../mgb_cg_impl.h"
namespace
mgb
{
...
...
@@ -73,7 +72,7 @@ ValueRefList LazyEvalTransformation::apply_transformation(
args
.
device
.
emplace
();
args
.
device
->
copy_from
(
*
args
.
host
);
// every h2d in imperative runtime should notify AsyncReleaser
AsyncReleaser
::
inst
()
->
add
(
*
args
.
host
);
async_release
(
*
args
.
host
);
}
return
*
args
.
device
;
};
...
...
@@ -155,7 +154,7 @@ ValueRefList LazyEvalTransformation::apply_transformation(
host_value
.
copy_from
(
inferred_value
);
DeviceTensorND
dev_value
;
dev_value
.
copy_from
(
host_value
);
AsyncReleaser
::
inst
()
->
add
(
host_value
);
async_release
(
host_value
);
return
{
DeviceValue
::
make
(
dev_value
)};
}
default:
...
...
imperative/src/include/megbrain/imperative/blob_manager.h
浏览文件 @
877bda41
...
...
@@ -24,18 +24,18 @@ public:
static
BlobManager
*
inst
();
virtual
void
alloc_direct
(
Blob
*
blob
,
size_t
size
)
=
0
;
virtual
void
alloc_direct
(
Owned
Blob
*
blob
,
size_t
size
)
=
0
;
virtual
void
alloc_with_defrag
(
Blob
*
blob
,
size_t
size
)
=
0
;
virtual
void
alloc_with_defrag
(
Owned
Blob
*
blob
,
size_t
size
)
=
0
;
virtual
void
set_allocator
(
allocator_t
allocator
)
=
0
;
virtual
DeviceTensorND
alloc_workspace_with_defrag
(
CompNode
cn
,
TensorLayout
&
layout
)
=
0
;
virtual
void
register_blob
(
Blob
*
blob
)
=
0
;
virtual
void
register_blob
(
Owned
Blob
*
blob
)
=
0
;
virtual
void
unregister_blob
(
Blob
*
blob
)
=
0
;
virtual
void
unregister_blob
(
Owned
Blob
*
blob
)
=
0
;
virtual
void
defrag
(
const
CompNode
&
cn
)
=
0
;
};
...
...
imperative/src/include/megbrain/imperative/physical_tensor.h
浏览文件 @
877bda41
...
...
@@ -11,12 +11,16 @@
#pragma once
#include <cstdint>
#include <memory>
#include <mutex>
#include <type_traits>
#include <variant>
#include "megbrain/graph.h"
#include "megbrain/imperative/resource_manager.h"
#include "megbrain/tensor.h"
#include "megbrain/utils/metahelper.h"
namespace
mgb
{
namespace
imperative
{
...
...
@@ -26,35 +30,67 @@ class Blob;
using
BlobPtr
=
std
::
shared_ptr
<
Blob
>
;
class
BlobManagerImpl
;
class
OwnedBlob
;
class
Blob
:
public
NonCopyableObj
,
public
std
::
enable_shared_from_this
<
Blob
>
{
protected:
CompNode
m_comp_node
;
size_t
m_size
;
Blob
(
CompNode
cn
,
size_t
size
)
:
m_comp_node
(
cn
),
m_size
(
size
)
{}
class
Blob
:
public
NonCopyableObj
{
public:
Blob
(
const
DeviceTensorStorage
&
s
);
Blob
(
CompNode
cn
,
size_t
sz
);
~
Blob
();
virtual
~
Blob
()
=
default
;
template
<
typename
...
Args
>
static
BlobPtr
make
(
Args
&&
...
args
)
{
return
std
::
make_shared
<
Blob
>
(
std
::
forward
<
Args
>
(
args
)...);
static
std
::
shared_ptr
<
OwnedBlob
>
make
(
Args
&&
...
args
)
{
return
std
::
make_shared
<
Owned
Blob
>
(
std
::
forward
<
Args
>
(
args
)...);
}
const
CompNode
&
comp_node
()
const
{
return
m_comp_node
;
}
size_t
size
()
const
{
return
m_size
;
}
using
RawStorage
=
DeviceTensorStorage
::
RawStorage
;
const
RawStorage
&
storage
();
virtual
const
RawStorage
&
storage
()
=
0
;
virtual
BlobPtr
borrow_to
(
CompNode
)
=
0
;
virtual
bool
storage_is_unique
()
=
0
;
virtual
void
*
raw_ptr_not_for_readwrite
()
=
0
;
};
const
CompNode
&
comp_node
()
const
{
return
m_comp_node
;
}
class
OwnedBlob
final
:
public
Blob
{
friend
class
Blob
;
size_t
size
()
const
{
return
m_size
;
}
public:
OwnedBlob
(
const
DeviceTensorStorage
&
s
);
OwnedBlob
(
CompNode
cn
,
size_t
sz
);
~
OwnedBlob
()
override
;
size_t
id
()
const
{
return
m_id
;
}
const
RawStorage
&
storage
()
override
;
BlobPtr
borrow_to
(
CompNode
)
override
;
bool
storage_is_unique
()
override
;
void
*
raw_ptr_not_for_readwrite
()
override
;
private:
friend
class
BlobManagerImpl
;
CompNode
m_comp_node
;
mutable
RawStorage
m_storage
;
size_t
m_size
=
0
;
RawStorage
m_storage
;
size_t
m_id
;
};
class
BorrowedBlob
final
:
public
Blob
{
std
::
mutex
m_mtx
;
std
::
shared_ptr
<
OwnedBlob
>
m_owner
;
uint64_t
m_event
;
bool
m_initialized
=
false
;
public:
BorrowedBlob
(
CompNode
,
std
::
shared_ptr
<
OwnedBlob
>
);
~
BorrowedBlob
()
override
;
const
RawStorage
&
storage
()
override
;
BlobPtr
borrow_to
(
CompNode
)
override
;
bool
storage_is_unique
()
override
;
void
*
raw_ptr_not_for_readwrite
()
override
;
};
struct
EventDeleter
{
void
operator
()(
CompNode
::
Event
*
);
};
...
...
@@ -121,6 +157,8 @@ public:
BlobPtr
&
blob
()
{
return
m_blob
;
}
void
*
raw_ptr_not_for_readwrite
()
{
return
m_blob
->
raw_ptr_not_for_readwrite
();
}
void
fetch_value
();
bool
value_fetched
();
TensorPtr
sub
(
size_t
offset
,
TensorShape
shape
);
...
...
@@ -131,8 +169,10 @@ public:
// return a pointer instead of a reference to ensure thread safety
const
HostTensorND
*
try_get_value
();
void
add_release_callback
(
CompNode
cn
);
CompNode
::
Event
*
get_or_create_event
();
void
set_ready_event
(
uint64_t
event
)
{
m_produced_at
=
event
;
}
uint64_t
get_ready_event
();
bool
storage_is_unique
();
// Make sure all static objects required to destruct a tensor has completed
// construction. All static storage duration object that holds tensors must
...
...
@@ -152,8 +192,37 @@ private:
std
::
mutex
m_value_mtx
;
HostTensorND
m_value
;
EventPtr
m_value_ready
=
nullptr
;
uint64_t
m_produced_at
=
0
;
};
/*!
* \brief record a virtual event
* \param doitnow also record a real event
*/
uint64_t
record_event
(
CompNode
cn
,
bool
doitnow
=
false
);
//! make a device wait on a virtual event
void
device_wait_event
(
CompNode
waiter
,
CompNode
waitee
,
uint64_t
event
);
//! hold a blob until a virtual event on a device is completed
void
async_release
(
CompNode
cn
,
uint64_t
event
,
BlobPtr
blob
);
//! hold a host tensor until a virtual event on a device is completed
void
async_release
(
CompNode
cn
,
uint64_t
event
,
HostTensorStorage
::
RawStorage
storage
);
inline
void
async_release
(
CompNode
cn
,
uint64_t
event
,
Tensor
&
tensor
)
{
async_release
(
cn
,
event
,
tensor
.
blob
());
}
inline
void
async_release
(
CompNode
cn
,
uint64_t
event
,
const
HostTensorND
&
hnd
)
{
async_release
(
cn
,
event
,
hnd
.
storage
().
raw_storage
());
}
inline
void
async_release
(
const
HostTensorND
&
hnd
)
{
auto
cn
=
hnd
.
comp_node
();
async_release
(
cn
,
record_event
(
cn
,
true
),
hnd
);
}
// Cache for small blobs
// 1. A blob has to be seen twice (within a window) to be eligible for cache
// 2. Cache eviction occurs when cache size reaches a threshold, in least frequently
...
...
src/core/impl/comp_node/impl_helper.cpp
浏览文件 @
877bda41
...
...
@@ -32,7 +32,7 @@ bool CompNodeImplHelper::EventImplHelper::finished() {
mgb_assert
(
m_recorded
);
if
(
do_finished
())
{
m_finished
=
true
;
m_recorded
=
false
;
//
m_recorded = false;
return
true
;
}
return
false
;
...
...
src/core/include/megbrain/ir/ops.td
浏览文件 @
877bda41
...
...
@@ -120,6 +120,19 @@ def Copy: MgbHashableOp<"Copy"> {
);
}
def Borrow: MgbHashableOp<"Borrow"> {
let extraArguments = (ins
MgbCompNodeAttr:$comp_node
);
}
def Barrier: MgbHashableOp<"Barrier"> {
let extraArguments = (ins
MgbCompNodeAttr:$comp_node,
MgbUI32Attr:$nr_outputs
);
}
def Argsort: MgbHashableOp<"Argsort", [ArgsortParam]>;
def Argmax : MgbHashableOp<"Argmax", [AxisParam]>;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录