Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
877bda41
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
399
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
877bda41
编写于
2月 14, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
perf(mge): improve cross stream memory borrowing
GitOrigin-RevId: c68977c5dce157bfc624f887f085d7a7aa6c54ac
上级
ed7fa104
变更
21
隐藏空白更改
内联
并排
Showing
21 changed file
with
756 addition
and
183 deletion
+756
-183
imperative/python/megengine/_multistream.py
imperative/python/megengine/_multistream.py
+21
-0
imperative/python/megengine/device.py
imperative/python/megengine/device.py
+3
-3
imperative/python/megengine/tensor.py
imperative/python/megengine/tensor.py
+6
-5
imperative/python/src/module_trace.h
imperative/python/src/module_trace.h
+1
-0
imperative/python/src/tensor.cpp
imperative/python/src/tensor.cpp
+3
-0
imperative/python/test/unit/core/test_stream.py
imperative/python/test/unit/core/test_stream.py
+74
-0
imperative/src/impl/async_releaser.h
imperative/src/impl/async_releaser.h
+0
-84
imperative/src/impl/blob_manager_impl.cpp
imperative/src/impl/blob_manager_impl.cpp
+10
-10
imperative/src/impl/blob_manager_impl.h
imperative/src/impl/blob_manager_impl.h
+9
-9
imperative/src/impl/interpreter/interpreter_impl.cpp
imperative/src/impl/interpreter/interpreter_impl.cpp
+8
-3
imperative/src/impl/interpreter/interpreter_impl.h
imperative/src/impl/interpreter/interpreter_impl.h
+1
-1
imperative/src/impl/ops/elemwise.cpp
imperative/src/impl/ops/elemwise.cpp
+1
-1
imperative/src/impl/ops/tensor_manip.cpp
imperative/src/impl/ops/tensor_manip.cpp
+1
-3
imperative/src/impl/ops/utility.cpp
imperative/src/impl/ops/utility.cpp
+98
-0
imperative/src/impl/physical_tensor.cpp
imperative/src/impl/physical_tensor.cpp
+407
-38
imperative/src/impl/proxy_graph/mini_graph.h
imperative/src/impl/proxy_graph/mini_graph.h
+9
-3
imperative/src/impl/transformations/lazy.cpp
imperative/src/impl/transformations/lazy.cpp
+2
-3
imperative/src/include/megbrain/imperative/blob_manager.h
imperative/src/include/megbrain/imperative/blob_manager.h
+4
-4
imperative/src/include/megbrain/imperative/physical_tensor.h
imperative/src/include/megbrain/imperative/physical_tensor.h
+84
-15
src/core/impl/comp_node/impl_helper.cpp
src/core/impl/comp_node/impl_helper.cpp
+1
-1
src/core/include/megbrain/ir/ops.td
src/core/include/megbrain/ir/ops.td
+13
-0
未找到文件。
imperative/python/megengine/_multistream.py
0 → 100644
浏览文件 @
877bda41
from
.core._imperative_rt.core2
import
apply
from
.core.ops.builtin
import
Barrier
from
.tensor
import
Tensor
_dummy_tensors
=
{}
def
_get_dummy_tensor
(
device
):
if
device
not
in
_dummy_tensors
:
_dummy_tensors
[
device
]
=
Tensor
([],
device
=
device
)
return
_dummy_tensors
[
device
]
def
record_event
(
device
):
x
=
_get_dummy_tensor
(
device
)
(
x
,)
=
apply
(
Barrier
(
device
,
1
),
x
)
return
x
def
wait_event
(
device
,
event
):
apply
(
Barrier
(
device
,
0
),
event
)
imperative/python/megengine/device.py
浏览文件 @
877bda41
...
@@ -51,7 +51,7 @@ _sh = _stream_helper()
...
@@ -51,7 +51,7 @@ _sh = _stream_helper()
def
_valid_device
(
inp
):
def
_valid_device
(
inp
):
if
isinstance
(
inp
,
str
)
and
re
.
match
(
if
isinstance
(
inp
,
str
)
and
re
.
match
(
"^([cxg]pu|rocm|multithread)(
\d+|\d+:\d+|x)
$"
,
inp
"^([cxg]pu|rocm|multithread)(
x|\d+)(:\d+)?
$"
,
inp
):
):
return
True
return
True
return
False
return
False
...
@@ -255,8 +255,8 @@ def what_is_xpu():
...
@@ -255,8 +255,8 @@ def what_is_xpu():
def
coalesce_free_memory
():
def
coalesce_free_memory
():
r
"""This function will try it best to free all consecutive free chunks back to operating system,
r
"""This function will try it best to free all consecutive free chunks back to operating system,
small pieces may not be returned.
small pieces may not be returned.
because of the async processing of megengine, the effect of this func may not be reflected
because of the async processing of megengine, the effect of this func may not be reflected
immediately. if you want to see the effect immediately, you can call megengine._full_sync after
immediately. if you want to see the effect immediately, you can call megengine._full_sync after
this func was called
this func was called
...
...
imperative/python/megengine/tensor.py
浏览文件 @
877bda41
...
@@ -15,7 +15,7 @@ from .core._imperative_rt.core2 import Tensor as _Tensor
...
@@ -15,7 +15,7 @@ from .core._imperative_rt.core2 import Tensor as _Tensor
from
.core._imperative_rt.core2
import
apply
,
set_py_tensor_type
from
.core._imperative_rt.core2
import
apply
,
set_py_tensor_type
from
.core._trace_option
import
use_symbolic_shape
from
.core._trace_option
import
use_symbolic_shape
from
.core._wrap
import
as_device
from
.core._wrap
import
as_device
from
.core.ops.builtin
import
Copy
,
GetVarShape
from
.core.ops.builtin
import
Borrow
,
Copy
,
GetVarShape
from
.core.tensor.array_method
import
ArrayMethodMixin
from
.core.tensor.array_method
import
ArrayMethodMixin
from
.device
import
_valid_device
,
get_default_device
from
.device
import
_valid_device
,
get_default_device
from
.logger
import
get_logger
from
.logger
import
get_logger
...
@@ -205,7 +205,7 @@ class Tensor(_Tensor, ArrayMethodMixin):
...
@@ -205,7 +205,7 @@ class Tensor(_Tensor, ArrayMethodMixin):
def
reset_zero
(
self
):
def
reset_zero
(
self
):
self
*=
0
self
*=
0
def
to
(
self
,
device
):
def
to
(
self
,
device
,
*
,
_borrow
=
False
):
r
"""Copy self :class:`~.Tensor` to specified device. See :func:`~.copy`"""
r
"""Copy self :class:`~.Tensor` to specified device. See :func:`~.copy`"""
if
isinstance
(
device
,
str
)
and
not
_valid_device
(
device
):
if
isinstance
(
device
,
str
)
and
not
_valid_device
(
device
):
raise
ValueError
(
raise
ValueError
(
...
@@ -214,7 +214,8 @@ class Tensor(_Tensor, ArrayMethodMixin):
...
@@ -214,7 +214,8 @@ class Tensor(_Tensor, ArrayMethodMixin):
)
)
)
)
cn
=
as_device
(
device
).
to_c
()
cn
=
as_device
(
device
).
to_c
()
return
apply
(
Copy
(
comp_node
=
cn
),
self
)[
0
]
op
=
Borrow
(
comp_node
=
cn
)
if
_borrow
else
Copy
(
comp_node
=
cn
)
return
apply
(
op
,
self
)[
0
]
@
property
@
property
def
requires_grad
(
self
):
def
requires_grad
(
self
):
...
@@ -232,11 +233,11 @@ class Tensor(_Tensor, ArrayMethodMixin):
...
@@ -232,11 +233,11 @@ class Tensor(_Tensor, ArrayMethodMixin):
return
id
(
self
)
return
id
(
self
)
def
__getnewargs__
(
self
):
def
__getnewargs__
(
self
):
r
"""
__getnewargs__ will be called for pickle serialization or deep copy"""
r
"""__getnewargs__ will be called for pickle serialization or deep copy"""
return
(
self
.
numpy
(),
self
.
dtype
,
self
.
device
.
logical_name
)
return
(
self
.
numpy
(),
self
.
dtype
,
self
.
device
.
logical_name
)
def
__getstate__
(
self
):
def
__getstate__
(
self
):
r
"""
__getstate__ will be called for pickle serialization or deep copy"""
r
"""__getstate__ will be called for pickle serialization or deep copy"""
state
=
{}
state
=
{}
if
self
.
_qparams
is
not
None
:
if
self
.
_qparams
is
not
None
:
state
[
"qparams"
]
=
self
.
_qparams
state
[
"qparams"
]
=
self
.
_qparams
...
...
imperative/python/src/module_trace.h
浏览文件 @
877bda41
...
@@ -11,6 +11,7 @@
...
@@ -11,6 +11,7 @@
#pragma once
#pragma once
#include <list>
#include "megbrain/imperative/transformations/trace.h"
#include "megbrain/imperative/transformations/trace.h"
#include "megbrain/imperative/utils/map.h"
#include "megbrain/imperative/utils/map.h"
#include "megbrain/imperative/utils/stats.h"
#include "megbrain/imperative/utils/stats.h"
...
...
imperative/python/src/tensor.cpp
浏览文件 @
877bda41
...
@@ -998,6 +998,9 @@ void init_tensor(py::module m) {
...
@@ -998,6 +998,9 @@ void init_tensor(py::module m) {
module_trace_hook
.
inc_ref
();
module_trace_hook
.
inc_ref
();
});
});
auto
atexit
=
py
::
module
::
import
(
"atexit"
);
atexit
.
attr
(
"register"
)(
py
::
cpp_function
([]()
{
module_trace_hook
=
{};
}));
m
.
def
(
"begin_record_values"
,
[]
{
Value
::
begin_record_values
();
});
m
.
def
(
"begin_record_values"
,
[]
{
Value
::
begin_record_values
();
});
m
.
def
(
"end_record_values"
,
[]
{
m
.
def
(
"end_record_values"
,
[]
{
...
...
imperative/python/test/unit/core/test_stream.py
0 → 100644
浏览文件 @
877bda41
import
gc
import
numpy
as
np
import
pytest
import
megengine
as
mge
import
megengine.functional
as
F
from
megengine._multistream
import
record_event
,
wait_event
class
MemStat
:
def
__init__
(
self
,
*
args
):
for
d
in
args
:
mge
.
Tensor
([],
device
=
d
)
gc
.
collect
()
mge
.
_full_sync
()
self
.
baseline
=
{
d
:
mge
.
device
.
get_allocated_memory
(
d
)
for
d
in
args
}
for
d
in
args
:
mge
.
device
.
reset_max_memory_stats
(
d
)
def
get_max
(
self
,
device
):
return
mge
.
device
.
get_max_allocated_memory
(
device
)
-
self
.
baseline
[
device
]
@
pytest
.
mark
.
require_ngpu
(
1
)
def
test_mem_stats
():
memstat
=
MemStat
(
"xpux:0"
,
"xpux:1"
)
F
.
arange
(
1024
,
device
=
"xpux:0"
)
mge
.
_full_sync
()
assert
4096
<=
memstat
.
get_max
(
"xpux:0"
)
==
memstat
.
get_max
(
"xpux:1"
)
<=
4096
+
128
@
pytest
.
mark
.
require_ngpu
(
1
)
def
test_borrow
():
memstat
=
MemStat
(
"xpux:0"
,
"xpux:1"
)
x_np
=
np
.
random
.
randint
(
2
**
30
,
size
=
(
1
*
1024
*
1024
,),
dtype
=
"int32"
)
unit
=
x_np
.
size
*
4
x0
=
mge
.
Tensor
(
x_np
,
device
=
"xpux:0"
)
x1
=
x0
.
to
(
"xpux:1"
,
_borrow
=
True
)
y
=
-
x1
np
.
testing
.
assert_equal
(
-
x_np
,
y
.
numpy
())
mge
.
_full_sync
()
assert
memstat
.
get_max
(
"xpux:0"
)
/
unit
<
2.1
@
pytest
.
mark
.
require_ngpu
(
1
)
def
test_stream_mem
():
memstat
=
MemStat
(
"xpux:0"
,
"xpux:1"
)
x_np
=
np
.
random
.
randint
(
2
**
10
,
size
=
(
1
*
1024
*
1024
,),
dtype
=
"int32"
)
unit
=
x_np
.
size
*
4
x0
=
mge
.
Tensor
(
x_np
,
device
=
"xpux:0"
)
results
=
[]
events
=
[]
for
i
in
range
(
100
):
if
len
(
events
)
>=
2
:
wait_event
(
"xpux:0"
,
events
[
-
2
])
x0
=
x0
+
1
results
.
append
(
x0
.
to
(
"xpux:1"
,
_borrow
=
True
).
sum
())
events
.
append
(
record_event
(
"xpux:1"
))
del
events
[:
-
2
]
y_np
=
x_np
.
sum
()
for
i
,
y
in
enumerate
(
results
):
y_np
+=
x_np
.
size
assert
y_np
==
y
.
numpy
()
mge
.
_full_sync
()
assert
memstat
.
get_max
(
"xpux:0"
)
/
unit
<
2.1
imperative/src/impl/async_releaser.h
已删除
100644 → 0
浏览文件 @
ed7fa104
/**
* \file imperative/src/impl/async_releaser.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "megbrain/comp_node.h"
#include "megbrain/imperative/blob_manager.h"
#include "megbrain/imperative/resource_manager.h"
#include "megbrain/system.h"
#include "./event_pool.h"
namespace
mgb
{
namespace
imperative
{
class
AsyncReleaser
:
public
CompNodeDepedentObject
{
struct
WaiterParam
{
CompNode
cn
;
CompNode
::
Event
*
event
;
BlobPtr
blob
;
HostTensorStorage
::
RawStorage
storage
;
};
class
Waiter
final
:
public
AsyncQueueSC
<
WaiterParam
,
Waiter
>
{
AsyncReleaser
*
m_par_releaser
;
public:
// disable busy wait by set max_spin=0 to save CPU cycle
Waiter
(
AsyncReleaser
*
releaser
)
:
AsyncQueueSC
<
WaiterParam
,
Waiter
>
(
0
),
m_par_releaser
(
releaser
)
{}
void
process_one_task
(
WaiterParam
&
param
)
{
if
(
param
.
event
->
finished
())
{
param
.
blob
.
reset
();
param
.
storage
.
reset
();
EventPool
::
without_timer
().
free
(
param
.
event
);
return
;
}
using
namespace
std
::
literals
;
std
::
this_thread
::
sleep_for
(
1us
);
add_task
(
std
::
move
(
param
));
}
void
on_async_queue_worker_thread_start
()
override
{
sys
::
set_thread_name
(
"releaser"
);
}
};
Waiter
m_waiter
{
this
};
protected:
std
::
shared_ptr
<
void
>
on_comp_node_finalize
()
override
{
m_waiter
.
wait_task_queue_empty
();
return
{};
}
public:
static
AsyncReleaser
*
inst
()
{
static
auto
*
releaser
=
ResourceManager
::
create_global
<
AsyncReleaser
>
();
return
releaser
;
}
~
AsyncReleaser
()
{
m_waiter
.
wait_task_queue_empty
();
}
void
add
(
BlobPtr
blob
,
CompNode
cn
)
{
add
(
cn
,
std
::
move
(
blob
),
{});
}
void
add
(
const
HostTensorND
&
hv
)
{
add
(
hv
.
comp_node
(),
{},
hv
.
storage
().
raw_storage
());
}
void
add
(
CompNode
cn
,
BlobPtr
blob
,
HostTensorStorage
::
RawStorage
storage
=
{})
{
auto
event
=
EventPool
::
without_timer
().
alloc
(
cn
);
event
->
record
();
m_waiter
.
add_task
({
cn
,
event
,
std
::
move
(
blob
),
std
::
move
(
storage
)});
}
};
}
// namespace imperative
}
// namespace mgb
imperative/src/impl/blob_manager_impl.cpp
浏览文件 @
877bda41
...
@@ -16,7 +16,7 @@
...
@@ -16,7 +16,7 @@
namespace
mgb
{
namespace
mgb
{
namespace
imperative
{
namespace
imperative
{
BlobManagerImpl
::
BlobData
::
BlobData
(
Blob
*
in_blob
)
{
BlobManagerImpl
::
BlobData
::
BlobData
(
Owned
Blob
*
in_blob
)
{
blob
=
in_blob
;
blob
=
in_blob
;
DeviceTensorStorage
d_storage
;
DeviceTensorStorage
d_storage
;
d_storage
.
reset
(
blob
->
m_comp_node
,
blob
->
m_size
,
blob
->
m_storage
);
d_storage
.
reset
(
blob
->
m_comp_node
,
blob
->
m_size
,
blob
->
m_storage
);
...
@@ -28,19 +28,19 @@ BlobManagerImpl::BlobData::BlobData(Blob* in_blob) {
...
@@ -28,19 +28,19 @@ BlobManagerImpl::BlobData::BlobData(Blob* in_blob) {
h_storage
.
copy_from
(
const_cast
<
DeviceTensorStorage
&>
(
d_storage
),
blob
->
m_size
);
h_storage
.
copy_from
(
const_cast
<
DeviceTensorStorage
&>
(
d_storage
),
blob
->
m_size
);
}
}
void
BlobManagerImpl
::
register_blob
(
Blob
*
blob
)
{
void
BlobManagerImpl
::
register_blob
(
Owned
Blob
*
blob
)
{
// add blob into the comp2blobs map
// add blob into the comp2blobs map
MGB_LOCK_GUARD
(
m_mtx
);
MGB_LOCK_GUARD
(
m_mtx
);
mgb_assert
(
m_comp2blobs_map
[
blob
->
m_comp_node
].
insert
(
blob
));
mgb_assert
(
m_comp2blobs_map
[
blob
->
m_comp_node
].
insert
(
blob
));
}
}
void
BlobManagerImpl
::
unregister_blob
(
Blob
*
blob
)
{
void
BlobManagerImpl
::
unregister_blob
(
Owned
Blob
*
blob
)
{
// erase blob into the comp2blobs map
// erase blob into the comp2blobs map
MGB_LOCK_GUARD
(
m_mtx
);
MGB_LOCK_GUARD
(
m_mtx
);
mgb_assert
(
1
==
m_comp2blobs_map
[
blob
->
m_comp_node
].
erase
(
blob
));
mgb_assert
(
1
==
m_comp2blobs_map
[
blob
->
m_comp_node
].
erase
(
blob
));
}
}
void
BlobManagerImpl
::
alloc_with_defrag
(
Blob
*
blob
,
size_t
size
)
{
void
BlobManagerImpl
::
alloc_with_defrag
(
Owned
Blob
*
blob
,
size_t
size
)
{
if
(
custom_allocator
)
{
if
(
custom_allocator
)
{
blob
->
m_storage
=
custom_allocator
(
blob
->
m_comp_node
,
size
);
blob
->
m_storage
=
custom_allocator
(
blob
->
m_comp_node
,
size
);
return
;
return
;
...
@@ -55,7 +55,7 @@ void BlobManagerImpl::alloc_with_defrag(Blob* blob, size_t size) {
...
@@ -55,7 +55,7 @@ void BlobManagerImpl::alloc_with_defrag(Blob* blob, size_t size) {
});
});
}
}
void
BlobManagerImpl
::
alloc_direct
(
Blob
*
blob
,
size_t
size
)
{
void
BlobManagerImpl
::
alloc_direct
(
Owned
Blob
*
blob
,
size_t
size
)
{
DeviceTensorStorage
storage
(
blob
->
m_comp_node
);
DeviceTensorStorage
storage
(
blob
->
m_comp_node
);
mgb_assert
(
blob
->
m_comp_node
.
valid
());
mgb_assert
(
blob
->
m_comp_node
.
valid
());
storage
.
ensure_size
(
size
);
storage
.
ensure_size
(
size
);
...
@@ -143,7 +143,7 @@ void BlobManagerImpl::defrag(const CompNode& cn) {
...
@@ -143,7 +143,7 @@ void BlobManagerImpl::defrag(const CompNode& cn) {
// sort blobs by created time, may be helpful for reduce memory fragment
// sort blobs by created time, may be helpful for reduce memory fragment
std
::
sort
(
std
::
sort
(
blob_data_arrary
.
begin
(),
blob_data_arrary
.
end
(),
blob_data_arrary
.
begin
(),
blob_data_arrary
.
end
(),
[](
auto
&
lhs
,
auto
&
rhs
)
{
return
lhs
.
blob
->
id
()
<
rhs
.
blob
->
id
()
;
});
[](
auto
&
lhs
,
auto
&
rhs
)
{
return
lhs
.
blob
->
m_id
<
rhs
.
blob
->
m_id
;
});
// allocate for each storage
// allocate for each storage
for
(
auto
i
:
blob_data_arrary
)
{
for
(
auto
i
:
blob_data_arrary
)
{
...
@@ -158,19 +158,19 @@ void BlobManagerImpl::defrag(const CompNode& cn) {
...
@@ -158,19 +158,19 @@ void BlobManagerImpl::defrag(const CompNode& cn) {
}
}
struct
BlobManagerStub
:
BlobManager
{
struct
BlobManagerStub
:
BlobManager
{
void
alloc_direct
(
Blob
*
blob
,
size_t
size
)
{
void
alloc_direct
(
Owned
Blob
*
blob
,
size_t
size
)
{
mgb_assert
(
0
,
"prohibited after global variable destruction"
);
mgb_assert
(
0
,
"prohibited after global variable destruction"
);
};
};
void
alloc_with_defrag
(
Blob
*
blob
,
size_t
size
)
{
void
alloc_with_defrag
(
Owned
Blob
*
blob
,
size_t
size
)
{
mgb_assert
(
0
,
"prohibited after global variable destruction"
);
mgb_assert
(
0
,
"prohibited after global variable destruction"
);
};
};
DeviceTensorND
alloc_workspace_with_defrag
(
CompNode
cn
,
TensorLayout
&
layout
)
{
DeviceTensorND
alloc_workspace_with_defrag
(
CompNode
cn
,
TensorLayout
&
layout
)
{
mgb_assert
(
0
,
"prohibited after global variable destruction"
);
mgb_assert
(
0
,
"prohibited after global variable destruction"
);
};
};
void
register_blob
(
Blob
*
blob
)
{
void
register_blob
(
Owned
Blob
*
blob
)
{
mgb_assert
(
0
,
"prohibited after global variable destruction"
);
mgb_assert
(
0
,
"prohibited after global variable destruction"
);
};
};
void
unregister_blob
(
Blob
*
blob
){};
void
unregister_blob
(
Owned
Blob
*
blob
){};
void
defrag
(
const
CompNode
&
cn
)
{
void
defrag
(
const
CompNode
&
cn
)
{
mgb_assert
(
0
,
"prohibited after global variable destruction"
);
mgb_assert
(
0
,
"prohibited after global variable destruction"
);
};
};
...
...
imperative/src/impl/blob_manager_impl.h
浏览文件 @
877bda41
...
@@ -19,21 +19,21 @@ namespace imperative {
...
@@ -19,21 +19,21 @@ namespace imperative {
class
BlobManagerImpl
final
:
public
BlobManager
{
class
BlobManagerImpl
final
:
public
BlobManager
{
struct
BlobSetWithMux
{
struct
BlobSetWithMux
{
std
::
mutex
mtx
;
std
::
mutex
mtx
;
ThinHashSet
<
Blob
*>
blobs_set
;
ThinHashSet
<
Owned
Blob
*>
blobs_set
;
bool
insert
(
Blob
*
blob
)
{
bool
insert
(
Owned
Blob
*
blob
)
{
MGB_LOCK_GUARD
(
mtx
);
MGB_LOCK_GUARD
(
mtx
);
return
blobs_set
.
insert
(
blob
).
second
;
return
blobs_set
.
insert
(
blob
).
second
;
}
}
size_t
erase
(
Blob
*
blob
)
{
size_t
erase
(
Owned
Blob
*
blob
)
{
MGB_LOCK_GUARD
(
mtx
);
MGB_LOCK_GUARD
(
mtx
);
return
blobs_set
.
erase
(
blob
);
return
blobs_set
.
erase
(
blob
);
}
}
};
};
struct
BlobData
{
struct
BlobData
{
Blob
*
blob
;
Owned
Blob
*
blob
;
HostTensorStorage
h_storage
;
HostTensorStorage
h_storage
;
BlobData
(
Blob
*
in_blob
);
BlobData
(
Owned
Blob
*
in_blob
);
};
};
std
::
mutex
m_mtx
;
std
::
mutex
m_mtx
;
...
@@ -41,7 +41,7 @@ class BlobManagerImpl final : public BlobManager {
...
@@ -41,7 +41,7 @@ class BlobManagerImpl final : public BlobManager {
void
defrag
(
const
CompNode
&
cn
)
override
;
void
defrag
(
const
CompNode
&
cn
)
override
;
void
alloc_direct
(
Blob
*
blob
,
size_t
size
)
override
;
void
alloc_direct
(
Owned
Blob
*
blob
,
size_t
size
)
override
;
DeviceTensorND
alloc_workspace
(
CompNode
cn
,
TensorLayout
layout
);
DeviceTensorND
alloc_workspace
(
CompNode
cn
,
TensorLayout
layout
);
...
@@ -50,14 +50,14 @@ class BlobManagerImpl final : public BlobManager {
...
@@ -50,14 +50,14 @@ class BlobManagerImpl final : public BlobManager {
public:
public:
static
BlobManager
*
inst
();
static
BlobManager
*
inst
();
void
alloc_with_defrag
(
Blob
*
blob
,
size_t
size
)
override
;
void
alloc_with_defrag
(
Owned
Blob
*
blob
,
size_t
size
)
override
;
DeviceTensorND
alloc_workspace_with_defrag
(
DeviceTensorND
alloc_workspace_with_defrag
(
CompNode
cn
,
TensorLayout
&
layout
)
override
;
CompNode
cn
,
TensorLayout
&
layout
)
override
;
void
register_blob
(
Blob
*
blob
)
override
;
void
register_blob
(
Owned
Blob
*
blob
)
override
;
void
unregister_blob
(
Blob
*
blob
)
override
;
void
unregister_blob
(
Owned
Blob
*
blob
)
override
;
void
set_allocator
(
allocator_t
allocator
)
override
;
void
set_allocator
(
allocator_t
allocator
)
override
;
};
};
...
...
imperative/src/impl/interpreter/interpreter_impl.cpp
浏览文件 @
877bda41
...
@@ -704,7 +704,7 @@ void ChannelImpl::produce_tensor(TensorInfo* dest, TensorPtr ptr) {
...
@@ -704,7 +704,7 @@ void ChannelImpl::produce_tensor(TensorInfo* dest, TensorPtr ptr) {
m_dtr
.
update_used_time
(
dest
);
m_dtr
.
update_used_time
(
dest
);
MGB_RECORD_EVENT
(
MGB_RECORD_EVENT
(
TensorProduceEvent
,
dest
->
id
,
ptr
->
layout
(),
ptr
->
comp_node
(),
TensorProduceEvent
,
dest
->
id
,
ptr
->
layout
(),
ptr
->
comp_node
(),
ptr
->
dev_tensor
(
false
).
raw_ptr
());
ptr
->
raw_ptr_not_for_readwrite
());
// update tensor desc for static infer
// update tensor desc for static infer
if
(
dest
->
desc
.
layout
.
ndim
)
{
if
(
dest
->
desc
.
layout
.
ndim
)
{
mgb_assert
(
mgb_assert
(
...
@@ -805,8 +805,13 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd, std::string reason) {
...
@@ -805,8 +805,13 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd, std::string reason) {
inputs
[
idx
]
->
to_contiguous_inplace
(
layout_checker
);
inputs
[
idx
]
->
to_contiguous_inplace
(
layout_checker
);
}
}
}
}
return
OpDef
::
apply_on_physical_tensor
(
auto
outputs
=
OpDef
::
apply_on_physical_tensor
(
def
,
std
::
move
(
inputs
),
output_descs
,
validated
);
def
,
std
::
move
(
inputs
),
output_descs
,
validated
);
for
(
auto
&
o
:
outputs
)
{
o
->
set_ready_event
(
record_event
(
o
->
comp_node
(),
def
.
same_type
<
imperative
::
Barrier
>
()));
}
return
outputs
;
};
};
MGB_RECORD_EVENT
(
OpExecuteEvent
,
apply_id
,
{},
reason
);
MGB_RECORD_EVENT
(
OpExecuteEvent
,
apply_id
,
{},
reason
);
SmallVector
<
std
::
pair
<
CompNode
,
uint64_t
>>
kernels
;
SmallVector
<
std
::
pair
<
CompNode
,
uint64_t
>>
kernels
;
...
@@ -1059,7 +1064,7 @@ std::unordered_set<TensorInfo*> ChannelImpl::collect_valid_tensors() {
...
@@ -1059,7 +1064,7 @@ std::unordered_set<TensorInfo*> ChannelImpl::collect_valid_tensors() {
return
valid_tensors
;
return
valid_tensors
;
}
}
void
ChannelImpl
::
alloc_tensor_with_evict
(
Blob
*
x
)
{
void
ChannelImpl
::
alloc_tensor_with_evict
(
Owned
Blob
*
x
)
{
bool
in_worker
=
(
get_worker_tid
()
==
std
::
this_thread
::
get_id
());
bool
in_worker
=
(
get_worker_tid
()
==
std
::
this_thread
::
get_id
());
auto
reserve_size
=
[
&
](
size_t
size
)
{
auto
reserve_size
=
[
&
](
size_t
size
)
{
if
(
!
m_dtr
.
comp_node
.
valid
())
{
if
(
!
m_dtr
.
comp_node
.
valid
())
{
...
...
imperative/src/impl/interpreter/interpreter_impl.h
浏览文件 @
877bda41
...
@@ -304,7 +304,7 @@ private:
...
@@ -304,7 +304,7 @@ private:
//! automatically evict an optimal tensor
//! automatically evict an optimal tensor
bool
auto_evict
(
size_t
);
bool
auto_evict
(
size_t
);
void
alloc_tensor_with_evict
(
Blob
*
);
void
alloc_tensor_with_evict
(
Owned
Blob
*
);
// assert thread id when call get_xxx_state to avoid misuse
// assert thread id when call get_xxx_state to avoid misuse
ChannelState
&
get_channel_state
();
ChannelState
&
get_channel_state
();
...
...
imperative/src/impl/ops/elemwise.cpp
浏览文件 @
877bda41
...
@@ -249,7 +249,7 @@ SmallVector<TensorPtr> apply_inplace_add_on_physical_tensor(
...
@@ -249,7 +249,7 @@ SmallVector<TensorPtr> apply_inplace_add_on_physical_tensor(
const
OpDef
&
def
,
const
SmallVector
<
TensorPtr
>&
inputs
,
const
OpDef
&
def
,
const
SmallVector
<
TensorPtr
>&
inputs
,
SmallVector
<
LogicalTensorDesc
>&
output_descs
,
const
bool
&
validated
)
{
SmallVector
<
LogicalTensorDesc
>&
output_descs
,
const
bool
&
validated
)
{
auto
dest
=
inputs
[
0
],
delta
=
inputs
[
1
],
alpha
=
inputs
[
2
],
beta
=
inputs
[
3
];
auto
dest
=
inputs
[
0
],
delta
=
inputs
[
1
],
alpha
=
inputs
[
2
],
beta
=
inputs
[
3
];
if
(
!
(
inputs
[
0
]
->
blob
().
unique
()
&&
inputs
[
0
]
->
blob
()
->
storage
().
unique
()
))
{
if
(
!
inputs
[
0
]
->
storage_is_unique
(
))
{
mgb_log_warn
(
mgb_log_warn
(
"This inplace modification may change the elements of other tensors. "
"This inplace modification may change the elements of other tensors. "
"Fallback to non-inplace update."
);
"Fallback to non-inplace update."
);
...
...
imperative/src/impl/ops/tensor_manip.cpp
浏览文件 @
877bda41
...
@@ -13,7 +13,6 @@
...
@@ -13,7 +13,6 @@
#include "megbrain/imperative/ops/autogen.h"
#include "megbrain/imperative/ops/autogen.h"
#include "megbrain/imperative/ops/opr_attr.h"
#include "megbrain/imperative/ops/opr_attr.h"
#include "../async_releaser.h"
#include "../dnn_op_helper.h"
#include "../dnn_op_helper.h"
#include "../op_trait.h"
#include "../op_trait.h"
...
@@ -267,8 +266,7 @@ SmallVector<TensorPtr> param_pack_concat_apply_on_physical_tensor(
...
@@ -267,8 +266,7 @@ SmallVector<TensorPtr> param_pack_concat_apply_on_physical_tensor(
{
srcs_raw_ptr
,
srcs_layout
},
inputs
.
back
()
->
dev_tensor
().
as_megdnn
(),
{
srcs_raw_ptr
,
srcs_layout
},
inputs
.
back
()
->
dev_tensor
().
as_megdnn
(),
output
->
dev_tensor
().
as_megdnn
(),
output
->
dev_tensor
().
as_megdnn
(),
caller
.
create_workspace
({{
ws_size
},
dtype
::
Byte
()}));
caller
.
create_workspace
({{
ws_size
},
dtype
::
Byte
()}));
AsyncReleaser
::
inst
()
->
add
(
async_release
(
HostTensorND
{
comp_node
,
srcs_layout
}.
storage
(
srcs_storage
));
HostTensorND
{
comp_node
,
srcs_layout
}.
storage
(
srcs_storage
));
return
{
output
};
return
{
output
};
}
}
...
...
imperative/src/impl/ops/utility.cpp
浏览文件 @
877bda41
...
@@ -23,6 +23,8 @@
...
@@ -23,6 +23,8 @@
#include "megbrain/opr/tensor_gen.h"
#include "megbrain/opr/tensor_gen.h"
#include "megbrain/opr/tensor_manip.h"
#include "megbrain/opr/tensor_manip.h"
#include "megbrain/opr/utility.h"
#include "megbrain/opr/utility.h"
#include "megbrain/tensor.h"
#include "megdnn/dtype.h"
#if MGB_JIT
#if MGB_JIT
#include "megbrain/jit/executor_opr.h"
#include "megbrain/jit/executor_opr.h"
...
@@ -36,6 +38,102 @@ namespace mgb::imperative {
...
@@ -36,6 +38,102 @@ namespace mgb::imperative {
MGB_DYN_TYPE_OBJ_FINAL_IMPL
(
GenericPyOp
);
MGB_DYN_TYPE_OBJ_FINAL_IMPL
(
GenericPyOp
);
OP_TRAIT_REG
(
GenericPyOp
,
GenericPyOp
).
fallback
();
OP_TRAIT_REG
(
GenericPyOp
,
GenericPyOp
).
fallback
();
namespace
{
namespace
borrow
{
SmallVector
<
TensorPtr
>
apply_on_physical_tensor
(
const
OpDef
&
def
,
const
SmallVector
<
TensorPtr
>&
inputs
,
SmallVector
<
LogicalTensorDesc
>&
output_descs
,
const
bool
&
validated
)
{
auto
&
op
=
def
.
cast_final_safe
<
Borrow
>
();
SmallVector
<
TensorPtr
>
outputs
;
outputs
.
reserve
(
inputs
.
size
());
for
(
auto
&
i
:
inputs
)
{
outputs
.
push_back
(
Tensor
::
make
(
i
->
blob
()
->
borrow_to
(
op
.
comp_node
),
i
->
layout
(),
i
->
offset
()));
}
return
outputs
;
}
SmallVector
<
VarNode
::
LayoutConstraintCallback
>
get_input_layout_constraint
(
const
OpDef
&
def
,
const
SmallVector
<
TensorPtr
>&
inputs
)
{
return
SmallVector
<
VarNode
::
LayoutConstraintCallback
>
(
inputs
.
size
());
}
auto
infer_output_attrs_fallible
(
const
OpDef
&
def
,
const
SmallVector
<
LogicalTensorDesc
>&
inputs
)
{
auto
&
op
=
def
.
cast_final_safe
<
Borrow
>
();
std
::
tuple
<
SmallVector
<
LogicalTensorDesc
>
,
bool
>
ret
(
inputs
,
true
);
for
(
auto
&
i
:
std
::
get
<
0
>
(
ret
))
{
mgb_assert
(
i
.
comp_node
.
mem_node
()
==
op
.
comp_node
.
mem_node
(),
"cannot borrow memory from %s to %s"
,
i
.
comp_node
.
to_string
().
c_str
(),
op
.
comp_node
.
to_string
().
c_str
());
i
.
comp_node
=
op
.
comp_node
;
}
return
ret
;
}
OP_TRAIT_REG
(
Borrow
,
Borrow
)
.
apply_on_physical_tensor
(
apply_on_physical_tensor
)
.
infer_output_attrs_fallible
(
infer_output_attrs_fallible
)
.
get_input_layout_constraint
(
get_input_layout_constraint
)
.
fallback
();
}
// namespace borrow
}
// namespace
namespace
{
namespace
barrier
{
SmallVector
<
TensorPtr
>
apply_on_physical_tensor
(
const
OpDef
&
def
,
const
SmallVector
<
TensorPtr
>&
inputs
,
SmallVector
<
LogicalTensorDesc
>&
output_descs
,
const
bool
&
validated
)
{
auto
&
op
=
def
.
cast_final_safe
<
imperative
::
Barrier
>
();
SmallVector
<
TensorPtr
>
outputs
;
for
(
auto
&
i
:
inputs
)
{
if
(
i
->
comp_node
()
!=
op
.
comp_node
)
{
device_wait_event
(
op
.
comp_node
,
i
->
comp_node
(),
i
->
get_ready_event
());
}
}
if
(
op
.
nr_outputs
)
{
outputs
.
resize
(
op
.
nr_outputs
);
TensorLayout
layout
(
TensorShape
({
0
}),
dtype
::
Int32
{});
for
(
auto
&
i
:
outputs
)
{
i
=
Tensor
::
make
(
layout
,
op
.
comp_node
);
}
}
return
outputs
;
}
SmallVector
<
VarNode
::
LayoutConstraintCallback
>
get_input_layout_constraint
(
const
OpDef
&
def
,
const
SmallVector
<
TensorPtr
>&
inputs
)
{
return
SmallVector
<
VarNode
::
LayoutConstraintCallback
>
(
inputs
.
size
());
}
auto
infer_output_attrs_fallible
(
const
OpDef
&
def
,
const
SmallVector
<
LogicalTensorDesc
>&
inputs
)
{
auto
&
op
=
def
.
cast_final_safe
<
imperative
::
Barrier
>
();
std
::
tuple
<
SmallVector
<
LogicalTensorDesc
>
,
bool
>
ret
;
auto
&
[
descs
,
checked
]
=
ret
;
descs
.
resize
(
op
.
nr_outputs
);
checked
=
true
;
for
(
auto
&
desc
:
descs
)
{
desc
.
comp_node
=
op
.
comp_node
;
desc
.
layout
.
dtype
=
dtype
::
Int32
{};
desc
.
layout
.
ndim
=
1
;
desc
.
layout
.
shape
[
0
]
=
0
;
desc
.
layout
.
stride
[
0
]
=
1
;
}
return
ret
;
}
OP_TRAIT_REG
(
Barrier
,
imperative
::
Barrier
)
.
apply_on_physical_tensor
(
apply_on_physical_tensor
)
.
infer_output_attrs_fallible
(
infer_output_attrs_fallible
)
.
get_input_layout_constraint
(
get_input_layout_constraint
)
.
fallback
();
}
// namespace barrier
}
// namespace
namespace
{
namespace
{
namespace
fastpathcopy
{
namespace
fastpathcopy
{
auto
apply_on_var_node
(
const
OpDef
&
def
,
const
VarNodeArray
&
inputs
)
{
auto
apply_on_var_node
(
const
OpDef
&
def
,
const
VarNodeArray
&
inputs
)
{
...
...
imperative/src/impl/physical_tensor.cpp
浏览文件 @
877bda41
...
@@ -9,57 +9,384 @@
...
@@ -9,57 +9,384 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
*/
#include "megbrain/imperative/physical_tensor.h"
#include "megbrain/common.h"
#include "megbrain/comp_node.h"
#include "megbrain/imperative.h"
#include "megbrain/imperative.h"
#include "megbrain/imperative/blob_manager.h"
#include "megbrain/imperative/blob_manager.h"
#include "megbrain/imperative/profiler.h"
#include "megbrain/imperative/profiler.h"
#include "megbrain/imperative/resource_manager.h"
#include "megbrain/imperative/resource_manager.h"
#include "./async_releaser.h"
#include "./event_pool.h"
#include "./event_pool.h"
#include "./profiler/events.h"
#include "./profiler/events.h"
#include <condition_variable>
#include <cstdint>
#include <deque>
#include <map>
#include <memory>
#include <mutex>
#include <mutex>
#include <type_traits>
#include <unordered_map>
#include <utility>
#include <variant>
#include <vector>
#ifndef WIN32
#include <pthread.h>
#endif
#include "range/v3/all.hpp"
namespace
views
=
ranges
::
views
;
namespace
mgb
{
namespace
mgb
{
namespace
imperative
{
namespace
imperative
{
namespace
{
namespace
{
class
CompNodeSyncManager
:
public
CompNodeDepedentObject
{
struct
CompNodeHash
{
ThinHashMap
<
Blob
*
,
std
::
unique_ptr
<
CompNode
::
Event
>>
m_blob2event
;
auto
operator
()(
CompNode
cn
)
const
{
return
mgb
::
hash
(
cn
);
}
};
template
<
typename
T
>
struct
NoThrowMovable
:
T
{
using
T
::
T
;
NoThrowMovable
(
NoThrowMovable
&&
)
noexcept
=
default
;
};
template
<
typename
...
Ts
>
using
Map
=
NoThrowMovable
<
std
::
map
<
Ts
...
>>
;
class
CompNodeSyncManager
{
struct
CompNodeData
{
template
<
typename
T
>
class
ReleaseQueue
{
Map
<
uint64_t
,
T
>
map
;
public:
template
<
typename
A
>
void
emplace
(
uint64_t
t
,
A
&&
a
)
{
map
.
emplace_hint
(
map
.
end
(),
t
,
std
::
forward
<
A
>
(
a
));
}
void
release
(
uint64_t
t
)
{
auto
it
=
map
.
upper_bound
(
t
);
map
.
erase
(
map
.
begin
(),
it
);
}
};
//! next virtual event
uint64_t
next
=
1
;
//! last completed virtual event
uint64_t
completed
=
0
;
//! virtual event to real event
Map
<
uint64_t
,
EventPtr
>
events
;
//! ordering information at some virtual events:
//! what virtual events on other comp nodes is _sequenced before_ this virtual
//! event
Map
<
uint64_t
,
std
::
vector
<
uint64_t
>>
ordering
;
//! release queue for dev storage, keyed by releaser. this comp node is the
//! **receiver**
std
::
vector
<
ReleaseQueue
<
BlobPtr
>>
release_queues
;
//! release queue for host storage. this comp node is the **releaser**
ReleaseQueue
<
HostTensorStorage
::
RawStorage
>
host_release_queue
;
};
std
::
mutex
m_mtx
;
std
::
mutex
m_mtx
;
std
::
condition_variable
m_cv
;
bool
m_should_stop
=
false
;
std
::
thread
m_polling_thread
;
std
::
unordered_map
<
CompNode
,
size_t
,
CompNodeHash
>
m_cn2id
;
std
::
vector
<
CompNodeData
>
m_cndata
;
auto
do_record
(
CompNode
cn
,
size_t
cnid
,
std
::
unique_lock
<
std
::
mutex
>&
lock
)
{
// CAUSION: don't keep reference across locking boundary
lock
.
unlock
();
auto
e
=
EventPool
::
without_timer
().
alloc
(
cn
);
e
->
record
();
lock
.
lock
();
auto
&
cndata
=
m_cndata
[
cnid
];
return
cndata
.
events
.
emplace_hint
(
cndata
.
events
.
end
(),
cndata
.
next
++
,
e
);
}
std
::
pair
<
uint64_t
,
CompNode
::
Event
*>
get_event
(
CompNode
cn
,
size_t
cnid
,
uint64_t
t
,
std
::
unique_lock
<
std
::
mutex
>&
lock
)
{
auto
&
cndata
=
m_cndata
[
cnid
];
auto
it
=
cndata
.
events
.
lower_bound
(
t
);
if
(
it
==
cndata
.
events
.
end
())
{
it
=
do_record
(
cn
,
cnid
,
lock
);
}
return
{
it
->
first
,
it
->
second
.
get
()};
}
size_t
get_cnid_unsafe
(
CompNode
cn
)
{
auto
[
it
,
unseen
]
=
m_cn2id
.
try_emplace
(
cn
,
m_cndata
.
size
());
if
(
unseen
)
{
m_cndata
.
emplace_back
();
}
return
it
->
second
;
}
void
monitor_events
()
{
#if defined(__APPLE__)
pthread_setname_np
(
"CompNodeSync"
);
#elif defined(__unix__)
pthread_setname_np
(
pthread_self
(),
"CompNodeSync"
);
#endif
// poll events in rounds. sleep for a fixed duration between rounds.
// number of events to query is decided by the number of successful queries in
// last round, independently for each comp node:
// a. all -> double
// b. 0 -> 1
// c. otherwise -> #successful
struct
Item
{
size_t
cnid
;
decltype
(
CompNodeData
::
events
)
::
iterator
it
;
};
struct
Stat
{
size_t
num_success
=
0
;
size_t
num_attempts
=
0
;
// iterator to the last finished event
decltype
(
CompNodeData
::
events
)
::
iterator
it
;
};
std
::
vector
<
Stat
>
stats
;
std
::
vector
<
Item
>
todos
;
std
::
unique_lock
lock
(
m_mtx
);
for
(;;)
{
// copy events to a temporary storage so that we may unlock while polling
stats
.
resize
(
m_cndata
.
size
());
for
(
size_t
cnid
=
0
;
cnid
<
m_cndata
.
size
();
++
cnid
)
{
// decide max number of events to query
// rule c: #successful
size_t
n
=
stats
[
cnid
].
num_success
;
if
(
n
==
stats
[
cnid
].
num_attempts
)
{
// rule a: double
n
*=
2
;
}
if
(
n
==
0
)
{
// rule b: 1
n
=
1
;
}
// now copy upto n events
auto
&
events
=
m_cndata
[
cnid
].
events
;
size_t
i
=
0
;
for
(
auto
it
=
events
.
begin
();
i
<
n
&&
it
!=
events
.
end
();
++
i
,
++
it
)
{
todos
.
push_back
({
cnid
,
it
});
}
// reset stats for this round
stats
[
cnid
].
num_success
=
0
;
stats
[
cnid
].
num_attempts
=
n
;
}
lock
.
unlock
();
bool
last_result
=
false
;
size_t
last_cnid
=
-
1
;
for
(
auto
item
:
todos
)
{
if
(
item
.
cnid
==
last_cnid
&&
!
last_result
)
{
// previous failed, this one almost certainly should fail
continue
;
}
last_cnid
=
item
.
cnid
;
last_result
=
item
.
it
->
second
->
finished
();
if
(
last_result
)
{
stats
[
item
.
cnid
].
num_success
++
;
stats
[
item
.
cnid
].
it
=
item
.
it
;
}
}
todos
.
clear
();
lock
.
lock
();
// release dev storage
for
(
size_t
receiver_cnid
=
0
;
receiver_cnid
<
m_cndata
.
size
();
++
receiver_cnid
)
{
for
(
size_t
releaser_cnid
=
0
;
releaser_cnid
<
m_cndata
[
receiver_cnid
].
release_queues
.
size
();
++
releaser_cnid
)
{
if
(
releaser_cnid
>=
stats
.
size
()
||
stats
[
releaser_cnid
].
num_success
==
0
)
{
continue
;
}
auto
&
q
=
m_cndata
[
receiver_cnid
].
release_queues
[
releaser_cnid
];
q
.
release
(
stats
[
releaser_cnid
].
it
->
first
);
}
}
for
(
size_t
cnid
=
0
;
cnid
<
stats
.
size
();
++
cnid
)
{
if
(
stats
[
cnid
].
num_success
==
0
)
{
continue
;
}
auto
&
cndata
=
m_cndata
[
cnid
];
auto
it
=
stats
[
cnid
].
it
;
auto
t
=
it
->
first
;
// update completed
cndata
.
completed
=
t
;
// release host storage
cndata
.
host_release_queue
.
release
(
t
);
// remove completed events
auto
&
events
=
cndata
.
events
;
events
.
erase
(
events
.
begin
(),
std
::
next
(
it
));
}
using
namespace
std
::
literals
;
if
(
m_cv
.
wait_for
(
lock
,
10us
,
[
&
]
{
return
m_should_stop
;
}))
{
return
;
}
}
}
CompNodeSyncManager
()
{
m_polling_thread
=
std
::
thread
([
this
]
{
monitor_events
();
});
}
public:
public:
std
::
shared_ptr
<
void
>
on_comp_node_finalize
()
override
{
~
CompNodeSyncManager
()
{
MGB_LOCK_GUARD
(
m_mtx
);
{
m_blob2event
.
clear
();
MGB_LOCK_GUARD
(
m_mtx
);
return
{};
m_should_stop
=
true
;
m_cv
.
notify_all
();
}
m_polling_thread
.
join
();
}
}
static
CompNodeSyncManager
&
inst
()
{
static
CompNodeSyncManager
&
inst
();
static
auto
*
sl_inst
=
ResourceManager
::
create_global
<
CompNodeSyncManager
>
();
return
*
sl_inst
;
uint64_t
record
(
CompNode
cn
,
bool
doitnow
=
false
)
{
std
::
unique_lock
lock
(
m_mtx
);
auto
cnid
=
get_cnid_unsafe
(
cn
);
if
(
doitnow
)
{
return
do_record
(
cn
,
cnid
,
lock
)
->
first
;
}
return
m_cndata
[
cnid
].
next
++
;
}
}
CompNode
::
Event
*
get_or_create_event
(
Blob
*
blob
)
{
void
async_release
(
CompNode
cn
,
uint64_t
t
,
BlobPtr
blob
)
{
mgb_assert
(
!
is_finalized
());
MGB_LOCK_GUARD
(
m_mtx
);
MGB_LOCK_GUARD
(
m_mtx
);
auto
&&
e
=
m_blob2event
[
blob
];
auto
releaser_cnid
=
get_cnid_unsafe
(
cn
);
if
(
!
e
)
{
if
(
t
<=
m_cndata
[
releaser_cnid
].
completed
)
{
e
=
blob
->
comp_node
().
create_event
();
return
;
}
auto
receiver_cnid
=
get_cnid_unsafe
(
blob
->
comp_node
());
auto
&
qs
=
m_cndata
[
receiver_cnid
].
release_queues
;
if
(
releaser_cnid
>=
qs
.
size
())
{
qs
.
resize
(
releaser_cnid
+
1
);
}
}
return
e
.
get
();
auto
&
q
=
qs
[
releaser_cnid
];
q
.
emplace
(
t
,
std
::
move
(
blob
));
}
}
void
remove
(
Blob
*
blob
)
{
void
async_release
(
CompNode
cn
,
uint64_t
t
,
HostTensorStorage
::
RawStorage
storage
)
{
MGB_LOCK_GUARD
(
m_mtx
);
MGB_LOCK_GUARD
(
m_mtx
);
m_blob2event
.
erase
(
blob
);
auto
releaser_cnid
=
get_cnid_unsafe
(
cn
);
if
(
t
<=
m_cndata
[
releaser_cnid
].
completed
)
{
return
;
}
auto
&
q
=
m_cndata
[
releaser_cnid
].
host_release_queue
;
q
.
emplace
(
t
,
std
::
move
(
storage
));
}
void
device_wait
(
CompNode
waiter
,
CompNode
waitee
,
uint64_t
t
)
{
std
::
unique_lock
lock
(
m_mtx
);
auto
waiter_id
=
get_cnid_unsafe
(
waiter
);
auto
waitee_id
=
get_cnid_unsafe
(
waitee
);
auto
&
waiter_data
=
m_cndata
.
at
(
waiter_id
);
auto
&
waitee_data
=
m_cndata
.
at
(
waitee_id
);
auto
[
t_waitee
,
e
]
=
get_event
(
waitee
,
waitee_id
,
t
,
lock
);
// DO NOT unlock around this line! Event* could be invalidated!
e
->
device_wait_by
(
waiter
);
auto
t_waiter
=
waiter_data
.
next
++
;
std
::
vector
<
uint64_t
>
ordering
(
m_cndata
.
size
(),
0
);
if
(
!
waiter_data
.
ordering
.
empty
())
{
auto
&
o
=
waiter_data
.
ordering
.
rbegin
()
->
second
;
std
::
copy
(
o
.
begin
(),
o
.
end
(),
ordering
.
begin
());
}
ordering
[
waitee_id
]
=
t_waitee
;
ordering
[
waiter_id
]
=
t_waiter
;
{
auto
it
=
waitee_data
.
ordering
.
lower_bound
(
t_waitee
);
if
(
it
!=
waitee_data
.
ordering
.
begin
())
{
for
(
auto
[
a
,
b
]
:
views
::
zip
(
ordering
,
std
::
prev
(
it
)
->
second
))
{
static_assert
(
std
::
is_lvalue_reference_v
<
decltype
(
a
)
>
);
a
=
std
::
max
(
a
,
b
);
}
}
}
waiter_data
.
ordering
.
emplace_hint
(
waiter_data
.
ordering
.
end
(),
t_waiter
,
ordering
);
for
(
auto
[
t
,
q
]
:
views
::
zip
(
ordering
,
waiter_data
.
release_queues
))
{
q
.
release
(
t
);
}
}
}
};
};
CompNodeSyncManager
&
CompNodeSyncManager
::
inst
()
{
static
std
::
mutex
mtx
;
static
std
::
unique_ptr
<
CompNodeSyncManager
>
inst
;
struct
Guard
final
:
CompNodeDepedentObject
{
std
::
shared_ptr
<
void
>
on_comp_node_finalize
()
override
{
MGB_LOCK_GUARD
(
mtx
);
inst
.
reset
();
return
{};
}
};
static
std
::
optional
<
Guard
>
guard
;
#ifndef WIN32
static
bool
broken
=
false
;
static
struct
ForkGuard
{
ForkGuard
()
{
mgb_assert
(
0
==
pthread_atfork
(
NULL
,
NULL
,
[]
{
if
(
inst
)
{
inst
.
release
();
// deliberate leak, unfixable
broken
=
true
;
}
}));
}
}
fork_guard
;
#endif
MGB_LOCK_GUARD
(
mtx
);
if
(
!
inst
)
{
#ifndef WIN32
mgb_assert
(
!
broken
);
#endif
EventPool
::
without_timer
();
inst
.
reset
(
new
CompNodeSyncManager
);
guard
.
emplace
();
}
return
*
inst
;
}
}
// namespace
}
// namespace
uint64_t
record_event
(
CompNode
cn
,
bool
doitnow
)
{
return
CompNodeSyncManager
::
inst
().
record
(
cn
,
doitnow
);
}
void
device_wait_event
(
CompNode
waiter
,
CompNode
waitee
,
uint64_t
event
)
{
CompNodeSyncManager
::
inst
().
device_wait
(
waiter
,
waitee
,
event
);
}
void
async_release
(
CompNode
cn
,
uint64_t
event
,
BlobPtr
blob
)
{
CompNodeSyncManager
::
inst
().
async_release
(
cn
,
event
,
std
::
move
(
blob
));
}
void
async_release
(
CompNode
cn
,
uint64_t
event
,
HostTensorStorage
::
RawStorage
storage
)
{
CompNodeSyncManager
::
inst
().
async_release
(
cn
,
event
,
std
::
move
(
storage
));
}
void
EventDeleter
::
operator
()(
CompNode
::
Event
*
event
)
{
void
EventDeleter
::
operator
()(
CompNode
::
Event
*
event
)
{
EventPool
::
without_timer
().
free
(
event
);
EventPool
::
without_timer
().
free
(
event
);
}
}
...
@@ -68,31 +395,74 @@ namespace {
...
@@ -68,31 +395,74 @@ namespace {
std
::
atomic_uint64_t
next_blob_id
=
0
;
std
::
atomic_uint64_t
next_blob_id
=
0
;
}
}
Blob
::
Blob
(
const
DeviceTensorStorage
&
s
)
OwnedBlob
::
Owned
Blob
(
const
DeviceTensorStorage
&
s
)
:
m_comp_node
{
s
.
comp_node
()}
,
:
Blob
(
s
.
comp_node
(),
s
.
size
()
+
s
.
offset
())
,
m_storage
{
s
.
raw_storage
()},
m_storage
{
s
.
raw_storage
()},
m_size
{
s
.
size
()
+
s
.
offset
()}
{
m_id
{
next_blob_id
++
}
{
m_id
=
next_blob_id
++
;
BlobManager
::
inst
()
->
register_blob
(
this
);
BlobManager
::
inst
()
->
register_blob
(
this
);
}
}
Blob
::
Blob
(
CompNode
cn
,
size_t
sz
)
:
m_comp_node
{
cn
},
m_storage
{},
m_size
{
sz
}
{
OwnedBlob
::
OwnedBlob
(
CompNode
cn
,
size_t
sz
)
m_id
=
next_blob_id
++
;
:
Blob
(
cn
,
sz
),
m_storage
{},
m_id
{
next_blob_id
++
}
{
BlobManager
::
inst
()
->
register_blob
(
this
);
BlobManager
::
inst
()
->
register_blob
(
this
);
}
}
Blob
::~
Blob
()
{
OwnedBlob
::~
Owned
Blob
()
{
BlobManager
::
inst
()
->
unregister_blob
(
this
);
BlobManager
::
inst
()
->
unregister_blob
(
this
);
CompNodeSyncManager
::
inst
().
remove
(
this
);
}
}
const
Blob
::
RawStorage
&
Blob
::
storage
()
{
const
Blob
::
RawStorage
&
Owned
Blob
::
storage
()
{
if
(
!
m_storage
&&
m_size
)
{
if
(
!
m_storage
&&
m_size
)
{
BlobManager
::
inst
()
->
alloc_with_defrag
(
this
,
m_size
);
BlobManager
::
inst
()
->
alloc_with_defrag
(
this
,
m_size
);
}
}
return
m_storage
;
return
m_storage
;
}
}
BlobPtr
OwnedBlob
::
borrow_to
(
CompNode
cn
)
{
return
std
::
make_shared
<
BorrowedBlob
>
(
cn
,
std
::
static_pointer_cast
<
OwnedBlob
>
(
shared_from_this
()));
}
bool
OwnedBlob
::
storage_is_unique
()
{
return
m_storage
.
unique
();
}
void
*
OwnedBlob
::
raw_ptr_not_for_readwrite
()
{
return
m_storage
.
get
();
}
BorrowedBlob
::
BorrowedBlob
(
CompNode
cn
,
std
::
shared_ptr
<
OwnedBlob
>
owner
)
:
Blob
(
cn
,
owner
->
size
()),
m_owner
(
std
::
move
(
owner
)),
m_event
(
record_event
(
m_owner
->
comp_node
(),
true
))
{}
BorrowedBlob
::~
BorrowedBlob
()
{
async_release
(
m_comp_node
,
record_event
(
m_comp_node
,
true
),
std
::
move
(
m_owner
));
}
const
Blob
::
RawStorage
&
BorrowedBlob
::
storage
()
{
{
MGB_LOCK_GUARD
(
m_mtx
);
if
(
!
m_initialized
)
{
device_wait_event
(
m_comp_node
,
m_owner
->
comp_node
(),
m_event
);
m_initialized
=
true
;
}
}
return
m_owner
->
storage
();
}
BlobPtr
BorrowedBlob
::
borrow_to
(
CompNode
cn
)
{
return
std
::
make_shared
<
BorrowedBlob
>
(
cn
,
m_owner
);
}
bool
BorrowedBlob
::
storage_is_unique
()
{
return
m_owner
.
unique
()
&&
m_owner
->
storage_is_unique
();
}
void
*
BorrowedBlob
::
raw_ptr_not_for_readwrite
()
{
return
m_owner
->
raw_ptr_not_for_readwrite
();
}
Tensor
::
Tensor
(
Tensor
::
Tensor
(
BlobPtr
blob
,
const
TensorLayout
&
layout
,
size_t
offset
,
const
HostTensorND
&
hv
)
BlobPtr
blob
,
const
TensorLayout
&
layout
,
size_t
offset
,
const
HostTensorND
&
hv
)
:
m_cn
(
blob
->
comp_node
()),
:
m_cn
(
blob
->
comp_node
()),
...
@@ -119,7 +489,7 @@ Tensor::Tensor(const HostTensorND& hv) : Tensor(hv.layout(), hv.comp_node()) {
...
@@ -119,7 +489,7 @@ Tensor::Tensor(const HostTensorND& hv) : Tensor(hv.layout(), hv.comp_node()) {
MGB_RECORD_EVENT
(
MGB_RECORD_EVENT
(
profiler
::
HostToDeviceFinishEvent
,
hv
.
layout
(),
hv
.
comp_node
(),
profiler
::
HostToDeviceFinishEvent
,
hv
.
layout
(),
hv
.
comp_node
(),
hv
.
raw_ptr
(),
dev_tensor
().
raw_ptr
());
hv
.
raw_ptr
(),
dev_tensor
().
raw_ptr
());
AsyncReleaser
::
inst
()
->
add
(
hv
);
async_release
(
hv
);
}
}
}
}
...
@@ -174,7 +544,7 @@ void Tensor::to_contiguous_inplace(VarNode::LayoutConstraintCallback& layout_che
...
@@ -174,7 +544,7 @@ void Tensor::to_contiguous_inplace(VarNode::LayoutConstraintCallback& layout_che
DeviceTensorND
dv_contig
;
DeviceTensorND
dv_contig
;
dv_contig
.
copy_from
(
dv
);
dv_contig
.
copy_from
(
dv
);
m_layout
=
dv_contig
.
layout
();
m_layout
=
dv_contig
.
layout
();
std
::
atomic_store
(
&
m_blob
,
Blob
::
make
(
dv_contig
.
storage
(
)));
std
::
atomic_store
(
&
m_blob
,
Blob
Ptr
(
Blob
::
make
(
dv_contig
.
storage
()
)));
mgb_assert
(
m_layout
.
is_contiguous
());
mgb_assert
(
m_layout
.
is_contiguous
());
m_offset
=
0
;
m_offset
=
0
;
}
}
...
@@ -188,7 +558,7 @@ void Tensor::to_contiguous_inplace() {
...
@@ -188,7 +558,7 @@ void Tensor::to_contiguous_inplace() {
void
Tensor
::
assign_from_dev_tensor
(
DeviceTensorND
dv
)
{
void
Tensor
::
assign_from_dev_tensor
(
DeviceTensorND
dv
)
{
MGB_LOCK_GUARD
(
m_blob_mtx
);
MGB_LOCK_GUARD
(
m_blob_mtx
);
std
::
atomic_store
(
&
m_blob
,
Blob
::
make
(
dv
.
storage
(
)));
std
::
atomic_store
(
&
m_blob
,
Blob
Ptr
(
Blob
::
make
(
dv
.
storage
()
)));
m_offset
=
dv
.
storage
().
offset
();
m_offset
=
dv
.
storage
().
offset
();
m_layout
=
dv
.
layout
();
m_layout
=
dv
.
layout
();
}
}
...
@@ -254,21 +624,20 @@ TensorPtr Tensor::sub(size_t offset, TensorShape shape) {
...
@@ -254,21 +624,20 @@ TensorPtr Tensor::sub(size_t offset, TensorShape shape) {
return
Tensor
::
make
(
m_blob
,
offset
+
m_offset
,
layout
);
return
Tensor
::
make
(
m_blob
,
offset
+
m_offset
,
layout
);
}
}
void
Tensor
::
add_release_callback
(
CompNode
cn
)
{
uint64_t
Tensor
::
get_ready_event
()
{
AsyncReleaser
::
inst
()
->
add
(
m_blob
,
cn
);
if
(
m_produced_at
==
0
)
{
m_produced_at
=
record_event
(
comp_node
());
}
return
m_produced_at
;
}
}
CompNode
::
Event
*
Tensor
::
get_or_create_event
()
{
bool
Tensor
::
storage_is_unique
()
{
auto
e
=
CompNodeSyncManager
::
inst
().
get_or_create_event
(
m_blob
.
get
());
return
m_blob
.
unique
()
&&
m_blob
->
storage_is_unique
();
e
->
record
();
return
e
;
}
}
void
Tensor
::
static_initialize
()
{
void
Tensor
::
static_initialize
()
{
EventPool
::
with_timer
();
EventPool
::
with_timer
();
EventPool
::
without_timer
();
EventPool
::
without_timer
();
AsyncReleaser
::
inst
();
CompNodeSyncManager
::
inst
();
MultiCNConstTensorCache
::
inst
();
MultiCNConstTensorCache
::
inst
();
}
}
...
...
imperative/src/impl/proxy_graph/mini_graph.h
浏览文件 @
877bda41
...
@@ -836,8 +836,8 @@ public:
...
@@ -836,8 +836,8 @@ public:
if
(
used_cns
.
insert
(
cn
).
second
)
{
if
(
used_cns
.
insert
(
cn
).
second
)
{
for
(
auto
&&
in
:
inputs
)
{
for
(
auto
&&
in
:
inputs
)
{
if
(
in
->
comp_node
()
!=
cn
)
{
if
(
in
->
comp_node
()
!=
cn
)
{
auto
&&
e
=
in
->
get_or_create
_event
();
auto
e
=
in
->
get_ready
_event
();
e
->
device_wait_by
(
cn
);
device_wait_event
(
cn
,
in
->
comp_node
(),
e
);
}
}
}
}
}
}
...
@@ -847,11 +847,17 @@ public:
...
@@ -847,11 +847,17 @@ public:
// so we need create inference session here
// so we need create inference session here
minigraph
.
execute
(
raw_inputs
,
raw_outputs
,
m_env
);
minigraph
.
execute
(
raw_inputs
,
raw_outputs
,
m_env
);
for
(
auto
&&
cn
:
used_cns
)
{
for
(
auto
&&
cn
:
used_cns
)
{
bool
should_record
=
false
;
for
(
auto
&&
in
:
inputs
)
{
for
(
auto
&&
in
:
inputs
)
{
if
(
in
->
comp_node
()
!=
cn
)
{
if
(
in
->
comp_node
()
!=
cn
)
{
in
->
add_release_callback
(
cn
);
should_record
=
true
;
auto
e
=
record_event
(
cn
);
async_release
(
cn
,
e
,
*
in
);
}
}
}
}
if
(
should_record
)
{
record_event
(
cn
,
true
);
}
}
}
return
outputs
;
return
outputs
;
...
...
imperative/src/impl/transformations/lazy.cpp
浏览文件 @
877bda41
...
@@ -15,7 +15,6 @@
...
@@ -15,7 +15,6 @@
#include "megbrain/opr/utility.h"
#include "megbrain/opr/utility.h"
#include "../async_releaser.h"
#include "../mgb_cg_impl.h"
#include "../mgb_cg_impl.h"
namespace
mgb
{
namespace
mgb
{
...
@@ -73,7 +72,7 @@ ValueRefList LazyEvalTransformation::apply_transformation(
...
@@ -73,7 +72,7 @@ ValueRefList LazyEvalTransformation::apply_transformation(
args
.
device
.
emplace
();
args
.
device
.
emplace
();
args
.
device
->
copy_from
(
*
args
.
host
);
args
.
device
->
copy_from
(
*
args
.
host
);
// every h2d in imperative runtime should notify AsyncReleaser
// every h2d in imperative runtime should notify AsyncReleaser
AsyncReleaser
::
inst
()
->
add
(
*
args
.
host
);
async_release
(
*
args
.
host
);
}
}
return
*
args
.
device
;
return
*
args
.
device
;
};
};
...
@@ -155,7 +154,7 @@ ValueRefList LazyEvalTransformation::apply_transformation(
...
@@ -155,7 +154,7 @@ ValueRefList LazyEvalTransformation::apply_transformation(
host_value
.
copy_from
(
inferred_value
);
host_value
.
copy_from
(
inferred_value
);
DeviceTensorND
dev_value
;
DeviceTensorND
dev_value
;
dev_value
.
copy_from
(
host_value
);
dev_value
.
copy_from
(
host_value
);
AsyncReleaser
::
inst
()
->
add
(
host_value
);
async_release
(
host_value
);
return
{
DeviceValue
::
make
(
dev_value
)};
return
{
DeviceValue
::
make
(
dev_value
)};
}
}
default:
default:
...
...
imperative/src/include/megbrain/imperative/blob_manager.h
浏览文件 @
877bda41
...
@@ -24,18 +24,18 @@ public:
...
@@ -24,18 +24,18 @@ public:
static
BlobManager
*
inst
();
static
BlobManager
*
inst
();
virtual
void
alloc_direct
(
Blob
*
blob
,
size_t
size
)
=
0
;
virtual
void
alloc_direct
(
Owned
Blob
*
blob
,
size_t
size
)
=
0
;
virtual
void
alloc_with_defrag
(
Blob
*
blob
,
size_t
size
)
=
0
;
virtual
void
alloc_with_defrag
(
Owned
Blob
*
blob
,
size_t
size
)
=
0
;
virtual
void
set_allocator
(
allocator_t
allocator
)
=
0
;
virtual
void
set_allocator
(
allocator_t
allocator
)
=
0
;
virtual
DeviceTensorND
alloc_workspace_with_defrag
(
virtual
DeviceTensorND
alloc_workspace_with_defrag
(
CompNode
cn
,
TensorLayout
&
layout
)
=
0
;
CompNode
cn
,
TensorLayout
&
layout
)
=
0
;
virtual
void
register_blob
(
Blob
*
blob
)
=
0
;
virtual
void
register_blob
(
Owned
Blob
*
blob
)
=
0
;
virtual
void
unregister_blob
(
Blob
*
blob
)
=
0
;
virtual
void
unregister_blob
(
Owned
Blob
*
blob
)
=
0
;
virtual
void
defrag
(
const
CompNode
&
cn
)
=
0
;
virtual
void
defrag
(
const
CompNode
&
cn
)
=
0
;
};
};
...
...
imperative/src/include/megbrain/imperative/physical_tensor.h
浏览文件 @
877bda41
...
@@ -11,12 +11,16 @@
...
@@ -11,12 +11,16 @@
#pragma once
#pragma once
#include <cstdint>
#include <memory>
#include <memory>
#include <mutex>
#include <mutex>
#include <type_traits>
#include <variant>
#include "megbrain/graph.h"
#include "megbrain/graph.h"
#include "megbrain/imperative/resource_manager.h"
#include "megbrain/imperative/resource_manager.h"
#include "megbrain/tensor.h"
#include "megbrain/tensor.h"
#include "megbrain/utils/metahelper.h"
namespace
mgb
{
namespace
mgb
{
namespace
imperative
{
namespace
imperative
{
...
@@ -26,35 +30,67 @@ class Blob;
...
@@ -26,35 +30,67 @@ class Blob;
using
BlobPtr
=
std
::
shared_ptr
<
Blob
>
;
using
BlobPtr
=
std
::
shared_ptr
<
Blob
>
;
class
BlobManagerImpl
;
class
BlobManagerImpl
;
class
OwnedBlob
;
class
Blob
:
public
NonCopyableObj
,
public
std
::
enable_shared_from_this
<
Blob
>
{
protected:
CompNode
m_comp_node
;
size_t
m_size
;
Blob
(
CompNode
cn
,
size_t
size
)
:
m_comp_node
(
cn
),
m_size
(
size
)
{}
class
Blob
:
public
NonCopyableObj
{
public:
public:
Blob
(
const
DeviceTensorStorage
&
s
);
virtual
~
Blob
()
=
default
;
Blob
(
CompNode
cn
,
size_t
sz
);
~
Blob
();
template
<
typename
...
Args
>
template
<
typename
...
Args
>
static
BlobPtr
make
(
Args
&&
...
args
)
{
static
std
::
shared_ptr
<
OwnedBlob
>
make
(
Args
&&
...
args
)
{
return
std
::
make_shared
<
Blob
>
(
std
::
forward
<
Args
>
(
args
)...);
return
std
::
make_shared
<
Owned
Blob
>
(
std
::
forward
<
Args
>
(
args
)...);
}
}
const
CompNode
&
comp_node
()
const
{
return
m_comp_node
;
}
size_t
size
()
const
{
return
m_size
;
}
using
RawStorage
=
DeviceTensorStorage
::
RawStorage
;
using
RawStorage
=
DeviceTensorStorage
::
RawStorage
;
const
RawStorage
&
storage
();
virtual
const
RawStorage
&
storage
()
=
0
;
virtual
BlobPtr
borrow_to
(
CompNode
)
=
0
;
virtual
bool
storage_is_unique
()
=
0
;
virtual
void
*
raw_ptr_not_for_readwrite
()
=
0
;
};
const
CompNode
&
comp_node
()
const
{
return
m_comp_node
;
}
class
OwnedBlob
final
:
public
Blob
{
friend
class
Blob
;
size_t
size
()
const
{
return
m_size
;
}
public:
OwnedBlob
(
const
DeviceTensorStorage
&
s
);
OwnedBlob
(
CompNode
cn
,
size_t
sz
);
~
OwnedBlob
()
override
;
size_t
id
()
const
{
return
m_id
;
}
const
RawStorage
&
storage
()
override
;
BlobPtr
borrow_to
(
CompNode
)
override
;
bool
storage_is_unique
()
override
;
void
*
raw_ptr_not_for_readwrite
()
override
;
private:
private:
friend
class
BlobManagerImpl
;
friend
class
BlobManagerImpl
;
CompNode
m_comp_node
;
RawStorage
m_storage
;
mutable
RawStorage
m_storage
;
size_t
m_size
=
0
;
size_t
m_id
;
size_t
m_id
;
};
};
class
BorrowedBlob
final
:
public
Blob
{
std
::
mutex
m_mtx
;
std
::
shared_ptr
<
OwnedBlob
>
m_owner
;
uint64_t
m_event
;
bool
m_initialized
=
false
;
public:
BorrowedBlob
(
CompNode
,
std
::
shared_ptr
<
OwnedBlob
>
);
~
BorrowedBlob
()
override
;
const
RawStorage
&
storage
()
override
;
BlobPtr
borrow_to
(
CompNode
)
override
;
bool
storage_is_unique
()
override
;
void
*
raw_ptr_not_for_readwrite
()
override
;
};
struct
EventDeleter
{
struct
EventDeleter
{
void
operator
()(
CompNode
::
Event
*
);
void
operator
()(
CompNode
::
Event
*
);
};
};
...
@@ -121,6 +157,8 @@ public:
...
@@ -121,6 +157,8 @@ public:
BlobPtr
&
blob
()
{
return
m_blob
;
}
BlobPtr
&
blob
()
{
return
m_blob
;
}
void
*
raw_ptr_not_for_readwrite
()
{
return
m_blob
->
raw_ptr_not_for_readwrite
();
}
void
fetch_value
();
void
fetch_value
();
bool
value_fetched
();
bool
value_fetched
();
TensorPtr
sub
(
size_t
offset
,
TensorShape
shape
);
TensorPtr
sub
(
size_t
offset
,
TensorShape
shape
);
...
@@ -131,8 +169,10 @@ public:
...
@@ -131,8 +169,10 @@ public:
// return a pointer instead of a reference to ensure thread safety
// return a pointer instead of a reference to ensure thread safety
const
HostTensorND
*
try_get_value
();
const
HostTensorND
*
try_get_value
();
void
add_release_callback
(
CompNode
cn
);
void
set_ready_event
(
uint64_t
event
)
{
m_produced_at
=
event
;
}
CompNode
::
Event
*
get_or_create_event
();
uint64_t
get_ready_event
();
bool
storage_is_unique
();
// Make sure all static objects required to destruct a tensor has completed
// Make sure all static objects required to destruct a tensor has completed
// construction. All static storage duration object that holds tensors must
// construction. All static storage duration object that holds tensors must
...
@@ -152,8 +192,37 @@ private:
...
@@ -152,8 +192,37 @@ private:
std
::
mutex
m_value_mtx
;
std
::
mutex
m_value_mtx
;
HostTensorND
m_value
;
HostTensorND
m_value
;
EventPtr
m_value_ready
=
nullptr
;
EventPtr
m_value_ready
=
nullptr
;
uint64_t
m_produced_at
=
0
;
};
};
/*!
* \brief record a virtual event
* \param doitnow also record a real event
*/
uint64_t
record_event
(
CompNode
cn
,
bool
doitnow
=
false
);
//! make a device wait on a virtual event
void
device_wait_event
(
CompNode
waiter
,
CompNode
waitee
,
uint64_t
event
);
//! hold a blob until a virtual event on a device is completed
void
async_release
(
CompNode
cn
,
uint64_t
event
,
BlobPtr
blob
);
//! hold a host tensor until a virtual event on a device is completed
void
async_release
(
CompNode
cn
,
uint64_t
event
,
HostTensorStorage
::
RawStorage
storage
);
inline
void
async_release
(
CompNode
cn
,
uint64_t
event
,
Tensor
&
tensor
)
{
async_release
(
cn
,
event
,
tensor
.
blob
());
}
inline
void
async_release
(
CompNode
cn
,
uint64_t
event
,
const
HostTensorND
&
hnd
)
{
async_release
(
cn
,
event
,
hnd
.
storage
().
raw_storage
());
}
inline
void
async_release
(
const
HostTensorND
&
hnd
)
{
auto
cn
=
hnd
.
comp_node
();
async_release
(
cn
,
record_event
(
cn
,
true
),
hnd
);
}
// Cache for small blobs
// Cache for small blobs
// 1. A blob has to be seen twice (within a window) to be eligible for cache
// 1. A blob has to be seen twice (within a window) to be eligible for cache
// 2. Cache eviction occurs when cache size reaches a threshold, in least frequently
// 2. Cache eviction occurs when cache size reaches a threshold, in least frequently
...
...
src/core/impl/comp_node/impl_helper.cpp
浏览文件 @
877bda41
...
@@ -32,7 +32,7 @@ bool CompNodeImplHelper::EventImplHelper::finished() {
...
@@ -32,7 +32,7 @@ bool CompNodeImplHelper::EventImplHelper::finished() {
mgb_assert
(
m_recorded
);
mgb_assert
(
m_recorded
);
if
(
do_finished
())
{
if
(
do_finished
())
{
m_finished
=
true
;
m_finished
=
true
;
m_recorded
=
false
;
//
m_recorded = false;
return
true
;
return
true
;
}
}
return
false
;
return
false
;
...
...
src/core/include/megbrain/ir/ops.td
浏览文件 @
877bda41
...
@@ -120,6 +120,19 @@ def Copy: MgbHashableOp<"Copy"> {
...
@@ -120,6 +120,19 @@ def Copy: MgbHashableOp<"Copy"> {
);
);
}
}
def Borrow: MgbHashableOp<"Borrow"> {
let extraArguments = (ins
MgbCompNodeAttr:$comp_node
);
}
def Barrier: MgbHashableOp<"Barrier"> {
let extraArguments = (ins
MgbCompNodeAttr:$comp_node,
MgbUI32Attr:$nr_outputs
);
}
def Argsort: MgbHashableOp<"Argsort", [ArgsortParam]>;
def Argsort: MgbHashableOp<"Argsort", [ArgsortParam]>;
def Argmax : MgbHashableOp<"Argmax", [AxisParam]>;
def Argmax : MgbHashableOp<"Argmax", [AxisParam]>;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录