Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
3ace7550
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
3ace7550
编写于
7月 20, 2020
作者:
L
lizhenyu
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refine gpu memory swap performance
上级
bbfcbbe2
变更
11
展开全部
隐藏空白更改
内联
并排
Showing
11 changed file
with
595 addition
and
123 deletion
+595
-123
mindspore/ccsrc/backend/optimizer/mem_reuse/mem_copy_manager.h
...pore/ccsrc/backend/optimizer/mem_reuse/mem_copy_manager.h
+45
-14
mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc
...ore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc
+320
-52
mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.h
...pore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.h
+48
-11
mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
+12
-0
mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
+2
-0
mindspore/ccsrc/runtime/device/gpu/cuda_driver.cc
mindspore/ccsrc/runtime/device/gpu/cuda_driver.cc
+10
-0
mindspore/ccsrc/runtime/device/gpu/cuda_driver.h
mindspore/ccsrc/runtime/device/gpu/cuda_driver.h
+1
-0
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+138
-43
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
+10
-3
mindspore/ccsrc/runtime/device/kernel_info.cc
mindspore/ccsrc/runtime/device/kernel_info.cc
+8
-0
mindspore/ccsrc/runtime/device/kernel_info.h
mindspore/ccsrc/runtime/device/kernel_info.h
+1
-0
未找到文件。
mindspore/ccsrc/backend/optimizer/mem_reuse/mem_copy_manager.h
浏览文件 @
3ace7550
...
...
@@ -19,6 +19,7 @@
#include <vector>
#include <map>
#include <set>
#include <queue>
#include <memory>
#include <utility>
...
...
@@ -40,29 +41,58 @@ struct TensorInfo {
struct
KernelExecutionInfo
{
size_t
topo_order_
{
0
};
float
execution_perform_
{
0.0
};
bool
trigger_swap_
{
false
};
bool
need_swap_
{
false
};
// output index to topo orders of node users
bool
trigger_swap_out_
{
false
};
bool
trigger_swap_in_
{
false
};
size_t
swap_in_task_num_
{
0
};
// Key: output index, value: topo orders of node users
std
::
map
<
size_t
,
std
::
vector
<
size_t
>>
node_users_map_
;
//
kernel output idx to host addr
std
::
map
<
size_t
,
HostAddress
>
host_addrs_
;
//
Key: output idx, value: (host addr, dirty or not)
std
::
map
<
size_t
,
std
::
pair
<
HostAddress
,
bool
>
>
host_addrs_
;
KernelExecutionInfo
()
:
KernelExecutionInfo
(
0
,
0.0
,
false
,
false
)
{}
explicit
KernelExecutionInfo
(
size_t
topo_order
)
:
topo_order_
(
topo_order
),
execution_perform_
(
0.0
),
trigger_swap_
(
false
),
need_swap_
(
false
)
{}
KernelExecutionInfo
(
size_t
topo_order
,
float
execution_perform
,
bool
trigger_swap
,
bool
need_swap
)
KernelExecutionInfo
()
{}
explicit
KernelExecutionInfo
(
size_t
topo_order
)
:
KernelExecutionInfo
(
topo_order
,
0.0
,
false
,
false
,
0
)
{}
KernelExecutionInfo
(
size_t
topo_order
,
float
execution_perform
,
bool
trigger_swap_out
,
bool
trigger_swap_in
,
size_t
swap_in_task_num
)
:
topo_order_
(
topo_order
),
execution_perform_
(
execution_perform
),
trigger_swap_
(
trigger_swap
),
need_swap_
(
need_swap
)
{}
trigger_swap_out_
(
trigger_swap_out
),
trigger_swap_in_
(
trigger_swap_in
),
swap_in_task_num_
(
swap_in_task_num
)
{}
};
// trigger swap
struct
MemSwapInfo
{
SwapKind
swap_kind_
;
//
kernel need to
be swapped
AnfNodePtr
kernel_
{
nullptr
}
;
//
Topo order of kernel need
be swapped
size_t
topo_order_
;
size_t
output_idx_
{
0
};
// Record the swapping out position of swapping in tensor
size_t
swap_out_pos_
;
};
struct
SwapInfoComp
{
bool
operator
()(
const
MemSwapInfo
&
a
,
const
MemSwapInfo
&
b
)
{
int
swap_kind_a
=
static_cast
<
int
>
(
a
.
swap_kind_
);
int
swap_kind_b
=
static_cast
<
int
>
(
b
.
swap_kind_
);
if
(
swap_kind_a
<
swap_kind_b
)
{
return
true
;
}
else
if
(
swap_kind_a
>
swap_kind_b
)
{
return
false
;
}
if
(
a
.
swap_out_pos_
<
b
.
swap_out_pos_
)
{
return
true
;
}
else
if
(
a
.
swap_out_pos_
>
b
.
swap_out_pos_
)
{
return
false
;
}
if
(
a
.
topo_order_
<
b
.
topo_order_
)
{
return
true
;
}
else
if
(
a
.
topo_order_
>
b
.
topo_order_
)
{
return
false
;
}
return
a
.
output_idx_
<
b
.
output_idx_
;
}
};
class
MemCopyManager
{
...
...
@@ -90,6 +120,7 @@ class MemCopyManager {
virtual
void
ClearSwapQueue
()
{}
};
using
MemCopyManagerPtr
=
std
::
shared_ptr
<
MemCopyManager
>
;
using
MemSwapInfoSet
=
std
::
set
<
MemSwapInfo
,
SwapInfoComp
>
;
}
// namespace memswap
}
// namespace device
}
// namespace mindspore
...
...
mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc
浏览文件 @
3ace7550
此差异已折叠。
点击以展开。
mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.h
浏览文件 @
3ace7550
...
...
@@ -32,7 +32,11 @@ namespace memswap {
class
MemSwapManager
{
public:
explicit
MemSwapManager
(
const
MemCopyManagerPtr
&
mem_copy_manager
)
:
tensor_size_threshold_
(
0
),
tensor_size_threshold_idx_
(
0
),
tensor_size_num_
(
1
),
distance_threshold_
(
1
)
{
:
tensor_size_threshold_
(
0
),
tensor_size_threshold_idx_
(
0
),
tensor_size_num_
(
1
),
distance_threshold_
(
1
),
distance_decay_step_
(
1
)
{
mem_copy_manager_
=
mem_copy_manager
;
}
...
...
@@ -42,7 +46,7 @@ class MemSwapManager {
~
MemSwapManager
()
=
default
;
void
Init
(
const
mindspore
::
session
::
KernelGraph
*
kernel_graph
);
bool
Init
(
const
mindspore
::
session
::
KernelGraph
*
kernel_graph
,
size_t
swap_mem_size
=
0
);
void
AddMemSwapTask
(
SwapKind
swap_kind
,
const
DeviceAddressPtr
&
device_address
,
const
HostAddress
&
host_address
)
const
;
...
...
@@ -51,9 +55,10 @@ class MemSwapManager {
DeviceAddressPtr
UpdateSwapQueue
(
SwapKind
swap_kind
)
const
;
// retreat to find a workable swap scheme
bool
RetreatSwapInfo
();
void
AdjustSwapInPos
(
const
AnfNodePtr
&
kernel
,
size_t
index
);
bool
trigger_swap
()
const
{
return
trigger_swap_
;
}
bool
mem_swap_init
()
const
{
return
mem_swap_initialized_
;
}
...
...
@@ -70,16 +75,28 @@ class MemSwapManager {
bool
QueryKernelTriggerSwap
(
const
AnfNodePtr
&
kernel
)
const
;
bool
QueryKernelNeedSwap
(
const
AnfNodePtr
&
kernel
)
const
;
bool
QueryKernelTriggerSwapIn
(
const
AnfNodePtr
&
kernel
)
const
;
size_t
QueryKernelTriggerSwapInTaskNum
(
const
AnfNodePtr
&
kernel
)
const
;
const
AnfNodePtr
QueryKerneByTopoOrder
(
size_t
index
)
const
;
const
MemSwapInfoSet
&
QueryKernelMemSwapInfo
(
const
AnfNodePtr
&
kernel
)
const
;
void
AssignHostMemory
();
const
std
::
vector
<
MemSwapInfo
>
&
QueryKernelMemSwapInfo
(
const
AnfNodePtr
&
kernel
)
const
;
const
HostAddress
&
QueryKernelHostAddr
(
const
AnfNodePtr
&
kernel
,
size_t
output_idx
)
const
;
void
AddKernelHostAddrIsDirty
(
const
AnfNodePtr
&
kernel
,
size_t
output_idx
,
bool
dirty
);
bool
QueryKernelHostAddrIsDirty
(
const
AnfNodePtr
&
kernel
,
size_t
output_idx
)
const
;
void
ResetHostAddrIsDirty
();
void
InsertSwapInBlackList
(
const
void
*
device_ptr
);
bool
FindInSwapInBlackList
(
const
void
*
device_ptr
)
const
;
const
HostAddress
&
kernel_host_addr
(
const
AnfNodePtr
&
kernel
,
size_t
output_idx
)
const
;
bool
AllocHostPinnedMem
(
size_t
size
,
void
**
addr
)
const
;
void
ReleaseHostPinnedMem
();
...
...
@@ -93,27 +110,47 @@ class MemSwapManager {
void
SaveUserKernelTopoOrder
();
void
AddKernelTriggerSwap
(
const
AnfNodePtr
&
kernel
,
bool
trigger_swap
);
bool
InitSwapThreshold
(
size_t
swap_mem_size
);
void
AddKernelNeedSwap
(
const
AnfNodePtr
&
kernel
,
bool
need_swap
);
void
RetreatSwapThreshold
();
void
CacheCurSwapInfoSet
(
const
AnfNodePtr
&
kernel
);
void
AddFirstTimeMovePos
(
const
AnfNodePtr
&
kernel
,
size_t
index
,
bool
first_time
);
bool
QueryFirstTimeMovePos
(
const
AnfNodePtr
&
kernel
,
size_t
index
)
const
;
size_t
BestSwapInPerformPos
(
const
AnfNodePtr
&
trigger_kernel
,
const
MemSwapInfo
&
mem_swap_info
)
const
;
void
MoveSwapInfoPos
(
size_t
des_pos
,
size_t
src_pos
,
const
MemSwapInfo
&
mem_swap_info
);
void
AddKernelMemSwapInfo
(
const
AnfNodePtr
&
kernel
,
const
MemSwapInfo
&
mem_swap_info
);
void
RemoveKernelMemSwapInfo
(
const
AnfNodePtr
&
kernel
,
const
MemSwapInfo
&
mem_swap_info
);
bool
CheckDistanceBetweenKernels
(
const
TensorInfo
&
tensor_info
)
const
;
bool
IsCommunicationRelevantOp
(
const
AnfNodePtr
&
kernel
)
const
;
std
::
vector
<
CNodePtr
>
execution_order_
;
std
::
vector
<
TensorInfo
>
ordered_tensors_
;
std
::
unordered_map
<
void
*
,
KernelExecutionInfo
>
kernel_execution_info_
;
std
::
unordered_map
<
void
*
,
std
::
map
<
size_t
,
PerformPair
>>
kernel_swap_perform_
;
//
trigger swap kernel key : MemSwapInfo
of kernel need to be swapped
std
::
unordered_map
<
void
*
,
std
::
vector
<
MemSwapInfo
>>
mem_swap_info
_
;
//
Key: trigger swap kernel, value: MemSwapInfoSet
of kernel need to be swapped
std
::
unordered_map
<
void
*
,
MemSwapInfoSet
>
mem_swap_info_map
_
;
std
::
vector
<
HostAddress
>
host_addrs_list_
;
std
::
unordered_set
<
const
void
*>
swap_in_blacklist_
;
// Key: cache kernel address, value: lists of first time move pos or not
std
::
map
<
void
*
,
std
::
vector
<
bool
>>
kernel_first_move_cache_map_
;
std
::
vector
<
MemSwapInfo
>
mem_swap_info_cache_list_
;
std
::
pair
<
size_t
,
size_t
>
best_and_cur_pos_cache_
;
size_t
tensor_size_threshold_
;
size_t
tensor_size_threshold_idx_
;
size_t
tensor_size_num_
;
size_t
distance_threshold_
;
size_t
distance_decay_step_
;
MemCopyManagerPtr
mem_copy_manager_
{
nullptr
};
FuncGraphManagerPtr
graph_manager_
{
nullptr
};
...
...
mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
浏览文件 @
3ace7550
...
...
@@ -707,6 +707,18 @@ DeviceAddress *AnfRuntimeAlgorithm::GetWorkspaceAddr(const AnfNodePtr &node, siz
return
addr
;
}
// get workspace device mutable addr of anf_node
DeviceAddressPtr
AnfRuntimeAlgorithm
::
GetMutableWorkspaceAddr
(
const
AnfNodePtr
&
node
,
size_t
index
)
{
MS_EXCEPTION_IF_NULL
(
node
);
auto
kernel_info
=
dynamic_cast
<
device
::
KernelInfo
*>
(
node
->
kernel_info
());
MS_EXCEPTION_IF_NULL
(
kernel_info
);
auto
addr
=
kernel_info
->
GetMutableWorkspaceAddr
(
index
);
if
(
addr
==
nullptr
)
{
MS_LOG
(
EXCEPTION
)
<<
"Index "
<<
index
<<
" of node "
<<
node
->
DebugString
()
<<
"] workspace addr is not exist"
;
}
return
addr
;
}
// set infer shapes and types of anf node
void
AnfRuntimeAlgorithm
::
SetOutputInferTypeAndShape
(
const
std
::
vector
<
TypeId
>
&
types
,
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
shapes
,
AnfNode
*
node
)
{
...
...
mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
浏览文件 @
3ace7550
...
...
@@ -149,6 +149,8 @@ class AnfRuntimeAlgorithm {
static
void
SetWorkspaceAddr
(
const
DeviceAddressPtr
&
addr
,
size_t
output_idx
,
AnfNode
*
node
);
// get workspace device addr of anf_node
static
DeviceAddress
*
GetWorkspaceAddr
(
const
AnfNodePtr
&
node
,
size_t
output_idx
);
// get workspace device mutable addr of anf_node
static
DeviceAddressPtr
GetMutableWorkspaceAddr
(
const
AnfNodePtr
&
node
,
size_t
index
);
// set infer shapes and types of anf node
static
void
SetOutputInferTypeAndShape
(
const
std
::
vector
<
TypeId
>
&
types
,
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
shapes
,
AnfNode
*
node
);
...
...
mindspore/ccsrc/runtime/device/gpu/cuda_driver.cc
浏览文件 @
3ace7550
...
...
@@ -209,6 +209,16 @@ bool CudaDriver::QueryEvent(const DeviceEvent &event) {
}
}
bool
CudaDriver
::
ElapsedTime
(
float
*
cost_time
,
const
DeviceEvent
&
start
,
const
DeviceEvent
&
end
)
{
auto
ret
=
cudaEventElapsedTime
(
cost_time
,
(
cudaEvent_t
)
start
,
(
cudaEvent_t
)
end
);
if
(
ret
==
cudaSuccess
)
{
return
true
;
}
else
{
MS_LOG
(
ERROR
)
<<
"cudaEventElapsedTime failed, ret["
<<
static_cast
<
int
>
(
ret
)
<<
"], "
<<
cudaGetErrorString
(
ret
);
return
false
;
}
}
int
CudaDriver
::
device_count
()
{
int
dev_count
;
auto
ret
=
cudaGetDeviceCount
(
&
dev_count
);
...
...
mindspore/ccsrc/runtime/device/gpu/cuda_driver.h
浏览文件 @
3ace7550
...
...
@@ -57,6 +57,7 @@ class CudaDriver {
static
bool
RecordEvent
(
DeviceEvent
event
,
DeviceStream
stream
=
0
);
static
bool
SyncEvent
(
const
DeviceEvent
&
event
);
static
bool
QueryEvent
(
const
DeviceEvent
&
event
);
static
bool
ElapsedTime
(
float
*
cost_time
,
const
DeviceEvent
&
start
,
const
DeviceEvent
&
end
);
// Encapsulate the cuda APIs associated with device management.
static
int
device_count
();
...
...
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
浏览文件 @
3ace7550
...
...
@@ -33,6 +33,7 @@
namespace
mindspore
{
namespace
device
{
namespace
gpu
{
using
mindspore
::
device
::
memswap
::
MemSwapInfoSet
;
using
mindspore
::
device
::
memswap
::
MemSwapManager
;
using
mindspore
::
device
::
memswap
::
SwapKind
;
bool
GPUKernelRuntime
::
SyncStream
()
{
return
GPUDeviceManager
::
GetInstance
().
SyncStream
(
stream_
);
}
...
...
@@ -139,6 +140,7 @@ void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) {
InitKernelRefCount
(
graph
);
InitMemorySwapInfo
(
graph
);
InitKernelOutputAddress
(
graph
);
InitKernelWorkspaceAddress
(
graph
);
}
else
{
AssignDynamicMemory
(
graph
);
}
...
...
@@ -183,6 +185,56 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph) {
return
ret
;
}
bool
GPUKernelRuntime
::
SearchMemSwapScheme
(
const
session
::
KernelGraph
*
graph
)
{
bool
ret
=
false
;
ClearKernelOldOutputAndWorkspace
(
graph
);
if
(
!
mem_swap_manager_
->
mem_swap_init
())
{
if
(
!
mem_swap_manager_
->
Init
(
graph
))
{
return
false
;
}
}
while
(
!
ret
)
{
if
(
!
mem_swap_manager_
->
RetreatSwapInfo
())
{
return
false
;
}
ret
=
LaunchKernelDynamic
(
graph
,
true
,
false
);
if
(
!
ret
)
{
ClearKernelOldOutputAndWorkspace
(
graph
);
}
}
mem_swap_manager_
->
AssignHostMemory
();
// Time profiling
ret
=
LaunchKernelDynamic
(
graph
,
false
,
true
);
if
(
!
ret
)
{
return
ret
;
}
return
RefineMemSwapScheme
(
graph
);
}
bool
GPUKernelRuntime
::
RefineMemSwapScheme
(
const
session
::
KernelGraph
*
graph
)
{
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
if
(
!
mem_swap_manager_
->
QueryKernelTriggerSwapIn
(
kernel
))
{
continue
;
}
size_t
swap_in_task_num
=
mem_swap_manager_
->
QueryKernelTriggerSwapInTaskNum
(
kernel
);
for
(
size_t
swap_in_task_idx
=
0
;
swap_in_task_idx
<
swap_in_task_num
;
swap_in_task_idx
++
)
{
bool
ret
=
false
;
while
(
!
ret
)
{
mem_swap_manager_
->
AdjustSwapInPos
(
kernel
,
swap_in_task_idx
);
ret
=
LaunchKernelDynamic
(
graph
,
true
,
false
);
if
(
!
ret
)
{
ClearKernelOldOutputAndWorkspace
(
graph
);
}
}
}
}
return
true
;
}
void
GPUKernelRuntime
::
InitKernelRefCount
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MemReuseUtilPtr
mem_reuse_util_ptr
=
std
::
make_shared
<
memreuse
::
MemReuseUtil
>
();
...
...
@@ -209,6 +261,7 @@ void GPUKernelRuntime::InitMemorySwapInfo(const session::KernelGraph *graph) {
MS_EXCEPTION_IF_NULL
(
mem_swap_manager
);
auto
graph_id
=
graph
->
graph_id
();
mem_swap_map_
[
graph_id
]
=
mem_swap_manager
;
is_first_step_map_
[
graph_id
]
=
true
;
}
void
GPUKernelRuntime
::
InitKernelOutputAddress
(
const
session
::
KernelGraph
*
graph
)
{
...
...
@@ -230,6 +283,25 @@ void GPUKernelRuntime::InitKernelOutputAddress(const session::KernelGraph *graph
}
}
void
GPUKernelRuntime
::
InitKernelWorkspaceAddress
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
workspace_sizes
=
kernel_mod
->
GetWorkspaceSizeList
();
for
(
size_t
i
=
0
;
i
<
workspace_sizes
.
size
();
++
i
)
{
auto
device_address
=
CreateDeviceAddress
(
nullptr
,
workspace_sizes
[
i
],
""
,
kTypeUnknown
);
AnfAlgo
::
SetWorkspaceAddr
(
device_address
,
i
,
kernel
.
get
());
}
}
}
void
GPUKernelRuntime
::
ClearKernelOldOutputAndWorkspace
(
const
session
::
KernelGraph
*
graph
)
{
ClearKernelOutputAddress
(
graph
);
ClearKernelWorkspaceAddress
(
graph
);
}
void
GPUKernelRuntime
::
ClearKernelOutputAddress
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
...
...
@@ -242,6 +314,7 @@ void GPUKernelRuntime::ClearKernelOutputAddress(const session::KernelGraph *grap
continue
;
}
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
)
{
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
}
...
...
@@ -250,7 +323,24 @@ void GPUKernelRuntime::ClearKernelOutputAddress(const session::KernelGraph *grap
}
}
bool
GPUKernelRuntime
::
LaunchKernelDynamic
(
const
session
::
KernelGraph
*
graph
)
{
void
GPUKernelRuntime
::
ClearKernelWorkspaceAddress
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
workspace_sizes
=
kernel_mod
->
GetWorkspaceSizeList
();
for
(
size_t
i
=
0
;
i
<
workspace_sizes
.
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableWorkspaceAddr
(
kernel
,
i
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
)
{
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
}
}
}
}
bool
GPUKernelRuntime
::
LaunchKernelDynamic
(
const
session
::
KernelGraph
*
graph
,
bool
mock
,
bool
profiling
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
// Reset the reference count.
...
...
@@ -271,7 +361,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph) {
if
(
!
kernel_mod
->
Launch
(
kernel_inputs
,
kernel_workspaces
,
kernel_outputs
,
stream_
))
{
MS_LOG
(
EXCEPTION
)
<<
"Launch kernel failed."
;
}
FreeKernelDynamicRes
(
kernel
,
kernel_workspaces
);
FreeKernelDynamicRes
(
kernel
);
UpdateMemorySwapTask
(
kernel
);
}
CHECK_OP_RET_WITH_EXCEPT
(
SyncStream
(),
"SyncStream failed."
);
...
...
@@ -279,13 +369,39 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph) {
return
true
;
}
void
GPUKernelRuntime
::
LaunchKernelWithTimeProfiling
(
const
AnfNodePtr
&
kernel
,
const
AddressPtrList
&
inputs
,
const
AddressPtrList
&
workspace
,
const
AddressPtrList
&
outputs
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
float
cost_time
=
0
;
DeviceEvent
start
=
nullptr
;
DeviceEvent
end
=
nullptr
;
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
CreateEvent
(
&
start
),
"Failed to create event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
CreateEvent
(
&
end
),
"Failed to create event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
RecordEvent
(
start
,
stream_
),
"Failed to record event to stream."
);
CHECK_OP_RET_WITH_EXCEPT
(
kernel_mod
->
Launch
(
inputs
,
workspace
,
outputs
,
stream_
),
"Launch kernel failed."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
RecordEvent
(
end
,
stream_
),
"Failed to record event to stream."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
SyncEvent
(
start
),
"Failed to sync event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
SyncEvent
(
end
),
"Failed to sync event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
ElapsedTime
(
&
cost_time
,
start
,
end
),
"Failed to record elapsed time."
);
mem_swap_manager_
->
AddKernelExecutionPerform
(
kernel
,
cost_time
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
DestroyEvent
(
start
),
"Failed to destroy event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
DestroyEvent
(
end
),
"Failed to destroy event."
);
}
bool
GPUKernelRuntime
::
AddMemorySwapTask
(
const
AnfNodePtr
&
kernel
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
auto
&
mem_swap_info_list
=
mem_swap_manager_
->
QueryKernelMemSwapInfo
(
kernel
);
for
(
auto
&
mem_swap_info
:
mem_swap_info_list
)
{
auto
&
kernel_exec_info
=
mem_swap_manager_
->
SearchKernelExecutionInfo
(
mem_swap_info
.
kernel_
);
const
HostAddress
&
host_address
=
kernel_exec_info
.
host_addrs_
[
mem_swap_info
.
output_idx_
];
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
mem_swap_info
.
kernel_
,
mem_swap_info
.
output_idx_
,
false
);
const
MemSwapInfoSet
&
mem_swap_info_set
=
mem_swap_manager_
->
QueryKernelMemSwapInfo
(
kernel
);
for
(
auto
&
mem_swap_info
:
mem_swap_info_set
)
{
auto
need_swap_kernel
=
mem_swap_manager_
->
QueryKerneByTopoOrder
(
mem_swap_info
.
topo_order_
);
MS_EXCEPTION_IF_NULL
(
need_swap_kernel
);
const
HostAddress
&
host_address
=
mem_swap_manager_
->
QueryKernelHostAddr
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
);
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
,
false
);
if
(
mem_swap_info
.
swap_kind_
==
SwapKind
::
kDeviceToHost
)
{
mem_swap_manager_
->
AddMemSwapTask
(
SwapKind
::
kDeviceToHost
,
device_address
,
host_address
);
...
...
@@ -309,9 +425,11 @@ bool GPUKernelRuntime::AddMemorySwapTask(const AnfNodePtr &kernel) {
bool
GPUKernelRuntime
::
UpdateMemorySwapInfo
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
ClearKernelO
utputAddress
(
graph
);
ClearKernelO
ldOutputAndWorkspace
(
graph
);
if
(
!
mem_swap_manager_
->
mem_swap_init
())
{
mem_swap_manager_
->
Init
(
graph
);
if
(
!
mem_swap_manager_
->
Init
(
graph
))
{
return
false
;
}
}
return
mem_swap_manager_
->
RetreatSwapInfo
();
}
...
...
@@ -408,29 +526,6 @@ bool GPUKernelRuntime::AttemptMallocMem(const DeviceAddressPtr &device_address,
return
true
;
}
void
*
GPUKernelRuntime
::
AttemptMallocMem
(
size_t
size
)
{
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
auto
device_ptr
=
mem_manager_
->
MallocMemFromMemPool
(
size
);
if
(
!
device_ptr
)
{
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
nullptr
;
}
mem_swap_manager_
->
SyncMemCopyStream
(
SwapKind
::
kDeviceToHost
);
while
(
auto
device_address_swap_out
=
mem_swap_manager_
->
UpdateSwapQueue
(
SwapKind
::
kDeviceToHost
))
{
if
(
!
mem_swap_manager_
->
FindInSwapInBlackList
(
device_address_swap_out
->
ptr_
)
&&
device_address_swap_out
->
ptr_
)
{
device_address_swap_out
->
set_status
(
DeviceAddressStatus
::
kInHost
);
mem_manager_
->
FreeMemFromMemPool
(
device_address_swap_out
);
}
}
device_ptr
=
mem_manager_
->
MallocMemFromMemPool
(
size
);
if
(
!
device_ptr
)
{
return
nullptr
;
}
}
return
device_ptr
;
}
bool
GPUKernelRuntime
::
AllocKernelDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_inputs
,
AddressPtrList
*
kernel_workspaces
,
AddressPtrList
*
kernel_outputs
)
{
...
...
@@ -504,13 +599,13 @@ bool GPUKernelRuntime::AllocKernelWorkspaceDynamicRes(const mindspore::kernel::K
kernel_workspaces
->
emplace_back
(
nullptr
);
continue
;
}
auto
device_
ptr
=
AttemptMallocMem
(
workspace_sizes
[
i
]
);
if
(
!
device_ptr
)
{
auto
device_
address
=
AnfAlgo
::
GetMutableWorkspaceAddr
(
kernel
,
i
);
if
(
device_address
->
ptr_
==
nullptr
&&
!
AttemptMallocMem
(
device_address
,
workspace_sizes
[
i
])
)
{
return
false
;
}
kernel
::
AddressPtr
workspace
=
std
::
make_shared
<
kernel
::
Address
>
();
MS_EXCEPTION_IF_NULL
(
workspace
);
workspace
->
addr
=
device_
ptr
;
workspace
->
addr
=
device_
address
->
ptr_
;
workspace
->
size
=
workspace_sizes
[
i
];
kernel_workspaces
->
emplace_back
(
workspace
);
}
...
...
@@ -606,8 +701,7 @@ void GPUKernelRuntime::AllocCommunicationOpMemory(bool is_need_alloc_memory, boo
}
}
void
GPUKernelRuntime
::
FreeKernelDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
,
const
AddressPtrList
&
kernel_workspaces
)
{
void
GPUKernelRuntime
::
FreeKernelDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
...
...
@@ -652,12 +746,13 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
}
}
// Free the workspace of kernel.
for
(
size_t
i
=
0
;
i
<
kernel_workspaces
.
size
();
++
i
)
{
auto
workspace
=
kernel_workspaces
[
i
];
if
(
workspace
!=
nullptr
)
{
MS_EXCEPTION_IF_NULL
(
workspace
->
addr
);
mem_manager_
->
FreeMemFromMemPool
(
workspace
->
addr
);
workspace
->
addr
=
nullptr
;
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
for
(
size_t
i
=
0
;
i
<
kernel_mod
->
GetWorkspaceSizeList
().
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableWorkspaceAddr
(
kernel
,
i
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
)
{
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
}
}
}
...
...
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
浏览文件 @
3ace7550
...
...
@@ -53,11 +53,17 @@ class GPUKernelRuntime : public KernelRuntime {
// The related functions and members for using dynamic memory pool.
void
InitKernelRefCount
(
const
session
::
KernelGraph
*
graph
);
void
InitKernelOutputAddress
(
const
session
::
KernelGraph
*
graph
);
void
InitKernelWorkspaceAddress
(
const
session
::
KernelGraph
*
graph
);
void
InitMemorySwapInfo
(
const
session
::
KernelGraph
*
graph
);
void
ClearKernelOutputAddress
(
const
session
::
KernelGraph
*
graph
);
bool
LaunchKernelDynamic
(
const
session
::
KernelGraph
*
graph
);
void
ClearKernelWorkspaceAddress
(
const
session
::
KernelGraph
*
graph
);
void
ClearKernelOldOutputAndWorkspace
(
const
session
::
KernelGraph
*
graph
);
bool
SearchMemSwapScheme
(
const
session
::
KernelGraph
*
graph
);
bool
RefineMemSwapScheme
(
const
session
::
KernelGraph
*
graph
);
bool
LaunchKernelDynamic
(
const
session
::
KernelGraph
*
graph
,
bool
mock
=
false
,
bool
profiling
=
false
);
void
LaunchKernelWithTimeProfiling
(
const
AnfNodePtr
&
kernel
,
const
AddressPtrList
&
inputs
,
const
AddressPtrList
&
workspace
,
const
AddressPtrList
&
outputs
);
bool
AttemptMallocMem
(
const
DeviceAddressPtr
&
device_address
,
size_t
size
);
void
*
AttemptMallocMem
(
size_t
size
);
bool
AllocKernelDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_inputs
,
AddressPtrList
*
kernel_workspaces
,
AddressPtrList
*
kernel_outputs
);
...
...
@@ -72,7 +78,7 @@ class GPUKernelRuntime : public KernelRuntime {
void
AllocCommunicationOpMemory
(
bool
is_need_alloc_memory
,
bool
is_need_free_memory
,
const
DeviceAddressPtrList
addr_list
,
size_t
total_size
,
std
::
vector
<
size_t
>
size_list
);
void
FreeKernelDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
,
const
AddressPtrList
&
kernel_workspaces
);
void
FreeKernelDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
);
bool
AddMemorySwapTask
(
const
AnfNodePtr
&
kernel
);
bool
UpdateMemorySwapInfo
(
const
session
::
KernelGraph
*
graph
);
bool
UpdateMemorySwapTask
(
const
AnfNodePtr
&
kernel
);
...
...
@@ -81,6 +87,7 @@ class GPUKernelRuntime : public KernelRuntime {
void
ClearSwapQueue
();
std
::
unordered_map
<
uint32_t
,
MemReuseUtilPtr
>
mem_reuse_util_map_
;
std
::
unordered_map
<
uint32_t
,
MemSwapManagerPtr
>
mem_swap_map_
;
std
::
unordered_map
<
uint32_t
,
bool
>
is_first_step_map_
;
MemReuseUtilPtr
mem_reuse_util_
{
nullptr
};
MemSwapManagerPtr
mem_swap_manager_
{
nullptr
};
};
...
...
mindspore/ccsrc/runtime/device/kernel_info.cc
浏览文件 @
3ace7550
...
...
@@ -73,6 +73,14 @@ DeviceAddress *KernelInfo::GetWorkspaceAddr(size_t index) const {
return
workspace_address_list_
[
index
].
get
();
}
DeviceAddressPtr
KernelInfo
::
GetMutableWorkspaceAddr
(
size_t
index
)
const
{
if
(
index
>=
workspace_address_list_
.
size
())
{
MS_LOG
(
ERROR
)
<<
"Index ["
<<
index
<<
"] out of range"
;
return
nullptr
;
}
return
workspace_address_list_
[
index
];
}
bool
KernelInfo
::
SetWorkspaceAddr
(
const
DeviceAddressPtr
&
output_address
,
size_t
index
)
{
if
(
workspace_address_list_
.
empty
())
{
// parameter and valuenode
...
...
mindspore/ccsrc/runtime/device/kernel_info.h
浏览文件 @
3ace7550
...
...
@@ -54,6 +54,7 @@ class KernelInfo : public KernelInfoDevice {
bool
OutputAddrExist
(
size_t
index
)
const
;
bool
SetOutputAddr
(
const
DeviceAddressPtr
&
output_address
,
size_t
index
);
DeviceAddress
*
GetWorkspaceAddr
(
size_t
index
)
const
;
DeviceAddressPtr
GetMutableWorkspaceAddr
(
size_t
index
)
const
;
bool
SetWorkspaceAddr
(
const
DeviceAddressPtr
&
output_address
,
size_t
index
);
void
set_kernel_mod
(
const
kernel
::
KernelModPtr
&
kernel_mod
);
kernel
::
KernelMod
*
MutableKernelMod
()
const
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录