Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
bbafa9db
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
bbafa9db
编写于
9月 07, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
9月 07, 2020
浏览文件
操作
浏览文件
下载
差异文件
!5832 Use the unified Execute function to run Graph or Single Graph.
Merge pull request !5832 from 张清华/master
上级
4535faa2
c0070d3d
变更
12
隐藏空白更改
内联
并排
Showing
12 changed file
with
1291 addition
and
1298 deletion
+1291
-1298
mindspore/ccsrc/backend/session/ascend_session.cc
mindspore/ccsrc/backend/session/ascend_session.cc
+14
-16
mindspore/ccsrc/backend/session/ascend_session.h
mindspore/ccsrc/backend/session/ascend_session.h
+3
-2
mindspore/ccsrc/backend/session/cpu_session.cc
mindspore/ccsrc/backend/session/cpu_session.cc
+1
-1
mindspore/ccsrc/backend/session/gpu_session.cc
mindspore/ccsrc/backend/session/gpu_session.cc
+2
-2
mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
...pore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+2
-8
mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
...spore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
+2
-2
mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
+1
-1
mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
+1
-1
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+1150
-1150
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
+112
-112
mindspore/ccsrc/runtime/device/kernel_runtime.cc
mindspore/ccsrc/runtime/device/kernel_runtime.cc
+1
-1
mindspore/ccsrc/runtime/device/kernel_runtime.h
mindspore/ccsrc/runtime/device/kernel_runtime.h
+2
-2
未找到文件。
mindspore/ccsrc/backend/session/ascend_session.cc
浏览文件 @
bbafa9db
...
...
@@ -318,7 +318,7 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::
#endif
{
// run task on device
Execute
(
kernel_graph
);
Execute
(
kernel_graph
,
true
);
}
// summary
Summary
(
kernel_graph
.
get
());
...
...
@@ -348,17 +348,6 @@ void AscendSession::RunOpHardwareOptimize(const std::shared_ptr<session::KernelG
MS_LOG
(
INFO
)
<<
"Finish"
;
}
void
AscendSession
::
RunOpExecTask
(
const
std
::
shared_ptr
<
KernelGraph
>
&
kernel_graph
)
const
{
MS_LOG
(
INFO
)
<<
"Start!"
;
auto
runtime_instance
=
device
::
KernelRuntimeManager
::
Instance
().
GetKernelRuntime
(
kAscendDevice
,
device_id_
);
MS_EXCEPTION_IF_NULL
(
runtime_instance
);
bool
ret_ok
=
runtime_instance
->
LaunchKernel
(
kernel_graph
.
get
());
if
(
!
ret_ok
)
{
MS_LOG
(
EXCEPTION
)
<<
"Run task error!"
;
}
MS_LOG
(
INFO
)
<<
"Finish!"
;
}
bool
AscendSession
::
GraphCacheExist
(
const
GraphInfo
&
graph_info
)
const
{
return
run_op_graphs_
.
find
(
graph_info
)
!=
run_op_graphs_
.
end
();
}
...
...
@@ -398,7 +387,7 @@ void AscendSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph_i
// load input data to device
LoadInputData
(
graph
,
input_tensors
);
// run op
RunOpExecTask
(
graph
);
Execute
(
graph
,
false
);
// get output
if
(
op_run_info
.
value
!=
nullptr
)
{
std
::
vector
<
tensor
::
TensorPtr
>
pre_output_tensors
;
...
...
@@ -552,21 +541,30 @@ void AscendSession::RunOpMemoryClear(const KernelGraph *kernel_graph) const {
void
AscendSession
::
Load
(
const
std
::
shared_ptr
<
KernelGraph
>
&
kernel_graph
)
const
{
MS_LOG
(
INFO
)
<<
"Start!"
;
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
bool
is_task_sink
=
context_ptr
->
get_param
<
bool
>
(
MS_CTX_ENABLE_TASK_SINK
);
(
void
)
device
::
KernelAdjust
::
GetInstance
().
StepLoadCtrlInputs
(
kernel_graph
);
auto
runtime_instance
=
device
::
KernelRuntimeManager
::
Instance
().
GetKernelRuntime
(
kAscendDevice
,
device_id_
);
MS_EXCEPTION_IF_NULL
(
runtime_instance
);
bool
ret_ok
=
runtime_instance
->
Load
(
kernel_graph
.
get
());
bool
ret_ok
=
runtime_instance
->
Load
(
kernel_graph
.
get
()
,
is_task_sink
);
if
(
!
ret_ok
)
{
MS_LOG
(
EXCEPTION
)
<<
"Load task error!"
;
}
MS_LOG
(
INFO
)
<<
"Finish!"
;
}
void
AscendSession
::
Execute
(
const
std
::
shared_ptr
<
KernelGraph
>
&
kernel_graph
)
const
{
void
AscendSession
::
Execute
(
const
std
::
shared_ptr
<
KernelGraph
>
&
kernel_graph
,
bool
is_task
)
const
{
MS_LOG
(
INFO
)
<<
"Start!"
;
bool
is_task_sink
=
false
;
if
(
is_task
)
{
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
is_task_sink
=
context_ptr
->
get_param
<
bool
>
(
MS_CTX_ENABLE_TASK_SINK
);
}
auto
runtime_instance
=
device
::
KernelRuntimeManager
::
Instance
().
GetKernelRuntime
(
kAscendDevice
,
device_id_
);
MS_EXCEPTION_IF_NULL
(
runtime_instance
);
bool
ret_ok
=
runtime_instance
->
Run
(
kernel_graph
.
get
());
bool
ret_ok
=
runtime_instance
->
Run
(
kernel_graph
.
get
()
,
is_task_sink
);
if
(
!
ret_ok
)
{
MS_LOG
(
EXCEPTION
)
<<
"run task error!"
;
}
...
...
mindspore/ccsrc/backend/session/ascend_session.h
浏览文件 @
bbafa9db
...
...
@@ -13,8 +13,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_SESSION_ASCEND_SESSION_H
#define MINDSPORE_CCSRC_BACKEND_SESSION_ASCEND_SESSION_H
#include <unordered_map>
#include <string>
#include <memory>
...
...
@@ -82,13 +84,12 @@ class AscendSession : public SessionBasic {
KernelGraph
*
kernel_graph
)
const
;
void
RunOpMemoryClear
(
const
KernelGraph
*
kernel_graph
)
const
;
void
Load
(
const
std
::
shared_ptr
<
KernelGraph
>
&
kernel_graph
)
const
;
void
Execute
(
const
std
::
shared_ptr
<
KernelGraph
>
&
kernel_graph
)
const
;
void
Execute
(
const
std
::
shared_ptr
<
KernelGraph
>
&
kernel_graph
,
bool
is_task
)
const
;
void
Dump
(
const
std
::
shared_ptr
<
KernelGraph
>
&
kernel_graph
)
const
;
void
DumpAllGraphs
(
const
std
::
vector
<
KernelGraphPtr
>
&
all_graphs
);
void
LoadTensor
(
const
std
::
shared_ptr
<
KernelGraph
>
&
kernel_graph
)
const
;
// below functions are used for run op
void
RunOpHardwareOptimize
(
const
std
::
shared_ptr
<
session
::
KernelGraph
>
&
kernel_graph
)
const
;
void
RunOpExecTask
(
const
std
::
shared_ptr
<
KernelGraph
>
&
kernel_graph
)
const
;
static
void
BackendOptimization
(
const
std
::
vector
<
KernelGraphPtr
>
&
all_graphs
);
static
void
LinkChildGraphs
(
NotNull
<
KernelGraphPtr
>
graph
);
...
...
mindspore/ccsrc/backend/session/cpu_session.cc
浏览文件 @
bbafa9db
...
...
@@ -118,7 +118,7 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
debugger_
->
PreExecute
(
kernel_graph
);
}
#endif
bool
ret
=
runtime_
.
Run
(
kernel_graph
.
get
());
bool
ret
=
runtime_
.
Run
(
kernel_graph
.
get
()
,
false
);
if
(
!
ret
)
{
MS_LOG
(
EXCEPTION
)
<<
"Run graph failed"
;
}
...
...
mindspore/ccsrc/backend/session/gpu_session.cc
浏览文件 @
bbafa9db
...
...
@@ -191,9 +191,9 @@ void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const
auto
runtime_instance
=
device
::
KernelRuntimeManager
::
Instance
().
GetSingleKernelRuntime
(
kGPUDevice
,
device_id_
);
MS_EXCEPTION_IF_NULL
(
runtime_instance
);
#ifdef ENABLE_DEBUGGER
if
(
!
runtime_instance
->
Run
(
kernel_graph
.
get
(),
debugger_
.
get
()))
{
if
(
!
runtime_instance
->
Run
(
kernel_graph
.
get
(),
false
,
debugger_
.
get
()))
{
#else
if
(
!
runtime_instance
->
Run
(
kernel_graph
.
get
()))
{
if
(
!
runtime_instance
->
Run
(
kernel_graph
.
get
()
,
false
))
{
#endif
MS_LOG
(
EXCEPTION
)
<<
"GPU execute graph failed!"
;
}
...
...
mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
浏览文件 @
bbafa9db
...
...
@@ -454,10 +454,7 @@ DeviceAddressPtr AscendKernelRuntime::CreateDeviceAddress(void *device_ptr, size
return
std
::
make_shared
<
AscendDeviceAddress
>
(
device_ptr
,
device_size
,
format
,
type_id
);
}
bool
AscendKernelRuntime
::
Load
(
session
::
KernelGraph
*
graph
)
{
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
bool
is_task_sink
=
context_ptr
->
get_param
<
bool
>
(
MS_CTX_ENABLE_TASK_SINK
);
bool
AscendKernelRuntime
::
Load
(
session
::
KernelGraph
*
graph
,
bool
is_task_sink
)
{
if
(
!
is_task_sink
)
{
return
true
;
}
...
...
@@ -609,17 +606,14 @@ void AscendKernelRuntime::DebugTaskIdName(GraphId graph_id) {
}
}
bool
AscendKernelRuntime
::
Run
(
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
bool
AscendKernelRuntime
::
Run
(
session
::
KernelGraph
*
graph
,
bool
is_task_sink
,
Debugger
*
debugger
)
{
bool
ret
=
false
;
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
#if defined(_WIN32) || defined(_WIN64)
auto
start_time
=
std
::
chrono
::
steady_clock
::
now
();
#else
struct
timeval
start_time
,
end_time
;
(
void
)
gettimeofday
(
&
start_time
,
nullptr
);
#endif
bool
is_task_sink
=
context_ptr
->
get_param
<
bool
>
(
MS_CTX_ENABLE_TASK_SINK
);
if
(
is_task_sink
)
{
ret
=
RunTask
(
graph
);
}
else
{
...
...
mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
浏览文件 @
bbafa9db
...
...
@@ -44,8 +44,8 @@ class AscendKernelRuntime : public KernelRuntime {
bool
GenTask
(
const
session
::
KernelGraph
*
graph
);
bool
LoadTask
(
const
session
::
KernelGraph
*
graph
);
bool
RunTask
(
const
session
::
KernelGraph
*
graph
);
bool
Load
(
session
::
KernelGraph
*
graph
)
override
;
bool
Run
(
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
)
override
;
bool
Load
(
session
::
KernelGraph
*
graph
,
bool
is_task_sink
)
override
;
bool
Run
(
session
::
KernelGraph
*
graph
,
bool
is_task_sink
,
Debugger
*
debugger
=
nullptr
)
override
;
void
ClearGraphRuntimeResource
(
uint32_t
graph_id
,
const
std
::
vector
<
AnfNodePtr
>
&
inputs
,
const
std
::
unordered_set
<
ValueNodePtr
>
&
value_nodes
,
const
std
::
vector
<
CNodePtr
>
&
execution_order
)
override
;
...
...
mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
浏览文件 @
bbafa9db
...
...
@@ -287,7 +287,7 @@ void CPUKernelRuntime::DecreaseSummaryRefCount(const session::NamedSummaryOutput
resource_manager_
.
DecreaseSummaryRefCount
(
summary_outputs
);
}
bool
CPUKernelRuntime
::
Run
(
session
::
KernelGraph
*
kernel_graph
,
Debugger
*
debugger
)
{
bool
CPUKernelRuntime
::
Run
(
session
::
KernelGraph
*
kernel_graph
,
bool
is_task_sink
,
Debugger
*
debugger
)
{
MS_EXCEPTION_IF_NULL
(
kernel_graph
);
resource_manager_
.
IncreaseAddressRefCount
(
kernel_graph
);
...
...
mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
浏览文件 @
bbafa9db
...
...
@@ -36,7 +36,7 @@ class CPUKernelRuntime : public KernelRuntime {
~
CPUKernelRuntime
()
override
=
default
;
bool
Init
()
override
{
return
true
;
}
bool
Run
(
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
)
override
;
bool
Run
(
session
::
KernelGraph
*
graph
,
bool
is_task_sink
,
Debugger
*
debugger
=
nullptr
)
override
;
void
AssignKernelAddress
(
session
::
KernelGraph
*
kernel_graph
);
void
BindInputOutput
(
session
::
KernelGraph
*
kernel_graph
,
const
std
::
vector
<
tensor
::
TensorPtr
>
&
inputs
,
VectorRef
*
outputs
);
...
...
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
浏览文件 @
bbafa9db
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "runtime/device/gpu/gpu_kernel_runtime.h"
#include <algorithm>
#include "runtime/device/gpu/gpu_device_address.h"
#include "runtime/device/gpu/cuda_driver.h"
#include "runtime/device/gpu/gpu_buffer_mgr.h"
#include "runtime/device/gpu/gpu_device_manager.h"
#include "runtime/device/gpu/gpu_memory_allocator.h"
#include "runtime/device/gpu/distribution/collective_init.h"
#include "utils/convert_utils.h"
#include "utils/ms_context.h"
#include "runtime/device/kernel_runtime_manager.h"
#include "runtime/device/gpu/gpu_common.h"
#include "utils/ms_utils.h"
#include "runtime/device/gpu/gpu_memory_manager.h"
#include "backend/kernel_compiler/common_utils.h"
#include "runtime/device/gpu/gpu_memory_copy_manager.h"
#include "common/trans.h"
#include "ir/dtype.h"
#include "profiler/device/gpu/gpu_profiling.h"
#include "utils/shape_utils.h"
#ifdef ENABLE_DEBUGGER
#include "debug/debug_services.h"
#endif
namespace
mindspore
{
namespace
device
{
namespace
gpu
{
using
mindspore
::
device
::
memswap
::
MemSwapInfoSet
;
using
mindspore
::
device
::
memswap
::
MemSwapManager
;
using
mindspore
::
device
::
memswap
::
SwapKind
;
static
const
size_t
PARAMETER_OUTPUT_INDEX
=
0
;
bool
GPUKernelRuntime
::
SyncStream
()
{
return
GPUDeviceManager
::
GetInstance
().
SyncStream
(
stream_
);
}
bool
GPUKernelRuntime
::
Init
()
{
if
(
device_init_
==
true
)
{
GPUMemoryAllocator
::
GetInstance
().
CheckMaxDeviceMemory
();
return
true
;
}
bool
ret
=
false
;
#ifdef ENABLE_DUMP_E2E
ret
=
SetDumpConf
();
if
(
!
ret
)
{
MS_LOG
(
INFO
)
<<
"No dump conf to set!"
;
}
#endif
ret
=
InitDevice
();
if
(
!
ret
)
{
MS_LOG
(
ERROR
)
<<
"InitDevice error."
;
return
ret
;
}
mem_manager_
=
std
::
make_shared
<
GPUMemoryManager
>
();
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
mem_manager_
->
MallocDeviceMemory
();
const
void
*
collective_handle_
=
CollectiveInitializer
::
instance
().
collective_handle
();
bool
collective_inited
=
CollectiveInitializer
::
instance
().
collective_inited
();
if
(
collective_inited
&&
collective_handle_
!=
nullptr
)
{
auto
init_nccl_comm_funcptr
=
reinterpret_cast
<
InitNCCLComm
>
(
dlsym
(
const_cast
<
void
*>
(
collective_handle_
),
"InitNCCLComm"
));
MS_EXCEPTION_IF_NULL
(
init_nccl_comm_funcptr
);
(
*
init_nccl_comm_funcptr
)();
}
device_init_
=
true
;
return
ret
;
}
#ifdef ENABLE_DUMP_E2E
namespace
{
void
DumpOutput
(
mindspore
::
session
::
KernelGraph
*
graph
,
const
string
&
dump_path
,
DumpConfPtr
dump_conf
,
Debugger
*
debugger
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MS_EXCEPTION_IF_NULL
(
dump_conf
);
bool
trans_flag
=
dump_conf
->
trans_flag
();
const
auto
&
apply_kernels
=
graph
->
execution_order
();
for
(
const
auto
&
node
:
apply_kernels
)
{
MS_EXCEPTION_IF_NULL
(
node
);
auto
node_name
=
AnfAlgo
::
GetCNodeName
(
node
);
std
::
string
kernel_name
=
node
->
fullname_with_scope
();
if
(
!
dump_conf
->
IsKernelNeedDump
(
kernel_name
))
{
continue
;
}
const
std
::
string
strsrc
=
"/"
;
const
std
::
string
strdst
=
"--"
;
std
::
string
::
size_type
pos
=
0
;
std
::
string
::
size_type
srclen
=
strsrc
.
size
();
std
::
string
::
size_type
dstlen
=
strdst
.
size
();
while
((
pos
=
kernel_name
.
find
(
strsrc
,
pos
))
!=
std
::
string
::
npos
)
{
kernel_name
.
replace
(
pos
,
srclen
,
strdst
);
pos
+=
dstlen
;
}
auto
output_size
=
AnfAlgo
::
GetOutputTensorNum
(
node
);
for
(
size_t
j
=
0
;
j
<
output_size
;
++
j
)
{
auto
addr
=
AnfAlgo
::
GetOutputAddr
(
node
,
j
);
TypeId
addr_type_id
=
addr
->
type_id
();
std
::
string
addr_format
=
addr
->
format
();
ShapeVector
int_shapes
;
if
(
trans_flag
)
{
int_shapes
=
trans
::
GetRuntimePaddingShape
(
node
,
j
);
}
else
{
auto
shape
=
AnfAlgo
::
GetOutputDeviceShape
(
node
,
j
);
(
void
)
std
::
transform
(
shape
.
begin
(),
shape
.
end
(),
std
::
back_inserter
(
int_shapes
),
[](
size_t
inner_item
)
{
return
SizeToInt
(
inner_item
);
});
}
auto
type
=
AnfAlgo
::
GetOutputInferDataType
(
node
,
j
);
auto
format
=
kOpFormat_DEFAULT
;
string
filepath
=
dump_path
+
'/'
+
kernel_name
+
'_'
+
"output_"
+
std
::
to_string
(
j
);
DebugServices
*
debug_services
=
debugger
->
debug_services
();
TensorLoader
*
tensor_loader
=
debug_services
->
tensor_loader
();
std
::
string
original_kernel_name
=
node
->
fullname_with_scope
();
size_t
slot
=
j
;
auto
ret
=
tensor_loader
->
DumpTensorToFile
(
original_kernel_name
,
trans_flag
,
filepath
,
format
,
int_shapes
,
type
,
addr_type_id
,
addr_format
,
slot
);
if
(
!
ret
)
{
std
::
string
error
=
"DumpTensorToFile Failed: flag:"
+
std
::
to_string
(
trans_flag
)
+
", path:"
+
filepath
+
", host_format:"
+
format
+
".!"
;
}
}
}
}
void
DumpParameters
(
mindspore
::
session
::
KernelGraph
*
graph
,
const
string
&
dump_path
,
DumpConfPtr
dump_conf
,
Debugger
*
debugger
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MS_EXCEPTION_IF_NULL
(
dump_conf
);
bool
trans_flag
=
dump_conf
->
trans_flag
();
const
auto
&
parameters
=
graph
->
inputs
();
for
(
auto
&
item
:
parameters
)
{
if
(
!
item
->
isa
<
Parameter
>
())
{
continue
;
}
std
::
string
parameter_name
=
item
->
fullname_with_scope
();
if
(
!
dump_conf
->
IsKernelNeedDump
(
parameter_name
))
{
continue
;
}
auto
addr
=
AnfAlgo
::
GetOutputAddr
(
item
,
PARAMETER_OUTPUT_INDEX
);
TypeId
addr_type_id
=
addr
->
type_id
();
std
::
string
addr_format
=
addr
->
format
();
ShapeVector
int_shapes
;
if
(
trans_flag
)
{
int_shapes
=
trans
::
GetRuntimePaddingShape
(
item
,
PARAMETER_OUTPUT_INDEX
);
}
else
{
auto
shape
=
AnfAlgo
::
GetOutputDeviceShape
(
item
,
PARAMETER_OUTPUT_INDEX
);
(
void
)
std
::
transform
(
shape
.
begin
(),
shape
.
end
(),
std
::
back_inserter
(
int_shapes
),
[](
size_t
inner_item
)
{
return
SizeToInt
(
inner_item
);
});
}
auto
type
=
AnfAlgo
::
GetOutputInferDataType
(
item
,
PARAMETER_OUTPUT_INDEX
);
auto
format
=
kOpFormat_DEFAULT
;
string
filepath
=
dump_path
+
'/'
+
parameter_name
+
'_'
+
"output_0"
;
DebugServices
*
debug_services
=
debugger
->
debug_services
();
TensorLoader
*
tensor_loader
=
debug_services
->
tensor_loader
();
std
::
string
original_kernel_name
=
parameter_name
;
size_t
slot
=
0
;
auto
ret
=
tensor_loader
->
DumpTensorToFile
(
original_kernel_name
,
trans_flag
,
filepath
,
format
,
int_shapes
,
type
,
addr_type_id
,
addr_format
,
slot
);
if
(
!
ret
)
{
std
::
string
error
=
"DumpTensorToFile Failed: flag:"
+
std
::
to_string
(
trans_flag
)
+
", path:"
+
filepath
+
", host_format:"
+
format
+
".!"
;
}
}
}
}
// namespace
bool
GPUKernelRuntime
::
DumpData
(
mindspore
::
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MS_LOG
(
INFO
)
<<
"Start dump step"
;
DumpConfPtr
dump_conf
=
GetDumpConf
();
MS_EXCEPTION_IF_NULL
(
dump_conf
);
dump_conf
->
UpdataCurIter
();
bool
dump_flag
=
dump_conf
->
dump_enable
();
if
(
!
dump_flag
)
{
MS_LOG
(
INFO
)
<<
"Dump flag is disable, pass dump step"
;
return
true
;
}
uint32_t
cur_iter
=
dump_conf
->
cur_iter
();
if
(
dump_conf
->
dump_iter
()
!=
0
)
{
if
(
cur_iter
!=
dump_conf
->
dump_iter
())
{
return
true
;
}
}
MS_LOG
(
INFO
)
<<
"Cur iter is "
<<
cur_iter
;
std
::
string
net_name
=
dump_conf
->
dump_net_name
();
std
::
string
iterator
=
std
::
to_string
(
cur_iter
);
std
::
string
dump_path
=
dump_conf
->
dump_path
();
if
(
dump_path
.
back
()
==
'/'
)
{
dump_path
=
dump_path
+
net_name
+
'/'
+
iterator
;
}
else
{
dump_path
=
dump_path
+
'/'
+
net_name
+
'/'
+
iterator
;
}
// dump output
DumpOutput
(
graph
,
dump_path
,
dump_conf
,
debugger
);
// dump parameters
DumpParameters
(
graph
,
dump_path
,
dump_conf
,
debugger
);
return
true
;
}
#endif
#ifdef ENABLE_DEBUGGER
namespace
{
void
LoadKernelData
(
Debugger
*
debugger
,
const
CNodePtr
&
kernel
,
const
std
::
vector
<
mindspore
::
kernel
::
AddressPtr
>
&
kernel_inputs
,
const
std
::
vector
<
mindspore
::
kernel
::
AddressPtr
>
&
kernel_workspaces
,
const
std
::
vector
<
mindspore
::
kernel
::
AddressPtr
>
&
kernel_outputs
,
int
exec_order
,
void
*
stream_ptr
,
bool
dump_enabled
)
{
// check if we should read the kernel data
bool
read_data
=
false
;
std
::
string
kernel_name
=
kernel
->
fullname_with_scope
();
if
(
debugger
)
{
debugger
->
SetCurNode
(
kernel_name
);
if
(
dump_enabled
)
{
read_data
=
true
;
}
else
if
(
debugger
->
debugger_enabled
())
{
read_data
=
debugger
->
ReadNodeDataRequired
();
}
}
if
(
!
read_data
)
{
return
;
}
// get inputs
auto
input_size
=
AnfAlgo
::
GetInputTensorNum
(
kernel
);
for
(
size_t
j
=
0
;
j
<
input_size
;
++
j
)
{
auto
input_kernel
=
kernel
->
input
(
j
+
1
);
std
::
string
input_kernel_name
=
input_kernel
->
fullname_with_scope
();
auto
addr
=
kernel_inputs
[
j
];
auto
type
=
AnfAlgo
::
GetOutputInferDataType
(
input_kernel
,
PARAMETER_OUTPUT_INDEX
);
auto
format
=
kOpFormat_DEFAULT
;
auto
gpu_addr
=
std
::
make_unique
<
GPUDeviceAddress
>
(
addr
->
addr
,
addr
->
size
,
format
,
type
);
string
input_tensor_name
=
input_kernel_name
+
':'
+
"0"
;
ShapeVector
int_shapes
;
auto
shape
=
AnfAlgo
::
GetOutputDeviceShape
(
input_kernel
,
PARAMETER_OUTPUT_INDEX
);
(
void
)
std
::
transform
(
shape
.
begin
(),
shape
.
end
(),
std
::
back_inserter
(
int_shapes
),
[](
size_t
inner_item
)
{
return
SizeToInt
(
inner_item
);
});
auto
ret
=
gpu_addr
->
LoadMemToHost
(
input_tensor_name
,
exec_order
,
format
,
int_shapes
,
type
,
0
,
debugger
,
false
);
if
(
!
ret
)
{
MS_LOG
(
ERROR
)
<<
"LoadMemToHost:"
<<
", tensor_name:"
<<
input_tensor_name
<<
", host_format:"
<<
format
<<
".!"
;
}
}
// get outputs
auto
output_size
=
AnfAlgo
::
GetOutputTensorNum
(
kernel
);
for
(
size_t
j
=
0
;
j
<
output_size
;
++
j
)
{
auto
addr
=
kernel_outputs
[
j
];
auto
type
=
AnfAlgo
::
GetOutputInferDataType
(
kernel
,
j
);
auto
format
=
kOpFormat_DEFAULT
;
auto
gpu_addr
=
std
::
make_unique
<
GPUDeviceAddress
>
(
addr
->
addr
,
addr
->
size
,
format
,
type
);
string
tensor_name
=
kernel_name
+
':'
+
std
::
to_string
(
j
);
ShapeVector
int_shapes
;
auto
shape
=
AnfAlgo
::
GetOutputDeviceShape
(
kernel
,
j
);
(
void
)
std
::
transform
(
shape
.
begin
(),
shape
.
end
(),
std
::
back_inserter
(
int_shapes
),
[](
size_t
inner_item
)
{
return
SizeToInt
(
inner_item
);
});
auto
ret
=
gpu_addr
->
LoadMemToHost
(
tensor_name
,
exec_order
,
format
,
int_shapes
,
type
,
j
,
debugger
,
false
);
if
(
!
ret
)
{
MS_LOG
(
ERROR
)
<<
"LoadMemToHost:"
<<
", tensor_name:"
<<
tensor_name
<<
", host_format:"
<<
format
<<
".!"
;
}
}
debugger
->
PostExecuteNode
();
}
void
UpdateStepNum
(
Debugger
*
debugger
,
bool
dump_enabled
)
{
if
(
debugger
&&
(
debugger
->
debugger_enabled
()
||
dump_enabled
))
{
auto
cur_step_num
=
debugger
->
step_num
();
cur_step_num
=
cur_step_num
+
1
;
debugger
->
SetStepNum
(
cur_step_num
);
}
}
void
LoadParameters
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
,
bool
dump_enabled
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
if
(
!
(
debugger
&&
dump_enabled
))
{
return
;
}
const
auto
&
parameters
=
graph
->
inputs
();
// for parameters, set its execution order to be 0;
int
exec_order
=
0
;
for
(
auto
&
item
:
parameters
)
{
if
(
!
item
->
isa
<
Parameter
>
())
{
continue
;
}
std
::
string
parameter_name
=
item
->
fullname_with_scope
();
auto
addr
=
AnfAlgo
::
GetOutputAddr
(
item
,
PARAMETER_OUTPUT_INDEX
);
auto
type
=
AnfAlgo
::
GetOutputInferDataType
(
item
,
PARAMETER_OUTPUT_INDEX
);
auto
format
=
kOpFormat_DEFAULT
;
string
tensor_name
=
parameter_name
+
':'
+
"0"
;
auto
gpu_addr
=
dynamic_cast
<
const
mindspore
::
device
::
gpu
::
GPUDeviceAddress
*>
(
addr
);
ShapeVector
int_shapes
;
auto
shape
=
AnfAlgo
::
GetOutputDeviceShape
(
item
,
PARAMETER_OUTPUT_INDEX
);
(
void
)
std
::
transform
(
shape
.
begin
(),
shape
.
end
(),
std
::
back_inserter
(
int_shapes
),
[](
size_t
inner_item
)
{
return
SizeToInt
(
inner_item
);
});
auto
ret
=
gpu_addr
->
LoadMemToHost
(
tensor_name
,
exec_order
,
format
,
int_shapes
,
type
,
0
,
debugger
,
true
);
if
(
!
ret
)
{
MS_LOG
(
ERROR
)
<<
"LoadMemToHost:"
<<
", tensor_name:"
<<
tensor_name
<<
", host_format:"
<<
format
<<
".!"
;
}
}
}
void
ClearCurrentData
(
Debugger
*
debugger
,
bool
dump_enabled
)
{
if
(
debugger
&&
(
debugger
->
debugger_enabled
()
||
dump_enabled
))
{
DebugServices
*
debug_services
=
debugger
->
debug_services
();
TensorLoader
*
tensor_loader
=
debug_services
->
tensor_loader
();
tensor_loader
->
EmptyCurrentTensor
();
}
}
}
// namespace
#endif
DeviceAddressPtr
GPUKernelRuntime
::
CreateDeviceAddress
(
void
*
device_ptr
,
size_t
device_size
,
const
string
&
format
,
TypeId
type_id
)
{
return
std
::
make_shared
<
GPUDeviceAddress
>
(
device_ptr
,
device_size
,
format
,
type_id
);
}
bool
GPUKernelRuntime
::
InitDevice
()
{
if
(
GPUDeviceManager
::
GetInstance
().
device_count
()
<=
0
)
{
MS_LOG
(
ERROR
)
<<
"No GPU device found."
;
return
false
;
}
const
void
*
collective_handle_
=
CollectiveInitializer
::
instance
().
collective_handle
();
bool
collective_inited
=
CollectiveInitializer
::
instance
().
collective_inited
();
if
(
collective_inited
&&
collective_handle_
!=
nullptr
)
{
auto
get_local_rank_funcptr
=
reinterpret_cast
<
GetLocalRankId
>
(
dlsym
(
const_cast
<
void
*>
(
collective_handle_
),
"local_rank_id"
));
MS_EXCEPTION_IF_NULL
(
get_local_rank_funcptr
);
device_id_
=
IntToUint
((
*
get_local_rank_funcptr
)());
}
if
(
!
GPUDeviceManager
::
GetInstance
().
is_device_id_init
())
{
if
(
!
GPUDeviceManager
::
GetInstance
().
set_cur_device_id
(
device_id_
))
{
MS_LOG
(
ERROR
)
<<
"Failed to set current device to "
<<
SizeToInt
(
device_id_
);
return
false
;
}
}
GPUDeviceManager
::
GetInstance
().
InitDevice
();
stream_
=
GPUDeviceManager
::
GetInstance
().
default_stream
();
if
(
stream_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"No default CUDA stream found."
;
return
false
;
}
return
true
;
}
void
GPUKernelRuntime
::
ReleaseDeviceRes
()
{
// For dataset mode.
if
(
GpuBufferMgr
::
GetInstance
().
IsInit
())
{
if
(
!
GpuBufferMgr
::
GetInstance
().
IsClosed
())
{
if
(
!
GpuBufferMgr
::
GetInstance
().
CloseNotify
())
{
MS_LOG
(
EXCEPTION
)
<<
"Could not close gpu data queue."
;
}
}
CHECK_OP_RET_WITH_EXCEPT
(
GpuBufferMgr
::
GetInstance
().
Destroy
(),
"Could not destroy gpu data queue."
);
}
// Destroy remaining memory swap events and free host memory.
for
(
auto
&
item
:
mem_swap_map_
)
{
auto
&
mem_swap_manager
=
item
.
second
;
MS_EXCEPTION_IF_NULL
(
mem_swap_manager
);
if
(
mem_swap_manager
->
trigger_swap
())
{
mem_swap_manager
->
ClearSwapQueue
(
false
);
mem_swap_manager
->
ReleaseHostPinnedMem
();
}
}
GPUDeviceManager
::
GetInstance
().
ReleaseDevice
();
if
(
mem_manager_
!=
nullptr
)
{
mem_manager_
->
FreeDeviceMemory
();
}
kernel
::
KernelMeta
*
bin_map
=
kernel
::
KernelMeta
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
bin_map
);
bin_map
->
RemoveKernelCache
();
}
void
GPUKernelRuntime
::
ClearGraphRuntimeResource
(
uint32_t
graph_id
,
const
std
::
vector
<
AnfNodePtr
>
&
inputs
,
const
std
::
unordered_set
<
ValueNodePtr
>
&
value_nodes
,
const
std
::
vector
<
CNodePtr
>
&
execution_order
)
{
MS_LOG
(
INFO
)
<<
"Clear graph:"
<<
graph_id
<<
" GPU runtime resource"
;
// Release the kernel resource.
for
(
const
auto
&
kernel
:
execution_order
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
if
(
kernel_mod
==
nullptr
)
{
continue
;
}
kernel_mod
->
ReleaseResource
();
}
// Clear the output address of graph.
ClearOutputAddress
(
inputs
,
value_nodes
,
execution_order
);
}
void
GPUKernelRuntime
::
AssignMemory
(
session
::
KernelGraph
*
graph
)
{
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
mem_manager_
->
ResetDynamicMemory
();
AssignStaticMemoryInput
(
graph
);
AssignStaticMemoryValueNode
(
graph
);
bool
is_enable_dynamic_mem
=
context_ptr
->
get_param
<
bool
>
(
MS_CTX_ENABLE_DYNAMIC_MEM_POOL
);
if
(
is_enable_dynamic_mem
)
{
// Use the dynamic memory pool.
InitKernelRefCount
(
graph
);
InitMemorySwapInfo
(
graph
);
InitKernelOutputAddress
(
graph
);
InitKernelWorkspaceAddress
(
graph
);
SaveGraphOutputNode
(
graph
);
}
else
{
AssignDynamicMemory
(
graph
);
}
}
bool
GPUKernelRuntime
::
Run
(
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
struct
timeval
start_time
,
end_time
;
(
void
)
gettimeofday
(
&
start_time
,
nullptr
);
bool
ret
=
true
;
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
bool
is_enable_dynamic_mem
=
context_ptr
->
get_param
<
bool
>
(
MS_CTX_ENABLE_DYNAMIC_MEM_POOL
);
bool
is_enable_pynative_infer
=
context_ptr
->
get_param
<
bool
>
(
MS_CTX_ENABLE_PYNATIVE_INFER
);
if
(
is_enable_dynamic_mem
&&
!
is_enable_pynative_infer
)
{
auto
graph_id
=
graph
->
graph_id
();
auto
iter
=
mem_swap_map_
.
find
(
graph_id
);
if
(
iter
==
mem_swap_map_
.
end
())
{
MS_LOG
(
EXCEPTION
)
<<
"Find memory swap map failed."
;
}
mem_swap_manager_
=
iter
->
second
;
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
auto
mem_reuse_iter
=
mem_reuse_util_map_
.
find
(
graph_id
);
if
(
mem_reuse_iter
==
mem_reuse_util_map_
.
end
())
{
MS_LOG
(
EXCEPTION
)
<<
"Find memory reuse map failed."
;
}
mem_reuse_util_
=
mem_reuse_iter
->
second
;
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
ret
=
RunOneStep
(
graph
,
debugger
);
}
else
{
ret
=
LaunchKernel
(
graph
);
}
(
void
)
gettimeofday
(
&
end_time
,
nullptr
);
const
uint64_t
kUSecondInSecond
=
1000000
;
uint64_t
cost
=
kUSecondInSecond
*
static_cast
<
uint64_t
>
(
end_time
.
tv_sec
-
start_time
.
tv_sec
);
cost
+=
static_cast
<
uint64_t
>
(
end_time
.
tv_usec
-
start_time
.
tv_usec
);
MS_LOG
(
DEBUG
)
<<
"GPU kernel runtime run graph in "
<<
cost
<<
" us"
;
return
ret
;
}
bool
GPUKernelRuntime
::
RunOneStep
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
bool
ret
=
true
;
auto
graph_id
=
graph
->
graph_id
();
if
(
!
is_first_step_map_
[
graph_id
])
{
// Normally run graph
ret
=
LaunchKernelDynamic
(
graph
,
debugger
);
}
else
{
// Mock run first step
ret
=
LaunchKernelDynamic
(
graph
,
debugger
,
true
,
false
);
if
(
ret
)
{
// Normally run graph
ret
=
LaunchKernelDynamic
(
graph
,
debugger
);
}
else
{
// Trigger memory swap
ret
=
SearchMemSwapScheme
(
graph
,
debugger
);
}
is_first_step_map_
[
graph_id
]
=
false
;
}
return
ret
;
}
bool
GPUKernelRuntime
::
SearchMemSwapScheme
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
MS_LOG
(
WARNING
)
<<
"Run out of memory and try memory swapping, it may take some time, please wait a moment."
;
bool
ret
=
false
;
ClearKernelOldOutputAndWorkspace
(
graph
);
if
(
!
mem_swap_manager_
->
mem_swap_init
())
{
if
(
!
mem_swap_manager_
->
Init
(
graph
))
{
return
false
;
}
}
while
(
!
ret
)
{
if
(
!
mem_swap_manager_
->
RetreatSwapInfo
())
{
return
false
;
}
ret
=
LaunchKernelDynamic
(
graph
,
debugger
,
true
,
false
);
if
(
!
ret
)
{
ClearKernelOldOutputAndWorkspace
(
graph
);
}
}
mem_swap_manager_
->
AssignHostMemory
();
// Time profiling
ret
=
LaunchKernelDynamic
(
graph
,
debugger
,
false
,
true
);
if
(
!
ret
)
{
return
ret
;
}
return
RefineMemSwapScheme
(
graph
,
debugger
);
}
bool
GPUKernelRuntime
::
RefineMemSwapScheme
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
MS_LOG
(
WARNING
)
<<
"Refine memory swap scheme, it may take some time, please wait a moment."
;
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
if
(
!
mem_swap_manager_
->
QueryKernelTriggerSwapIn
(
kernel
))
{
continue
;
}
size_t
swap_in_task_num
=
mem_swap_manager_
->
QueryKernelTriggerSwapInTaskNum
(
kernel
);
for
(
size_t
swap_in_task_idx
=
0
;
swap_in_task_idx
<
swap_in_task_num
;
swap_in_task_idx
++
)
{
bool
ret
=
false
;
while
(
!
ret
)
{
mem_swap_manager_
->
AdjustSwapInPos
(
kernel
,
swap_in_task_idx
);
ret
=
LaunchKernelDynamic
(
graph
,
debugger
,
true
,
false
);
if
(
!
ret
)
{
ClearKernelOldOutputAndWorkspace
(
graph
);
ClearSwapInfo
(
true
);
}
}
}
}
return
true
;
}
void
GPUKernelRuntime
::
InitKernelRefCount
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MemReuseUtilPtr
mem_reuse_util_ptr
=
std
::
make_shared
<
memreuse
::
MemReuseUtil
>
();
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_ptr
);
// Init the kernel reference count.
if
(
!
mem_reuse_util_ptr
->
InitDynamicKernelRef
(
graph
))
{
MS_LOG
(
EXCEPTION
)
<<
"Init kernel reference count failed"
;
}
mem_reuse_util_ptr
->
SetKernelDefMap
();
mem_reuse_util_ptr
->
SetReuseRefCount
();
// Can't free the device address of graph output, so set the reference count of graph output specially.
mem_reuse_util_ptr
->
SetGraphOutputRefCount
();
// Can't free the device address of summary nodes, so set the reference count of summary nodes specially.
mem_reuse_util_ptr
->
SetSummaryNodesRefCount
();
auto
graph_id
=
graph
->
graph_id
();
mem_reuse_util_map_
[
graph_id
]
=
mem_reuse_util_ptr
;
}
void
GPUKernelRuntime
::
InitMemorySwapInfo
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
GPUMemCopyManagerPtr
gpu_mem_copy_manager
=
std
::
make_shared
<
GPUMemCopyManager
>
();
MS_EXCEPTION_IF_NULL
(
gpu_mem_copy_manager
);
MemSwapManagerPtr
mem_swap_manager
=
std
::
make_shared
<
MemSwapManager
>
(
gpu_mem_copy_manager
);
MS_EXCEPTION_IF_NULL
(
mem_swap_manager
);
auto
graph_id
=
graph
->
graph_id
();
mem_swap_map_
[
graph_id
]
=
mem_swap_manager
;
is_first_step_map_
[
graph_id
]
=
true
;
}
void
GPUKernelRuntime
::
InitKernelOutputAddress
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
output_sizes
=
kernel_mod
->
GetOutputSizeList
();
for
(
size_t
i
=
0
;
i
<
output_sizes
.
size
();
++
i
)
{
if
(
AnfAlgo
::
OutputAddrExist
(
kernel
,
i
))
{
continue
;
}
std
::
string
output_format
=
AnfAlgo
::
GetOutputFormat
(
kernel
,
i
);
auto
output_type
=
AnfAlgo
::
GetOutputDeviceDataType
(
kernel
,
i
);
auto
device_address
=
CreateDeviceAddress
(
nullptr
,
output_sizes
[
i
],
output_format
,
output_type
);
AnfAlgo
::
SetOutputAddr
(
device_address
,
i
,
kernel
.
get
());
}
}
}
void
GPUKernelRuntime
::
InitKernelWorkspaceAddress
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
workspace_sizes
=
kernel_mod
->
GetWorkspaceSizeList
();
for
(
size_t
i
=
0
;
i
<
workspace_sizes
.
size
();
++
i
)
{
auto
device_address
=
CreateDeviceAddress
(
nullptr
,
workspace_sizes
[
i
],
""
,
kTypeUnknown
);
AnfAlgo
::
SetWorkspaceAddr
(
device_address
,
i
,
kernel
.
get
());
}
}
}
void
GPUKernelRuntime
::
SaveGraphOutputNode
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
graph_id
=
graph
->
graph_id
();
const
auto
&
output_nodes
=
AnfAlgo
::
GetAllOutput
(
graph
->
output
(),
{
prim
::
kPrimTupleGetItem
});
for
(
const
auto
&
node
:
output_nodes
)
{
graph_output_map_
[
graph_id
].
insert
(
node
);
}
}
bool
GPUKernelRuntime
::
IsGraphOutput
(
const
session
::
KernelGraph
*
graph
,
const
mindspore
::
AnfNodePtr
&
kernel
)
const
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
graph_id
=
graph
->
graph_id
();
auto
iter
=
graph_output_map_
.
find
(
graph_id
);
if
(
iter
==
graph_output_map_
.
end
())
{
MS_LOG
(
EXCEPTION
)
<<
"Find graph output info failed."
;
}
auto
&
graph_output_set
=
iter
->
second
;
return
(
graph_output_set
.
find
(
kernel
)
!=
graph_output_set
.
end
());
}
void
GPUKernelRuntime
::
ClearKernelOldOutputAndWorkspace
(
const
session
::
KernelGraph
*
graph
)
{
ClearKernelOutputAddress
(
graph
);
ClearKernelWorkspaceAddress
(
graph
);
}
void
GPUKernelRuntime
::
ClearKernelOutputAddress
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
if
(
IsGraphOutput
(
graph
,
kernel
))
{
continue
;
}
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
output_sizes
=
kernel_mod
->
GetOutputSizeList
();
for
(
size_t
i
=
0
;
i
<
output_sizes
.
size
();
++
i
)
{
if
(
!
AnfAlgo
::
OutputAddrExist
(
kernel
,
i
))
{
continue
;
}
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
)
{
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
}
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
}
}
void
GPUKernelRuntime
::
ClearKernelWorkspaceAddress
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
workspace_sizes
=
kernel_mod
->
GetWorkspaceSizeList
();
for
(
size_t
i
=
0
;
i
<
workspace_sizes
.
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableWorkspaceAddr
(
kernel
,
i
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
)
{
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
}
}
}
}
bool
GPUKernelRuntime
::
LaunchKernelDynamic
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
,
bool
mock
,
bool
profiling
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
// Reset the reference count.
mem_reuse_util_
->
ResetDynamicUsedRefCount
();
// The inputs and outputs memory of communication kernel need be continuous, so separate processing.
AllocCommunicationOpDynamicRes
(
graph
);
#ifdef ENABLE_DEBUGGER
bool
dump_enabled
=
GPUKernelRuntime
::
DumpDataEnabledIteration
();
if
(
!
mock
)
{
UpdateStepNum
(
debugger
,
dump_enabled
);
}
#endif
auto
&
kernels
=
graph
->
execution_order
();
int
exec_order
=
1
;
auto
profiler_inst
=
profiler
::
gpu
::
GPUProfiler
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
profiler_inst
);
for
(
const
auto
&
kernel
:
kernels
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
AddressPtrList
kernel_inputs
;
AddressPtrList
kernel_workspaces
;
AddressPtrList
kernel_outputs
;
auto
ret
=
AllocKernelDynamicRes
(
*
kernel_mod
,
kernel
,
&
kernel_inputs
,
&
kernel_workspaces
,
&
kernel_outputs
,
mock
);
if
(
!
ret
)
{
#ifdef ENABLE_DEBUGGER
if
(
!
mock
)
{
// invalidate current data collected by the debugger
ClearCurrentData
(
debugger
,
dump_enabled
);
}
#endif
return
false
;
}
if
(
!
mock
)
{
if
(
!
profiling
)
{
if
(
profiler_inst
->
GetEnableFlag
())
{
profiler_inst
->
OpDataProducerBegin
(
kernel
->
fullname_with_scope
(),
stream_
);
}
CHECK_OP_RET_WITH_EXCEPT
(
kernel_mod
->
Launch
(
kernel_inputs
,
kernel_workspaces
,
kernel_outputs
,
stream_
),
"Launch kernel failed."
);
if
(
profiler_inst
->
GetEnableFlag
())
{
profiler_inst
->
OpDataProducerEnd
();
if
(
profiler_inst
->
GetSyncEnableFlag
())
{
CHECK_OP_RET_WITH_ERROR
(
SyncStream
(),
"Profiler SyncStream failed."
);
}
}
}
else
{
LaunchKernelWithTimeProfiling
(
kernel
,
kernel_inputs
,
kernel_workspaces
,
kernel_outputs
);
}
#ifdef ENABLE_DEBUGGER
// called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost)
LoadKernelData
(
debugger
,
kernel
,
kernel_inputs
,
kernel_workspaces
,
kernel_outputs
,
exec_order
,
stream_
,
dump_enabled
);
#endif
}
exec_order
=
exec_order
+
1
;
FreeKernelDynamicRes
(
kernel
);
if
(
!
UpdateMemorySwapTask
(
kernel
,
mock
,
profiling
))
{
#ifdef ENABLE_DEBUGGER
if
(
!
mock
)
{
// invalidate current data collected by the debugger
ClearCurrentData
(
debugger
,
dump_enabled
);
}
#endif
return
false
;
}
}
if
(
!
mock
)
{
#ifdef ENABLE_DEBUGGER
// collect weights and bias for dump mode
LoadParameters
(
graph
,
debugger
,
dump_enabled
);
#endif
CHECK_OP_RET_WITH_EXCEPT
(
SyncStream
(),
"SyncStream failed."
);
}
ClearSwapInfo
(
mock
);
return
true
;
}
void
GPUKernelRuntime
::
LaunchKernelWithTimeProfiling
(
const
AnfNodePtr
&
kernel
,
const
AddressPtrList
&
inputs
,
const
AddressPtrList
&
workspace
,
const
AddressPtrList
&
outputs
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
float
cost_time
=
0
;
DeviceEvent
start
=
nullptr
;
DeviceEvent
end
=
nullptr
;
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
CreateEvent
(
&
start
),
"Failed to create event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
CreateEvent
(
&
end
),
"Failed to create event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
RecordEvent
(
start
,
stream_
),
"Failed to record event to stream."
);
CHECK_OP_RET_WITH_EXCEPT
(
kernel_mod
->
Launch
(
inputs
,
workspace
,
outputs
,
stream_
),
"Launch kernel failed."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
RecordEvent
(
end
,
stream_
),
"Failed to record event to stream."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
SyncEvent
(
start
),
"Failed to sync event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
SyncEvent
(
end
),
"Failed to sync event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
ElapsedTime
(
&
cost_time
,
start
,
end
),
"Failed to record elapsed time."
);
mem_swap_manager_
->
AddKernelExecutionPerform
(
kernel
,
cost_time
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
DestroyEvent
(
start
),
"Failed to destroy event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
DestroyEvent
(
end
),
"Failed to destroy event."
);
}
bool
GPUKernelRuntime
::
AddMemorySwapTask
(
const
AnfNodePtr
&
kernel
,
bool
mock
,
bool
profiling
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
const
MemSwapInfoSet
&
mem_swap_info_set
=
mem_swap_manager_
->
QueryKernelMemSwapInfo
(
kernel
);
for
(
auto
&
mem_swap_info
:
mem_swap_info_set
)
{
auto
need_swap_kernel
=
mem_swap_manager_
->
QueryKernelByTopoOrder
(
mem_swap_info
.
topo_order_
);
MS_EXCEPTION_IF_NULL
(
need_swap_kernel
);
const
HostAddress
&
host_address
=
mem_swap_manager_
->
QueryKernelHostAddr
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
);
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
,
false
);
if
(
mem_swap_info
.
swap_kind_
==
SwapKind
::
kDeviceToHost
)
{
if
(
mem_swap_manager_
->
QueryKernelHostAddrIsDirty
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
))
{
mem_swap_manager_
->
AddMemSwapTask
(
SwapKind
::
kDeviceToHost
,
device_address
,
host_address
,
mock
);
mem_swap_manager_
->
AddKernelHostAddrIsDirty
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
,
false
);
}
else
{
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
device_address
->
set_status
(
DeviceAddressStatus
::
kInHost
);
}
}
else
if
(
mem_swap_info
.
swap_kind_
==
SwapKind
::
kHostToDevice
)
{
auto
status
=
device_address
->
status
();
if
(
status
==
DeviceAddressStatus
::
kInDeviceToHost
)
{
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
else
if
(
status
==
DeviceAddressStatus
::
kInHost
)
{
if
(
!
device_address
->
ptr_
&&
!
AttemptMallocMem
(
device_address
,
device_address
->
size_
,
mock
))
{
return
false
;
}
float
cost_time
=
0
;
mem_swap_manager_
->
AddMemSwapTask
(
SwapKind
::
kHostToDevice
,
device_address
,
host_address
,
mock
,
profiling
,
&
cost_time
);
if
(
profiling
)
{
mem_swap_manager_
->
AddKernelSwapPerform
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
,
std
::
make_pair
(
0
,
cost_time
));
}
}
}
}
return
true
;
}
bool
GPUKernelRuntime
::
UpdateMemorySwapTask
(
const
AnfNodePtr
&
kernel
,
bool
mock
,
bool
profiling
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
true
;
}
if
(
mem_swap_manager_
->
QueryKernelTriggerSwap
(
kernel
))
{
if
(
!
mock
)
{
CHECK_OP_RET_WITH_EXCEPT
(
SyncStream
(),
"SyncStream failed."
);
}
if
(
!
AddMemorySwapTask
(
kernel
,
mock
,
profiling
))
{
return
false
;
}
if
(
!
mock
)
{
CHECK_OP_RET_WITH_EXCEPT
(
mem_swap_manager_
->
SyncMemCopyStream
(
SwapKind
::
kDeviceToHost
),
"SyncCopyStream failed."
);
}
}
return
true
;
}
void
GPUKernelRuntime
::
UpdateHostSwapInQueue
(
const
DeviceAddressPtr
device_address
,
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
;
}
while
(
auto
device_address_swap_in
=
mem_swap_manager_
->
UpdateSwapQueue
(
SwapKind
::
kHostToDevice
,
mock
))
{
device_address_swap_in
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
auto
status
=
device_address
->
status
();
switch
(
status
)
{
case
DeviceAddressStatus
::
kInDevice
:
break
;
case
DeviceAddressStatus
::
kInDeviceToHost
:
{
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
break
;
}
case
DeviceAddressStatus
::
kInHostToDevice
:
{
while
(
device_address
->
status
()
!=
DeviceAddressStatus
::
kInDevice
)
{
while
(
auto
device_address_swap_in
=
mem_swap_manager_
->
UpdateSwapQueue
(
SwapKind
::
kHostToDevice
,
mock
))
{
device_address_swap_in
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
}
break
;
}
case
DeviceAddressStatus
::
kInHost
:
MS_LOG
(
WARNING
)
<<
"Unexpected device address status: "
<<
status
;
break
;
default:
MS_LOG
(
EXCEPTION
)
<<
"Invaild device address status: "
<<
status
;
}
}
void
GPUKernelRuntime
::
UpdateHostSwapOutQueue
(
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
;
}
while
(
auto
device_address_swap_out
=
mem_swap_manager_
->
UpdateSwapQueue
(
SwapKind
::
kDeviceToHost
,
mock
))
{
if
(
device_address_swap_out
->
status
()
==
DeviceAddressStatus
::
kInDeviceToHost
&&
device_address_swap_out
->
ptr_
)
{
device_address_swap_out
->
set_status
(
DeviceAddressStatus
::
kInHost
);
mem_manager_
->
FreeMemFromMemPool
(
device_address_swap_out
);
}
}
}
void
GPUKernelRuntime
::
ClearSwapInfo
(
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
;
}
mem_swap_manager_
->
ClearSwapQueue
(
mock
);
mem_swap_manager_
->
ResetHostAddrIsDirty
();
}
bool
GPUKernelRuntime
::
AttemptMallocMem
(
const
DeviceAddressPtr
&
device_address
,
size_t
size
,
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
auto
ret
=
mem_manager_
->
MallocMemFromMemPool
(
device_address
,
size
);
if
(
!
ret
)
{
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
false
;
}
if
(
!
mock
)
{
mem_swap_manager_
->
SyncMemCopyStream
(
SwapKind
::
kDeviceToHost
);
}
UpdateHostSwapOutQueue
(
mock
);
ret
=
mem_manager_
->
MallocMemFromMemPool
(
device_address
,
size
);
if
(
!
ret
)
{
return
false
;
}
}
return
true
;
}
bool
GPUKernelRuntime
::
AllocKernelDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_inputs
,
AddressPtrList
*
kernel_workspaces
,
AddressPtrList
*
kernel_outputs
,
bool
mock
)
{
if
(
!
AllocKernelInputDynamicRes
(
kernel
,
kernel_inputs
,
mock
))
{
return
false
;
}
if
(
!
AllocKernelOutputDynamicRes
(
kernel_mod
,
kernel
,
kernel_outputs
,
mock
))
{
return
false
;
}
if
(
!
AllocKernelWorkspaceDynamicRes
(
kernel_mod
,
kernel
,
kernel_workspaces
,
mock
))
{
return
false
;
}
return
true
;
}
bool
GPUKernelRuntime
::
AllocKernelInputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_inputs
,
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_inputs
);
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
for
(
size_t
i
=
0
;
i
<
AnfAlgo
::
GetInputTensorNum
(
kernel
);
++
i
)
{
DeviceAddressPtr
device_address
;
if
(
mem_reuse_util_
->
is_all_nop_node
())
{
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
false
);
}
else
{
// Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
true
);
}
MS_EXCEPTION_IF_NULL
(
device_address
);
UpdateHostSwapInQueue
(
device_address
,
mock
);
MS_EXCEPTION_IF_NULL
(
device_address
->
ptr_
);
kernel
::
AddressPtr
input
=
std
::
make_shared
<
kernel
::
Address
>
();
MS_EXCEPTION_IF_NULL
(
input
);
input
->
addr
=
device_address
->
ptr_
;
input
->
size
=
device_address
->
size_
;
kernel_inputs
->
emplace_back
(
input
);
}
return
true
;
}
bool
GPUKernelRuntime
::
AllocKernelOutputDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_outputs
,
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_outputs
);
UpdateHostSwapOutQueue
(
mock
);
auto
output_sizes
=
kernel_mod
.
GetOutputSizeList
();
for
(
size_t
i
=
0
;
i
<
output_sizes
.
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
==
nullptr
&&
!
AttemptMallocMem
(
device_address
,
output_sizes
[
i
],
mock
))
{
return
false
;
}
kernel
::
AddressPtr
output
=
std
::
make_shared
<
kernel
::
Address
>
();
MS_EXCEPTION_IF_NULL
(
output
);
output
->
addr
=
device_address
->
ptr_
;
output
->
size
=
output_sizes
[
i
];
kernel_outputs
->
emplace_back
(
output
);
}
return
true
;
}
bool
GPUKernelRuntime
::
AllocKernelWorkspaceDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_workspaces
,
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_workspaces
);
auto
workspace_sizes
=
kernel_mod
.
GetWorkspaceSizeList
();
for
(
size_t
i
=
0
;
i
<
workspace_sizes
.
size
();
++
i
)
{
if
(
workspace_sizes
[
i
]
==
0
)
{
kernel_workspaces
->
emplace_back
(
nullptr
);
continue
;
}
auto
device_address
=
AnfAlgo
::
GetMutableWorkspaceAddr
(
kernel
,
i
);
if
(
device_address
->
ptr_
==
nullptr
&&
!
AttemptMallocMem
(
device_address
,
workspace_sizes
[
i
],
mock
))
{
return
false
;
}
kernel
::
AddressPtr
workspace
=
std
::
make_shared
<
kernel
::
Address
>
();
MS_EXCEPTION_IF_NULL
(
workspace
);
workspace
->
addr
=
device_address
->
ptr_
;
workspace
->
size
=
workspace_sizes
[
i
];
kernel_workspaces
->
emplace_back
(
workspace
);
}
return
true
;
}
void
GPUKernelRuntime
::
AllocCommunicationOpDynamicRes
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
auto
&
kernel
:
kernels
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
if
(
AnfAlgo
::
IsCommunicationOp
(
kernel
))
{
AllocCommunicationOpInputDynamicRes
(
kernel
);
AllocCommunicationOpOutputDynamicRes
(
kernel
);
}
}
}
void
GPUKernelRuntime
::
AllocCommunicationOpInputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
bool
is_need_alloc_memory
=
false
;
bool
is_need_free_memory
=
false
;
size_t
total_size
=
0
;
std
::
vector
<
size_t
>
size_list
;
DeviceAddressPtrList
addr_list
;
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
intput_sizes
=
kernel_mod
->
GetInputSizeList
();
for
(
size_t
i
=
0
;
i
<
intput_sizes
.
size
();
++
i
)
{
DeviceAddressPtr
device_address
;
if
(
mem_reuse_util_
->
is_all_nop_node
())
{
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
false
);
}
else
{
// Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
true
);
}
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
==
nullptr
)
{
is_need_alloc_memory
=
true
;
}
else
{
is_need_free_memory
=
true
;
}
total_size
+=
intput_sizes
[
i
];
size_list
.
emplace_back
(
intput_sizes
[
i
]);
addr_list
.
emplace_back
(
device_address
);
}
AllocCommunicationOpMemory
(
is_need_alloc_memory
,
is_need_free_memory
,
addr_list
,
total_size
,
size_list
);
}
void
GPUKernelRuntime
::
AllocCommunicationOpOutputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
bool
is_need_alloc_memory
=
false
;
bool
is_need_free_memory
=
false
;
size_t
total_size
=
0
;
std
::
vector
<
size_t
>
size_list
;
DeviceAddressPtrList
addr_list
;
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
output_sizes
=
kernel_mod
->
GetOutputSizeList
();
for
(
size_t
i
=
0
;
i
<
output_sizes
.
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
==
nullptr
)
{
is_need_alloc_memory
=
true
;
}
else
{
is_need_free_memory
=
true
;
}
total_size
+=
output_sizes
[
i
];
size_list
.
emplace_back
(
output_sizes
[
i
]);
addr_list
.
emplace_back
(
device_address
);
}
AllocCommunicationOpMemory
(
is_need_alloc_memory
,
is_need_free_memory
,
addr_list
,
total_size
,
size_list
);
}
void
GPUKernelRuntime
::
AllocCommunicationOpMemory
(
bool
is_need_alloc_memory
,
bool
is_need_free_memory
,
const
DeviceAddressPtrList
addr_list
,
size_t
total_size
,
std
::
vector
<
size_t
>
size_list
)
{
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
if
(
!
is_need_alloc_memory
)
{
return
;
}
if
(
is_need_free_memory
)
{
for
(
const
auto
&
iter
:
addr_list
)
{
MS_EXCEPTION_IF_NULL
(
iter
);
// Free the inputs/outputs of communication kernel which are not released.
if
(
iter
->
ptr_
!=
nullptr
)
{
mem_manager_
->
FreeMemFromMemPool
(
iter
);
}
}
}
auto
ret
=
mem_manager_
->
MallocContinuousMemFromMemPool
(
addr_list
,
total_size
,
size_list
);
if
(
!
ret
)
{
MS_LOG
(
EXCEPTION
)
<<
"Malloc device memory failed."
;
}
}
void
GPUKernelRuntime
::
FreeKernelDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
auto
cnode
=
kernel
->
cast
<
CNodePtr
>
();
MS_EXCEPTION_IF_NULL
(
cnode
);
if
(
AnfAlgo
::
IsCommunicationOp
(
kernel
))
{
return
;
}
// Free the input of kernel by reference count.
for
(
size_t
i
=
0
;
i
<
AnfAlgo
::
GetInputTensorNum
(
kernel
);
++
i
)
{
auto
kernel_ref_count_ptr
=
mem_reuse_util_
->
GetKernelInputRef
(
cnode
,
i
);
if
(
kernel_ref_count_ptr
==
nullptr
)
{
continue
;
}
kernel_ref_count_ptr
->
ref_count_dynamic_use_
--
;
if
(
kernel_ref_count_ptr
->
ref_count_dynamic_use_
<
0
)
{
MS_LOG
(
EXCEPTION
)
<<
"Check dynamic reference count failed."
;
}
if
(
kernel_ref_count_ptr
->
ref_count_dynamic_use_
==
0
)
{
DeviceAddressPtr
device_address
;
if
(
mem_reuse_util_
->
is_all_nop_node
())
{
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
false
);
}
else
{
// Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
true
);
}
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
}
// Free the output of kernel, if output has no reference.
for
(
size_t
i
=
0
;
i
<
AnfAlgo
::
GetOutputTensorNum
(
kernel
);
++
i
)
{
auto
kernel_ref_count_ptr
=
mem_reuse_util_
->
GetRef
(
cnode
,
i
);
if
(
kernel_ref_count_ptr
==
nullptr
)
{
continue
;
}
if
(
kernel_ref_count_ptr
->
ref_count_dynamic_use_
==
0
)
{
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
}
// Free the workspace of kernel.
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
for
(
size_t
i
=
0
;
i
<
kernel_mod
->
GetWorkspaceSizeList
().
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableWorkspaceAddr
(
kernel
,
i
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
)
{
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
}
}
}
}
// namespace gpu
}
// namespace device
}
// namespace mindspore
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "runtime/device/gpu/gpu_kernel_runtime.h"
#include <algorithm>
#include "runtime/device/gpu/gpu_device_address.h"
#include "runtime/device/gpu/cuda_driver.h"
#include "runtime/device/gpu/gpu_buffer_mgr.h"
#include "runtime/device/gpu/gpu_device_manager.h"
#include "runtime/device/gpu/gpu_memory_allocator.h"
#include "runtime/device/gpu/distribution/collective_init.h"
#include "utils/convert_utils.h"
#include "utils/ms_context.h"
#include "runtime/device/kernel_runtime_manager.h"
#include "runtime/device/gpu/gpu_common.h"
#include "utils/ms_utils.h"
#include "runtime/device/gpu/gpu_memory_manager.h"
#include "backend/kernel_compiler/common_utils.h"
#include "runtime/device/gpu/gpu_memory_copy_manager.h"
#include "common/trans.h"
#include "ir/dtype.h"
#include "profiler/device/gpu/gpu_profiling.h"
#include "utils/shape_utils.h"
#ifdef ENABLE_DEBUGGER
#include "debug/debug_services.h"
#endif
namespace
mindspore
{
namespace
device
{
namespace
gpu
{
using
mindspore
::
device
::
memswap
::
MemSwapInfoSet
;
using
mindspore
::
device
::
memswap
::
MemSwapManager
;
using
mindspore
::
device
::
memswap
::
SwapKind
;
static
const
size_t
PARAMETER_OUTPUT_INDEX
=
0
;
bool
GPUKernelRuntime
::
SyncStream
()
{
return
GPUDeviceManager
::
GetInstance
().
SyncStream
(
stream_
);
}
bool
GPUKernelRuntime
::
Init
()
{
if
(
device_init_
==
true
)
{
GPUMemoryAllocator
::
GetInstance
().
CheckMaxDeviceMemory
();
return
true
;
}
bool
ret
=
false
;
#ifdef ENABLE_DUMP_E2E
ret
=
SetDumpConf
();
if
(
!
ret
)
{
MS_LOG
(
INFO
)
<<
"No dump conf to set!"
;
}
#endif
ret
=
InitDevice
();
if
(
!
ret
)
{
MS_LOG
(
ERROR
)
<<
"InitDevice error."
;
return
ret
;
}
mem_manager_
=
std
::
make_shared
<
GPUMemoryManager
>
();
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
mem_manager_
->
MallocDeviceMemory
();
const
void
*
collective_handle_
=
CollectiveInitializer
::
instance
().
collective_handle
();
bool
collective_inited
=
CollectiveInitializer
::
instance
().
collective_inited
();
if
(
collective_inited
&&
collective_handle_
!=
nullptr
)
{
auto
init_nccl_comm_funcptr
=
reinterpret_cast
<
InitNCCLComm
>
(
dlsym
(
const_cast
<
void
*>
(
collective_handle_
),
"InitNCCLComm"
));
MS_EXCEPTION_IF_NULL
(
init_nccl_comm_funcptr
);
(
*
init_nccl_comm_funcptr
)();
}
device_init_
=
true
;
return
ret
;
}
#ifdef ENABLE_DUMP_E2E
namespace
{
void
DumpOutput
(
mindspore
::
session
::
KernelGraph
*
graph
,
const
string
&
dump_path
,
DumpConfPtr
dump_conf
,
Debugger
*
debugger
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MS_EXCEPTION_IF_NULL
(
dump_conf
);
bool
trans_flag
=
dump_conf
->
trans_flag
();
const
auto
&
apply_kernels
=
graph
->
execution_order
();
for
(
const
auto
&
node
:
apply_kernels
)
{
MS_EXCEPTION_IF_NULL
(
node
);
auto
node_name
=
AnfAlgo
::
GetCNodeName
(
node
);
std
::
string
kernel_name
=
node
->
fullname_with_scope
();
if
(
!
dump_conf
->
IsKernelNeedDump
(
kernel_name
))
{
continue
;
}
const
std
::
string
strsrc
=
"/"
;
const
std
::
string
strdst
=
"--"
;
std
::
string
::
size_type
pos
=
0
;
std
::
string
::
size_type
srclen
=
strsrc
.
size
();
std
::
string
::
size_type
dstlen
=
strdst
.
size
();
while
((
pos
=
kernel_name
.
find
(
strsrc
,
pos
))
!=
std
::
string
::
npos
)
{
kernel_name
.
replace
(
pos
,
srclen
,
strdst
);
pos
+=
dstlen
;
}
auto
output_size
=
AnfAlgo
::
GetOutputTensorNum
(
node
);
for
(
size_t
j
=
0
;
j
<
output_size
;
++
j
)
{
auto
addr
=
AnfAlgo
::
GetOutputAddr
(
node
,
j
);
TypeId
addr_type_id
=
addr
->
type_id
();
std
::
string
addr_format
=
addr
->
format
();
ShapeVector
int_shapes
;
if
(
trans_flag
)
{
int_shapes
=
trans
::
GetRuntimePaddingShape
(
node
,
j
);
}
else
{
auto
shape
=
AnfAlgo
::
GetOutputDeviceShape
(
node
,
j
);
(
void
)
std
::
transform
(
shape
.
begin
(),
shape
.
end
(),
std
::
back_inserter
(
int_shapes
),
[](
size_t
inner_item
)
{
return
SizeToInt
(
inner_item
);
});
}
auto
type
=
AnfAlgo
::
GetOutputInferDataType
(
node
,
j
);
auto
format
=
kOpFormat_DEFAULT
;
string
filepath
=
dump_path
+
'/'
+
kernel_name
+
'_'
+
"output_"
+
std
::
to_string
(
j
);
DebugServices
*
debug_services
=
debugger
->
debug_services
();
TensorLoader
*
tensor_loader
=
debug_services
->
tensor_loader
();
std
::
string
original_kernel_name
=
node
->
fullname_with_scope
();
size_t
slot
=
j
;
auto
ret
=
tensor_loader
->
DumpTensorToFile
(
original_kernel_name
,
trans_flag
,
filepath
,
format
,
int_shapes
,
type
,
addr_type_id
,
addr_format
,
slot
);
if
(
!
ret
)
{
std
::
string
error
=
"DumpTensorToFile Failed: flag:"
+
std
::
to_string
(
trans_flag
)
+
", path:"
+
filepath
+
", host_format:"
+
format
+
".!"
;
}
}
}
}
void
DumpParameters
(
mindspore
::
session
::
KernelGraph
*
graph
,
const
string
&
dump_path
,
DumpConfPtr
dump_conf
,
Debugger
*
debugger
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MS_EXCEPTION_IF_NULL
(
dump_conf
);
bool
trans_flag
=
dump_conf
->
trans_flag
();
const
auto
&
parameters
=
graph
->
inputs
();
for
(
auto
&
item
:
parameters
)
{
if
(
!
item
->
isa
<
Parameter
>
())
{
continue
;
}
std
::
string
parameter_name
=
item
->
fullname_with_scope
();
if
(
!
dump_conf
->
IsKernelNeedDump
(
parameter_name
))
{
continue
;
}
auto
addr
=
AnfAlgo
::
GetOutputAddr
(
item
,
PARAMETER_OUTPUT_INDEX
);
TypeId
addr_type_id
=
addr
->
type_id
();
std
::
string
addr_format
=
addr
->
format
();
ShapeVector
int_shapes
;
if
(
trans_flag
)
{
int_shapes
=
trans
::
GetRuntimePaddingShape
(
item
,
PARAMETER_OUTPUT_INDEX
);
}
else
{
auto
shape
=
AnfAlgo
::
GetOutputDeviceShape
(
item
,
PARAMETER_OUTPUT_INDEX
);
(
void
)
std
::
transform
(
shape
.
begin
(),
shape
.
end
(),
std
::
back_inserter
(
int_shapes
),
[](
size_t
inner_item
)
{
return
SizeToInt
(
inner_item
);
});
}
auto
type
=
AnfAlgo
::
GetOutputInferDataType
(
item
,
PARAMETER_OUTPUT_INDEX
);
auto
format
=
kOpFormat_DEFAULT
;
string
filepath
=
dump_path
+
'/'
+
parameter_name
+
'_'
+
"output_0"
;
DebugServices
*
debug_services
=
debugger
->
debug_services
();
TensorLoader
*
tensor_loader
=
debug_services
->
tensor_loader
();
std
::
string
original_kernel_name
=
parameter_name
;
size_t
slot
=
0
;
auto
ret
=
tensor_loader
->
DumpTensorToFile
(
original_kernel_name
,
trans_flag
,
filepath
,
format
,
int_shapes
,
type
,
addr_type_id
,
addr_format
,
slot
);
if
(
!
ret
)
{
std
::
string
error
=
"DumpTensorToFile Failed: flag:"
+
std
::
to_string
(
trans_flag
)
+
", path:"
+
filepath
+
", host_format:"
+
format
+
".!"
;
}
}
}
}
// namespace
bool
GPUKernelRuntime
::
DumpData
(
mindspore
::
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MS_LOG
(
INFO
)
<<
"Start dump step"
;
DumpConfPtr
dump_conf
=
GetDumpConf
();
MS_EXCEPTION_IF_NULL
(
dump_conf
);
dump_conf
->
UpdataCurIter
();
bool
dump_flag
=
dump_conf
->
dump_enable
();
if
(
!
dump_flag
)
{
MS_LOG
(
INFO
)
<<
"Dump flag is disable, pass dump step"
;
return
true
;
}
uint32_t
cur_iter
=
dump_conf
->
cur_iter
();
if
(
dump_conf
->
dump_iter
()
!=
0
)
{
if
(
cur_iter
!=
dump_conf
->
dump_iter
())
{
return
true
;
}
}
MS_LOG
(
INFO
)
<<
"Cur iter is "
<<
cur_iter
;
std
::
string
net_name
=
dump_conf
->
dump_net_name
();
std
::
string
iterator
=
std
::
to_string
(
cur_iter
);
std
::
string
dump_path
=
dump_conf
->
dump_path
();
if
(
dump_path
.
back
()
==
'/'
)
{
dump_path
=
dump_path
+
net_name
+
'/'
+
iterator
;
}
else
{
dump_path
=
dump_path
+
'/'
+
net_name
+
'/'
+
iterator
;
}
// dump output
DumpOutput
(
graph
,
dump_path
,
dump_conf
,
debugger
);
// dump parameters
DumpParameters
(
graph
,
dump_path
,
dump_conf
,
debugger
);
return
true
;
}
#endif
#ifdef ENABLE_DEBUGGER
namespace
{
void
LoadKernelData
(
Debugger
*
debugger
,
const
CNodePtr
&
kernel
,
const
std
::
vector
<
mindspore
::
kernel
::
AddressPtr
>
&
kernel_inputs
,
const
std
::
vector
<
mindspore
::
kernel
::
AddressPtr
>
&
kernel_workspaces
,
const
std
::
vector
<
mindspore
::
kernel
::
AddressPtr
>
&
kernel_outputs
,
int
exec_order
,
void
*
stream_ptr
,
bool
dump_enabled
)
{
// check if we should read the kernel data
bool
read_data
=
false
;
std
::
string
kernel_name
=
kernel
->
fullname_with_scope
();
if
(
debugger
)
{
debugger
->
SetCurNode
(
kernel_name
);
if
(
dump_enabled
)
{
read_data
=
true
;
}
else
if
(
debugger
->
debugger_enabled
())
{
read_data
=
debugger
->
ReadNodeDataRequired
();
}
}
if
(
!
read_data
)
{
return
;
}
// get inputs
auto
input_size
=
AnfAlgo
::
GetInputTensorNum
(
kernel
);
for
(
size_t
j
=
0
;
j
<
input_size
;
++
j
)
{
auto
input_kernel
=
kernel
->
input
(
j
+
1
);
std
::
string
input_kernel_name
=
input_kernel
->
fullname_with_scope
();
auto
addr
=
kernel_inputs
[
j
];
auto
type
=
AnfAlgo
::
GetOutputInferDataType
(
input_kernel
,
PARAMETER_OUTPUT_INDEX
);
auto
format
=
kOpFormat_DEFAULT
;
auto
gpu_addr
=
std
::
make_unique
<
GPUDeviceAddress
>
(
addr
->
addr
,
addr
->
size
,
format
,
type
);
string
input_tensor_name
=
input_kernel_name
+
':'
+
"0"
;
ShapeVector
int_shapes
;
auto
shape
=
AnfAlgo
::
GetOutputDeviceShape
(
input_kernel
,
PARAMETER_OUTPUT_INDEX
);
(
void
)
std
::
transform
(
shape
.
begin
(),
shape
.
end
(),
std
::
back_inserter
(
int_shapes
),
[](
size_t
inner_item
)
{
return
SizeToInt
(
inner_item
);
});
auto
ret
=
gpu_addr
->
LoadMemToHost
(
input_tensor_name
,
exec_order
,
format
,
int_shapes
,
type
,
0
,
debugger
,
false
);
if
(
!
ret
)
{
MS_LOG
(
ERROR
)
<<
"LoadMemToHost:"
<<
", tensor_name:"
<<
input_tensor_name
<<
", host_format:"
<<
format
<<
".!"
;
}
}
// get outputs
auto
output_size
=
AnfAlgo
::
GetOutputTensorNum
(
kernel
);
for
(
size_t
j
=
0
;
j
<
output_size
;
++
j
)
{
auto
addr
=
kernel_outputs
[
j
];
auto
type
=
AnfAlgo
::
GetOutputInferDataType
(
kernel
,
j
);
auto
format
=
kOpFormat_DEFAULT
;
auto
gpu_addr
=
std
::
make_unique
<
GPUDeviceAddress
>
(
addr
->
addr
,
addr
->
size
,
format
,
type
);
string
tensor_name
=
kernel_name
+
':'
+
std
::
to_string
(
j
);
ShapeVector
int_shapes
;
auto
shape
=
AnfAlgo
::
GetOutputDeviceShape
(
kernel
,
j
);
(
void
)
std
::
transform
(
shape
.
begin
(),
shape
.
end
(),
std
::
back_inserter
(
int_shapes
),
[](
size_t
inner_item
)
{
return
SizeToInt
(
inner_item
);
});
auto
ret
=
gpu_addr
->
LoadMemToHost
(
tensor_name
,
exec_order
,
format
,
int_shapes
,
type
,
j
,
debugger
,
false
);
if
(
!
ret
)
{
MS_LOG
(
ERROR
)
<<
"LoadMemToHost:"
<<
", tensor_name:"
<<
tensor_name
<<
", host_format:"
<<
format
<<
".!"
;
}
}
debugger
->
PostExecuteNode
();
}
void
UpdateStepNum
(
Debugger
*
debugger
,
bool
dump_enabled
)
{
if
(
debugger
&&
(
debugger
->
debugger_enabled
()
||
dump_enabled
))
{
auto
cur_step_num
=
debugger
->
step_num
();
cur_step_num
=
cur_step_num
+
1
;
debugger
->
SetStepNum
(
cur_step_num
);
}
}
void
LoadParameters
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
,
bool
dump_enabled
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
if
(
!
(
debugger
&&
dump_enabled
))
{
return
;
}
const
auto
&
parameters
=
graph
->
inputs
();
// for parameters, set its execution order to be 0;
int
exec_order
=
0
;
for
(
auto
&
item
:
parameters
)
{
if
(
!
item
->
isa
<
Parameter
>
())
{
continue
;
}
std
::
string
parameter_name
=
item
->
fullname_with_scope
();
auto
addr
=
AnfAlgo
::
GetOutputAddr
(
item
,
PARAMETER_OUTPUT_INDEX
);
auto
type
=
AnfAlgo
::
GetOutputInferDataType
(
item
,
PARAMETER_OUTPUT_INDEX
);
auto
format
=
kOpFormat_DEFAULT
;
string
tensor_name
=
parameter_name
+
':'
+
"0"
;
auto
gpu_addr
=
dynamic_cast
<
const
mindspore
::
device
::
gpu
::
GPUDeviceAddress
*>
(
addr
);
ShapeVector
int_shapes
;
auto
shape
=
AnfAlgo
::
GetOutputDeviceShape
(
item
,
PARAMETER_OUTPUT_INDEX
);
(
void
)
std
::
transform
(
shape
.
begin
(),
shape
.
end
(),
std
::
back_inserter
(
int_shapes
),
[](
size_t
inner_item
)
{
return
SizeToInt
(
inner_item
);
});
auto
ret
=
gpu_addr
->
LoadMemToHost
(
tensor_name
,
exec_order
,
format
,
int_shapes
,
type
,
0
,
debugger
,
true
);
if
(
!
ret
)
{
MS_LOG
(
ERROR
)
<<
"LoadMemToHost:"
<<
", tensor_name:"
<<
tensor_name
<<
", host_format:"
<<
format
<<
".!"
;
}
}
}
void
ClearCurrentData
(
Debugger
*
debugger
,
bool
dump_enabled
)
{
if
(
debugger
&&
(
debugger
->
debugger_enabled
()
||
dump_enabled
))
{
DebugServices
*
debug_services
=
debugger
->
debug_services
();
TensorLoader
*
tensor_loader
=
debug_services
->
tensor_loader
();
tensor_loader
->
EmptyCurrentTensor
();
}
}
}
// namespace
#endif
DeviceAddressPtr
GPUKernelRuntime
::
CreateDeviceAddress
(
void
*
device_ptr
,
size_t
device_size
,
const
string
&
format
,
TypeId
type_id
)
{
return
std
::
make_shared
<
GPUDeviceAddress
>
(
device_ptr
,
device_size
,
format
,
type_id
);
}
bool
GPUKernelRuntime
::
InitDevice
()
{
if
(
GPUDeviceManager
::
GetInstance
().
device_count
()
<=
0
)
{
MS_LOG
(
ERROR
)
<<
"No GPU device found."
;
return
false
;
}
const
void
*
collective_handle_
=
CollectiveInitializer
::
instance
().
collective_handle
();
bool
collective_inited
=
CollectiveInitializer
::
instance
().
collective_inited
();
if
(
collective_inited
&&
collective_handle_
!=
nullptr
)
{
auto
get_local_rank_funcptr
=
reinterpret_cast
<
GetLocalRankId
>
(
dlsym
(
const_cast
<
void
*>
(
collective_handle_
),
"local_rank_id"
));
MS_EXCEPTION_IF_NULL
(
get_local_rank_funcptr
);
device_id_
=
IntToUint
((
*
get_local_rank_funcptr
)());
}
if
(
!
GPUDeviceManager
::
GetInstance
().
is_device_id_init
())
{
if
(
!
GPUDeviceManager
::
GetInstance
().
set_cur_device_id
(
device_id_
))
{
MS_LOG
(
ERROR
)
<<
"Failed to set current device to "
<<
SizeToInt
(
device_id_
);
return
false
;
}
}
GPUDeviceManager
::
GetInstance
().
InitDevice
();
stream_
=
GPUDeviceManager
::
GetInstance
().
default_stream
();
if
(
stream_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"No default CUDA stream found."
;
return
false
;
}
return
true
;
}
void
GPUKernelRuntime
::
ReleaseDeviceRes
()
{
// For dataset mode.
if
(
GpuBufferMgr
::
GetInstance
().
IsInit
())
{
if
(
!
GpuBufferMgr
::
GetInstance
().
IsClosed
())
{
if
(
!
GpuBufferMgr
::
GetInstance
().
CloseNotify
())
{
MS_LOG
(
EXCEPTION
)
<<
"Could not close gpu data queue."
;
}
}
CHECK_OP_RET_WITH_EXCEPT
(
GpuBufferMgr
::
GetInstance
().
Destroy
(),
"Could not destroy gpu data queue."
);
}
// Destroy remaining memory swap events and free host memory.
for
(
auto
&
item
:
mem_swap_map_
)
{
auto
&
mem_swap_manager
=
item
.
second
;
MS_EXCEPTION_IF_NULL
(
mem_swap_manager
);
if
(
mem_swap_manager
->
trigger_swap
())
{
mem_swap_manager
->
ClearSwapQueue
(
false
);
mem_swap_manager
->
ReleaseHostPinnedMem
();
}
}
GPUDeviceManager
::
GetInstance
().
ReleaseDevice
();
if
(
mem_manager_
!=
nullptr
)
{
mem_manager_
->
FreeDeviceMemory
();
}
kernel
::
KernelMeta
*
bin_map
=
kernel
::
KernelMeta
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
bin_map
);
bin_map
->
RemoveKernelCache
();
}
void
GPUKernelRuntime
::
ClearGraphRuntimeResource
(
uint32_t
graph_id
,
const
std
::
vector
<
AnfNodePtr
>
&
inputs
,
const
std
::
unordered_set
<
ValueNodePtr
>
&
value_nodes
,
const
std
::
vector
<
CNodePtr
>
&
execution_order
)
{
MS_LOG
(
INFO
)
<<
"Clear graph:"
<<
graph_id
<<
" GPU runtime resource"
;
// Release the kernel resource.
for
(
const
auto
&
kernel
:
execution_order
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
if
(
kernel_mod
==
nullptr
)
{
continue
;
}
kernel_mod
->
ReleaseResource
();
}
// Clear the output address of graph.
ClearOutputAddress
(
inputs
,
value_nodes
,
execution_order
);
}
void
GPUKernelRuntime
::
AssignMemory
(
session
::
KernelGraph
*
graph
)
{
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
mem_manager_
->
ResetDynamicMemory
();
AssignStaticMemoryInput
(
graph
);
AssignStaticMemoryValueNode
(
graph
);
bool
is_enable_dynamic_mem
=
context_ptr
->
get_param
<
bool
>
(
MS_CTX_ENABLE_DYNAMIC_MEM_POOL
);
if
(
is_enable_dynamic_mem
)
{
// Use the dynamic memory pool.
InitKernelRefCount
(
graph
);
InitMemorySwapInfo
(
graph
);
InitKernelOutputAddress
(
graph
);
InitKernelWorkspaceAddress
(
graph
);
SaveGraphOutputNode
(
graph
);
}
else
{
AssignDynamicMemory
(
graph
);
}
}
bool
GPUKernelRuntime
::
Run
(
session
::
KernelGraph
*
graph
,
bool
is_task_sink
,
Debugger
*
debugger
)
{
struct
timeval
start_time
,
end_time
;
(
void
)
gettimeofday
(
&
start_time
,
nullptr
);
bool
ret
=
true
;
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
bool
is_enable_dynamic_mem
=
context_ptr
->
get_param
<
bool
>
(
MS_CTX_ENABLE_DYNAMIC_MEM_POOL
);
bool
is_enable_pynative_infer
=
context_ptr
->
get_param
<
bool
>
(
MS_CTX_ENABLE_PYNATIVE_INFER
);
if
(
is_enable_dynamic_mem
&&
!
is_enable_pynative_infer
)
{
auto
graph_id
=
graph
->
graph_id
();
auto
iter
=
mem_swap_map_
.
find
(
graph_id
);
if
(
iter
==
mem_swap_map_
.
end
())
{
MS_LOG
(
EXCEPTION
)
<<
"Find memory swap map failed."
;
}
mem_swap_manager_
=
iter
->
second
;
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
auto
mem_reuse_iter
=
mem_reuse_util_map_
.
find
(
graph_id
);
if
(
mem_reuse_iter
==
mem_reuse_util_map_
.
end
())
{
MS_LOG
(
EXCEPTION
)
<<
"Find memory reuse map failed."
;
}
mem_reuse_util_
=
mem_reuse_iter
->
second
;
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
ret
=
RunOneStep
(
graph
,
debugger
);
}
else
{
ret
=
LaunchKernel
(
graph
);
}
(
void
)
gettimeofday
(
&
end_time
,
nullptr
);
const
uint64_t
kUSecondInSecond
=
1000000
;
uint64_t
cost
=
kUSecondInSecond
*
static_cast
<
uint64_t
>
(
end_time
.
tv_sec
-
start_time
.
tv_sec
);
cost
+=
static_cast
<
uint64_t
>
(
end_time
.
tv_usec
-
start_time
.
tv_usec
);
MS_LOG
(
DEBUG
)
<<
"GPU kernel runtime run graph in "
<<
cost
<<
" us"
;
return
ret
;
}
bool
GPUKernelRuntime
::
RunOneStep
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
bool
ret
=
true
;
auto
graph_id
=
graph
->
graph_id
();
if
(
!
is_first_step_map_
[
graph_id
])
{
// Normally run graph
ret
=
LaunchKernelDynamic
(
graph
,
debugger
);
}
else
{
// Mock run first step
ret
=
LaunchKernelDynamic
(
graph
,
debugger
,
true
,
false
);
if
(
ret
)
{
// Normally run graph
ret
=
LaunchKernelDynamic
(
graph
,
debugger
);
}
else
{
// Trigger memory swap
ret
=
SearchMemSwapScheme
(
graph
,
debugger
);
}
is_first_step_map_
[
graph_id
]
=
false
;
}
return
ret
;
}
bool
GPUKernelRuntime
::
SearchMemSwapScheme
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
MS_LOG
(
WARNING
)
<<
"Run out of memory and try memory swapping, it may take some time, please wait a moment."
;
bool
ret
=
false
;
ClearKernelOldOutputAndWorkspace
(
graph
);
if
(
!
mem_swap_manager_
->
mem_swap_init
())
{
if
(
!
mem_swap_manager_
->
Init
(
graph
))
{
return
false
;
}
}
while
(
!
ret
)
{
if
(
!
mem_swap_manager_
->
RetreatSwapInfo
())
{
return
false
;
}
ret
=
LaunchKernelDynamic
(
graph
,
debugger
,
true
,
false
);
if
(
!
ret
)
{
ClearKernelOldOutputAndWorkspace
(
graph
);
}
}
mem_swap_manager_
->
AssignHostMemory
();
// Time profiling
ret
=
LaunchKernelDynamic
(
graph
,
debugger
,
false
,
true
);
if
(
!
ret
)
{
return
ret
;
}
return
RefineMemSwapScheme
(
graph
,
debugger
);
}
bool
GPUKernelRuntime
::
RefineMemSwapScheme
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
MS_LOG
(
WARNING
)
<<
"Refine memory swap scheme, it may take some time, please wait a moment."
;
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
if
(
!
mem_swap_manager_
->
QueryKernelTriggerSwapIn
(
kernel
))
{
continue
;
}
size_t
swap_in_task_num
=
mem_swap_manager_
->
QueryKernelTriggerSwapInTaskNum
(
kernel
);
for
(
size_t
swap_in_task_idx
=
0
;
swap_in_task_idx
<
swap_in_task_num
;
swap_in_task_idx
++
)
{
bool
ret
=
false
;
while
(
!
ret
)
{
mem_swap_manager_
->
AdjustSwapInPos
(
kernel
,
swap_in_task_idx
);
ret
=
LaunchKernelDynamic
(
graph
,
debugger
,
true
,
false
);
if
(
!
ret
)
{
ClearKernelOldOutputAndWorkspace
(
graph
);
ClearSwapInfo
(
true
);
}
}
}
}
return
true
;
}
void
GPUKernelRuntime
::
InitKernelRefCount
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MemReuseUtilPtr
mem_reuse_util_ptr
=
std
::
make_shared
<
memreuse
::
MemReuseUtil
>
();
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_ptr
);
// Init the kernel reference count.
if
(
!
mem_reuse_util_ptr
->
InitDynamicKernelRef
(
graph
))
{
MS_LOG
(
EXCEPTION
)
<<
"Init kernel reference count failed"
;
}
mem_reuse_util_ptr
->
SetKernelDefMap
();
mem_reuse_util_ptr
->
SetReuseRefCount
();
// Can't free the device address of graph output, so set the reference count of graph output specially.
mem_reuse_util_ptr
->
SetGraphOutputRefCount
();
// Can't free the device address of summary nodes, so set the reference count of summary nodes specially.
mem_reuse_util_ptr
->
SetSummaryNodesRefCount
();
auto
graph_id
=
graph
->
graph_id
();
mem_reuse_util_map_
[
graph_id
]
=
mem_reuse_util_ptr
;
}
void
GPUKernelRuntime
::
InitMemorySwapInfo
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
GPUMemCopyManagerPtr
gpu_mem_copy_manager
=
std
::
make_shared
<
GPUMemCopyManager
>
();
MS_EXCEPTION_IF_NULL
(
gpu_mem_copy_manager
);
MemSwapManagerPtr
mem_swap_manager
=
std
::
make_shared
<
MemSwapManager
>
(
gpu_mem_copy_manager
);
MS_EXCEPTION_IF_NULL
(
mem_swap_manager
);
auto
graph_id
=
graph
->
graph_id
();
mem_swap_map_
[
graph_id
]
=
mem_swap_manager
;
is_first_step_map_
[
graph_id
]
=
true
;
}
void
GPUKernelRuntime
::
InitKernelOutputAddress
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
output_sizes
=
kernel_mod
->
GetOutputSizeList
();
for
(
size_t
i
=
0
;
i
<
output_sizes
.
size
();
++
i
)
{
if
(
AnfAlgo
::
OutputAddrExist
(
kernel
,
i
))
{
continue
;
}
std
::
string
output_format
=
AnfAlgo
::
GetOutputFormat
(
kernel
,
i
);
auto
output_type
=
AnfAlgo
::
GetOutputDeviceDataType
(
kernel
,
i
);
auto
device_address
=
CreateDeviceAddress
(
nullptr
,
output_sizes
[
i
],
output_format
,
output_type
);
AnfAlgo
::
SetOutputAddr
(
device_address
,
i
,
kernel
.
get
());
}
}
}
void
GPUKernelRuntime
::
InitKernelWorkspaceAddress
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
workspace_sizes
=
kernel_mod
->
GetWorkspaceSizeList
();
for
(
size_t
i
=
0
;
i
<
workspace_sizes
.
size
();
++
i
)
{
auto
device_address
=
CreateDeviceAddress
(
nullptr
,
workspace_sizes
[
i
],
""
,
kTypeUnknown
);
AnfAlgo
::
SetWorkspaceAddr
(
device_address
,
i
,
kernel
.
get
());
}
}
}
void
GPUKernelRuntime
::
SaveGraphOutputNode
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
graph_id
=
graph
->
graph_id
();
const
auto
&
output_nodes
=
AnfAlgo
::
GetAllOutput
(
graph
->
output
(),
{
prim
::
kPrimTupleGetItem
});
for
(
const
auto
&
node
:
output_nodes
)
{
graph_output_map_
[
graph_id
].
insert
(
node
);
}
}
bool
GPUKernelRuntime
::
IsGraphOutput
(
const
session
::
KernelGraph
*
graph
,
const
mindspore
::
AnfNodePtr
&
kernel
)
const
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
graph_id
=
graph
->
graph_id
();
auto
iter
=
graph_output_map_
.
find
(
graph_id
);
if
(
iter
==
graph_output_map_
.
end
())
{
MS_LOG
(
EXCEPTION
)
<<
"Find graph output info failed."
;
}
auto
&
graph_output_set
=
iter
->
second
;
return
(
graph_output_set
.
find
(
kernel
)
!=
graph_output_set
.
end
());
}
void
GPUKernelRuntime
::
ClearKernelOldOutputAndWorkspace
(
const
session
::
KernelGraph
*
graph
)
{
ClearKernelOutputAddress
(
graph
);
ClearKernelWorkspaceAddress
(
graph
);
}
void
GPUKernelRuntime
::
ClearKernelOutputAddress
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
if
(
IsGraphOutput
(
graph
,
kernel
))
{
continue
;
}
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
output_sizes
=
kernel_mod
->
GetOutputSizeList
();
for
(
size_t
i
=
0
;
i
<
output_sizes
.
size
();
++
i
)
{
if
(
!
AnfAlgo
::
OutputAddrExist
(
kernel
,
i
))
{
continue
;
}
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
)
{
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
}
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
}
}
void
GPUKernelRuntime
::
ClearKernelWorkspaceAddress
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
workspace_sizes
=
kernel_mod
->
GetWorkspaceSizeList
();
for
(
size_t
i
=
0
;
i
<
workspace_sizes
.
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableWorkspaceAddr
(
kernel
,
i
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
)
{
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
}
}
}
}
bool
GPUKernelRuntime
::
LaunchKernelDynamic
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
,
bool
mock
,
bool
profiling
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
// Reset the reference count.
mem_reuse_util_
->
ResetDynamicUsedRefCount
();
// The inputs and outputs memory of communication kernel need be continuous, so separate processing.
AllocCommunicationOpDynamicRes
(
graph
);
#ifdef ENABLE_DEBUGGER
bool
dump_enabled
=
GPUKernelRuntime
::
DumpDataEnabledIteration
();
if
(
!
mock
)
{
UpdateStepNum
(
debugger
,
dump_enabled
);
}
#endif
auto
&
kernels
=
graph
->
execution_order
();
int
exec_order
=
1
;
auto
profiler_inst
=
profiler
::
gpu
::
GPUProfiler
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
profiler_inst
);
for
(
const
auto
&
kernel
:
kernels
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
AddressPtrList
kernel_inputs
;
AddressPtrList
kernel_workspaces
;
AddressPtrList
kernel_outputs
;
auto
ret
=
AllocKernelDynamicRes
(
*
kernel_mod
,
kernel
,
&
kernel_inputs
,
&
kernel_workspaces
,
&
kernel_outputs
,
mock
);
if
(
!
ret
)
{
#ifdef ENABLE_DEBUGGER
if
(
!
mock
)
{
// invalidate current data collected by the debugger
ClearCurrentData
(
debugger
,
dump_enabled
);
}
#endif
return
false
;
}
if
(
!
mock
)
{
if
(
!
profiling
)
{
if
(
profiler_inst
->
GetEnableFlag
())
{
profiler_inst
->
OpDataProducerBegin
(
kernel
->
fullname_with_scope
(),
stream_
);
}
CHECK_OP_RET_WITH_EXCEPT
(
kernel_mod
->
Launch
(
kernel_inputs
,
kernel_workspaces
,
kernel_outputs
,
stream_
),
"Launch kernel failed."
);
if
(
profiler_inst
->
GetEnableFlag
())
{
profiler_inst
->
OpDataProducerEnd
();
if
(
profiler_inst
->
GetSyncEnableFlag
())
{
CHECK_OP_RET_WITH_ERROR
(
SyncStream
(),
"Profiler SyncStream failed."
);
}
}
}
else
{
LaunchKernelWithTimeProfiling
(
kernel
,
kernel_inputs
,
kernel_workspaces
,
kernel_outputs
);
}
#ifdef ENABLE_DEBUGGER
// called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost)
LoadKernelData
(
debugger
,
kernel
,
kernel_inputs
,
kernel_workspaces
,
kernel_outputs
,
exec_order
,
stream_
,
dump_enabled
);
#endif
}
exec_order
=
exec_order
+
1
;
FreeKernelDynamicRes
(
kernel
);
if
(
!
UpdateMemorySwapTask
(
kernel
,
mock
,
profiling
))
{
#ifdef ENABLE_DEBUGGER
if
(
!
mock
)
{
// invalidate current data collected by the debugger
ClearCurrentData
(
debugger
,
dump_enabled
);
}
#endif
return
false
;
}
}
if
(
!
mock
)
{
#ifdef ENABLE_DEBUGGER
// collect weights and bias for dump mode
LoadParameters
(
graph
,
debugger
,
dump_enabled
);
#endif
CHECK_OP_RET_WITH_EXCEPT
(
SyncStream
(),
"SyncStream failed."
);
}
ClearSwapInfo
(
mock
);
return
true
;
}
void
GPUKernelRuntime
::
LaunchKernelWithTimeProfiling
(
const
AnfNodePtr
&
kernel
,
const
AddressPtrList
&
inputs
,
const
AddressPtrList
&
workspace
,
const
AddressPtrList
&
outputs
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
float
cost_time
=
0
;
DeviceEvent
start
=
nullptr
;
DeviceEvent
end
=
nullptr
;
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
CreateEvent
(
&
start
),
"Failed to create event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
CreateEvent
(
&
end
),
"Failed to create event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
RecordEvent
(
start
,
stream_
),
"Failed to record event to stream."
);
CHECK_OP_RET_WITH_EXCEPT
(
kernel_mod
->
Launch
(
inputs
,
workspace
,
outputs
,
stream_
),
"Launch kernel failed."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
RecordEvent
(
end
,
stream_
),
"Failed to record event to stream."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
SyncEvent
(
start
),
"Failed to sync event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
SyncEvent
(
end
),
"Failed to sync event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
ElapsedTime
(
&
cost_time
,
start
,
end
),
"Failed to record elapsed time."
);
mem_swap_manager_
->
AddKernelExecutionPerform
(
kernel
,
cost_time
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
DestroyEvent
(
start
),
"Failed to destroy event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
DestroyEvent
(
end
),
"Failed to destroy event."
);
}
bool
GPUKernelRuntime
::
AddMemorySwapTask
(
const
AnfNodePtr
&
kernel
,
bool
mock
,
bool
profiling
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
const
MemSwapInfoSet
&
mem_swap_info_set
=
mem_swap_manager_
->
QueryKernelMemSwapInfo
(
kernel
);
for
(
auto
&
mem_swap_info
:
mem_swap_info_set
)
{
auto
need_swap_kernel
=
mem_swap_manager_
->
QueryKernelByTopoOrder
(
mem_swap_info
.
topo_order_
);
MS_EXCEPTION_IF_NULL
(
need_swap_kernel
);
const
HostAddress
&
host_address
=
mem_swap_manager_
->
QueryKernelHostAddr
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
);
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
,
false
);
if
(
mem_swap_info
.
swap_kind_
==
SwapKind
::
kDeviceToHost
)
{
if
(
mem_swap_manager_
->
QueryKernelHostAddrIsDirty
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
))
{
mem_swap_manager_
->
AddMemSwapTask
(
SwapKind
::
kDeviceToHost
,
device_address
,
host_address
,
mock
);
mem_swap_manager_
->
AddKernelHostAddrIsDirty
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
,
false
);
}
else
{
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
device_address
->
set_status
(
DeviceAddressStatus
::
kInHost
);
}
}
else
if
(
mem_swap_info
.
swap_kind_
==
SwapKind
::
kHostToDevice
)
{
auto
status
=
device_address
->
status
();
if
(
status
==
DeviceAddressStatus
::
kInDeviceToHost
)
{
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
else
if
(
status
==
DeviceAddressStatus
::
kInHost
)
{
if
(
!
device_address
->
ptr_
&&
!
AttemptMallocMem
(
device_address
,
device_address
->
size_
,
mock
))
{
return
false
;
}
float
cost_time
=
0
;
mem_swap_manager_
->
AddMemSwapTask
(
SwapKind
::
kHostToDevice
,
device_address
,
host_address
,
mock
,
profiling
,
&
cost_time
);
if
(
profiling
)
{
mem_swap_manager_
->
AddKernelSwapPerform
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
,
std
::
make_pair
(
0
,
cost_time
));
}
}
}
}
return
true
;
}
bool
GPUKernelRuntime
::
UpdateMemorySwapTask
(
const
AnfNodePtr
&
kernel
,
bool
mock
,
bool
profiling
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
true
;
}
if
(
mem_swap_manager_
->
QueryKernelTriggerSwap
(
kernel
))
{
if
(
!
mock
)
{
CHECK_OP_RET_WITH_EXCEPT
(
SyncStream
(),
"SyncStream failed."
);
}
if
(
!
AddMemorySwapTask
(
kernel
,
mock
,
profiling
))
{
return
false
;
}
if
(
!
mock
)
{
CHECK_OP_RET_WITH_EXCEPT
(
mem_swap_manager_
->
SyncMemCopyStream
(
SwapKind
::
kDeviceToHost
),
"SyncCopyStream failed."
);
}
}
return
true
;
}
void
GPUKernelRuntime
::
UpdateHostSwapInQueue
(
const
DeviceAddressPtr
device_address
,
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
;
}
while
(
auto
device_address_swap_in
=
mem_swap_manager_
->
UpdateSwapQueue
(
SwapKind
::
kHostToDevice
,
mock
))
{
device_address_swap_in
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
auto
status
=
device_address
->
status
();
switch
(
status
)
{
case
DeviceAddressStatus
::
kInDevice
:
break
;
case
DeviceAddressStatus
::
kInDeviceToHost
:
{
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
break
;
}
case
DeviceAddressStatus
::
kInHostToDevice
:
{
while
(
device_address
->
status
()
!=
DeviceAddressStatus
::
kInDevice
)
{
while
(
auto
device_address_swap_in
=
mem_swap_manager_
->
UpdateSwapQueue
(
SwapKind
::
kHostToDevice
,
mock
))
{
device_address_swap_in
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
}
break
;
}
case
DeviceAddressStatus
::
kInHost
:
MS_LOG
(
WARNING
)
<<
"Unexpected device address status: "
<<
status
;
break
;
default:
MS_LOG
(
EXCEPTION
)
<<
"Invaild device address status: "
<<
status
;
}
}
void
GPUKernelRuntime
::
UpdateHostSwapOutQueue
(
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
;
}
while
(
auto
device_address_swap_out
=
mem_swap_manager_
->
UpdateSwapQueue
(
SwapKind
::
kDeviceToHost
,
mock
))
{
if
(
device_address_swap_out
->
status
()
==
DeviceAddressStatus
::
kInDeviceToHost
&&
device_address_swap_out
->
ptr_
)
{
device_address_swap_out
->
set_status
(
DeviceAddressStatus
::
kInHost
);
mem_manager_
->
FreeMemFromMemPool
(
device_address_swap_out
);
}
}
}
void
GPUKernelRuntime
::
ClearSwapInfo
(
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
;
}
mem_swap_manager_
->
ClearSwapQueue
(
mock
);
mem_swap_manager_
->
ResetHostAddrIsDirty
();
}
bool
GPUKernelRuntime
::
AttemptMallocMem
(
const
DeviceAddressPtr
&
device_address
,
size_t
size
,
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
auto
ret
=
mem_manager_
->
MallocMemFromMemPool
(
device_address
,
size
);
if
(
!
ret
)
{
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
false
;
}
if
(
!
mock
)
{
mem_swap_manager_
->
SyncMemCopyStream
(
SwapKind
::
kDeviceToHost
);
}
UpdateHostSwapOutQueue
(
mock
);
ret
=
mem_manager_
->
MallocMemFromMemPool
(
device_address
,
size
);
if
(
!
ret
)
{
return
false
;
}
}
return
true
;
}
bool
GPUKernelRuntime
::
AllocKernelDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_inputs
,
AddressPtrList
*
kernel_workspaces
,
AddressPtrList
*
kernel_outputs
,
bool
mock
)
{
if
(
!
AllocKernelInputDynamicRes
(
kernel
,
kernel_inputs
,
mock
))
{
return
false
;
}
if
(
!
AllocKernelOutputDynamicRes
(
kernel_mod
,
kernel
,
kernel_outputs
,
mock
))
{
return
false
;
}
if
(
!
AllocKernelWorkspaceDynamicRes
(
kernel_mod
,
kernel
,
kernel_workspaces
,
mock
))
{
return
false
;
}
return
true
;
}
bool
GPUKernelRuntime
::
AllocKernelInputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_inputs
,
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_inputs
);
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
for
(
size_t
i
=
0
;
i
<
AnfAlgo
::
GetInputTensorNum
(
kernel
);
++
i
)
{
DeviceAddressPtr
device_address
;
if
(
mem_reuse_util_
->
is_all_nop_node
())
{
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
false
);
}
else
{
// Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
true
);
}
MS_EXCEPTION_IF_NULL
(
device_address
);
UpdateHostSwapInQueue
(
device_address
,
mock
);
MS_EXCEPTION_IF_NULL
(
device_address
->
ptr_
);
kernel
::
AddressPtr
input
=
std
::
make_shared
<
kernel
::
Address
>
();
MS_EXCEPTION_IF_NULL
(
input
);
input
->
addr
=
device_address
->
ptr_
;
input
->
size
=
device_address
->
size_
;
kernel_inputs
->
emplace_back
(
input
);
}
return
true
;
}
bool
GPUKernelRuntime
::
AllocKernelOutputDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_outputs
,
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_outputs
);
UpdateHostSwapOutQueue
(
mock
);
auto
output_sizes
=
kernel_mod
.
GetOutputSizeList
();
for
(
size_t
i
=
0
;
i
<
output_sizes
.
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
==
nullptr
&&
!
AttemptMallocMem
(
device_address
,
output_sizes
[
i
],
mock
))
{
return
false
;
}
kernel
::
AddressPtr
output
=
std
::
make_shared
<
kernel
::
Address
>
();
MS_EXCEPTION_IF_NULL
(
output
);
output
->
addr
=
device_address
->
ptr_
;
output
->
size
=
output_sizes
[
i
];
kernel_outputs
->
emplace_back
(
output
);
}
return
true
;
}
bool
GPUKernelRuntime
::
AllocKernelWorkspaceDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_workspaces
,
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_workspaces
);
auto
workspace_sizes
=
kernel_mod
.
GetWorkspaceSizeList
();
for
(
size_t
i
=
0
;
i
<
workspace_sizes
.
size
();
++
i
)
{
if
(
workspace_sizes
[
i
]
==
0
)
{
kernel_workspaces
->
emplace_back
(
nullptr
);
continue
;
}
auto
device_address
=
AnfAlgo
::
GetMutableWorkspaceAddr
(
kernel
,
i
);
if
(
device_address
->
ptr_
==
nullptr
&&
!
AttemptMallocMem
(
device_address
,
workspace_sizes
[
i
],
mock
))
{
return
false
;
}
kernel
::
AddressPtr
workspace
=
std
::
make_shared
<
kernel
::
Address
>
();
MS_EXCEPTION_IF_NULL
(
workspace
);
workspace
->
addr
=
device_address
->
ptr_
;
workspace
->
size
=
workspace_sizes
[
i
];
kernel_workspaces
->
emplace_back
(
workspace
);
}
return
true
;
}
void
GPUKernelRuntime
::
AllocCommunicationOpDynamicRes
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
auto
&
kernel
:
kernels
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
if
(
AnfAlgo
::
IsCommunicationOp
(
kernel
))
{
AllocCommunicationOpInputDynamicRes
(
kernel
);
AllocCommunicationOpOutputDynamicRes
(
kernel
);
}
}
}
void
GPUKernelRuntime
::
AllocCommunicationOpInputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
bool
is_need_alloc_memory
=
false
;
bool
is_need_free_memory
=
false
;
size_t
total_size
=
0
;
std
::
vector
<
size_t
>
size_list
;
DeviceAddressPtrList
addr_list
;
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
intput_sizes
=
kernel_mod
->
GetInputSizeList
();
for
(
size_t
i
=
0
;
i
<
intput_sizes
.
size
();
++
i
)
{
DeviceAddressPtr
device_address
;
if
(
mem_reuse_util_
->
is_all_nop_node
())
{
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
false
);
}
else
{
// Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
true
);
}
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
==
nullptr
)
{
is_need_alloc_memory
=
true
;
}
else
{
is_need_free_memory
=
true
;
}
total_size
+=
intput_sizes
[
i
];
size_list
.
emplace_back
(
intput_sizes
[
i
]);
addr_list
.
emplace_back
(
device_address
);
}
AllocCommunicationOpMemory
(
is_need_alloc_memory
,
is_need_free_memory
,
addr_list
,
total_size
,
size_list
);
}
void
GPUKernelRuntime
::
AllocCommunicationOpOutputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
bool
is_need_alloc_memory
=
false
;
bool
is_need_free_memory
=
false
;
size_t
total_size
=
0
;
std
::
vector
<
size_t
>
size_list
;
DeviceAddressPtrList
addr_list
;
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
output_sizes
=
kernel_mod
->
GetOutputSizeList
();
for
(
size_t
i
=
0
;
i
<
output_sizes
.
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
==
nullptr
)
{
is_need_alloc_memory
=
true
;
}
else
{
is_need_free_memory
=
true
;
}
total_size
+=
output_sizes
[
i
];
size_list
.
emplace_back
(
output_sizes
[
i
]);
addr_list
.
emplace_back
(
device_address
);
}
AllocCommunicationOpMemory
(
is_need_alloc_memory
,
is_need_free_memory
,
addr_list
,
total_size
,
size_list
);
}
void
GPUKernelRuntime
::
AllocCommunicationOpMemory
(
bool
is_need_alloc_memory
,
bool
is_need_free_memory
,
const
DeviceAddressPtrList
addr_list
,
size_t
total_size
,
std
::
vector
<
size_t
>
size_list
)
{
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
if
(
!
is_need_alloc_memory
)
{
return
;
}
if
(
is_need_free_memory
)
{
for
(
const
auto
&
iter
:
addr_list
)
{
MS_EXCEPTION_IF_NULL
(
iter
);
// Free the inputs/outputs of communication kernel which are not released.
if
(
iter
->
ptr_
!=
nullptr
)
{
mem_manager_
->
FreeMemFromMemPool
(
iter
);
}
}
}
auto
ret
=
mem_manager_
->
MallocContinuousMemFromMemPool
(
addr_list
,
total_size
,
size_list
);
if
(
!
ret
)
{
MS_LOG
(
EXCEPTION
)
<<
"Malloc device memory failed."
;
}
}
void
GPUKernelRuntime
::
FreeKernelDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
auto
cnode
=
kernel
->
cast
<
CNodePtr
>
();
MS_EXCEPTION_IF_NULL
(
cnode
);
if
(
AnfAlgo
::
IsCommunicationOp
(
kernel
))
{
return
;
}
// Free the input of kernel by reference count.
for
(
size_t
i
=
0
;
i
<
AnfAlgo
::
GetInputTensorNum
(
kernel
);
++
i
)
{
auto
kernel_ref_count_ptr
=
mem_reuse_util_
->
GetKernelInputRef
(
cnode
,
i
);
if
(
kernel_ref_count_ptr
==
nullptr
)
{
continue
;
}
kernel_ref_count_ptr
->
ref_count_dynamic_use_
--
;
if
(
kernel_ref_count_ptr
->
ref_count_dynamic_use_
<
0
)
{
MS_LOG
(
EXCEPTION
)
<<
"Check dynamic reference count failed."
;
}
if
(
kernel_ref_count_ptr
->
ref_count_dynamic_use_
==
0
)
{
DeviceAddressPtr
device_address
;
if
(
mem_reuse_util_
->
is_all_nop_node
())
{
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
false
);
}
else
{
// Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
true
);
}
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
}
// Free the output of kernel, if output has no reference.
for
(
size_t
i
=
0
;
i
<
AnfAlgo
::
GetOutputTensorNum
(
kernel
);
++
i
)
{
auto
kernel_ref_count_ptr
=
mem_reuse_util_
->
GetRef
(
cnode
,
i
);
if
(
kernel_ref_count_ptr
==
nullptr
)
{
continue
;
}
if
(
kernel_ref_count_ptr
->
ref_count_dynamic_use_
==
0
)
{
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
}
// Free the workspace of kernel.
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
for
(
size_t
i
=
0
;
i
<
kernel_mod
->
GetWorkspaceSizeList
().
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableWorkspaceAddr
(
kernel
,
i
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
)
{
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
}
}
}
}
// namespace gpu
}
// namespace device
}
// namespace mindspore
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
浏览文件 @
bbafa9db
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
#define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
#include <string>
#include <memory>
#include <vector>
#include <set>
#include <utility>
#include <unordered_map>
#include <unordered_set>
#include "runtime/device/kernel_runtime.h"
#include "runtime/device/kernel_runtime_manager.h"
#include "backend/optimizer/mem_reuse/mem_swap_manager.h"
namespace
mindspore
{
namespace
device
{
namespace
gpu
{
using
mindspore
::
device
::
memswap
::
MemSwapManagerPtr
;
class
GPUKernelRuntime
:
public
KernelRuntime
{
public:
GPUKernelRuntime
()
=
default
;
~
GPUKernelRuntime
()
override
=
default
;
bool
Init
()
override
;
void
ReleaseDeviceRes
()
override
;
void
ClearGraphRuntimeResource
(
uint32_t
graph_id
,
const
std
::
vector
<
AnfNodePtr
>
&
inputs
,
const
std
::
unordered_set
<
ValueNodePtr
>
&
value_nodes
,
const
std
::
vector
<
CNodePtr
>
&
execution_order
)
override
;
void
AssignMemory
(
session
::
KernelGraph
*
graph
)
override
;
bool
Run
(
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
)
override
;
#ifdef ENABLE_DUMP_E2E
bool
DumpData
(
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
)
override
;
#endif
protected:
DeviceAddressPtr
CreateDeviceAddress
(
void
*
device_ptr
,
size_t
device_size
,
const
string
&
format
,
TypeId
type_id
)
override
;
bool
SyncStream
()
override
;
private:
GPUKernelRuntime
(
const
GPUKernelRuntime
&
);
GPUKernelRuntime
&
operator
=
(
const
GPUKernelRuntime
&
);
bool
InitDevice
();
bool
device_init_
{
false
};
// The related functions and members for using dynamic memory pool.
void
InitKernelRefCount
(
const
session
::
KernelGraph
*
graph
);
void
InitKernelOutputAddress
(
const
session
::
KernelGraph
*
graph
);
void
InitKernelWorkspaceAddress
(
const
session
::
KernelGraph
*
graph
);
void
InitMemorySwapInfo
(
const
session
::
KernelGraph
*
graph
);
void
SaveGraphOutputNode
(
const
session
::
KernelGraph
*
graph
);
bool
IsGraphOutput
(
const
session
::
KernelGraph
*
graph
,
const
mindspore
::
AnfNodePtr
&
kernel
)
const
;
void
ClearKernelOutputAddress
(
const
session
::
KernelGraph
*
graph
);
void
ClearKernelWorkspaceAddress
(
const
session
::
KernelGraph
*
graph
);
void
ClearKernelOldOutputAndWorkspace
(
const
session
::
KernelGraph
*
graph
);
bool
RunOneStep
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
);
bool
SearchMemSwapScheme
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
);
bool
RefineMemSwapScheme
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
);
bool
LaunchKernelDynamic
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
,
bool
mock
=
false
,
bool
profiling
=
false
);
void
LaunchKernelWithTimeProfiling
(
const
AnfNodePtr
&
kernel
,
const
AddressPtrList
&
inputs
,
const
AddressPtrList
&
workspace
,
const
AddressPtrList
&
outputs
);
bool
AttemptMallocMem
(
const
DeviceAddressPtr
&
device_address
,
size_t
size
,
bool
mock
);
bool
AllocKernelDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_inputs
,
AddressPtrList
*
kernel_workspaces
,
AddressPtrList
*
kernel_outputs
,
bool
mock
);
bool
AllocKernelInputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_inputs
,
bool
mock
);
bool
AllocKernelOutputDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_outputs
,
bool
mock
);
bool
AllocKernelWorkspaceDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_workspaces
,
bool
mock
);
void
AllocCommunicationOpDynamicRes
(
const
session
::
KernelGraph
*
graph
);
void
AllocCommunicationOpInputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
);
void
AllocCommunicationOpOutputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
);
void
AllocCommunicationOpMemory
(
bool
is_need_alloc_memory
,
bool
is_need_free_memory
,
const
DeviceAddressPtrList
addr_list
,
size_t
total_size
,
std
::
vector
<
size_t
>
size_list
);
void
FreeKernelDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
);
bool
UpdateMemorySwapTask
(
const
AnfNodePtr
&
kernel
,
bool
mock
,
bool
profiling
);
bool
AddMemorySwapTask
(
const
AnfNodePtr
&
kernel
,
bool
mock
,
bool
profiling
);
void
UpdateHostSwapInQueue
(
const
DeviceAddressPtr
device_address
,
bool
mock
);
void
UpdateHostSwapOutQueue
(
bool
mock
);
void
ClearSwapInfo
(
bool
mock
);
std
::
unordered_map
<
uint32_t
,
MemReuseUtilPtr
>
mem_reuse_util_map_
;
std
::
unordered_map
<
uint32_t
,
MemSwapManagerPtr
>
mem_swap_map_
;
std
::
unordered_map
<
uint32_t
,
bool
>
is_first_step_map_
;
std
::
unordered_map
<
uint32_t
,
std
::
set
<
AnfNodePtr
>>
graph_output_map_
;
MemReuseUtilPtr
mem_reuse_util_
{
nullptr
};
MemSwapManagerPtr
mem_swap_manager_
{
nullptr
};
};
MS_REG_KERNEL_RUNTIME
(
kGPUDevice
,
GPUKernelRuntime
);
}
// namespace gpu
}
// namespace device
}
// namespace mindspore
#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
#define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
#include <string>
#include <memory>
#include <vector>
#include <set>
#include <utility>
#include <unordered_map>
#include <unordered_set>
#include "runtime/device/kernel_runtime.h"
#include "runtime/device/kernel_runtime_manager.h"
#include "backend/optimizer/mem_reuse/mem_swap_manager.h"
namespace
mindspore
{
namespace
device
{
namespace
gpu
{
using
mindspore
::
device
::
memswap
::
MemSwapManagerPtr
;
class
GPUKernelRuntime
:
public
KernelRuntime
{
public:
GPUKernelRuntime
()
=
default
;
~
GPUKernelRuntime
()
override
=
default
;
bool
Init
()
override
;
void
ReleaseDeviceRes
()
override
;
void
ClearGraphRuntimeResource
(
uint32_t
graph_id
,
const
std
::
vector
<
AnfNodePtr
>
&
inputs
,
const
std
::
unordered_set
<
ValueNodePtr
>
&
value_nodes
,
const
std
::
vector
<
CNodePtr
>
&
execution_order
)
override
;
void
AssignMemory
(
session
::
KernelGraph
*
graph
)
override
;
bool
Run
(
session
::
KernelGraph
*
graph
,
bool
is_task_sink
,
Debugger
*
debugger
=
nullptr
)
override
;
#ifdef ENABLE_DUMP_E2E
bool
DumpData
(
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
)
override
;
#endif
protected:
DeviceAddressPtr
CreateDeviceAddress
(
void
*
device_ptr
,
size_t
device_size
,
const
string
&
format
,
TypeId
type_id
)
override
;
bool
SyncStream
()
override
;
private:
GPUKernelRuntime
(
const
GPUKernelRuntime
&
);
GPUKernelRuntime
&
operator
=
(
const
GPUKernelRuntime
&
);
bool
InitDevice
();
bool
device_init_
{
false
};
// The related functions and members for using dynamic memory pool.
void
InitKernelRefCount
(
const
session
::
KernelGraph
*
graph
);
void
InitKernelOutputAddress
(
const
session
::
KernelGraph
*
graph
);
void
InitKernelWorkspaceAddress
(
const
session
::
KernelGraph
*
graph
);
void
InitMemorySwapInfo
(
const
session
::
KernelGraph
*
graph
);
void
SaveGraphOutputNode
(
const
session
::
KernelGraph
*
graph
);
bool
IsGraphOutput
(
const
session
::
KernelGraph
*
graph
,
const
mindspore
::
AnfNodePtr
&
kernel
)
const
;
void
ClearKernelOutputAddress
(
const
session
::
KernelGraph
*
graph
);
void
ClearKernelWorkspaceAddress
(
const
session
::
KernelGraph
*
graph
);
void
ClearKernelOldOutputAndWorkspace
(
const
session
::
KernelGraph
*
graph
);
bool
RunOneStep
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
);
bool
SearchMemSwapScheme
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
);
bool
RefineMemSwapScheme
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
);
bool
LaunchKernelDynamic
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
,
bool
mock
=
false
,
bool
profiling
=
false
);
void
LaunchKernelWithTimeProfiling
(
const
AnfNodePtr
&
kernel
,
const
AddressPtrList
&
inputs
,
const
AddressPtrList
&
workspace
,
const
AddressPtrList
&
outputs
);
bool
AttemptMallocMem
(
const
DeviceAddressPtr
&
device_address
,
size_t
size
,
bool
mock
);
bool
AllocKernelDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_inputs
,
AddressPtrList
*
kernel_workspaces
,
AddressPtrList
*
kernel_outputs
,
bool
mock
);
bool
AllocKernelInputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_inputs
,
bool
mock
);
bool
AllocKernelOutputDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_outputs
,
bool
mock
);
bool
AllocKernelWorkspaceDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_workspaces
,
bool
mock
);
void
AllocCommunicationOpDynamicRes
(
const
session
::
KernelGraph
*
graph
);
void
AllocCommunicationOpInputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
);
void
AllocCommunicationOpOutputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
);
void
AllocCommunicationOpMemory
(
bool
is_need_alloc_memory
,
bool
is_need_free_memory
,
const
DeviceAddressPtrList
addr_list
,
size_t
total_size
,
std
::
vector
<
size_t
>
size_list
);
void
FreeKernelDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
);
bool
UpdateMemorySwapTask
(
const
AnfNodePtr
&
kernel
,
bool
mock
,
bool
profiling
);
bool
AddMemorySwapTask
(
const
AnfNodePtr
&
kernel
,
bool
mock
,
bool
profiling
);
void
UpdateHostSwapInQueue
(
const
DeviceAddressPtr
device_address
,
bool
mock
);
void
UpdateHostSwapOutQueue
(
bool
mock
);
void
ClearSwapInfo
(
bool
mock
);
std
::
unordered_map
<
uint32_t
,
MemReuseUtilPtr
>
mem_reuse_util_map_
;
std
::
unordered_map
<
uint32_t
,
MemSwapManagerPtr
>
mem_swap_map_
;
std
::
unordered_map
<
uint32_t
,
bool
>
is_first_step_map_
;
std
::
unordered_map
<
uint32_t
,
std
::
set
<
AnfNodePtr
>>
graph_output_map_
;
MemReuseUtilPtr
mem_reuse_util_
{
nullptr
};
MemSwapManagerPtr
mem_swap_manager_
{
nullptr
};
};
MS_REG_KERNEL_RUNTIME
(
kGPUDevice
,
GPUKernelRuntime
);
}
// namespace gpu
}
// namespace device
}
// namespace mindspore
#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
mindspore/ccsrc/runtime/device/kernel_runtime.cc
浏览文件 @
bbafa9db
...
...
@@ -40,7 +40,7 @@ KernelRuntime::~KernelRuntime() {
#endif
}
bool
KernelRuntime
::
Load
(
session
::
KernelGraph
*
graph
)
{
return
true
;
}
bool
KernelRuntime
::
Load
(
session
::
KernelGraph
*
graph
,
bool
is_task_sink
)
{
return
true
;
}
bool
KernelRuntime
::
DumpData
(
mindspore
::
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
if
(
graph
!=
nullptr
)
{
...
...
mindspore/ccsrc/runtime/device/kernel_runtime.h
浏览文件 @
bbafa9db
...
...
@@ -59,8 +59,8 @@ class KernelRuntime {
bool
DumpDataEnabled
();
bool
DumpDataEnabledIteration
();
virtual
bool
DumpData
(
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
);
virtual
bool
Load
(
session
::
KernelGraph
*
graph
);
virtual
bool
Run
(
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
)
=
0
;
virtual
bool
Load
(
session
::
KernelGraph
*
graph
,
bool
is_task_sink
);
virtual
bool
Run
(
session
::
KernelGraph
*
graph
,
bool
is_task_sink
,
Debugger
*
debugger
=
nullptr
)
=
0
;
bool
LaunchKernel
(
const
session
::
KernelGraph
*
graph
);
bool
LaunchTaskBasedOnSingleKernel
(
kernel
::
KernelModPtr
kernel_mod_ptr
,
const
AddressPtrList
&
kernel_inputs
,
const
AddressPtrList
&
kernel_outputs
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录