Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
c0070d3d
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
c0070d3d
编写于
9月 07, 2020
作者:
Z
Zhang Qinghua
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Use the unified Execute function to run Graph or Single Op Graph.
上级
77dd91a6
变更
12
隐藏空白更改
内联
并排
Showing
12 changed file
with
1291 addition
and
1298 deletion
+1291
-1298
mindspore/ccsrc/backend/session/ascend_session.cc
mindspore/ccsrc/backend/session/ascend_session.cc
+14
-16
mindspore/ccsrc/backend/session/ascend_session.h
mindspore/ccsrc/backend/session/ascend_session.h
+3
-2
mindspore/ccsrc/backend/session/cpu_session.cc
mindspore/ccsrc/backend/session/cpu_session.cc
+1
-1
mindspore/ccsrc/backend/session/gpu_session.cc
mindspore/ccsrc/backend/session/gpu_session.cc
+2
-2
mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
...pore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+2
-8
mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
...spore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
+2
-2
mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
+1
-1
mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
+1
-1
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+1150
-1150
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
+112
-112
mindspore/ccsrc/runtime/device/kernel_runtime.cc
mindspore/ccsrc/runtime/device/kernel_runtime.cc
+1
-1
mindspore/ccsrc/runtime/device/kernel_runtime.h
mindspore/ccsrc/runtime/device/kernel_runtime.h
+2
-2
未找到文件。
mindspore/ccsrc/backend/session/ascend_session.cc
浏览文件 @
c0070d3d
...
...
@@ -318,7 +318,7 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::
#endif
{
// run task on device
Execute
(
kernel_graph
);
Execute
(
kernel_graph
,
true
);
}
// summary
Summary
(
kernel_graph
.
get
());
...
...
@@ -348,17 +348,6 @@ void AscendSession::RunOpHardwareOptimize(const std::shared_ptr<session::KernelG
MS_LOG
(
INFO
)
<<
"Finish"
;
}
void
AscendSession
::
RunOpExecTask
(
const
std
::
shared_ptr
<
KernelGraph
>
&
kernel_graph
)
const
{
MS_LOG
(
INFO
)
<<
"Start!"
;
auto
runtime_instance
=
device
::
KernelRuntimeManager
::
Instance
().
GetKernelRuntime
(
kAscendDevice
,
device_id_
);
MS_EXCEPTION_IF_NULL
(
runtime_instance
);
bool
ret_ok
=
runtime_instance
->
LaunchKernel
(
kernel_graph
.
get
());
if
(
!
ret_ok
)
{
MS_LOG
(
EXCEPTION
)
<<
"Run task error!"
;
}
MS_LOG
(
INFO
)
<<
"Finish!"
;
}
bool
AscendSession
::
GraphCacheExist
(
const
GraphInfo
&
graph_info
)
const
{
return
run_op_graphs_
.
find
(
graph_info
)
!=
run_op_graphs_
.
end
();
}
...
...
@@ -398,7 +387,7 @@ void AscendSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph_i
// load input data to device
LoadInputData
(
graph
,
input_tensors
);
// run op
RunOpExecTask
(
graph
);
Execute
(
graph
,
false
);
// get output
if
(
op_run_info
.
value
!=
nullptr
)
{
std
::
vector
<
tensor
::
TensorPtr
>
pre_output_tensors
;
...
...
@@ -552,21 +541,30 @@ void AscendSession::RunOpMemoryClear(const KernelGraph *kernel_graph) const {
void
AscendSession
::
Load
(
const
std
::
shared_ptr
<
KernelGraph
>
&
kernel_graph
)
const
{
MS_LOG
(
INFO
)
<<
"Start!"
;
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
bool
is_task_sink
=
context_ptr
->
get_param
<
bool
>
(
MS_CTX_ENABLE_TASK_SINK
);
(
void
)
device
::
KernelAdjust
::
GetInstance
().
StepLoadCtrlInputs
(
kernel_graph
);
auto
runtime_instance
=
device
::
KernelRuntimeManager
::
Instance
().
GetKernelRuntime
(
kAscendDevice
,
device_id_
);
MS_EXCEPTION_IF_NULL
(
runtime_instance
);
bool
ret_ok
=
runtime_instance
->
Load
(
kernel_graph
.
get
());
bool
ret_ok
=
runtime_instance
->
Load
(
kernel_graph
.
get
()
,
is_task_sink
);
if
(
!
ret_ok
)
{
MS_LOG
(
EXCEPTION
)
<<
"Load task error!"
;
}
MS_LOG
(
INFO
)
<<
"Finish!"
;
}
void
AscendSession
::
Execute
(
const
std
::
shared_ptr
<
KernelGraph
>
&
kernel_graph
)
const
{
void
AscendSession
::
Execute
(
const
std
::
shared_ptr
<
KernelGraph
>
&
kernel_graph
,
bool
is_task
)
const
{
MS_LOG
(
INFO
)
<<
"Start!"
;
bool
is_task_sink
=
false
;
if
(
is_task
)
{
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
is_task_sink
=
context_ptr
->
get_param
<
bool
>
(
MS_CTX_ENABLE_TASK_SINK
);
}
auto
runtime_instance
=
device
::
KernelRuntimeManager
::
Instance
().
GetKernelRuntime
(
kAscendDevice
,
device_id_
);
MS_EXCEPTION_IF_NULL
(
runtime_instance
);
bool
ret_ok
=
runtime_instance
->
Run
(
kernel_graph
.
get
());
bool
ret_ok
=
runtime_instance
->
Run
(
kernel_graph
.
get
()
,
is_task_sink
);
if
(
!
ret_ok
)
{
MS_LOG
(
EXCEPTION
)
<<
"run task error!"
;
}
...
...
mindspore/ccsrc/backend/session/ascend_session.h
浏览文件 @
c0070d3d
...
...
@@ -13,8 +13,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_SESSION_ASCEND_SESSION_H
#define MINDSPORE_CCSRC_BACKEND_SESSION_ASCEND_SESSION_H
#include <unordered_map>
#include <string>
#include <memory>
...
...
@@ -82,13 +84,12 @@ class AscendSession : public SessionBasic {
KernelGraph
*
kernel_graph
)
const
;
void
RunOpMemoryClear
(
const
KernelGraph
*
kernel_graph
)
const
;
void
Load
(
const
std
::
shared_ptr
<
KernelGraph
>
&
kernel_graph
)
const
;
void
Execute
(
const
std
::
shared_ptr
<
KernelGraph
>
&
kernel_graph
)
const
;
void
Execute
(
const
std
::
shared_ptr
<
KernelGraph
>
&
kernel_graph
,
bool
is_task
)
const
;
void
Dump
(
const
std
::
shared_ptr
<
KernelGraph
>
&
kernel_graph
)
const
;
void
DumpAllGraphs
(
const
std
::
vector
<
KernelGraphPtr
>
&
all_graphs
);
void
LoadTensor
(
const
std
::
shared_ptr
<
KernelGraph
>
&
kernel_graph
)
const
;
// below functions are used for run op
void
RunOpHardwareOptimize
(
const
std
::
shared_ptr
<
session
::
KernelGraph
>
&
kernel_graph
)
const
;
void
RunOpExecTask
(
const
std
::
shared_ptr
<
KernelGraph
>
&
kernel_graph
)
const
;
static
void
BackendOptimization
(
const
std
::
vector
<
KernelGraphPtr
>
&
all_graphs
);
static
void
LinkChildGraphs
(
NotNull
<
KernelGraphPtr
>
graph
);
...
...
mindspore/ccsrc/backend/session/cpu_session.cc
浏览文件 @
c0070d3d
...
...
@@ -118,7 +118,7 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
debugger_
->
PreExecute
(
kernel_graph
);
}
#endif
bool
ret
=
runtime_
.
Run
(
kernel_graph
.
get
());
bool
ret
=
runtime_
.
Run
(
kernel_graph
.
get
()
,
false
);
if
(
!
ret
)
{
MS_LOG
(
EXCEPTION
)
<<
"Run graph failed"
;
}
...
...
mindspore/ccsrc/backend/session/gpu_session.cc
浏览文件 @
c0070d3d
...
...
@@ -191,9 +191,9 @@ void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const
auto
runtime_instance
=
device
::
KernelRuntimeManager
::
Instance
().
GetSingleKernelRuntime
(
kGPUDevice
,
device_id_
);
MS_EXCEPTION_IF_NULL
(
runtime_instance
);
#ifdef ENABLE_DEBUGGER
if
(
!
runtime_instance
->
Run
(
kernel_graph
.
get
(),
debugger_
.
get
()))
{
if
(
!
runtime_instance
->
Run
(
kernel_graph
.
get
(),
false
,
debugger_
.
get
()))
{
#else
if
(
!
runtime_instance
->
Run
(
kernel_graph
.
get
()))
{
if
(
!
runtime_instance
->
Run
(
kernel_graph
.
get
()
,
false
))
{
#endif
MS_LOG
(
EXCEPTION
)
<<
"GPU execute graph failed!"
;
}
...
...
mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
浏览文件 @
c0070d3d
...
...
@@ -454,10 +454,7 @@ DeviceAddressPtr AscendKernelRuntime::CreateDeviceAddress(void *device_ptr, size
return
std
::
make_shared
<
AscendDeviceAddress
>
(
device_ptr
,
device_size
,
format
,
type_id
);
}
bool
AscendKernelRuntime
::
Load
(
session
::
KernelGraph
*
graph
)
{
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
bool
is_task_sink
=
context_ptr
->
get_param
<
bool
>
(
MS_CTX_ENABLE_TASK_SINK
);
bool
AscendKernelRuntime
::
Load
(
session
::
KernelGraph
*
graph
,
bool
is_task_sink
)
{
if
(
!
is_task_sink
)
{
return
true
;
}
...
...
@@ -609,17 +606,14 @@ void AscendKernelRuntime::DebugTaskIdName(GraphId graph_id) {
}
}
bool
AscendKernelRuntime
::
Run
(
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
bool
AscendKernelRuntime
::
Run
(
session
::
KernelGraph
*
graph
,
bool
is_task_sink
,
Debugger
*
debugger
)
{
bool
ret
=
false
;
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
#if defined(_WIN32) || defined(_WIN64)
auto
start_time
=
std
::
chrono
::
steady_clock
::
now
();
#else
struct
timeval
start_time
,
end_time
;
(
void
)
gettimeofday
(
&
start_time
,
nullptr
);
#endif
bool
is_task_sink
=
context_ptr
->
get_param
<
bool
>
(
MS_CTX_ENABLE_TASK_SINK
);
if
(
is_task_sink
)
{
ret
=
RunTask
(
graph
);
}
else
{
...
...
mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
浏览文件 @
c0070d3d
...
...
@@ -44,8 +44,8 @@ class AscendKernelRuntime : public KernelRuntime {
bool
GenTask
(
const
session
::
KernelGraph
*
graph
);
bool
LoadTask
(
const
session
::
KernelGraph
*
graph
);
bool
RunTask
(
const
session
::
KernelGraph
*
graph
);
bool
Load
(
session
::
KernelGraph
*
graph
)
override
;
bool
Run
(
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
)
override
;
bool
Load
(
session
::
KernelGraph
*
graph
,
bool
is_task_sink
)
override
;
bool
Run
(
session
::
KernelGraph
*
graph
,
bool
is_task_sink
,
Debugger
*
debugger
=
nullptr
)
override
;
void
ClearGraphRuntimeResource
(
uint32_t
graph_id
,
const
std
::
vector
<
AnfNodePtr
>
&
inputs
,
const
std
::
unordered_set
<
ValueNodePtr
>
&
value_nodes
,
const
std
::
vector
<
CNodePtr
>
&
execution_order
)
override
;
...
...
mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
浏览文件 @
c0070d3d
...
...
@@ -287,7 +287,7 @@ void CPUKernelRuntime::DecreaseSummaryRefCount(const session::NamedSummaryOutput
resource_manager_
.
DecreaseSummaryRefCount
(
summary_outputs
);
}
bool
CPUKernelRuntime
::
Run
(
session
::
KernelGraph
*
kernel_graph
,
Debugger
*
debugger
)
{
bool
CPUKernelRuntime
::
Run
(
session
::
KernelGraph
*
kernel_graph
,
bool
is_task_sink
,
Debugger
*
debugger
)
{
MS_EXCEPTION_IF_NULL
(
kernel_graph
);
resource_manager_
.
IncreaseAddressRefCount
(
kernel_graph
);
...
...
mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
浏览文件 @
c0070d3d
...
...
@@ -36,7 +36,7 @@ class CPUKernelRuntime : public KernelRuntime {
~
CPUKernelRuntime
()
override
=
default
;
bool
Init
()
override
{
return
true
;
}
bool
Run
(
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
)
override
;
bool
Run
(
session
::
KernelGraph
*
graph
,
bool
is_task_sink
,
Debugger
*
debugger
=
nullptr
)
override
;
void
AssignKernelAddress
(
session
::
KernelGraph
*
kernel_graph
);
void
BindInputOutput
(
session
::
KernelGraph
*
kernel_graph
,
const
std
::
vector
<
tensor
::
TensorPtr
>
&
inputs
,
VectorRef
*
outputs
);
...
...
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
浏览文件 @
c0070d3d
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "runtime/device/gpu/gpu_kernel_runtime.h"
#include <algorithm>
#include "runtime/device/gpu/gpu_device_address.h"
#include "runtime/device/gpu/cuda_driver.h"
#include "runtime/device/gpu/gpu_buffer_mgr.h"
#include "runtime/device/gpu/gpu_device_manager.h"
#include "runtime/device/gpu/gpu_memory_allocator.h"
#include "runtime/device/gpu/distribution/collective_init.h"
#include "utils/convert_utils.h"
#include "utils/ms_context.h"
#include "runtime/device/kernel_runtime_manager.h"
#include "runtime/device/gpu/gpu_common.h"
#include "utils/ms_utils.h"
#include "runtime/device/gpu/gpu_memory_manager.h"
#include "backend/kernel_compiler/common_utils.h"
#include "runtime/device/gpu/gpu_memory_copy_manager.h"
#include "common/trans.h"
#include "ir/dtype.h"
#include "profiler/device/gpu/gpu_profiling.h"
#include "utils/shape_utils.h"
#ifdef ENABLE_DEBUGGER
#include "debug/debug_services.h"
#endif
namespace
mindspore
{
namespace
device
{
namespace
gpu
{
using
mindspore
::
device
::
memswap
::
MemSwapInfoSet
;
using
mindspore
::
device
::
memswap
::
MemSwapManager
;
using
mindspore
::
device
::
memswap
::
SwapKind
;
static
const
size_t
PARAMETER_OUTPUT_INDEX
=
0
;
bool
GPUKernelRuntime
::
SyncStream
()
{
return
GPUDeviceManager
::
GetInstance
().
SyncStream
(
stream_
);
}
bool
GPUKernelRuntime
::
Init
()
{
if
(
device_init_
==
true
)
{
GPUMemoryAllocator
::
GetInstance
().
CheckMaxDeviceMemory
();
return
true
;
}
bool
ret
=
false
;
#ifdef ENABLE_DUMP_E2E
ret
=
SetDumpConf
();
if
(
!
ret
)
{
MS_LOG
(
INFO
)
<<
"No dump conf to set!"
;
}
#endif
ret
=
InitDevice
();
if
(
!
ret
)
{
MS_LOG
(
ERROR
)
<<
"InitDevice error."
;
return
ret
;
}
mem_manager_
=
std
::
make_shared
<
GPUMemoryManager
>
();
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
mem_manager_
->
MallocDeviceMemory
();
const
void
*
collective_handle_
=
CollectiveInitializer
::
instance
().
collective_handle
();
bool
collective_inited
=
CollectiveInitializer
::
instance
().
collective_inited
();
if
(
collective_inited
&&
collective_handle_
!=
nullptr
)
{
auto
init_nccl_comm_funcptr
=
reinterpret_cast
<
InitNCCLComm
>
(
dlsym
(
const_cast
<
void
*>
(
collective_handle_
),
"InitNCCLComm"
));
MS_EXCEPTION_IF_NULL
(
init_nccl_comm_funcptr
);
(
*
init_nccl_comm_funcptr
)();
}
device_init_
=
true
;
return
ret
;
}
#ifdef ENABLE_DUMP_E2E
namespace
{
void
DumpOutput
(
mindspore
::
session
::
KernelGraph
*
graph
,
const
string
&
dump_path
,
DumpConfPtr
dump_conf
,
Debugger
*
debugger
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MS_EXCEPTION_IF_NULL
(
dump_conf
);
bool
trans_flag
=
dump_conf
->
trans_flag
();
const
auto
&
apply_kernels
=
graph
->
execution_order
();
for
(
const
auto
&
node
:
apply_kernels
)
{
MS_EXCEPTION_IF_NULL
(
node
);
auto
node_name
=
AnfAlgo
::
GetCNodeName
(
node
);
std
::
string
kernel_name
=
node
->
fullname_with_scope
();
if
(
!
dump_conf
->
IsKernelNeedDump
(
kernel_name
))
{
continue
;
}
const
std
::
string
strsrc
=
"/"
;
const
std
::
string
strdst
=
"--"
;
std
::
string
::
size_type
pos
=
0
;
std
::
string
::
size_type
srclen
=
strsrc
.
size
();
std
::
string
::
size_type
dstlen
=
strdst
.
size
();
while
((
pos
=
kernel_name
.
find
(
strsrc
,
pos
))
!=
std
::
string
::
npos
)
{
kernel_name
.
replace
(
pos
,
srclen
,
strdst
);
pos
+=
dstlen
;
}
auto
output_size
=
AnfAlgo
::
GetOutputTensorNum
(
node
);
for
(
size_t
j
=
0
;
j
<
output_size
;
++
j
)
{
auto
addr
=
AnfAlgo
::
GetOutputAddr
(
node
,
j
);
TypeId
addr_type_id
=
addr
->
type_id
();
std
::
string
addr_format
=
addr
->
format
();
ShapeVector
int_shapes
;
if
(
trans_flag
)
{
int_shapes
=
trans
::
GetRuntimePaddingShape
(
node
,
j
);
}
else
{
auto
shape
=
AnfAlgo
::
GetOutputDeviceShape
(
node
,
j
);
(
void
)
std
::
transform
(
shape
.
begin
(),
shape
.
end
(),
std
::
back_inserter
(
int_shapes
),
[](
size_t
inner_item
)
{
return
SizeToInt
(
inner_item
);
});
}
auto
type
=
AnfAlgo
::
GetOutputInferDataType
(
node
,
j
);
auto
format
=
kOpFormat_DEFAULT
;
string
filepath
=
dump_path
+
'/'
+
kernel_name
+
'_'
+
"output_"
+
std
::
to_string
(
j
);
DebugServices
*
debug_services
=
debugger
->
debug_services
();
TensorLoader
*
tensor_loader
=
debug_services
->
tensor_loader
();
std
::
string
original_kernel_name
=
node
->
fullname_with_scope
();
size_t
slot
=
j
;
auto
ret
=
tensor_loader
->
DumpTensorToFile
(
original_kernel_name
,
trans_flag
,
filepath
,
format
,
int_shapes
,
type
,
addr_type_id
,
addr_format
,
slot
);
if
(
!
ret
)
{
std
::
string
error
=
"DumpTensorToFile Failed: flag:"
+
std
::
to_string
(
trans_flag
)
+
", path:"
+
filepath
+
", host_format:"
+
format
+
".!"
;
}
}
}
}
void
DumpParameters
(
mindspore
::
session
::
KernelGraph
*
graph
,
const
string
&
dump_path
,
DumpConfPtr
dump_conf
,
Debugger
*
debugger
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MS_EXCEPTION_IF_NULL
(
dump_conf
);
bool
trans_flag
=
dump_conf
->
trans_flag
();
const
auto
&
parameters
=
graph
->
inputs
();
for
(
auto
&
item
:
parameters
)
{
if
(
!
item
->
isa
<
Parameter
>
())
{
continue
;
}
std
::
string
parameter_name
=
item
->
fullname_with_scope
();
if
(
!
dump_conf
->
IsKernelNeedDump
(
parameter_name
))
{
continue
;
}
auto
addr
=
AnfAlgo
::
GetOutputAddr
(
item
,
PARAMETER_OUTPUT_INDEX
);
TypeId
addr_type_id
=
addr
->
type_id
();
std
::
string
addr_format
=
addr
->
format
();
ShapeVector
int_shapes
;
if
(
trans_flag
)
{
int_shapes
=
trans
::
GetRuntimePaddingShape
(
item
,
PARAMETER_OUTPUT_INDEX
);
}
else
{
auto
shape
=
AnfAlgo
::
GetOutputDeviceShape
(
item
,
PARAMETER_OUTPUT_INDEX
);
(
void
)
std
::
transform
(
shape
.
begin
(),
shape
.
end
(),
std
::
back_inserter
(
int_shapes
),
[](
size_t
inner_item
)
{
return
SizeToInt
(
inner_item
);
});
}
auto
type
=
AnfAlgo
::
GetOutputInferDataType
(
item
,
PARAMETER_OUTPUT_INDEX
);
auto
format
=
kOpFormat_DEFAULT
;
string
filepath
=
dump_path
+
'/'
+
parameter_name
+
'_'
+
"output_0"
;
DebugServices
*
debug_services
=
debugger
->
debug_services
();
TensorLoader
*
tensor_loader
=
debug_services
->
tensor_loader
();
std
::
string
original_kernel_name
=
parameter_name
;
size_t
slot
=
0
;
auto
ret
=
tensor_loader
->
DumpTensorToFile
(
original_kernel_name
,
trans_flag
,
filepath
,
format
,
int_shapes
,
type
,
addr_type_id
,
addr_format
,
slot
);
if
(
!
ret
)
{
std
::
string
error
=
"DumpTensorToFile Failed: flag:"
+
std
::
to_string
(
trans_flag
)
+
", path:"
+
filepath
+
", host_format:"
+
format
+
".!"
;
}
}
}
}
// namespace
bool
GPUKernelRuntime
::
DumpData
(
mindspore
::
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MS_LOG
(
INFO
)
<<
"Start dump step"
;
DumpConfPtr
dump_conf
=
GetDumpConf
();
MS_EXCEPTION_IF_NULL
(
dump_conf
);
dump_conf
->
UpdataCurIter
();
bool
dump_flag
=
dump_conf
->
dump_enable
();
if
(
!
dump_flag
)
{
MS_LOG
(
INFO
)
<<
"Dump flag is disable, pass dump step"
;
return
true
;
}
uint32_t
cur_iter
=
dump_conf
->
cur_iter
();
if
(
dump_conf
->
dump_iter
()
!=
0
)
{
if
(
cur_iter
!=
dump_conf
->
dump_iter
())
{
return
true
;
}
}
MS_LOG
(
INFO
)
<<
"Cur iter is "
<<
cur_iter
;
std
::
string
net_name
=
dump_conf
->
dump_net_name
();
std
::
string
iterator
=
std
::
to_string
(
cur_iter
);
std
::
string
dump_path
=
dump_conf
->
dump_path
();
if
(
dump_path
.
back
()
==
'/'
)
{
dump_path
=
dump_path
+
net_name
+
'/'
+
iterator
;
}
else
{
dump_path
=
dump_path
+
'/'
+
net_name
+
'/'
+
iterator
;
}
// dump output
DumpOutput
(
graph
,
dump_path
,
dump_conf
,
debugger
);
// dump parameters
DumpParameters
(
graph
,
dump_path
,
dump_conf
,
debugger
);
return
true
;
}
#endif
#ifdef ENABLE_DEBUGGER
namespace
{
void
LoadKernelData
(
Debugger
*
debugger
,
const
CNodePtr
&
kernel
,
const
std
::
vector
<
mindspore
::
kernel
::
AddressPtr
>
&
kernel_inputs
,
const
std
::
vector
<
mindspore
::
kernel
::
AddressPtr
>
&
kernel_workspaces
,
const
std
::
vector
<
mindspore
::
kernel
::
AddressPtr
>
&
kernel_outputs
,
int
exec_order
,
void
*
stream_ptr
,
bool
dump_enabled
)
{
// check if we should read the kernel data
bool
read_data
=
false
;
std
::
string
kernel_name
=
kernel
->
fullname_with_scope
();
if
(
debugger
)
{
debugger
->
SetCurNode
(
kernel_name
);
if
(
dump_enabled
)
{
read_data
=
true
;
}
else
if
(
debugger
->
debugger_enabled
())
{
read_data
=
debugger
->
ReadNodeDataRequired
();
}
}
if
(
!
read_data
)
{
return
;
}
// get inputs
auto
input_size
=
AnfAlgo
::
GetInputTensorNum
(
kernel
);
for
(
size_t
j
=
0
;
j
<
input_size
;
++
j
)
{
auto
input_kernel
=
kernel
->
input
(
j
+
1
);
std
::
string
input_kernel_name
=
input_kernel
->
fullname_with_scope
();
auto
addr
=
kernel_inputs
[
j
];
auto
type
=
AnfAlgo
::
GetOutputInferDataType
(
input_kernel
,
PARAMETER_OUTPUT_INDEX
);
auto
format
=
kOpFormat_DEFAULT
;
auto
gpu_addr
=
std
::
make_unique
<
GPUDeviceAddress
>
(
addr
->
addr
,
addr
->
size
,
format
,
type
);
string
input_tensor_name
=
input_kernel_name
+
':'
+
"0"
;
ShapeVector
int_shapes
;
auto
shape
=
AnfAlgo
::
GetOutputDeviceShape
(
input_kernel
,
PARAMETER_OUTPUT_INDEX
);
(
void
)
std
::
transform
(
shape
.
begin
(),
shape
.
end
(),
std
::
back_inserter
(
int_shapes
),
[](
size_t
inner_item
)
{
return
SizeToInt
(
inner_item
);
});
auto
ret
=
gpu_addr
->
LoadMemToHost
(
input_tensor_name
,
exec_order
,
format
,
int_shapes
,
type
,
0
,
debugger
,
false
);
if
(
!
ret
)
{
MS_LOG
(
ERROR
)
<<
"LoadMemToHost:"
<<
", tensor_name:"
<<
input_tensor_name
<<
", host_format:"
<<
format
<<
".!"
;
}
}
// get outputs
auto
output_size
=
AnfAlgo
::
GetOutputTensorNum
(
kernel
);
for
(
size_t
j
=
0
;
j
<
output_size
;
++
j
)
{
auto
addr
=
kernel_outputs
[
j
];
auto
type
=
AnfAlgo
::
GetOutputInferDataType
(
kernel
,
j
);
auto
format
=
kOpFormat_DEFAULT
;
auto
gpu_addr
=
std
::
make_unique
<
GPUDeviceAddress
>
(
addr
->
addr
,
addr
->
size
,
format
,
type
);
string
tensor_name
=
kernel_name
+
':'
+
std
::
to_string
(
j
);
ShapeVector
int_shapes
;
auto
shape
=
AnfAlgo
::
GetOutputDeviceShape
(
kernel
,
j
);
(
void
)
std
::
transform
(
shape
.
begin
(),
shape
.
end
(),
std
::
back_inserter
(
int_shapes
),
[](
size_t
inner_item
)
{
return
SizeToInt
(
inner_item
);
});
auto
ret
=
gpu_addr
->
LoadMemToHost
(
tensor_name
,
exec_order
,
format
,
int_shapes
,
type
,
j
,
debugger
,
false
);
if
(
!
ret
)
{
MS_LOG
(
ERROR
)
<<
"LoadMemToHost:"
<<
", tensor_name:"
<<
tensor_name
<<
", host_format:"
<<
format
<<
".!"
;
}
}
debugger
->
PostExecuteNode
();
}
void
UpdateStepNum
(
Debugger
*
debugger
,
bool
dump_enabled
)
{
if
(
debugger
&&
(
debugger
->
debugger_enabled
()
||
dump_enabled
))
{
auto
cur_step_num
=
debugger
->
step_num
();
cur_step_num
=
cur_step_num
+
1
;
debugger
->
SetStepNum
(
cur_step_num
);
}
}
void
LoadParameters
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
,
bool
dump_enabled
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
if
(
!
(
debugger
&&
dump_enabled
))
{
return
;
}
const
auto
&
parameters
=
graph
->
inputs
();
// for parameters, set its execution order to be 0;
int
exec_order
=
0
;
for
(
auto
&
item
:
parameters
)
{
if
(
!
item
->
isa
<
Parameter
>
())
{
continue
;
}
std
::
string
parameter_name
=
item
->
fullname_with_scope
();
auto
addr
=
AnfAlgo
::
GetOutputAddr
(
item
,
PARAMETER_OUTPUT_INDEX
);
auto
type
=
AnfAlgo
::
GetOutputInferDataType
(
item
,
PARAMETER_OUTPUT_INDEX
);
auto
format
=
kOpFormat_DEFAULT
;
string
tensor_name
=
parameter_name
+
':'
+
"0"
;
auto
gpu_addr
=
dynamic_cast
<
const
mindspore
::
device
::
gpu
::
GPUDeviceAddress
*>
(
addr
);
ShapeVector
int_shapes
;
auto
shape
=
AnfAlgo
::
GetOutputDeviceShape
(
item
,
PARAMETER_OUTPUT_INDEX
);
(
void
)
std
::
transform
(
shape
.
begin
(),
shape
.
end
(),
std
::
back_inserter
(
int_shapes
),
[](
size_t
inner_item
)
{
return
SizeToInt
(
inner_item
);
});
auto
ret
=
gpu_addr
->
LoadMemToHost
(
tensor_name
,
exec_order
,
format
,
int_shapes
,
type
,
0
,
debugger
,
true
);
if
(
!
ret
)
{
MS_LOG
(
ERROR
)
<<
"LoadMemToHost:"
<<
", tensor_name:"
<<
tensor_name
<<
", host_format:"
<<
format
<<
".!"
;
}
}
}
void
ClearCurrentData
(
Debugger
*
debugger
,
bool
dump_enabled
)
{
if
(
debugger
&&
(
debugger
->
debugger_enabled
()
||
dump_enabled
))
{
DebugServices
*
debug_services
=
debugger
->
debug_services
();
TensorLoader
*
tensor_loader
=
debug_services
->
tensor_loader
();
tensor_loader
->
EmptyCurrentTensor
();
}
}
}
// namespace
#endif
DeviceAddressPtr
GPUKernelRuntime
::
CreateDeviceAddress
(
void
*
device_ptr
,
size_t
device_size
,
const
string
&
format
,
TypeId
type_id
)
{
return
std
::
make_shared
<
GPUDeviceAddress
>
(
device_ptr
,
device_size
,
format
,
type_id
);
}
bool
GPUKernelRuntime
::
InitDevice
()
{
if
(
GPUDeviceManager
::
GetInstance
().
device_count
()
<=
0
)
{
MS_LOG
(
ERROR
)
<<
"No GPU device found."
;
return
false
;
}
const
void
*
collective_handle_
=
CollectiveInitializer
::
instance
().
collective_handle
();
bool
collective_inited
=
CollectiveInitializer
::
instance
().
collective_inited
();
if
(
collective_inited
&&
collective_handle_
!=
nullptr
)
{
auto
get_local_rank_funcptr
=
reinterpret_cast
<
GetLocalRankId
>
(
dlsym
(
const_cast
<
void
*>
(
collective_handle_
),
"local_rank_id"
));
MS_EXCEPTION_IF_NULL
(
get_local_rank_funcptr
);
device_id_
=
IntToUint
((
*
get_local_rank_funcptr
)());
}
if
(
!
GPUDeviceManager
::
GetInstance
().
is_device_id_init
())
{
if
(
!
GPUDeviceManager
::
GetInstance
().
set_cur_device_id
(
device_id_
))
{
MS_LOG
(
ERROR
)
<<
"Failed to set current device to "
<<
SizeToInt
(
device_id_
);
return
false
;
}
}
GPUDeviceManager
::
GetInstance
().
InitDevice
();
stream_
=
GPUDeviceManager
::
GetInstance
().
default_stream
();
if
(
stream_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"No default CUDA stream found."
;
return
false
;
}
return
true
;
}
void
GPUKernelRuntime
::
ReleaseDeviceRes
()
{
// For dataset mode.
if
(
GpuBufferMgr
::
GetInstance
().
IsInit
())
{
if
(
!
GpuBufferMgr
::
GetInstance
().
IsClosed
())
{
if
(
!
GpuBufferMgr
::
GetInstance
().
CloseNotify
())
{
MS_LOG
(
EXCEPTION
)
<<
"Could not close gpu data queue."
;
}
}
CHECK_OP_RET_WITH_EXCEPT
(
GpuBufferMgr
::
GetInstance
().
Destroy
(),
"Could not destroy gpu data queue."
);
}
// Destroy remaining memory swap events and free host memory.
for
(
auto
&
item
:
mem_swap_map_
)
{
auto
&
mem_swap_manager
=
item
.
second
;
MS_EXCEPTION_IF_NULL
(
mem_swap_manager
);
if
(
mem_swap_manager
->
trigger_swap
())
{
mem_swap_manager
->
ClearSwapQueue
(
false
);
mem_swap_manager
->
ReleaseHostPinnedMem
();
}
}
GPUDeviceManager
::
GetInstance
().
ReleaseDevice
();
if
(
mem_manager_
!=
nullptr
)
{
mem_manager_
->
FreeDeviceMemory
();
}
kernel
::
KernelMeta
*
bin_map
=
kernel
::
KernelMeta
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
bin_map
);
bin_map
->
RemoveKernelCache
();
}
void
GPUKernelRuntime
::
ClearGraphRuntimeResource
(
uint32_t
graph_id
,
const
std
::
vector
<
AnfNodePtr
>
&
inputs
,
const
std
::
unordered_set
<
ValueNodePtr
>
&
value_nodes
,
const
std
::
vector
<
CNodePtr
>
&
execution_order
)
{
MS_LOG
(
INFO
)
<<
"Clear graph:"
<<
graph_id
<<
" GPU runtime resource"
;
// Release the kernel resource.
for
(
const
auto
&
kernel
:
execution_order
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
if
(
kernel_mod
==
nullptr
)
{
continue
;
}
kernel_mod
->
ReleaseResource
();
}
// Clear the output address of graph.
ClearOutputAddress
(
inputs
,
value_nodes
,
execution_order
);
}
void
GPUKernelRuntime
::
AssignMemory
(
session
::
KernelGraph
*
graph
)
{
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
mem_manager_
->
ResetDynamicMemory
();
AssignStaticMemoryInput
(
graph
);
AssignStaticMemoryValueNode
(
graph
);
bool
is_enable_dynamic_mem
=
context_ptr
->
get_param
<
bool
>
(
MS_CTX_ENABLE_DYNAMIC_MEM_POOL
);
if
(
is_enable_dynamic_mem
)
{
// Use the dynamic memory pool.
InitKernelRefCount
(
graph
);
InitMemorySwapInfo
(
graph
);
InitKernelOutputAddress
(
graph
);
InitKernelWorkspaceAddress
(
graph
);
SaveGraphOutputNode
(
graph
);
}
else
{
AssignDynamicMemory
(
graph
);
}
}
bool
GPUKernelRuntime
::
Run
(
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
struct
timeval
start_time
,
end_time
;
(
void
)
gettimeofday
(
&
start_time
,
nullptr
);
bool
ret
=
true
;
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
bool
is_enable_dynamic_mem
=
context_ptr
->
get_param
<
bool
>
(
MS_CTX_ENABLE_DYNAMIC_MEM_POOL
);
bool
is_enable_pynative_infer
=
context_ptr
->
get_param
<
bool
>
(
MS_CTX_ENABLE_PYNATIVE_INFER
);
if
(
is_enable_dynamic_mem
&&
!
is_enable_pynative_infer
)
{
auto
graph_id
=
graph
->
graph_id
();
auto
iter
=
mem_swap_map_
.
find
(
graph_id
);
if
(
iter
==
mem_swap_map_
.
end
())
{
MS_LOG
(
EXCEPTION
)
<<
"Find memory swap map failed."
;
}
mem_swap_manager_
=
iter
->
second
;
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
auto
mem_reuse_iter
=
mem_reuse_util_map_
.
find
(
graph_id
);
if
(
mem_reuse_iter
==
mem_reuse_util_map_
.
end
())
{
MS_LOG
(
EXCEPTION
)
<<
"Find memory reuse map failed."
;
}
mem_reuse_util_
=
mem_reuse_iter
->
second
;
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
ret
=
RunOneStep
(
graph
,
debugger
);
}
else
{
ret
=
LaunchKernel
(
graph
);
}
(
void
)
gettimeofday
(
&
end_time
,
nullptr
);
const
uint64_t
kUSecondInSecond
=
1000000
;
uint64_t
cost
=
kUSecondInSecond
*
static_cast
<
uint64_t
>
(
end_time
.
tv_sec
-
start_time
.
tv_sec
);
cost
+=
static_cast
<
uint64_t
>
(
end_time
.
tv_usec
-
start_time
.
tv_usec
);
MS_LOG
(
DEBUG
)
<<
"GPU kernel runtime run graph in "
<<
cost
<<
" us"
;
return
ret
;
}
bool
GPUKernelRuntime
::
RunOneStep
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
bool
ret
=
true
;
auto
graph_id
=
graph
->
graph_id
();
if
(
!
is_first_step_map_
[
graph_id
])
{
// Normally run graph
ret
=
LaunchKernelDynamic
(
graph
,
debugger
);
}
else
{
// Mock run first step
ret
=
LaunchKernelDynamic
(
graph
,
debugger
,
true
,
false
);
if
(
ret
)
{
// Normally run graph
ret
=
LaunchKernelDynamic
(
graph
,
debugger
);
}
else
{
// Trigger memory swap
ret
=
SearchMemSwapScheme
(
graph
,
debugger
);
}
is_first_step_map_
[
graph_id
]
=
false
;
}
return
ret
;
}
bool
GPUKernelRuntime
::
SearchMemSwapScheme
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
MS_LOG
(
WARNING
)
<<
"Run out of memory and try memory swapping, it may take some time, please wait a moment."
;
bool
ret
=
false
;
ClearKernelOldOutputAndWorkspace
(
graph
);
if
(
!
mem_swap_manager_
->
mem_swap_init
())
{
if
(
!
mem_swap_manager_
->
Init
(
graph
))
{
return
false
;
}
}
while
(
!
ret
)
{
if
(
!
mem_swap_manager_
->
RetreatSwapInfo
())
{
return
false
;
}
ret
=
LaunchKernelDynamic
(
graph
,
debugger
,
true
,
false
);
if
(
!
ret
)
{
ClearKernelOldOutputAndWorkspace
(
graph
);
}
}
mem_swap_manager_
->
AssignHostMemory
();
// Time profiling
ret
=
LaunchKernelDynamic
(
graph
,
debugger
,
false
,
true
);
if
(
!
ret
)
{
return
ret
;
}
return
RefineMemSwapScheme
(
graph
,
debugger
);
}
bool
GPUKernelRuntime
::
RefineMemSwapScheme
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
MS_LOG
(
WARNING
)
<<
"Refine memory swap scheme, it may take some time, please wait a moment."
;
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
if
(
!
mem_swap_manager_
->
QueryKernelTriggerSwapIn
(
kernel
))
{
continue
;
}
size_t
swap_in_task_num
=
mem_swap_manager_
->
QueryKernelTriggerSwapInTaskNum
(
kernel
);
for
(
size_t
swap_in_task_idx
=
0
;
swap_in_task_idx
<
swap_in_task_num
;
swap_in_task_idx
++
)
{
bool
ret
=
false
;
while
(
!
ret
)
{
mem_swap_manager_
->
AdjustSwapInPos
(
kernel
,
swap_in_task_idx
);
ret
=
LaunchKernelDynamic
(
graph
,
debugger
,
true
,
false
);
if
(
!
ret
)
{
ClearKernelOldOutputAndWorkspace
(
graph
);
ClearSwapInfo
(
true
);
}
}
}
}
return
true
;
}
void
GPUKernelRuntime
::
InitKernelRefCount
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MemReuseUtilPtr
mem_reuse_util_ptr
=
std
::
make_shared
<
memreuse
::
MemReuseUtil
>
();
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_ptr
);
// Init the kernel reference count.
if
(
!
mem_reuse_util_ptr
->
InitDynamicKernelRef
(
graph
))
{
MS_LOG
(
EXCEPTION
)
<<
"Init kernel reference count failed"
;
}
mem_reuse_util_ptr
->
SetKernelDefMap
();
mem_reuse_util_ptr
->
SetReuseRefCount
();
// Can't free the device address of graph output, so set the reference count of graph output specially.
mem_reuse_util_ptr
->
SetGraphOutputRefCount
();
// Can't free the device address of summary nodes, so set the reference count of summary nodes specially.
mem_reuse_util_ptr
->
SetSummaryNodesRefCount
();
auto
graph_id
=
graph
->
graph_id
();
mem_reuse_util_map_
[
graph_id
]
=
mem_reuse_util_ptr
;
}
void
GPUKernelRuntime
::
InitMemorySwapInfo
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
GPUMemCopyManagerPtr
gpu_mem_copy_manager
=
std
::
make_shared
<
GPUMemCopyManager
>
();
MS_EXCEPTION_IF_NULL
(
gpu_mem_copy_manager
);
MemSwapManagerPtr
mem_swap_manager
=
std
::
make_shared
<
MemSwapManager
>
(
gpu_mem_copy_manager
);
MS_EXCEPTION_IF_NULL
(
mem_swap_manager
);
auto
graph_id
=
graph
->
graph_id
();
mem_swap_map_
[
graph_id
]
=
mem_swap_manager
;
is_first_step_map_
[
graph_id
]
=
true
;
}
void
GPUKernelRuntime
::
InitKernelOutputAddress
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
output_sizes
=
kernel_mod
->
GetOutputSizeList
();
for
(
size_t
i
=
0
;
i
<
output_sizes
.
size
();
++
i
)
{
if
(
AnfAlgo
::
OutputAddrExist
(
kernel
,
i
))
{
continue
;
}
std
::
string
output_format
=
AnfAlgo
::
GetOutputFormat
(
kernel
,
i
);
auto
output_type
=
AnfAlgo
::
GetOutputDeviceDataType
(
kernel
,
i
);
auto
device_address
=
CreateDeviceAddress
(
nullptr
,
output_sizes
[
i
],
output_format
,
output_type
);
AnfAlgo
::
SetOutputAddr
(
device_address
,
i
,
kernel
.
get
());
}
}
}
void
GPUKernelRuntime
::
InitKernelWorkspaceAddress
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
workspace_sizes
=
kernel_mod
->
GetWorkspaceSizeList
();
for
(
size_t
i
=
0
;
i
<
workspace_sizes
.
size
();
++
i
)
{
auto
device_address
=
CreateDeviceAddress
(
nullptr
,
workspace_sizes
[
i
],
""
,
kTypeUnknown
);
AnfAlgo
::
SetWorkspaceAddr
(
device_address
,
i
,
kernel
.
get
());
}
}
}
void
GPUKernelRuntime
::
SaveGraphOutputNode
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
graph_id
=
graph
->
graph_id
();
const
auto
&
output_nodes
=
AnfAlgo
::
GetAllOutput
(
graph
->
output
(),
{
prim
::
kPrimTupleGetItem
});
for
(
const
auto
&
node
:
output_nodes
)
{
graph_output_map_
[
graph_id
].
insert
(
node
);
}
}
bool
GPUKernelRuntime
::
IsGraphOutput
(
const
session
::
KernelGraph
*
graph
,
const
mindspore
::
AnfNodePtr
&
kernel
)
const
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
graph_id
=
graph
->
graph_id
();
auto
iter
=
graph_output_map_
.
find
(
graph_id
);
if
(
iter
==
graph_output_map_
.
end
())
{
MS_LOG
(
EXCEPTION
)
<<
"Find graph output info failed."
;
}
auto
&
graph_output_set
=
iter
->
second
;
return
(
graph_output_set
.
find
(
kernel
)
!=
graph_output_set
.
end
());
}
void
GPUKernelRuntime
::
ClearKernelOldOutputAndWorkspace
(
const
session
::
KernelGraph
*
graph
)
{
ClearKernelOutputAddress
(
graph
);
ClearKernelWorkspaceAddress
(
graph
);
}
void
GPUKernelRuntime
::
ClearKernelOutputAddress
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
if
(
IsGraphOutput
(
graph
,
kernel
))
{
continue
;
}
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
output_sizes
=
kernel_mod
->
GetOutputSizeList
();
for
(
size_t
i
=
0
;
i
<
output_sizes
.
size
();
++
i
)
{
if
(
!
AnfAlgo
::
OutputAddrExist
(
kernel
,
i
))
{
continue
;
}
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
)
{
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
}
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
}
}
void
GPUKernelRuntime
::
ClearKernelWorkspaceAddress
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
workspace_sizes
=
kernel_mod
->
GetWorkspaceSizeList
();
for
(
size_t
i
=
0
;
i
<
workspace_sizes
.
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableWorkspaceAddr
(
kernel
,
i
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
)
{
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
}
}
}
}
bool
GPUKernelRuntime
::
LaunchKernelDynamic
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
,
bool
mock
,
bool
profiling
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
// Reset the reference count.
mem_reuse_util_
->
ResetDynamicUsedRefCount
();
// The inputs and outputs memory of communication kernel need be continuous, so separate processing.
AllocCommunicationOpDynamicRes
(
graph
);
#ifdef ENABLE_DEBUGGER
bool
dump_enabled
=
GPUKernelRuntime
::
DumpDataEnabledIteration
();
if
(
!
mock
)
{
UpdateStepNum
(
debugger
,
dump_enabled
);
}
#endif
auto
&
kernels
=
graph
->
execution_order
();
int
exec_order
=
1
;
auto
profiler_inst
=
profiler
::
gpu
::
GPUProfiler
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
profiler_inst
);
for
(
const
auto
&
kernel
:
kernels
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
AddressPtrList
kernel_inputs
;
AddressPtrList
kernel_workspaces
;
AddressPtrList
kernel_outputs
;
auto
ret
=
AllocKernelDynamicRes
(
*
kernel_mod
,
kernel
,
&
kernel_inputs
,
&
kernel_workspaces
,
&
kernel_outputs
,
mock
);
if
(
!
ret
)
{
#ifdef ENABLE_DEBUGGER
if
(
!
mock
)
{
// invalidate current data collected by the debugger
ClearCurrentData
(
debugger
,
dump_enabled
);
}
#endif
return
false
;
}
if
(
!
mock
)
{
if
(
!
profiling
)
{
if
(
profiler_inst
->
GetEnableFlag
())
{
profiler_inst
->
OpDataProducerBegin
(
kernel
->
fullname_with_scope
(),
stream_
);
}
CHECK_OP_RET_WITH_EXCEPT
(
kernel_mod
->
Launch
(
kernel_inputs
,
kernel_workspaces
,
kernel_outputs
,
stream_
),
"Launch kernel failed."
);
if
(
profiler_inst
->
GetEnableFlag
())
{
profiler_inst
->
OpDataProducerEnd
();
if
(
profiler_inst
->
GetSyncEnableFlag
())
{
CHECK_OP_RET_WITH_ERROR
(
SyncStream
(),
"Profiler SyncStream failed."
);
}
}
}
else
{
LaunchKernelWithTimeProfiling
(
kernel
,
kernel_inputs
,
kernel_workspaces
,
kernel_outputs
);
}
#ifdef ENABLE_DEBUGGER
// called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost)
LoadKernelData
(
debugger
,
kernel
,
kernel_inputs
,
kernel_workspaces
,
kernel_outputs
,
exec_order
,
stream_
,
dump_enabled
);
#endif
}
exec_order
=
exec_order
+
1
;
FreeKernelDynamicRes
(
kernel
);
if
(
!
UpdateMemorySwapTask
(
kernel
,
mock
,
profiling
))
{
#ifdef ENABLE_DEBUGGER
if
(
!
mock
)
{
// invalidate current data collected by the debugger
ClearCurrentData
(
debugger
,
dump_enabled
);
}
#endif
return
false
;
}
}
if
(
!
mock
)
{
#ifdef ENABLE_DEBUGGER
// collect weights and bias for dump mode
LoadParameters
(
graph
,
debugger
,
dump_enabled
);
#endif
CHECK_OP_RET_WITH_EXCEPT
(
SyncStream
(),
"SyncStream failed."
);
}
ClearSwapInfo
(
mock
);
return
true
;
}
void
GPUKernelRuntime
::
LaunchKernelWithTimeProfiling
(
const
AnfNodePtr
&
kernel
,
const
AddressPtrList
&
inputs
,
const
AddressPtrList
&
workspace
,
const
AddressPtrList
&
outputs
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
float
cost_time
=
0
;
DeviceEvent
start
=
nullptr
;
DeviceEvent
end
=
nullptr
;
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
CreateEvent
(
&
start
),
"Failed to create event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
CreateEvent
(
&
end
),
"Failed to create event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
RecordEvent
(
start
,
stream_
),
"Failed to record event to stream."
);
CHECK_OP_RET_WITH_EXCEPT
(
kernel_mod
->
Launch
(
inputs
,
workspace
,
outputs
,
stream_
),
"Launch kernel failed."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
RecordEvent
(
end
,
stream_
),
"Failed to record event to stream."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
SyncEvent
(
start
),
"Failed to sync event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
SyncEvent
(
end
),
"Failed to sync event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
ElapsedTime
(
&
cost_time
,
start
,
end
),
"Failed to record elapsed time."
);
mem_swap_manager_
->
AddKernelExecutionPerform
(
kernel
,
cost_time
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
DestroyEvent
(
start
),
"Failed to destroy event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
DestroyEvent
(
end
),
"Failed to destroy event."
);
}
bool
GPUKernelRuntime
::
AddMemorySwapTask
(
const
AnfNodePtr
&
kernel
,
bool
mock
,
bool
profiling
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
const
MemSwapInfoSet
&
mem_swap_info_set
=
mem_swap_manager_
->
QueryKernelMemSwapInfo
(
kernel
);
for
(
auto
&
mem_swap_info
:
mem_swap_info_set
)
{
auto
need_swap_kernel
=
mem_swap_manager_
->
QueryKernelByTopoOrder
(
mem_swap_info
.
topo_order_
);
MS_EXCEPTION_IF_NULL
(
need_swap_kernel
);
const
HostAddress
&
host_address
=
mem_swap_manager_
->
QueryKernelHostAddr
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
);
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
,
false
);
if
(
mem_swap_info
.
swap_kind_
==
SwapKind
::
kDeviceToHost
)
{
if
(
mem_swap_manager_
->
QueryKernelHostAddrIsDirty
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
))
{
mem_swap_manager_
->
AddMemSwapTask
(
SwapKind
::
kDeviceToHost
,
device_address
,
host_address
,
mock
);
mem_swap_manager_
->
AddKernelHostAddrIsDirty
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
,
false
);
}
else
{
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
device_address
->
set_status
(
DeviceAddressStatus
::
kInHost
);
}
}
else
if
(
mem_swap_info
.
swap_kind_
==
SwapKind
::
kHostToDevice
)
{
auto
status
=
device_address
->
status
();
if
(
status
==
DeviceAddressStatus
::
kInDeviceToHost
)
{
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
else
if
(
status
==
DeviceAddressStatus
::
kInHost
)
{
if
(
!
device_address
->
ptr_
&&
!
AttemptMallocMem
(
device_address
,
device_address
->
size_
,
mock
))
{
return
false
;
}
float
cost_time
=
0
;
mem_swap_manager_
->
AddMemSwapTask
(
SwapKind
::
kHostToDevice
,
device_address
,
host_address
,
mock
,
profiling
,
&
cost_time
);
if
(
profiling
)
{
mem_swap_manager_
->
AddKernelSwapPerform
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
,
std
::
make_pair
(
0
,
cost_time
));
}
}
}
}
return
true
;
}
bool
GPUKernelRuntime
::
UpdateMemorySwapTask
(
const
AnfNodePtr
&
kernel
,
bool
mock
,
bool
profiling
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
true
;
}
if
(
mem_swap_manager_
->
QueryKernelTriggerSwap
(
kernel
))
{
if
(
!
mock
)
{
CHECK_OP_RET_WITH_EXCEPT
(
SyncStream
(),
"SyncStream failed."
);
}
if
(
!
AddMemorySwapTask
(
kernel
,
mock
,
profiling
))
{
return
false
;
}
if
(
!
mock
)
{
CHECK_OP_RET_WITH_EXCEPT
(
mem_swap_manager_
->
SyncMemCopyStream
(
SwapKind
::
kDeviceToHost
),
"SyncCopyStream failed."
);
}
}
return
true
;
}
void
GPUKernelRuntime
::
UpdateHostSwapInQueue
(
const
DeviceAddressPtr
device_address
,
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
;
}
while
(
auto
device_address_swap_in
=
mem_swap_manager_
->
UpdateSwapQueue
(
SwapKind
::
kHostToDevice
,
mock
))
{
device_address_swap_in
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
auto
status
=
device_address
->
status
();
switch
(
status
)
{
case
DeviceAddressStatus
::
kInDevice
:
break
;
case
DeviceAddressStatus
::
kInDeviceToHost
:
{
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
break
;
}
case
DeviceAddressStatus
::
kInHostToDevice
:
{
while
(
device_address
->
status
()
!=
DeviceAddressStatus
::
kInDevice
)
{
while
(
auto
device_address_swap_in
=
mem_swap_manager_
->
UpdateSwapQueue
(
SwapKind
::
kHostToDevice
,
mock
))
{
device_address_swap_in
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
}
break
;
}
case
DeviceAddressStatus
::
kInHost
:
MS_LOG
(
WARNING
)
<<
"Unexpected device address status: "
<<
status
;
break
;
default:
MS_LOG
(
EXCEPTION
)
<<
"Invaild device address status: "
<<
status
;
}
}
void
GPUKernelRuntime
::
UpdateHostSwapOutQueue
(
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
;
}
while
(
auto
device_address_swap_out
=
mem_swap_manager_
->
UpdateSwapQueue
(
SwapKind
::
kDeviceToHost
,
mock
))
{
if
(
device_address_swap_out
->
status
()
==
DeviceAddressStatus
::
kInDeviceToHost
&&
device_address_swap_out
->
ptr_
)
{
device_address_swap_out
->
set_status
(
DeviceAddressStatus
::
kInHost
);
mem_manager_
->
FreeMemFromMemPool
(
device_address_swap_out
);
}
}
}
void
GPUKernelRuntime
::
ClearSwapInfo
(
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
;
}
mem_swap_manager_
->
ClearSwapQueue
(
mock
);
mem_swap_manager_
->
ResetHostAddrIsDirty
();
}
bool
GPUKernelRuntime
::
AttemptMallocMem
(
const
DeviceAddressPtr
&
device_address
,
size_t
size
,
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
auto
ret
=
mem_manager_
->
MallocMemFromMemPool
(
device_address
,
size
);
if
(
!
ret
)
{
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
false
;
}
if
(
!
mock
)
{
mem_swap_manager_
->
SyncMemCopyStream
(
SwapKind
::
kDeviceToHost
);
}
UpdateHostSwapOutQueue
(
mock
);
ret
=
mem_manager_
->
MallocMemFromMemPool
(
device_address
,
size
);
if
(
!
ret
)
{
return
false
;
}
}
return
true
;
}
bool
GPUKernelRuntime
::
AllocKernelDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_inputs
,
AddressPtrList
*
kernel_workspaces
,
AddressPtrList
*
kernel_outputs
,
bool
mock
)
{
if
(
!
AllocKernelInputDynamicRes
(
kernel
,
kernel_inputs
,
mock
))
{
return
false
;
}
if
(
!
AllocKernelOutputDynamicRes
(
kernel_mod
,
kernel
,
kernel_outputs
,
mock
))
{
return
false
;
}
if
(
!
AllocKernelWorkspaceDynamicRes
(
kernel_mod
,
kernel
,
kernel_workspaces
,
mock
))
{
return
false
;
}
return
true
;
}
bool
GPUKernelRuntime
::
AllocKernelInputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_inputs
,
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_inputs
);
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
for
(
size_t
i
=
0
;
i
<
AnfAlgo
::
GetInputTensorNum
(
kernel
);
++
i
)
{
DeviceAddressPtr
device_address
;
if
(
mem_reuse_util_
->
is_all_nop_node
())
{
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
false
);
}
else
{
// Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
true
);
}
MS_EXCEPTION_IF_NULL
(
device_address
);
UpdateHostSwapInQueue
(
device_address
,
mock
);
MS_EXCEPTION_IF_NULL
(
device_address
->
ptr_
);
kernel
::
AddressPtr
input
=
std
::
make_shared
<
kernel
::
Address
>
();
MS_EXCEPTION_IF_NULL
(
input
);
input
->
addr
=
device_address
->
ptr_
;
input
->
size
=
device_address
->
size_
;
kernel_inputs
->
emplace_back
(
input
);
}
return
true
;
}
bool
GPUKernelRuntime
::
AllocKernelOutputDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_outputs
,
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_outputs
);
UpdateHostSwapOutQueue
(
mock
);
auto
output_sizes
=
kernel_mod
.
GetOutputSizeList
();
for
(
size_t
i
=
0
;
i
<
output_sizes
.
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
==
nullptr
&&
!
AttemptMallocMem
(
device_address
,
output_sizes
[
i
],
mock
))
{
return
false
;
}
kernel
::
AddressPtr
output
=
std
::
make_shared
<
kernel
::
Address
>
();
MS_EXCEPTION_IF_NULL
(
output
);
output
->
addr
=
device_address
->
ptr_
;
output
->
size
=
output_sizes
[
i
];
kernel_outputs
->
emplace_back
(
output
);
}
return
true
;
}
bool
GPUKernelRuntime
::
AllocKernelWorkspaceDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_workspaces
,
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_workspaces
);
auto
workspace_sizes
=
kernel_mod
.
GetWorkspaceSizeList
();
for
(
size_t
i
=
0
;
i
<
workspace_sizes
.
size
();
++
i
)
{
if
(
workspace_sizes
[
i
]
==
0
)
{
kernel_workspaces
->
emplace_back
(
nullptr
);
continue
;
}
auto
device_address
=
AnfAlgo
::
GetMutableWorkspaceAddr
(
kernel
,
i
);
if
(
device_address
->
ptr_
==
nullptr
&&
!
AttemptMallocMem
(
device_address
,
workspace_sizes
[
i
],
mock
))
{
return
false
;
}
kernel
::
AddressPtr
workspace
=
std
::
make_shared
<
kernel
::
Address
>
();
MS_EXCEPTION_IF_NULL
(
workspace
);
workspace
->
addr
=
device_address
->
ptr_
;
workspace
->
size
=
workspace_sizes
[
i
];
kernel_workspaces
->
emplace_back
(
workspace
);
}
return
true
;
}
void
GPUKernelRuntime
::
AllocCommunicationOpDynamicRes
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
auto
&
kernel
:
kernels
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
if
(
AnfAlgo
::
IsCommunicationOp
(
kernel
))
{
AllocCommunicationOpInputDynamicRes
(
kernel
);
AllocCommunicationOpOutputDynamicRes
(
kernel
);
}
}
}
void
GPUKernelRuntime
::
AllocCommunicationOpInputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
bool
is_need_alloc_memory
=
false
;
bool
is_need_free_memory
=
false
;
size_t
total_size
=
0
;
std
::
vector
<
size_t
>
size_list
;
DeviceAddressPtrList
addr_list
;
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
intput_sizes
=
kernel_mod
->
GetInputSizeList
();
for
(
size_t
i
=
0
;
i
<
intput_sizes
.
size
();
++
i
)
{
DeviceAddressPtr
device_address
;
if
(
mem_reuse_util_
->
is_all_nop_node
())
{
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
false
);
}
else
{
// Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
true
);
}
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
==
nullptr
)
{
is_need_alloc_memory
=
true
;
}
else
{
is_need_free_memory
=
true
;
}
total_size
+=
intput_sizes
[
i
];
size_list
.
emplace_back
(
intput_sizes
[
i
]);
addr_list
.
emplace_back
(
device_address
);
}
AllocCommunicationOpMemory
(
is_need_alloc_memory
,
is_need_free_memory
,
addr_list
,
total_size
,
size_list
);
}
void
GPUKernelRuntime
::
AllocCommunicationOpOutputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
bool
is_need_alloc_memory
=
false
;
bool
is_need_free_memory
=
false
;
size_t
total_size
=
0
;
std
::
vector
<
size_t
>
size_list
;
DeviceAddressPtrList
addr_list
;
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
output_sizes
=
kernel_mod
->
GetOutputSizeList
();
for
(
size_t
i
=
0
;
i
<
output_sizes
.
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
==
nullptr
)
{
is_need_alloc_memory
=
true
;
}
else
{
is_need_free_memory
=
true
;
}
total_size
+=
output_sizes
[
i
];
size_list
.
emplace_back
(
output_sizes
[
i
]);
addr_list
.
emplace_back
(
device_address
);
}
AllocCommunicationOpMemory
(
is_need_alloc_memory
,
is_need_free_memory
,
addr_list
,
total_size
,
size_list
);
}
void
GPUKernelRuntime
::
AllocCommunicationOpMemory
(
bool
is_need_alloc_memory
,
bool
is_need_free_memory
,
const
DeviceAddressPtrList
addr_list
,
size_t
total_size
,
std
::
vector
<
size_t
>
size_list
)
{
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
if
(
!
is_need_alloc_memory
)
{
return
;
}
if
(
is_need_free_memory
)
{
for
(
const
auto
&
iter
:
addr_list
)
{
MS_EXCEPTION_IF_NULL
(
iter
);
// Free the inputs/outputs of communication kernel which are not released.
if
(
iter
->
ptr_
!=
nullptr
)
{
mem_manager_
->
FreeMemFromMemPool
(
iter
);
}
}
}
auto
ret
=
mem_manager_
->
MallocContinuousMemFromMemPool
(
addr_list
,
total_size
,
size_list
);
if
(
!
ret
)
{
MS_LOG
(
EXCEPTION
)
<<
"Malloc device memory failed."
;
}
}
void
GPUKernelRuntime
::
FreeKernelDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
auto
cnode
=
kernel
->
cast
<
CNodePtr
>
();
MS_EXCEPTION_IF_NULL
(
cnode
);
if
(
AnfAlgo
::
IsCommunicationOp
(
kernel
))
{
return
;
}
// Free the input of kernel by reference count.
for
(
size_t
i
=
0
;
i
<
AnfAlgo
::
GetInputTensorNum
(
kernel
);
++
i
)
{
auto
kernel_ref_count_ptr
=
mem_reuse_util_
->
GetKernelInputRef
(
cnode
,
i
);
if
(
kernel_ref_count_ptr
==
nullptr
)
{
continue
;
}
kernel_ref_count_ptr
->
ref_count_dynamic_use_
--
;
if
(
kernel_ref_count_ptr
->
ref_count_dynamic_use_
<
0
)
{
MS_LOG
(
EXCEPTION
)
<<
"Check dynamic reference count failed."
;
}
if
(
kernel_ref_count_ptr
->
ref_count_dynamic_use_
==
0
)
{
DeviceAddressPtr
device_address
;
if
(
mem_reuse_util_
->
is_all_nop_node
())
{
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
false
);
}
else
{
// Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
true
);
}
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
}
// Free the output of kernel, if output has no reference.
for
(
size_t
i
=
0
;
i
<
AnfAlgo
::
GetOutputTensorNum
(
kernel
);
++
i
)
{
auto
kernel_ref_count_ptr
=
mem_reuse_util_
->
GetRef
(
cnode
,
i
);
if
(
kernel_ref_count_ptr
==
nullptr
)
{
continue
;
}
if
(
kernel_ref_count_ptr
->
ref_count_dynamic_use_
==
0
)
{
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
}
// Free the workspace of kernel.
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
for
(
size_t
i
=
0
;
i
<
kernel_mod
->
GetWorkspaceSizeList
().
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableWorkspaceAddr
(
kernel
,
i
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
)
{
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
}
}
}
}
// namespace gpu
}
// namespace device
}
// namespace mindspore
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "runtime/device/gpu/gpu_kernel_runtime.h"
#include <algorithm>
#include "runtime/device/gpu/gpu_device_address.h"
#include "runtime/device/gpu/cuda_driver.h"
#include "runtime/device/gpu/gpu_buffer_mgr.h"
#include "runtime/device/gpu/gpu_device_manager.h"
#include "runtime/device/gpu/gpu_memory_allocator.h"
#include "runtime/device/gpu/distribution/collective_init.h"
#include "utils/convert_utils.h"
#include "utils/ms_context.h"
#include "runtime/device/kernel_runtime_manager.h"
#include "runtime/device/gpu/gpu_common.h"
#include "utils/ms_utils.h"
#include "runtime/device/gpu/gpu_memory_manager.h"
#include "backend/kernel_compiler/common_utils.h"
#include "runtime/device/gpu/gpu_memory_copy_manager.h"
#include "common/trans.h"
#include "ir/dtype.h"
#include "profiler/device/gpu/gpu_profiling.h"
#include "utils/shape_utils.h"
#ifdef ENABLE_DEBUGGER
#include "debug/debug_services.h"
#endif
namespace
mindspore
{
namespace
device
{
namespace
gpu
{
using
mindspore
::
device
::
memswap
::
MemSwapInfoSet
;
using
mindspore
::
device
::
memswap
::
MemSwapManager
;
using
mindspore
::
device
::
memswap
::
SwapKind
;
static
const
size_t
PARAMETER_OUTPUT_INDEX
=
0
;
bool
GPUKernelRuntime
::
SyncStream
()
{
return
GPUDeviceManager
::
GetInstance
().
SyncStream
(
stream_
);
}
bool
GPUKernelRuntime
::
Init
()
{
if
(
device_init_
==
true
)
{
GPUMemoryAllocator
::
GetInstance
().
CheckMaxDeviceMemory
();
return
true
;
}
bool
ret
=
false
;
#ifdef ENABLE_DUMP_E2E
ret
=
SetDumpConf
();
if
(
!
ret
)
{
MS_LOG
(
INFO
)
<<
"No dump conf to set!"
;
}
#endif
ret
=
InitDevice
();
if
(
!
ret
)
{
MS_LOG
(
ERROR
)
<<
"InitDevice error."
;
return
ret
;
}
mem_manager_
=
std
::
make_shared
<
GPUMemoryManager
>
();
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
mem_manager_
->
MallocDeviceMemory
();
const
void
*
collective_handle_
=
CollectiveInitializer
::
instance
().
collective_handle
();
bool
collective_inited
=
CollectiveInitializer
::
instance
().
collective_inited
();
if
(
collective_inited
&&
collective_handle_
!=
nullptr
)
{
auto
init_nccl_comm_funcptr
=
reinterpret_cast
<
InitNCCLComm
>
(
dlsym
(
const_cast
<
void
*>
(
collective_handle_
),
"InitNCCLComm"
));
MS_EXCEPTION_IF_NULL
(
init_nccl_comm_funcptr
);
(
*
init_nccl_comm_funcptr
)();
}
device_init_
=
true
;
return
ret
;
}
#ifdef ENABLE_DUMP_E2E
namespace
{
void
DumpOutput
(
mindspore
::
session
::
KernelGraph
*
graph
,
const
string
&
dump_path
,
DumpConfPtr
dump_conf
,
Debugger
*
debugger
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MS_EXCEPTION_IF_NULL
(
dump_conf
);
bool
trans_flag
=
dump_conf
->
trans_flag
();
const
auto
&
apply_kernels
=
graph
->
execution_order
();
for
(
const
auto
&
node
:
apply_kernels
)
{
MS_EXCEPTION_IF_NULL
(
node
);
auto
node_name
=
AnfAlgo
::
GetCNodeName
(
node
);
std
::
string
kernel_name
=
node
->
fullname_with_scope
();
if
(
!
dump_conf
->
IsKernelNeedDump
(
kernel_name
))
{
continue
;
}
const
std
::
string
strsrc
=
"/"
;
const
std
::
string
strdst
=
"--"
;
std
::
string
::
size_type
pos
=
0
;
std
::
string
::
size_type
srclen
=
strsrc
.
size
();
std
::
string
::
size_type
dstlen
=
strdst
.
size
();
while
((
pos
=
kernel_name
.
find
(
strsrc
,
pos
))
!=
std
::
string
::
npos
)
{
kernel_name
.
replace
(
pos
,
srclen
,
strdst
);
pos
+=
dstlen
;
}
auto
output_size
=
AnfAlgo
::
GetOutputTensorNum
(
node
);
for
(
size_t
j
=
0
;
j
<
output_size
;
++
j
)
{
auto
addr
=
AnfAlgo
::
GetOutputAddr
(
node
,
j
);
TypeId
addr_type_id
=
addr
->
type_id
();
std
::
string
addr_format
=
addr
->
format
();
ShapeVector
int_shapes
;
if
(
trans_flag
)
{
int_shapes
=
trans
::
GetRuntimePaddingShape
(
node
,
j
);
}
else
{
auto
shape
=
AnfAlgo
::
GetOutputDeviceShape
(
node
,
j
);
(
void
)
std
::
transform
(
shape
.
begin
(),
shape
.
end
(),
std
::
back_inserter
(
int_shapes
),
[](
size_t
inner_item
)
{
return
SizeToInt
(
inner_item
);
});
}
auto
type
=
AnfAlgo
::
GetOutputInferDataType
(
node
,
j
);
auto
format
=
kOpFormat_DEFAULT
;
string
filepath
=
dump_path
+
'/'
+
kernel_name
+
'_'
+
"output_"
+
std
::
to_string
(
j
);
DebugServices
*
debug_services
=
debugger
->
debug_services
();
TensorLoader
*
tensor_loader
=
debug_services
->
tensor_loader
();
std
::
string
original_kernel_name
=
node
->
fullname_with_scope
();
size_t
slot
=
j
;
auto
ret
=
tensor_loader
->
DumpTensorToFile
(
original_kernel_name
,
trans_flag
,
filepath
,
format
,
int_shapes
,
type
,
addr_type_id
,
addr_format
,
slot
);
if
(
!
ret
)
{
std
::
string
error
=
"DumpTensorToFile Failed: flag:"
+
std
::
to_string
(
trans_flag
)
+
", path:"
+
filepath
+
", host_format:"
+
format
+
".!"
;
}
}
}
}
void
DumpParameters
(
mindspore
::
session
::
KernelGraph
*
graph
,
const
string
&
dump_path
,
DumpConfPtr
dump_conf
,
Debugger
*
debugger
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MS_EXCEPTION_IF_NULL
(
dump_conf
);
bool
trans_flag
=
dump_conf
->
trans_flag
();
const
auto
&
parameters
=
graph
->
inputs
();
for
(
auto
&
item
:
parameters
)
{
if
(
!
item
->
isa
<
Parameter
>
())
{
continue
;
}
std
::
string
parameter_name
=
item
->
fullname_with_scope
();
if
(
!
dump_conf
->
IsKernelNeedDump
(
parameter_name
))
{
continue
;
}
auto
addr
=
AnfAlgo
::
GetOutputAddr
(
item
,
PARAMETER_OUTPUT_INDEX
);
TypeId
addr_type_id
=
addr
->
type_id
();
std
::
string
addr_format
=
addr
->
format
();
ShapeVector
int_shapes
;
if
(
trans_flag
)
{
int_shapes
=
trans
::
GetRuntimePaddingShape
(
item
,
PARAMETER_OUTPUT_INDEX
);
}
else
{
auto
shape
=
AnfAlgo
::
GetOutputDeviceShape
(
item
,
PARAMETER_OUTPUT_INDEX
);
(
void
)
std
::
transform
(
shape
.
begin
(),
shape
.
end
(),
std
::
back_inserter
(
int_shapes
),
[](
size_t
inner_item
)
{
return
SizeToInt
(
inner_item
);
});
}
auto
type
=
AnfAlgo
::
GetOutputInferDataType
(
item
,
PARAMETER_OUTPUT_INDEX
);
auto
format
=
kOpFormat_DEFAULT
;
string
filepath
=
dump_path
+
'/'
+
parameter_name
+
'_'
+
"output_0"
;
DebugServices
*
debug_services
=
debugger
->
debug_services
();
TensorLoader
*
tensor_loader
=
debug_services
->
tensor_loader
();
std
::
string
original_kernel_name
=
parameter_name
;
size_t
slot
=
0
;
auto
ret
=
tensor_loader
->
DumpTensorToFile
(
original_kernel_name
,
trans_flag
,
filepath
,
format
,
int_shapes
,
type
,
addr_type_id
,
addr_format
,
slot
);
if
(
!
ret
)
{
std
::
string
error
=
"DumpTensorToFile Failed: flag:"
+
std
::
to_string
(
trans_flag
)
+
", path:"
+
filepath
+
", host_format:"
+
format
+
".!"
;
}
}
}
}
// namespace
bool
GPUKernelRuntime
::
DumpData
(
mindspore
::
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MS_LOG
(
INFO
)
<<
"Start dump step"
;
DumpConfPtr
dump_conf
=
GetDumpConf
();
MS_EXCEPTION_IF_NULL
(
dump_conf
);
dump_conf
->
UpdataCurIter
();
bool
dump_flag
=
dump_conf
->
dump_enable
();
if
(
!
dump_flag
)
{
MS_LOG
(
INFO
)
<<
"Dump flag is disable, pass dump step"
;
return
true
;
}
uint32_t
cur_iter
=
dump_conf
->
cur_iter
();
if
(
dump_conf
->
dump_iter
()
!=
0
)
{
if
(
cur_iter
!=
dump_conf
->
dump_iter
())
{
return
true
;
}
}
MS_LOG
(
INFO
)
<<
"Cur iter is "
<<
cur_iter
;
std
::
string
net_name
=
dump_conf
->
dump_net_name
();
std
::
string
iterator
=
std
::
to_string
(
cur_iter
);
std
::
string
dump_path
=
dump_conf
->
dump_path
();
if
(
dump_path
.
back
()
==
'/'
)
{
dump_path
=
dump_path
+
net_name
+
'/'
+
iterator
;
}
else
{
dump_path
=
dump_path
+
'/'
+
net_name
+
'/'
+
iterator
;
}
// dump output
DumpOutput
(
graph
,
dump_path
,
dump_conf
,
debugger
);
// dump parameters
DumpParameters
(
graph
,
dump_path
,
dump_conf
,
debugger
);
return
true
;
}
#endif
#ifdef ENABLE_DEBUGGER
namespace
{
void
LoadKernelData
(
Debugger
*
debugger
,
const
CNodePtr
&
kernel
,
const
std
::
vector
<
mindspore
::
kernel
::
AddressPtr
>
&
kernel_inputs
,
const
std
::
vector
<
mindspore
::
kernel
::
AddressPtr
>
&
kernel_workspaces
,
const
std
::
vector
<
mindspore
::
kernel
::
AddressPtr
>
&
kernel_outputs
,
int
exec_order
,
void
*
stream_ptr
,
bool
dump_enabled
)
{
// check if we should read the kernel data
bool
read_data
=
false
;
std
::
string
kernel_name
=
kernel
->
fullname_with_scope
();
if
(
debugger
)
{
debugger
->
SetCurNode
(
kernel_name
);
if
(
dump_enabled
)
{
read_data
=
true
;
}
else
if
(
debugger
->
debugger_enabled
())
{
read_data
=
debugger
->
ReadNodeDataRequired
();
}
}
if
(
!
read_data
)
{
return
;
}
// get inputs
auto
input_size
=
AnfAlgo
::
GetInputTensorNum
(
kernel
);
for
(
size_t
j
=
0
;
j
<
input_size
;
++
j
)
{
auto
input_kernel
=
kernel
->
input
(
j
+
1
);
std
::
string
input_kernel_name
=
input_kernel
->
fullname_with_scope
();
auto
addr
=
kernel_inputs
[
j
];
auto
type
=
AnfAlgo
::
GetOutputInferDataType
(
input_kernel
,
PARAMETER_OUTPUT_INDEX
);
auto
format
=
kOpFormat_DEFAULT
;
auto
gpu_addr
=
std
::
make_unique
<
GPUDeviceAddress
>
(
addr
->
addr
,
addr
->
size
,
format
,
type
);
string
input_tensor_name
=
input_kernel_name
+
':'
+
"0"
;
ShapeVector
int_shapes
;
auto
shape
=
AnfAlgo
::
GetOutputDeviceShape
(
input_kernel
,
PARAMETER_OUTPUT_INDEX
);
(
void
)
std
::
transform
(
shape
.
begin
(),
shape
.
end
(),
std
::
back_inserter
(
int_shapes
),
[](
size_t
inner_item
)
{
return
SizeToInt
(
inner_item
);
});
auto
ret
=
gpu_addr
->
LoadMemToHost
(
input_tensor_name
,
exec_order
,
format
,
int_shapes
,
type
,
0
,
debugger
,
false
);
if
(
!
ret
)
{
MS_LOG
(
ERROR
)
<<
"LoadMemToHost:"
<<
", tensor_name:"
<<
input_tensor_name
<<
", host_format:"
<<
format
<<
".!"
;
}
}
// get outputs
auto
output_size
=
AnfAlgo
::
GetOutputTensorNum
(
kernel
);
for
(
size_t
j
=
0
;
j
<
output_size
;
++
j
)
{
auto
addr
=
kernel_outputs
[
j
];
auto
type
=
AnfAlgo
::
GetOutputInferDataType
(
kernel
,
j
);
auto
format
=
kOpFormat_DEFAULT
;
auto
gpu_addr
=
std
::
make_unique
<
GPUDeviceAddress
>
(
addr
->
addr
,
addr
->
size
,
format
,
type
);
string
tensor_name
=
kernel_name
+
':'
+
std
::
to_string
(
j
);
ShapeVector
int_shapes
;
auto
shape
=
AnfAlgo
::
GetOutputDeviceShape
(
kernel
,
j
);
(
void
)
std
::
transform
(
shape
.
begin
(),
shape
.
end
(),
std
::
back_inserter
(
int_shapes
),
[](
size_t
inner_item
)
{
return
SizeToInt
(
inner_item
);
});
auto
ret
=
gpu_addr
->
LoadMemToHost
(
tensor_name
,
exec_order
,
format
,
int_shapes
,
type
,
j
,
debugger
,
false
);
if
(
!
ret
)
{
MS_LOG
(
ERROR
)
<<
"LoadMemToHost:"
<<
", tensor_name:"
<<
tensor_name
<<
", host_format:"
<<
format
<<
".!"
;
}
}
debugger
->
PostExecuteNode
();
}
void
UpdateStepNum
(
Debugger
*
debugger
,
bool
dump_enabled
)
{
if
(
debugger
&&
(
debugger
->
debugger_enabled
()
||
dump_enabled
))
{
auto
cur_step_num
=
debugger
->
step_num
();
cur_step_num
=
cur_step_num
+
1
;
debugger
->
SetStepNum
(
cur_step_num
);
}
}
void
LoadParameters
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
,
bool
dump_enabled
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
if
(
!
(
debugger
&&
dump_enabled
))
{
return
;
}
const
auto
&
parameters
=
graph
->
inputs
();
// for parameters, set its execution order to be 0;
int
exec_order
=
0
;
for
(
auto
&
item
:
parameters
)
{
if
(
!
item
->
isa
<
Parameter
>
())
{
continue
;
}
std
::
string
parameter_name
=
item
->
fullname_with_scope
();
auto
addr
=
AnfAlgo
::
GetOutputAddr
(
item
,
PARAMETER_OUTPUT_INDEX
);
auto
type
=
AnfAlgo
::
GetOutputInferDataType
(
item
,
PARAMETER_OUTPUT_INDEX
);
auto
format
=
kOpFormat_DEFAULT
;
string
tensor_name
=
parameter_name
+
':'
+
"0"
;
auto
gpu_addr
=
dynamic_cast
<
const
mindspore
::
device
::
gpu
::
GPUDeviceAddress
*>
(
addr
);
ShapeVector
int_shapes
;
auto
shape
=
AnfAlgo
::
GetOutputDeviceShape
(
item
,
PARAMETER_OUTPUT_INDEX
);
(
void
)
std
::
transform
(
shape
.
begin
(),
shape
.
end
(),
std
::
back_inserter
(
int_shapes
),
[](
size_t
inner_item
)
{
return
SizeToInt
(
inner_item
);
});
auto
ret
=
gpu_addr
->
LoadMemToHost
(
tensor_name
,
exec_order
,
format
,
int_shapes
,
type
,
0
,
debugger
,
true
);
if
(
!
ret
)
{
MS_LOG
(
ERROR
)
<<
"LoadMemToHost:"
<<
", tensor_name:"
<<
tensor_name
<<
", host_format:"
<<
format
<<
".!"
;
}
}
}
void
ClearCurrentData
(
Debugger
*
debugger
,
bool
dump_enabled
)
{
if
(
debugger
&&
(
debugger
->
debugger_enabled
()
||
dump_enabled
))
{
DebugServices
*
debug_services
=
debugger
->
debug_services
();
TensorLoader
*
tensor_loader
=
debug_services
->
tensor_loader
();
tensor_loader
->
EmptyCurrentTensor
();
}
}
}
// namespace
#endif
DeviceAddressPtr
GPUKernelRuntime
::
CreateDeviceAddress
(
void
*
device_ptr
,
size_t
device_size
,
const
string
&
format
,
TypeId
type_id
)
{
return
std
::
make_shared
<
GPUDeviceAddress
>
(
device_ptr
,
device_size
,
format
,
type_id
);
}
bool
GPUKernelRuntime
::
InitDevice
()
{
if
(
GPUDeviceManager
::
GetInstance
().
device_count
()
<=
0
)
{
MS_LOG
(
ERROR
)
<<
"No GPU device found."
;
return
false
;
}
const
void
*
collective_handle_
=
CollectiveInitializer
::
instance
().
collective_handle
();
bool
collective_inited
=
CollectiveInitializer
::
instance
().
collective_inited
();
if
(
collective_inited
&&
collective_handle_
!=
nullptr
)
{
auto
get_local_rank_funcptr
=
reinterpret_cast
<
GetLocalRankId
>
(
dlsym
(
const_cast
<
void
*>
(
collective_handle_
),
"local_rank_id"
));
MS_EXCEPTION_IF_NULL
(
get_local_rank_funcptr
);
device_id_
=
IntToUint
((
*
get_local_rank_funcptr
)());
}
if
(
!
GPUDeviceManager
::
GetInstance
().
is_device_id_init
())
{
if
(
!
GPUDeviceManager
::
GetInstance
().
set_cur_device_id
(
device_id_
))
{
MS_LOG
(
ERROR
)
<<
"Failed to set current device to "
<<
SizeToInt
(
device_id_
);
return
false
;
}
}
GPUDeviceManager
::
GetInstance
().
InitDevice
();
stream_
=
GPUDeviceManager
::
GetInstance
().
default_stream
();
if
(
stream_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"No default CUDA stream found."
;
return
false
;
}
return
true
;
}
void
GPUKernelRuntime
::
ReleaseDeviceRes
()
{
// For dataset mode.
if
(
GpuBufferMgr
::
GetInstance
().
IsInit
())
{
if
(
!
GpuBufferMgr
::
GetInstance
().
IsClosed
())
{
if
(
!
GpuBufferMgr
::
GetInstance
().
CloseNotify
())
{
MS_LOG
(
EXCEPTION
)
<<
"Could not close gpu data queue."
;
}
}
CHECK_OP_RET_WITH_EXCEPT
(
GpuBufferMgr
::
GetInstance
().
Destroy
(),
"Could not destroy gpu data queue."
);
}
// Destroy remaining memory swap events and free host memory.
for
(
auto
&
item
:
mem_swap_map_
)
{
auto
&
mem_swap_manager
=
item
.
second
;
MS_EXCEPTION_IF_NULL
(
mem_swap_manager
);
if
(
mem_swap_manager
->
trigger_swap
())
{
mem_swap_manager
->
ClearSwapQueue
(
false
);
mem_swap_manager
->
ReleaseHostPinnedMem
();
}
}
GPUDeviceManager
::
GetInstance
().
ReleaseDevice
();
if
(
mem_manager_
!=
nullptr
)
{
mem_manager_
->
FreeDeviceMemory
();
}
kernel
::
KernelMeta
*
bin_map
=
kernel
::
KernelMeta
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
bin_map
);
bin_map
->
RemoveKernelCache
();
}
void
GPUKernelRuntime
::
ClearGraphRuntimeResource
(
uint32_t
graph_id
,
const
std
::
vector
<
AnfNodePtr
>
&
inputs
,
const
std
::
unordered_set
<
ValueNodePtr
>
&
value_nodes
,
const
std
::
vector
<
CNodePtr
>
&
execution_order
)
{
MS_LOG
(
INFO
)
<<
"Clear graph:"
<<
graph_id
<<
" GPU runtime resource"
;
// Release the kernel resource.
for
(
const
auto
&
kernel
:
execution_order
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
if
(
kernel_mod
==
nullptr
)
{
continue
;
}
kernel_mod
->
ReleaseResource
();
}
// Clear the output address of graph.
ClearOutputAddress
(
inputs
,
value_nodes
,
execution_order
);
}
void
GPUKernelRuntime
::
AssignMemory
(
session
::
KernelGraph
*
graph
)
{
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
mem_manager_
->
ResetDynamicMemory
();
AssignStaticMemoryInput
(
graph
);
AssignStaticMemoryValueNode
(
graph
);
bool
is_enable_dynamic_mem
=
context_ptr
->
get_param
<
bool
>
(
MS_CTX_ENABLE_DYNAMIC_MEM_POOL
);
if
(
is_enable_dynamic_mem
)
{
// Use the dynamic memory pool.
InitKernelRefCount
(
graph
);
InitMemorySwapInfo
(
graph
);
InitKernelOutputAddress
(
graph
);
InitKernelWorkspaceAddress
(
graph
);
SaveGraphOutputNode
(
graph
);
}
else
{
AssignDynamicMemory
(
graph
);
}
}
bool
GPUKernelRuntime
::
Run
(
session
::
KernelGraph
*
graph
,
bool
is_task_sink
,
Debugger
*
debugger
)
{
struct
timeval
start_time
,
end_time
;
(
void
)
gettimeofday
(
&
start_time
,
nullptr
);
bool
ret
=
true
;
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
bool
is_enable_dynamic_mem
=
context_ptr
->
get_param
<
bool
>
(
MS_CTX_ENABLE_DYNAMIC_MEM_POOL
);
bool
is_enable_pynative_infer
=
context_ptr
->
get_param
<
bool
>
(
MS_CTX_ENABLE_PYNATIVE_INFER
);
if
(
is_enable_dynamic_mem
&&
!
is_enable_pynative_infer
)
{
auto
graph_id
=
graph
->
graph_id
();
auto
iter
=
mem_swap_map_
.
find
(
graph_id
);
if
(
iter
==
mem_swap_map_
.
end
())
{
MS_LOG
(
EXCEPTION
)
<<
"Find memory swap map failed."
;
}
mem_swap_manager_
=
iter
->
second
;
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
auto
mem_reuse_iter
=
mem_reuse_util_map_
.
find
(
graph_id
);
if
(
mem_reuse_iter
==
mem_reuse_util_map_
.
end
())
{
MS_LOG
(
EXCEPTION
)
<<
"Find memory reuse map failed."
;
}
mem_reuse_util_
=
mem_reuse_iter
->
second
;
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
ret
=
RunOneStep
(
graph
,
debugger
);
}
else
{
ret
=
LaunchKernel
(
graph
);
}
(
void
)
gettimeofday
(
&
end_time
,
nullptr
);
const
uint64_t
kUSecondInSecond
=
1000000
;
uint64_t
cost
=
kUSecondInSecond
*
static_cast
<
uint64_t
>
(
end_time
.
tv_sec
-
start_time
.
tv_sec
);
cost
+=
static_cast
<
uint64_t
>
(
end_time
.
tv_usec
-
start_time
.
tv_usec
);
MS_LOG
(
DEBUG
)
<<
"GPU kernel runtime run graph in "
<<
cost
<<
" us"
;
return
ret
;
}
bool
GPUKernelRuntime
::
RunOneStep
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
bool
ret
=
true
;
auto
graph_id
=
graph
->
graph_id
();
if
(
!
is_first_step_map_
[
graph_id
])
{
// Normally run graph
ret
=
LaunchKernelDynamic
(
graph
,
debugger
);
}
else
{
// Mock run first step
ret
=
LaunchKernelDynamic
(
graph
,
debugger
,
true
,
false
);
if
(
ret
)
{
// Normally run graph
ret
=
LaunchKernelDynamic
(
graph
,
debugger
);
}
else
{
// Trigger memory swap
ret
=
SearchMemSwapScheme
(
graph
,
debugger
);
}
is_first_step_map_
[
graph_id
]
=
false
;
}
return
ret
;
}
bool
GPUKernelRuntime
::
SearchMemSwapScheme
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
MS_LOG
(
WARNING
)
<<
"Run out of memory and try memory swapping, it may take some time, please wait a moment."
;
bool
ret
=
false
;
ClearKernelOldOutputAndWorkspace
(
graph
);
if
(
!
mem_swap_manager_
->
mem_swap_init
())
{
if
(
!
mem_swap_manager_
->
Init
(
graph
))
{
return
false
;
}
}
while
(
!
ret
)
{
if
(
!
mem_swap_manager_
->
RetreatSwapInfo
())
{
return
false
;
}
ret
=
LaunchKernelDynamic
(
graph
,
debugger
,
true
,
false
);
if
(
!
ret
)
{
ClearKernelOldOutputAndWorkspace
(
graph
);
}
}
mem_swap_manager_
->
AssignHostMemory
();
// Time profiling
ret
=
LaunchKernelDynamic
(
graph
,
debugger
,
false
,
true
);
if
(
!
ret
)
{
return
ret
;
}
return
RefineMemSwapScheme
(
graph
,
debugger
);
}
bool
GPUKernelRuntime
::
RefineMemSwapScheme
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
MS_LOG
(
WARNING
)
<<
"Refine memory swap scheme, it may take some time, please wait a moment."
;
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
if
(
!
mem_swap_manager_
->
QueryKernelTriggerSwapIn
(
kernel
))
{
continue
;
}
size_t
swap_in_task_num
=
mem_swap_manager_
->
QueryKernelTriggerSwapInTaskNum
(
kernel
);
for
(
size_t
swap_in_task_idx
=
0
;
swap_in_task_idx
<
swap_in_task_num
;
swap_in_task_idx
++
)
{
bool
ret
=
false
;
while
(
!
ret
)
{
mem_swap_manager_
->
AdjustSwapInPos
(
kernel
,
swap_in_task_idx
);
ret
=
LaunchKernelDynamic
(
graph
,
debugger
,
true
,
false
);
if
(
!
ret
)
{
ClearKernelOldOutputAndWorkspace
(
graph
);
ClearSwapInfo
(
true
);
}
}
}
}
return
true
;
}
void
GPUKernelRuntime
::
InitKernelRefCount
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MemReuseUtilPtr
mem_reuse_util_ptr
=
std
::
make_shared
<
memreuse
::
MemReuseUtil
>
();
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_ptr
);
// Init the kernel reference count.
if
(
!
mem_reuse_util_ptr
->
InitDynamicKernelRef
(
graph
))
{
MS_LOG
(
EXCEPTION
)
<<
"Init kernel reference count failed"
;
}
mem_reuse_util_ptr
->
SetKernelDefMap
();
mem_reuse_util_ptr
->
SetReuseRefCount
();
// Can't free the device address of graph output, so set the reference count of graph output specially.
mem_reuse_util_ptr
->
SetGraphOutputRefCount
();
// Can't free the device address of summary nodes, so set the reference count of summary nodes specially.
mem_reuse_util_ptr
->
SetSummaryNodesRefCount
();
auto
graph_id
=
graph
->
graph_id
();
mem_reuse_util_map_
[
graph_id
]
=
mem_reuse_util_ptr
;
}
void
GPUKernelRuntime
::
InitMemorySwapInfo
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
GPUMemCopyManagerPtr
gpu_mem_copy_manager
=
std
::
make_shared
<
GPUMemCopyManager
>
();
MS_EXCEPTION_IF_NULL
(
gpu_mem_copy_manager
);
MemSwapManagerPtr
mem_swap_manager
=
std
::
make_shared
<
MemSwapManager
>
(
gpu_mem_copy_manager
);
MS_EXCEPTION_IF_NULL
(
mem_swap_manager
);
auto
graph_id
=
graph
->
graph_id
();
mem_swap_map_
[
graph_id
]
=
mem_swap_manager
;
is_first_step_map_
[
graph_id
]
=
true
;
}
void
GPUKernelRuntime
::
InitKernelOutputAddress
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
output_sizes
=
kernel_mod
->
GetOutputSizeList
();
for
(
size_t
i
=
0
;
i
<
output_sizes
.
size
();
++
i
)
{
if
(
AnfAlgo
::
OutputAddrExist
(
kernel
,
i
))
{
continue
;
}
std
::
string
output_format
=
AnfAlgo
::
GetOutputFormat
(
kernel
,
i
);
auto
output_type
=
AnfAlgo
::
GetOutputDeviceDataType
(
kernel
,
i
);
auto
device_address
=
CreateDeviceAddress
(
nullptr
,
output_sizes
[
i
],
output_format
,
output_type
);
AnfAlgo
::
SetOutputAddr
(
device_address
,
i
,
kernel
.
get
());
}
}
}
void
GPUKernelRuntime
::
InitKernelWorkspaceAddress
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
workspace_sizes
=
kernel_mod
->
GetWorkspaceSizeList
();
for
(
size_t
i
=
0
;
i
<
workspace_sizes
.
size
();
++
i
)
{
auto
device_address
=
CreateDeviceAddress
(
nullptr
,
workspace_sizes
[
i
],
""
,
kTypeUnknown
);
AnfAlgo
::
SetWorkspaceAddr
(
device_address
,
i
,
kernel
.
get
());
}
}
}
void
GPUKernelRuntime
::
SaveGraphOutputNode
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
graph_id
=
graph
->
graph_id
();
const
auto
&
output_nodes
=
AnfAlgo
::
GetAllOutput
(
graph
->
output
(),
{
prim
::
kPrimTupleGetItem
});
for
(
const
auto
&
node
:
output_nodes
)
{
graph_output_map_
[
graph_id
].
insert
(
node
);
}
}
bool
GPUKernelRuntime
::
IsGraphOutput
(
const
session
::
KernelGraph
*
graph
,
const
mindspore
::
AnfNodePtr
&
kernel
)
const
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
graph_id
=
graph
->
graph_id
();
auto
iter
=
graph_output_map_
.
find
(
graph_id
);
if
(
iter
==
graph_output_map_
.
end
())
{
MS_LOG
(
EXCEPTION
)
<<
"Find graph output info failed."
;
}
auto
&
graph_output_set
=
iter
->
second
;
return
(
graph_output_set
.
find
(
kernel
)
!=
graph_output_set
.
end
());
}
void
GPUKernelRuntime
::
ClearKernelOldOutputAndWorkspace
(
const
session
::
KernelGraph
*
graph
)
{
ClearKernelOutputAddress
(
graph
);
ClearKernelWorkspaceAddress
(
graph
);
}
void
GPUKernelRuntime
::
ClearKernelOutputAddress
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
if
(
IsGraphOutput
(
graph
,
kernel
))
{
continue
;
}
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
output_sizes
=
kernel_mod
->
GetOutputSizeList
();
for
(
size_t
i
=
0
;
i
<
output_sizes
.
size
();
++
i
)
{
if
(
!
AnfAlgo
::
OutputAddrExist
(
kernel
,
i
))
{
continue
;
}
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
)
{
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
}
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
}
}
void
GPUKernelRuntime
::
ClearKernelWorkspaceAddress
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
const
auto
&
kernel
:
kernels
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
workspace_sizes
=
kernel_mod
->
GetWorkspaceSizeList
();
for
(
size_t
i
=
0
;
i
<
workspace_sizes
.
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableWorkspaceAddr
(
kernel
,
i
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
)
{
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
}
}
}
}
bool
GPUKernelRuntime
::
LaunchKernelDynamic
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
,
bool
mock
,
bool
profiling
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
// Reset the reference count.
mem_reuse_util_
->
ResetDynamicUsedRefCount
();
// The inputs and outputs memory of communication kernel need be continuous, so separate processing.
AllocCommunicationOpDynamicRes
(
graph
);
#ifdef ENABLE_DEBUGGER
bool
dump_enabled
=
GPUKernelRuntime
::
DumpDataEnabledIteration
();
if
(
!
mock
)
{
UpdateStepNum
(
debugger
,
dump_enabled
);
}
#endif
auto
&
kernels
=
graph
->
execution_order
();
int
exec_order
=
1
;
auto
profiler_inst
=
profiler
::
gpu
::
GPUProfiler
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
profiler_inst
);
for
(
const
auto
&
kernel
:
kernels
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
AddressPtrList
kernel_inputs
;
AddressPtrList
kernel_workspaces
;
AddressPtrList
kernel_outputs
;
auto
ret
=
AllocKernelDynamicRes
(
*
kernel_mod
,
kernel
,
&
kernel_inputs
,
&
kernel_workspaces
,
&
kernel_outputs
,
mock
);
if
(
!
ret
)
{
#ifdef ENABLE_DEBUGGER
if
(
!
mock
)
{
// invalidate current data collected by the debugger
ClearCurrentData
(
debugger
,
dump_enabled
);
}
#endif
return
false
;
}
if
(
!
mock
)
{
if
(
!
profiling
)
{
if
(
profiler_inst
->
GetEnableFlag
())
{
profiler_inst
->
OpDataProducerBegin
(
kernel
->
fullname_with_scope
(),
stream_
);
}
CHECK_OP_RET_WITH_EXCEPT
(
kernel_mod
->
Launch
(
kernel_inputs
,
kernel_workspaces
,
kernel_outputs
,
stream_
),
"Launch kernel failed."
);
if
(
profiler_inst
->
GetEnableFlag
())
{
profiler_inst
->
OpDataProducerEnd
();
if
(
profiler_inst
->
GetSyncEnableFlag
())
{
CHECK_OP_RET_WITH_ERROR
(
SyncStream
(),
"Profiler SyncStream failed."
);
}
}
}
else
{
LaunchKernelWithTimeProfiling
(
kernel
,
kernel_inputs
,
kernel_workspaces
,
kernel_outputs
);
}
#ifdef ENABLE_DEBUGGER
// called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost)
LoadKernelData
(
debugger
,
kernel
,
kernel_inputs
,
kernel_workspaces
,
kernel_outputs
,
exec_order
,
stream_
,
dump_enabled
);
#endif
}
exec_order
=
exec_order
+
1
;
FreeKernelDynamicRes
(
kernel
);
if
(
!
UpdateMemorySwapTask
(
kernel
,
mock
,
profiling
))
{
#ifdef ENABLE_DEBUGGER
if
(
!
mock
)
{
// invalidate current data collected by the debugger
ClearCurrentData
(
debugger
,
dump_enabled
);
}
#endif
return
false
;
}
}
if
(
!
mock
)
{
#ifdef ENABLE_DEBUGGER
// collect weights and bias for dump mode
LoadParameters
(
graph
,
debugger
,
dump_enabled
);
#endif
CHECK_OP_RET_WITH_EXCEPT
(
SyncStream
(),
"SyncStream failed."
);
}
ClearSwapInfo
(
mock
);
return
true
;
}
void
GPUKernelRuntime
::
LaunchKernelWithTimeProfiling
(
const
AnfNodePtr
&
kernel
,
const
AddressPtrList
&
inputs
,
const
AddressPtrList
&
workspace
,
const
AddressPtrList
&
outputs
)
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
float
cost_time
=
0
;
DeviceEvent
start
=
nullptr
;
DeviceEvent
end
=
nullptr
;
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
CreateEvent
(
&
start
),
"Failed to create event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
CreateEvent
(
&
end
),
"Failed to create event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
RecordEvent
(
start
,
stream_
),
"Failed to record event to stream."
);
CHECK_OP_RET_WITH_EXCEPT
(
kernel_mod
->
Launch
(
inputs
,
workspace
,
outputs
,
stream_
),
"Launch kernel failed."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
RecordEvent
(
end
,
stream_
),
"Failed to record event to stream."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
SyncEvent
(
start
),
"Failed to sync event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
SyncEvent
(
end
),
"Failed to sync event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
ElapsedTime
(
&
cost_time
,
start
,
end
),
"Failed to record elapsed time."
);
mem_swap_manager_
->
AddKernelExecutionPerform
(
kernel
,
cost_time
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
DestroyEvent
(
start
),
"Failed to destroy event."
);
CHECK_OP_RET_WITH_EXCEPT
(
CudaDriver
::
DestroyEvent
(
end
),
"Failed to destroy event."
);
}
bool
GPUKernelRuntime
::
AddMemorySwapTask
(
const
AnfNodePtr
&
kernel
,
bool
mock
,
bool
profiling
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
const
MemSwapInfoSet
&
mem_swap_info_set
=
mem_swap_manager_
->
QueryKernelMemSwapInfo
(
kernel
);
for
(
auto
&
mem_swap_info
:
mem_swap_info_set
)
{
auto
need_swap_kernel
=
mem_swap_manager_
->
QueryKernelByTopoOrder
(
mem_swap_info
.
topo_order_
);
MS_EXCEPTION_IF_NULL
(
need_swap_kernel
);
const
HostAddress
&
host_address
=
mem_swap_manager_
->
QueryKernelHostAddr
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
);
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
,
false
);
if
(
mem_swap_info
.
swap_kind_
==
SwapKind
::
kDeviceToHost
)
{
if
(
mem_swap_manager_
->
QueryKernelHostAddrIsDirty
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
))
{
mem_swap_manager_
->
AddMemSwapTask
(
SwapKind
::
kDeviceToHost
,
device_address
,
host_address
,
mock
);
mem_swap_manager_
->
AddKernelHostAddrIsDirty
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
,
false
);
}
else
{
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
device_address
->
set_status
(
DeviceAddressStatus
::
kInHost
);
}
}
else
if
(
mem_swap_info
.
swap_kind_
==
SwapKind
::
kHostToDevice
)
{
auto
status
=
device_address
->
status
();
if
(
status
==
DeviceAddressStatus
::
kInDeviceToHost
)
{
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
else
if
(
status
==
DeviceAddressStatus
::
kInHost
)
{
if
(
!
device_address
->
ptr_
&&
!
AttemptMallocMem
(
device_address
,
device_address
->
size_
,
mock
))
{
return
false
;
}
float
cost_time
=
0
;
mem_swap_manager_
->
AddMemSwapTask
(
SwapKind
::
kHostToDevice
,
device_address
,
host_address
,
mock
,
profiling
,
&
cost_time
);
if
(
profiling
)
{
mem_swap_manager_
->
AddKernelSwapPerform
(
need_swap_kernel
,
mem_swap_info
.
output_idx_
,
std
::
make_pair
(
0
,
cost_time
));
}
}
}
}
return
true
;
}
bool
GPUKernelRuntime
::
UpdateMemorySwapTask
(
const
AnfNodePtr
&
kernel
,
bool
mock
,
bool
profiling
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
true
;
}
if
(
mem_swap_manager_
->
QueryKernelTriggerSwap
(
kernel
))
{
if
(
!
mock
)
{
CHECK_OP_RET_WITH_EXCEPT
(
SyncStream
(),
"SyncStream failed."
);
}
if
(
!
AddMemorySwapTask
(
kernel
,
mock
,
profiling
))
{
return
false
;
}
if
(
!
mock
)
{
CHECK_OP_RET_WITH_EXCEPT
(
mem_swap_manager_
->
SyncMemCopyStream
(
SwapKind
::
kDeviceToHost
),
"SyncCopyStream failed."
);
}
}
return
true
;
}
void
GPUKernelRuntime
::
UpdateHostSwapInQueue
(
const
DeviceAddressPtr
device_address
,
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
;
}
while
(
auto
device_address_swap_in
=
mem_swap_manager_
->
UpdateSwapQueue
(
SwapKind
::
kHostToDevice
,
mock
))
{
device_address_swap_in
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
auto
status
=
device_address
->
status
();
switch
(
status
)
{
case
DeviceAddressStatus
::
kInDevice
:
break
;
case
DeviceAddressStatus
::
kInDeviceToHost
:
{
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
break
;
}
case
DeviceAddressStatus
::
kInHostToDevice
:
{
while
(
device_address
->
status
()
!=
DeviceAddressStatus
::
kInDevice
)
{
while
(
auto
device_address_swap_in
=
mem_swap_manager_
->
UpdateSwapQueue
(
SwapKind
::
kHostToDevice
,
mock
))
{
device_address_swap_in
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
}
break
;
}
case
DeviceAddressStatus
::
kInHost
:
MS_LOG
(
WARNING
)
<<
"Unexpected device address status: "
<<
status
;
break
;
default:
MS_LOG
(
EXCEPTION
)
<<
"Invaild device address status: "
<<
status
;
}
}
void
GPUKernelRuntime
::
UpdateHostSwapOutQueue
(
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
;
}
while
(
auto
device_address_swap_out
=
mem_swap_manager_
->
UpdateSwapQueue
(
SwapKind
::
kDeviceToHost
,
mock
))
{
if
(
device_address_swap_out
->
status
()
==
DeviceAddressStatus
::
kInDeviceToHost
&&
device_address_swap_out
->
ptr_
)
{
device_address_swap_out
->
set_status
(
DeviceAddressStatus
::
kInHost
);
mem_manager_
->
FreeMemFromMemPool
(
device_address_swap_out
);
}
}
}
void
GPUKernelRuntime
::
ClearSwapInfo
(
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
;
}
mem_swap_manager_
->
ClearSwapQueue
(
mock
);
mem_swap_manager_
->
ResetHostAddrIsDirty
();
}
bool
GPUKernelRuntime
::
AttemptMallocMem
(
const
DeviceAddressPtr
&
device_address
,
size_t
size
,
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
auto
ret
=
mem_manager_
->
MallocMemFromMemPool
(
device_address
,
size
);
if
(
!
ret
)
{
if
(
!
mem_swap_manager_
->
trigger_swap
())
{
return
false
;
}
if
(
!
mock
)
{
mem_swap_manager_
->
SyncMemCopyStream
(
SwapKind
::
kDeviceToHost
);
}
UpdateHostSwapOutQueue
(
mock
);
ret
=
mem_manager_
->
MallocMemFromMemPool
(
device_address
,
size
);
if
(
!
ret
)
{
return
false
;
}
}
return
true
;
}
bool
GPUKernelRuntime
::
AllocKernelDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_inputs
,
AddressPtrList
*
kernel_workspaces
,
AddressPtrList
*
kernel_outputs
,
bool
mock
)
{
if
(
!
AllocKernelInputDynamicRes
(
kernel
,
kernel_inputs
,
mock
))
{
return
false
;
}
if
(
!
AllocKernelOutputDynamicRes
(
kernel_mod
,
kernel
,
kernel_outputs
,
mock
))
{
return
false
;
}
if
(
!
AllocKernelWorkspaceDynamicRes
(
kernel_mod
,
kernel
,
kernel_workspaces
,
mock
))
{
return
false
;
}
return
true
;
}
bool
GPUKernelRuntime
::
AllocKernelInputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_inputs
,
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_inputs
);
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
for
(
size_t
i
=
0
;
i
<
AnfAlgo
::
GetInputTensorNum
(
kernel
);
++
i
)
{
DeviceAddressPtr
device_address
;
if
(
mem_reuse_util_
->
is_all_nop_node
())
{
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
false
);
}
else
{
// Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
true
);
}
MS_EXCEPTION_IF_NULL
(
device_address
);
UpdateHostSwapInQueue
(
device_address
,
mock
);
MS_EXCEPTION_IF_NULL
(
device_address
->
ptr_
);
kernel
::
AddressPtr
input
=
std
::
make_shared
<
kernel
::
Address
>
();
MS_EXCEPTION_IF_NULL
(
input
);
input
->
addr
=
device_address
->
ptr_
;
input
->
size
=
device_address
->
size_
;
kernel_inputs
->
emplace_back
(
input
);
}
return
true
;
}
bool
GPUKernelRuntime
::
AllocKernelOutputDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_outputs
,
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_outputs
);
UpdateHostSwapOutQueue
(
mock
);
auto
output_sizes
=
kernel_mod
.
GetOutputSizeList
();
for
(
size_t
i
=
0
;
i
<
output_sizes
.
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
==
nullptr
&&
!
AttemptMallocMem
(
device_address
,
output_sizes
[
i
],
mock
))
{
return
false
;
}
kernel
::
AddressPtr
output
=
std
::
make_shared
<
kernel
::
Address
>
();
MS_EXCEPTION_IF_NULL
(
output
);
output
->
addr
=
device_address
->
ptr_
;
output
->
size
=
output_sizes
[
i
];
kernel_outputs
->
emplace_back
(
output
);
}
return
true
;
}
bool
GPUKernelRuntime
::
AllocKernelWorkspaceDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_workspaces
,
bool
mock
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_workspaces
);
auto
workspace_sizes
=
kernel_mod
.
GetWorkspaceSizeList
();
for
(
size_t
i
=
0
;
i
<
workspace_sizes
.
size
();
++
i
)
{
if
(
workspace_sizes
[
i
]
==
0
)
{
kernel_workspaces
->
emplace_back
(
nullptr
);
continue
;
}
auto
device_address
=
AnfAlgo
::
GetMutableWorkspaceAddr
(
kernel
,
i
);
if
(
device_address
->
ptr_
==
nullptr
&&
!
AttemptMallocMem
(
device_address
,
workspace_sizes
[
i
],
mock
))
{
return
false
;
}
kernel
::
AddressPtr
workspace
=
std
::
make_shared
<
kernel
::
Address
>
();
MS_EXCEPTION_IF_NULL
(
workspace
);
workspace
->
addr
=
device_address
->
ptr_
;
workspace
->
size
=
workspace_sizes
[
i
];
kernel_workspaces
->
emplace_back
(
workspace
);
}
return
true
;
}
void
GPUKernelRuntime
::
AllocCommunicationOpDynamicRes
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
auto
&
kernel
:
kernels
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
if
(
AnfAlgo
::
IsCommunicationOp
(
kernel
))
{
AllocCommunicationOpInputDynamicRes
(
kernel
);
AllocCommunicationOpOutputDynamicRes
(
kernel
);
}
}
}
void
GPUKernelRuntime
::
AllocCommunicationOpInputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
bool
is_need_alloc_memory
=
false
;
bool
is_need_free_memory
=
false
;
size_t
total_size
=
0
;
std
::
vector
<
size_t
>
size_list
;
DeviceAddressPtrList
addr_list
;
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
intput_sizes
=
kernel_mod
->
GetInputSizeList
();
for
(
size_t
i
=
0
;
i
<
intput_sizes
.
size
();
++
i
)
{
DeviceAddressPtr
device_address
;
if
(
mem_reuse_util_
->
is_all_nop_node
())
{
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
false
);
}
else
{
// Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
true
);
}
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
==
nullptr
)
{
is_need_alloc_memory
=
true
;
}
else
{
is_need_free_memory
=
true
;
}
total_size
+=
intput_sizes
[
i
];
size_list
.
emplace_back
(
intput_sizes
[
i
]);
addr_list
.
emplace_back
(
device_address
);
}
AllocCommunicationOpMemory
(
is_need_alloc_memory
,
is_need_free_memory
,
addr_list
,
total_size
,
size_list
);
}
void
GPUKernelRuntime
::
AllocCommunicationOpOutputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
bool
is_need_alloc_memory
=
false
;
bool
is_need_free_memory
=
false
;
size_t
total_size
=
0
;
std
::
vector
<
size_t
>
size_list
;
DeviceAddressPtrList
addr_list
;
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
output_sizes
=
kernel_mod
->
GetOutputSizeList
();
for
(
size_t
i
=
0
;
i
<
output_sizes
.
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
==
nullptr
)
{
is_need_alloc_memory
=
true
;
}
else
{
is_need_free_memory
=
true
;
}
total_size
+=
output_sizes
[
i
];
size_list
.
emplace_back
(
output_sizes
[
i
]);
addr_list
.
emplace_back
(
device_address
);
}
AllocCommunicationOpMemory
(
is_need_alloc_memory
,
is_need_free_memory
,
addr_list
,
total_size
,
size_list
);
}
void
GPUKernelRuntime
::
AllocCommunicationOpMemory
(
bool
is_need_alloc_memory
,
bool
is_need_free_memory
,
const
DeviceAddressPtrList
addr_list
,
size_t
total_size
,
std
::
vector
<
size_t
>
size_list
)
{
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
if
(
!
is_need_alloc_memory
)
{
return
;
}
if
(
is_need_free_memory
)
{
for
(
const
auto
&
iter
:
addr_list
)
{
MS_EXCEPTION_IF_NULL
(
iter
);
// Free the inputs/outputs of communication kernel which are not released.
if
(
iter
->
ptr_
!=
nullptr
)
{
mem_manager_
->
FreeMemFromMemPool
(
iter
);
}
}
}
auto
ret
=
mem_manager_
->
MallocContinuousMemFromMemPool
(
addr_list
,
total_size
,
size_list
);
if
(
!
ret
)
{
MS_LOG
(
EXCEPTION
)
<<
"Malloc device memory failed."
;
}
}
void
GPUKernelRuntime
::
FreeKernelDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_
);
auto
cnode
=
kernel
->
cast
<
CNodePtr
>
();
MS_EXCEPTION_IF_NULL
(
cnode
);
if
(
AnfAlgo
::
IsCommunicationOp
(
kernel
))
{
return
;
}
// Free the input of kernel by reference count.
for
(
size_t
i
=
0
;
i
<
AnfAlgo
::
GetInputTensorNum
(
kernel
);
++
i
)
{
auto
kernel_ref_count_ptr
=
mem_reuse_util_
->
GetKernelInputRef
(
cnode
,
i
);
if
(
kernel_ref_count_ptr
==
nullptr
)
{
continue
;
}
kernel_ref_count_ptr
->
ref_count_dynamic_use_
--
;
if
(
kernel_ref_count_ptr
->
ref_count_dynamic_use_
<
0
)
{
MS_LOG
(
EXCEPTION
)
<<
"Check dynamic reference count failed."
;
}
if
(
kernel_ref_count_ptr
->
ref_count_dynamic_use_
==
0
)
{
DeviceAddressPtr
device_address
;
if
(
mem_reuse_util_
->
is_all_nop_node
())
{
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
false
);
}
else
{
// Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node.
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
true
);
}
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
}
// Free the output of kernel, if output has no reference.
for
(
size_t
i
=
0
;
i
<
AnfAlgo
::
GetOutputTensorNum
(
kernel
);
++
i
)
{
auto
kernel_ref_count_ptr
=
mem_reuse_util_
->
GetRef
(
cnode
,
i
);
if
(
kernel_ref_count_ptr
==
nullptr
)
{
continue
;
}
if
(
kernel_ref_count_ptr
->
ref_count_dynamic_use_
==
0
)
{
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
}
// Free the workspace of kernel.
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
for
(
size_t
i
=
0
;
i
<
kernel_mod
->
GetWorkspaceSizeList
().
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableWorkspaceAddr
(
kernel
,
i
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
)
{
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
}
}
}
}
// namespace gpu
}
// namespace device
}
// namespace mindspore
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
浏览文件 @
c0070d3d
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
#define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
#include <string>
#include <memory>
#include <vector>
#include <set>
#include <utility>
#include <unordered_map>
#include <unordered_set>
#include "runtime/device/kernel_runtime.h"
#include "runtime/device/kernel_runtime_manager.h"
#include "backend/optimizer/mem_reuse/mem_swap_manager.h"
namespace
mindspore
{
namespace
device
{
namespace
gpu
{
using
mindspore
::
device
::
memswap
::
MemSwapManagerPtr
;
class
GPUKernelRuntime
:
public
KernelRuntime
{
public:
GPUKernelRuntime
()
=
default
;
~
GPUKernelRuntime
()
override
=
default
;
bool
Init
()
override
;
void
ReleaseDeviceRes
()
override
;
void
ClearGraphRuntimeResource
(
uint32_t
graph_id
,
const
std
::
vector
<
AnfNodePtr
>
&
inputs
,
const
std
::
unordered_set
<
ValueNodePtr
>
&
value_nodes
,
const
std
::
vector
<
CNodePtr
>
&
execution_order
)
override
;
void
AssignMemory
(
session
::
KernelGraph
*
graph
)
override
;
bool
Run
(
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
)
override
;
#ifdef ENABLE_DUMP_E2E
bool
DumpData
(
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
)
override
;
#endif
protected:
DeviceAddressPtr
CreateDeviceAddress
(
void
*
device_ptr
,
size_t
device_size
,
const
string
&
format
,
TypeId
type_id
)
override
;
bool
SyncStream
()
override
;
private:
GPUKernelRuntime
(
const
GPUKernelRuntime
&
);
GPUKernelRuntime
&
operator
=
(
const
GPUKernelRuntime
&
);
bool
InitDevice
();
bool
device_init_
{
false
};
// The related functions and members for using dynamic memory pool.
void
InitKernelRefCount
(
const
session
::
KernelGraph
*
graph
);
void
InitKernelOutputAddress
(
const
session
::
KernelGraph
*
graph
);
void
InitKernelWorkspaceAddress
(
const
session
::
KernelGraph
*
graph
);
void
InitMemorySwapInfo
(
const
session
::
KernelGraph
*
graph
);
void
SaveGraphOutputNode
(
const
session
::
KernelGraph
*
graph
);
bool
IsGraphOutput
(
const
session
::
KernelGraph
*
graph
,
const
mindspore
::
AnfNodePtr
&
kernel
)
const
;
void
ClearKernelOutputAddress
(
const
session
::
KernelGraph
*
graph
);
void
ClearKernelWorkspaceAddress
(
const
session
::
KernelGraph
*
graph
);
void
ClearKernelOldOutputAndWorkspace
(
const
session
::
KernelGraph
*
graph
);
bool
RunOneStep
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
);
bool
SearchMemSwapScheme
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
);
bool
RefineMemSwapScheme
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
);
bool
LaunchKernelDynamic
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
,
bool
mock
=
false
,
bool
profiling
=
false
);
void
LaunchKernelWithTimeProfiling
(
const
AnfNodePtr
&
kernel
,
const
AddressPtrList
&
inputs
,
const
AddressPtrList
&
workspace
,
const
AddressPtrList
&
outputs
);
bool
AttemptMallocMem
(
const
DeviceAddressPtr
&
device_address
,
size_t
size
,
bool
mock
);
bool
AllocKernelDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_inputs
,
AddressPtrList
*
kernel_workspaces
,
AddressPtrList
*
kernel_outputs
,
bool
mock
);
bool
AllocKernelInputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_inputs
,
bool
mock
);
bool
AllocKernelOutputDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_outputs
,
bool
mock
);
bool
AllocKernelWorkspaceDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_workspaces
,
bool
mock
);
void
AllocCommunicationOpDynamicRes
(
const
session
::
KernelGraph
*
graph
);
void
AllocCommunicationOpInputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
);
void
AllocCommunicationOpOutputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
);
void
AllocCommunicationOpMemory
(
bool
is_need_alloc_memory
,
bool
is_need_free_memory
,
const
DeviceAddressPtrList
addr_list
,
size_t
total_size
,
std
::
vector
<
size_t
>
size_list
);
void
FreeKernelDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
);
bool
UpdateMemorySwapTask
(
const
AnfNodePtr
&
kernel
,
bool
mock
,
bool
profiling
);
bool
AddMemorySwapTask
(
const
AnfNodePtr
&
kernel
,
bool
mock
,
bool
profiling
);
void
UpdateHostSwapInQueue
(
const
DeviceAddressPtr
device_address
,
bool
mock
);
void
UpdateHostSwapOutQueue
(
bool
mock
);
void
ClearSwapInfo
(
bool
mock
);
std
::
unordered_map
<
uint32_t
,
MemReuseUtilPtr
>
mem_reuse_util_map_
;
std
::
unordered_map
<
uint32_t
,
MemSwapManagerPtr
>
mem_swap_map_
;
std
::
unordered_map
<
uint32_t
,
bool
>
is_first_step_map_
;
std
::
unordered_map
<
uint32_t
,
std
::
set
<
AnfNodePtr
>>
graph_output_map_
;
MemReuseUtilPtr
mem_reuse_util_
{
nullptr
};
MemSwapManagerPtr
mem_swap_manager_
{
nullptr
};
};
MS_REG_KERNEL_RUNTIME
(
kGPUDevice
,
GPUKernelRuntime
);
}
// namespace gpu
}
// namespace device
}
// namespace mindspore
#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
#define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
#include <string>
#include <memory>
#include <vector>
#include <set>
#include <utility>
#include <unordered_map>
#include <unordered_set>
#include "runtime/device/kernel_runtime.h"
#include "runtime/device/kernel_runtime_manager.h"
#include "backend/optimizer/mem_reuse/mem_swap_manager.h"
namespace
mindspore
{
namespace
device
{
namespace
gpu
{
using
mindspore
::
device
::
memswap
::
MemSwapManagerPtr
;
class
GPUKernelRuntime
:
public
KernelRuntime
{
public:
GPUKernelRuntime
()
=
default
;
~
GPUKernelRuntime
()
override
=
default
;
bool
Init
()
override
;
void
ReleaseDeviceRes
()
override
;
void
ClearGraphRuntimeResource
(
uint32_t
graph_id
,
const
std
::
vector
<
AnfNodePtr
>
&
inputs
,
const
std
::
unordered_set
<
ValueNodePtr
>
&
value_nodes
,
const
std
::
vector
<
CNodePtr
>
&
execution_order
)
override
;
void
AssignMemory
(
session
::
KernelGraph
*
graph
)
override
;
bool
Run
(
session
::
KernelGraph
*
graph
,
bool
is_task_sink
,
Debugger
*
debugger
=
nullptr
)
override
;
#ifdef ENABLE_DUMP_E2E
bool
DumpData
(
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
)
override
;
#endif
protected:
DeviceAddressPtr
CreateDeviceAddress
(
void
*
device_ptr
,
size_t
device_size
,
const
string
&
format
,
TypeId
type_id
)
override
;
bool
SyncStream
()
override
;
private:
GPUKernelRuntime
(
const
GPUKernelRuntime
&
);
GPUKernelRuntime
&
operator
=
(
const
GPUKernelRuntime
&
);
bool
InitDevice
();
bool
device_init_
{
false
};
// The related functions and members for using dynamic memory pool.
void
InitKernelRefCount
(
const
session
::
KernelGraph
*
graph
);
void
InitKernelOutputAddress
(
const
session
::
KernelGraph
*
graph
);
void
InitKernelWorkspaceAddress
(
const
session
::
KernelGraph
*
graph
);
void
InitMemorySwapInfo
(
const
session
::
KernelGraph
*
graph
);
void
SaveGraphOutputNode
(
const
session
::
KernelGraph
*
graph
);
bool
IsGraphOutput
(
const
session
::
KernelGraph
*
graph
,
const
mindspore
::
AnfNodePtr
&
kernel
)
const
;
void
ClearKernelOutputAddress
(
const
session
::
KernelGraph
*
graph
);
void
ClearKernelWorkspaceAddress
(
const
session
::
KernelGraph
*
graph
);
void
ClearKernelOldOutputAndWorkspace
(
const
session
::
KernelGraph
*
graph
);
bool
RunOneStep
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
);
bool
SearchMemSwapScheme
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
);
bool
RefineMemSwapScheme
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
);
bool
LaunchKernelDynamic
(
const
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
,
bool
mock
=
false
,
bool
profiling
=
false
);
void
LaunchKernelWithTimeProfiling
(
const
AnfNodePtr
&
kernel
,
const
AddressPtrList
&
inputs
,
const
AddressPtrList
&
workspace
,
const
AddressPtrList
&
outputs
);
bool
AttemptMallocMem
(
const
DeviceAddressPtr
&
device_address
,
size_t
size
,
bool
mock
);
bool
AllocKernelDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_inputs
,
AddressPtrList
*
kernel_workspaces
,
AddressPtrList
*
kernel_outputs
,
bool
mock
);
bool
AllocKernelInputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_inputs
,
bool
mock
);
bool
AllocKernelOutputDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_outputs
,
bool
mock
);
bool
AllocKernelWorkspaceDynamicRes
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_workspaces
,
bool
mock
);
void
AllocCommunicationOpDynamicRes
(
const
session
::
KernelGraph
*
graph
);
void
AllocCommunicationOpInputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
);
void
AllocCommunicationOpOutputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
);
void
AllocCommunicationOpMemory
(
bool
is_need_alloc_memory
,
bool
is_need_free_memory
,
const
DeviceAddressPtrList
addr_list
,
size_t
total_size
,
std
::
vector
<
size_t
>
size_list
);
void
FreeKernelDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
);
bool
UpdateMemorySwapTask
(
const
AnfNodePtr
&
kernel
,
bool
mock
,
bool
profiling
);
bool
AddMemorySwapTask
(
const
AnfNodePtr
&
kernel
,
bool
mock
,
bool
profiling
);
void
UpdateHostSwapInQueue
(
const
DeviceAddressPtr
device_address
,
bool
mock
);
void
UpdateHostSwapOutQueue
(
bool
mock
);
void
ClearSwapInfo
(
bool
mock
);
std
::
unordered_map
<
uint32_t
,
MemReuseUtilPtr
>
mem_reuse_util_map_
;
std
::
unordered_map
<
uint32_t
,
MemSwapManagerPtr
>
mem_swap_map_
;
std
::
unordered_map
<
uint32_t
,
bool
>
is_first_step_map_
;
std
::
unordered_map
<
uint32_t
,
std
::
set
<
AnfNodePtr
>>
graph_output_map_
;
MemReuseUtilPtr
mem_reuse_util_
{
nullptr
};
MemSwapManagerPtr
mem_swap_manager_
{
nullptr
};
};
MS_REG_KERNEL_RUNTIME
(
kGPUDevice
,
GPUKernelRuntime
);
}
// namespace gpu
}
// namespace device
}
// namespace mindspore
#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_
mindspore/ccsrc/runtime/device/kernel_runtime.cc
浏览文件 @
c0070d3d
...
...
@@ -40,7 +40,7 @@ KernelRuntime::~KernelRuntime() {
#endif
}
bool
KernelRuntime
::
Load
(
session
::
KernelGraph
*
graph
)
{
return
true
;
}
bool
KernelRuntime
::
Load
(
session
::
KernelGraph
*
graph
,
bool
is_task_sink
)
{
return
true
;
}
bool
KernelRuntime
::
DumpData
(
mindspore
::
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
)
{
if
(
graph
!=
nullptr
)
{
...
...
mindspore/ccsrc/runtime/device/kernel_runtime.h
浏览文件 @
c0070d3d
...
...
@@ -59,8 +59,8 @@ class KernelRuntime {
bool
DumpDataEnabled
();
bool
DumpDataEnabledIteration
();
virtual
bool
DumpData
(
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
);
virtual
bool
Load
(
session
::
KernelGraph
*
graph
);
virtual
bool
Run
(
session
::
KernelGraph
*
graph
,
Debugger
*
debugger
=
nullptr
)
=
0
;
virtual
bool
Load
(
session
::
KernelGraph
*
graph
,
bool
is_task_sink
);
virtual
bool
Run
(
session
::
KernelGraph
*
graph
,
bool
is_task_sink
,
Debugger
*
debugger
=
nullptr
)
=
0
;
bool
LaunchKernel
(
const
session
::
KernelGraph
*
graph
);
bool
LaunchTaskBasedOnSingleKernel
(
kernel
::
KernelModPtr
kernel_mod_ptr
,
const
AddressPtrList
&
kernel_inputs
,
const
AddressPtrList
&
kernel_outputs
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录