Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
兔爷不爱我
mindspore
提交
664f2628
M
mindspore
项目概览
兔爷不爱我
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
664f2628
编写于
4月 26, 2020
作者:
L
limingqi107
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
optimize gpu allReduce alloc memory performance
上级
7c7d95ac
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
36 addition
and
20 deletion
+36
-20
mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc
mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc
+32
-11
mindspore/ccsrc/device/gpu/gpu_kernel_runtime.h
mindspore/ccsrc/device/gpu/gpu_kernel_runtime.h
+3
-0
mindspore/ccsrc/device/memory_manager.cc
mindspore/ccsrc/device/memory_manager.cc
+1
-1
mindspore/ccsrc/session/anf_runtime_algorithm.cc
mindspore/ccsrc/session/anf_runtime_algorithm.cc
+0
-8
未找到文件。
mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc
浏览文件 @
664f2628
...
...
@@ -261,8 +261,7 @@ void GPUKernelRuntime::AllocCommunicationOpDynamicRes(const session::KernelGraph
auto
&
kernels
=
graph
->
execution_order
();
for
(
auto
&
kernel
:
kernels
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
auto
kernel_name
=
AnfAlgo
::
GetCNodeName
(
kernel
);
if
(
kernel_name
==
kAllReduceOpName
)
{
if
(
AnfAlgo
::
IsCommunicationOp
(
kernel
))
{
AllocCommunicationOpInputDynamicRes
(
kernel
);
AllocCommunicationOpOutputDynamicRes
(
kernel
);
}
...
...
@@ -272,27 +271,31 @@ void GPUKernelRuntime::AllocCommunicationOpDynamicRes(const session::KernelGraph
void
GPUKernelRuntime
::
AllocCommunicationOpInputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
bool
is_need_alloc_memory
=
false
;
bool
is_need_free_memory
=
false
;
size_t
total_size
=
0
;
std
::
vector
<
size_t
>
size_list
;
DeviceAddressPtrList
addr_list
;
for
(
size_t
i
=
0
;
i
<
AnfAlgo
::
GetInputTensorNum
(
kernel
);
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
);
MS_EXCEPTION_IF_NULL
(
device_address
);
// The inputs of communication kernel are not released.
if
(
device_address
->
ptr_
!=
nullptr
)
{
MS_LOG
(
INFO
)
<<
"The inputs of communication kernel are not released."
;
mem_manager_
->
FreeMemFromMemPool
(
device_address
)
;
if
(
device_address
->
ptr_
==
nullptr
)
{
is_need_alloc_memory
=
true
;
}
else
{
is_need_free_memory
=
true
;
}
total_size
+=
device_address
->
size_
;
size_list
.
emplace_back
(
device_address
->
size_
);
addr_list
.
emplace_back
(
device_address
);
}
mem_manager_
->
MallocContinuousMemFromMemPool
(
addr_list
,
total_size
,
size_list
);
AllocCommunicationOpMemory
(
is_need_alloc_memory
,
is_need_free_memory
,
addr_list
,
total_size
,
size_list
);
}
void
GPUKernelRuntime
::
AllocCommunicationOpOutputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
bool
is_need_alloc_memory
=
false
;
bool
is_need_free_memory
=
false
;
size_t
total_size
=
0
;
std
::
vector
<
size_t
>
size_list
;
DeviceAddressPtrList
addr_list
;
...
...
@@ -302,15 +305,33 @@ void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::Anf
for
(
size_t
i
=
0
;
i
<
output_sizes
.
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
);
MS_EXCEPTION_IF_NULL
(
device_address
);
// The outputs of communication kernel are not released.
if
(
device_address
->
ptr_
!=
nullptr
)
{
MS_LOG
(
INFO
)
<<
"The outputs of communication kernel are not released."
;
mem_manager_
->
FreeMemFromMemPool
(
device_address
)
;
if
(
device_address
->
ptr_
==
nullptr
)
{
is_need_alloc_memory
=
true
;
}
else
{
is_need_free_memory
=
true
;
}
total_size
+=
output_sizes
[
i
];
size_list
.
emplace_back
(
output_sizes
[
i
]);
addr_list
.
emplace_back
(
device_address
);
}
AllocCommunicationOpMemory
(
is_need_alloc_memory
,
is_need_free_memory
,
addr_list
,
total_size
,
size_list
);
}
void
GPUKernelRuntime
::
AllocCommunicationOpMemory
(
bool
is_need_alloc_memory
,
bool
is_need_free_memory
,
const
DeviceAddressPtrList
addr_list
,
size_t
total_size
,
std
::
vector
<
size_t
>
size_list
)
{
if
(
!
is_need_alloc_memory
)
{
return
;
}
if
(
is_need_free_memory
)
{
for
(
const
auto
&
iter
:
addr_list
)
{
MS_EXCEPTION_IF_NULL
(
iter
);
// Free the inputs/outputs of communication kernel which are not released.
if
(
iter
->
ptr_
!=
nullptr
)
{
mem_manager_
->
FreeMemFromMemPool
(
iter
);
}
}
}
mem_manager_
->
MallocContinuousMemFromMemPool
(
addr_list
,
total_size
,
size_list
);
}
...
...
mindspore/ccsrc/device/gpu/gpu_kernel_runtime.h
浏览文件 @
664f2628
...
...
@@ -58,6 +58,9 @@ class GPUKernelRuntime : public KernelRuntime {
void
AllocCommunicationOpDynamicRes
(
const
session
::
KernelGraph
*
graph
);
void
AllocCommunicationOpInputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
);
void
AllocCommunicationOpOutputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
);
void
AllocCommunicationOpMemory
(
bool
is_need_alloc_memory
,
bool
is_need_free_memory
,
const
DeviceAddressPtrList
addr_list
,
size_t
total_size
,
std
::
vector
<
size_t
>
size_list
);
void
FreeKernelDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
,
const
AddressPtrList
&
kernel_workspaces
,
uint32_t
graph_id
);
std
::
unordered_map
<
uint32_t
,
MemReuseUtilPtr
>
mem_reuse_util_map_
;
...
...
mindspore/ccsrc/device/memory_manager.cc
浏览文件 @
664f2628
...
...
@@ -172,7 +172,7 @@ void MemoryManager::MallocContinuousMemFromMemPool(const DeviceAddressPtrList ad
std
::
vector
<
size_t
>
size_list
)
{
auto
device_ptr_list
=
MallocContinuousMemFromMemPool
(
total_size
,
size_list
);
if
(
addr_list
.
size
()
!=
device_ptr_list
.
size
())
{
MS_LOG
(
EXCEPTION
)
<<
"The size of device list is not equal
to the size of address list."
;
MS_LOG
(
EXCEPTION
)
<<
"The size of device list is not equal to the size of address list."
;
}
for
(
size_t
i
=
0
;
i
<
addr_list
.
size
();
i
++
)
{
MS_EXCEPTION_IF_NULL
(
device_ptr_list
[
i
]);
...
...
mindspore/ccsrc/session/anf_runtime_algorithm.cc
浏览文件 @
664f2628
...
...
@@ -514,10 +514,6 @@ const DeviceAddress *AnfRuntimeAlgorithm::GetOutputAddr(const AnfNodePtr &node,
MS_LOG
(
EXCEPTION
)
<<
node
->
DebugString
()
<<
"Invalid nop node"
;
}
}
if
(
output_idx
>
GetOutputTensorNum
(
node
))
{
MS_LOG
(
EXCEPTION
)
<<
"The index ["
<<
output_idx
<<
"] is out of range of the node's output size [ "
<<
GetOutputTensorNum
(
node
)
<<
"#node:[ "
<<
node
->
DebugString
()
<<
"]"
;
}
auto
kernel_info
=
node
->
kernel_info
();
MS_EXCEPTION_IF_NULL
(
kernel_info
);
auto
addr
=
kernel_info
->
GetOutputAddr
(
output_idx
);
...
...
@@ -539,10 +535,6 @@ DeviceAddressPtr AnfRuntimeAlgorithm::GetMutableOutputAddr(const AnfNodePtr &nod
MS_LOG
(
EXCEPTION
)
<<
node
->
DebugString
()
<<
"Invalid nop node."
;
}
}
if
(
output_idx
>
GetOutputTensorNum
(
node
))
{
MS_LOG
(
EXCEPTION
)
<<
"The index ["
<<
output_idx
<<
"] is out of range of the node's output size [ "
<<
GetOutputTensorNum
(
node
)
<<
"#node:[ "
<<
node
->
DebugString
()
<<
"]"
;
}
auto
kernel_info
=
node
->
kernel_info
();
MS_EXCEPTION_IF_NULL
(
kernel_info
);
auto
addr
=
kernel_info
->
GetMutableOutputAddr
(
output_idx
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录