Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
正统之独孤求败
mindspore
提交
fb343bd6
M
mindspore
项目概览
正统之独孤求败
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
fb343bd6
编写于
4月 08, 2020
作者:
K
kswang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add mem manager
上级
dd9a5a38
变更
18
隐藏空白更改
内联
并排
Showing
18 changed file
with
562 addition
and
378 deletion
+562
-378
mindspore/ccsrc/CMakeLists.txt
mindspore/ccsrc/CMakeLists.txt
+1
-0
mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc
mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc
+6
-51
mindspore/ccsrc/device/ascend/ascend_kernel_runtime.h
mindspore/ccsrc/device/ascend/ascend_kernel_runtime.h
+1
-4
mindspore/ccsrc/device/ascend/ascend_memory_manager.cc
mindspore/ccsrc/device/ascend/ascend_memory_manager.cc
+65
-0
mindspore/ccsrc/device/ascend/ascend_memory_manager.h
mindspore/ccsrc/device/ascend/ascend_memory_manager.h
+35
-0
mindspore/ccsrc/device/device_address.h
mindspore/ccsrc/device/device_address.h
+5
-0
mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc
mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc
+21
-87
mindspore/ccsrc/device/gpu/gpu_kernel_runtime.h
mindspore/ccsrc/device/gpu/gpu_kernel_runtime.h
+1
-8
mindspore/ccsrc/device/gpu/gpu_memory_manager.cc
mindspore/ccsrc/device/gpu/gpu_memory_manager.cc
+88
-0
mindspore/ccsrc/device/gpu/gpu_memory_manager.h
mindspore/ccsrc/device/gpu/gpu_memory_manager.h
+40
-0
mindspore/ccsrc/device/kernel_runtime.cc
mindspore/ccsrc/device/kernel_runtime.cc
+40
-191
mindspore/ccsrc/device/kernel_runtime.h
mindspore/ccsrc/device/kernel_runtime.h
+5
-33
mindspore/ccsrc/device/memory_manager.cc
mindspore/ccsrc/device/memory_manager.cc
+170
-0
mindspore/ccsrc/device/memory_manager.h
mindspore/ccsrc/device/memory_manager.h
+71
-0
mindspore/ccsrc/session/anf_runtime_algorithm.cc
mindspore/ccsrc/session/anf_runtime_algorithm.cc
+10
-0
mindspore/ccsrc/session/anf_runtime_algorithm.h
mindspore/ccsrc/session/anf_runtime_algorithm.h
+1
-0
mindspore/ccsrc/session/gpu_session.cc
mindspore/ccsrc/session/gpu_session.cc
+0
-4
tests/ut/cpp/CMakeLists.txt
tests/ut/cpp/CMakeLists.txt
+2
-0
未找到文件。
mindspore/ccsrc/CMakeLists.txt
浏览文件 @
fb343bd6
...
...
@@ -132,6 +132,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
"kernel/kash/*.cc"
"device/kernel_info.cc"
"device/kernel_runtime.cc"
"device/memory_manager.cc"
"device/kernel_runtime_manager.cc"
"device/convert_tensor_utils.cc"
"pre_activate/common/*.cc"
...
...
mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc
浏览文件 @
fb343bd6
...
...
@@ -37,6 +37,7 @@
#include "kernel/tbe/tbe_utils.h"
#include "kernel/tbe/tbe_python_funcs.h"
#include "pre_activate/mem_reuse/mem_reuse_checker.h"
#include "device/ascend/ascend_memory_manager.h"
using
mindspore
::
device
::
ascend
::
ProfilingManager
;
using
mindspore
::
device
::
ascend
::
ProfilingUtils
;
...
...
@@ -47,8 +48,6 @@ using std::vector;
namespace
mindspore
{
namespace
device
{
namespace
ascend
{
static
const
uint64_t
ASCEND_MEM_SIZE
=
20
;
static
const
uint64_t
ASCEND_MEM_SIZE_BYTE
=
(
ASCEND_MEM_SIZE
<<
30
);
static
const
size_t
PRAMATER_OUTPUT_INDEX
=
0
;
AscendKernelRuntime
::~
AscendKernelRuntime
()
{
graph_model_map_
.
clear
();
}
...
...
@@ -86,7 +85,8 @@ void AscendKernelRuntime::ReleaseDeviceRes() {
MS_EXCEPTION
(
DeviceProcessError
)
<<
"rtSetDevice, ret["
<<
static_cast
<
int
>
(
ret
)
<<
"]"
;
}
FreeDeviceMemory
();
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
mem_manager_
->
FreeDeviceMemory
();
(
void
)
DestroyHccl
();
(
void
)
ResetDevice
();
(
void
)
ProfilingManager
::
GetInstance
().
StopProfiling
();
...
...
@@ -109,11 +109,9 @@ bool AscendKernelRuntime::Init() {
if
(
!
ret
)
{
return
ret
;
}
ret
=
MallocDeviceMemory
();
if
(
!
ret
)
{
return
ret
;
}
mem_manager_
=
std
::
make_shared
<
AscendMemoryManager
>
();
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
mem_manager_
->
MallocDeviceMemory
();
ret
=
ProfilingManager
::
GetInstance
().
StartupProfiling
(
device_id_
);
if
(
!
ret
)
{
...
...
@@ -239,13 +237,6 @@ DeviceAddressPtr AscendKernelRuntime::CreateDeviceAddress(void *device_ptr, size
return
std
::
make_shared
<
AscendDeviceAddress
>
(
device_ptr
,
device_size
,
format
,
type_id
);
}
void
AscendKernelRuntime
::
MallocOpMemory
(
const
DeviceAddressPtr
address
,
size_t
size
,
int
)
{
auto
device_ptr
=
AscendMemoryAllocator
::
GetInstance
().
AllocTensorMem
(
size
);
MS_EXCEPTION_IF_NULL
(
device_ptr
);
address
->
ptr_
=
device_ptr
;
address
->
mem_dynamic_alloc_
=
true
;
}
bool
AscendKernelRuntime
::
GenTask
(
const
session
::
KernelGraph
*
graph
)
{
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
...
...
@@ -474,42 +465,6 @@ bool AscendKernelRuntime::DestroyHccl() {
context_ptr
->
set_enable_hccl
(
false
);
return
true
;
}
bool
AscendKernelRuntime
::
MallocDeviceMemory
()
{
device_mem_size_
=
ASCEND_MEM_SIZE_BYTE
;
static_mem_offset_
=
FloatToSize
(
device_mem_size_
*
GRAPH_INIT_ASCEND_MEM_RATIO
);
auto
ret
=
rtMalloc
(
reinterpret_cast
<
void
**>
(
&
device_mem_base_
),
static_mem_offset_
,
RT_MEMORY_HBM
);
if
(
ret
!=
RT_ERROR_NONE
)
{
MS_EXCEPTION
(
DeviceProcessError
)
<<
"rtMalloc mem size["
<<
static_mem_offset_
<<
"] fail, ret["
<<
ret
<<
"]"
;
}
device_mem_pool_size_
=
FloatToSize
(
device_mem_size_
*
(
1
-
GRAPH_INIT_ASCEND_MEM_RATIO
));
ret
=
rtMalloc
(
reinterpret_cast
<
void
**>
(
&
device_mem_pool_base_
),
device_mem_pool_size_
,
RT_MEMORY_HBM
);
if
(
ret
!=
RT_ERROR_NONE
)
{
MS_EXCEPTION
(
DeviceProcessError
)
<<
"rtMalloc mem size["
<<
device_mem_pool_size_
<<
"] fail, ret["
<<
ret
<<
"]"
;
}
AscendMemoryAllocator
::
GetInstance
().
set_device_mem_pool_base
(
device_mem_pool_base_
);
AscendMemoryAllocator
::
GetInstance
().
set_device_mem_pool_size
(
device_mem_pool_size_
);
return
true
;
}
void
AscendKernelRuntime
::
FreeDeviceMemory
()
{
if
(
device_mem_base_
!=
nullptr
)
{
auto
ret
=
rtFree
(
device_mem_base_
);
if
(
ret
!=
RT_ERROR_NONE
)
{
MS_LOG
(
ERROR
)
<<
"rtFree mem size["
<<
device_mem_size_
<<
"] fail, ret["
<<
ret
<<
"]"
;
}
device_mem_base_
=
nullptr
;
}
if
(
device_mem_pool_base_
!=
nullptr
)
{
auto
ret
=
rtFree
(
device_mem_pool_base_
);
if
(
ret
!=
RT_ERROR_NONE
)
{
MS_LOG
(
ERROR
)
<<
"rtFree mem size["
<<
device_mem_pool_size_
<<
"] fail, ret["
<<
ret
<<
"]"
;
}
device_mem_pool_base_
=
nullptr
;
}
}
void
AscendKernelRuntime
::
FreeHostMemory
()
{
dynamic_mem_offset_
=
0
;
}
}
// namespace ascend
}
// namespace device
}
// namespace mindspore
mindspore/ccsrc/device/ascend/ascend_kernel_runtime.h
浏览文件 @
fb343bd6
...
...
@@ -39,13 +39,11 @@ class AscendKernelRuntime : public KernelRuntime {
bool
GenTask
(
const
session
::
KernelGraph
*
graph
)
override
;
bool
RunTask
(
const
session
::
KernelGraph
*
graph
)
override
;
bool
LoadTask
(
const
session
::
KernelGraph
*
graph
)
override
;
void
FreeHostMemory
()
override
;
protected:
DeviceAddressPtr
CreateDeviceAddress
(
void
*
device_ptr
,
size_t
device_size
,
const
string
&
format
,
TypeId
type_id
)
override
;
bool
SyncStream
()
override
;
void
MallocOpMemory
(
const
DeviceAddressPtr
address
,
size_t
size
,
int
flag
)
override
;
private:
bool
InitDevice
();
...
...
@@ -53,8 +51,7 @@ class AscendKernelRuntime : public KernelRuntime {
bool
HcclInit
();
bool
NeedDestroyHccl
();
bool
DestroyHccl
();
bool
MallocDeviceMemory
();
void
FreeDeviceMemory
();
void
ClearGraphModelMap
();
void
ReleaseDeviceRes
()
override
;
uint32_t
GetGraphModelId
(
const
session
::
KernelGraph
*
kernel_graph
);
...
...
mindspore/ccsrc/device/ascend/ascend_memory_manager.cc
0 → 100644
浏览文件 @
fb343bd6
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "device/ascend/ascend_memory_manager.h"
#include "device/ascend/ascend_memory_allocator.h"
#include "utils/context/ms_context.h"
#include "runtime/mem.h"
namespace
mindspore
{
namespace
device
{
namespace
ascend
{
static
const
uint64_t
ASCEND_MEM_SIZE
=
20
;
static
const
uint64_t
ASCEND_MEM_SIZE_BYTE
=
(
ASCEND_MEM_SIZE
<<
30
);
void
AscendMemoryManager
::
MallocDeviceMemory
()
{
device_mem_size_
=
ASCEND_MEM_SIZE_BYTE
;
static_mem_offset_
=
FloatToSize
(
device_mem_size_
*
GRAPH_INIT_ASCEND_MEM_RATIO
);
auto
ret
=
rtMalloc
(
reinterpret_cast
<
void
**>
(
&
device_mem_base_
),
static_mem_offset_
,
RT_MEMORY_HBM
);
if
(
ret
!=
RT_ERROR_NONE
)
{
MS_EXCEPTION
(
DeviceProcessError
)
<<
"rtMalloc mem size["
<<
static_mem_offset_
<<
"] fail, ret["
<<
ret
<<
"]"
;
}
device_mem_pool_size_
=
FloatToSize
(
device_mem_size_
*
(
1
-
GRAPH_INIT_ASCEND_MEM_RATIO
));
ret
=
rtMalloc
(
reinterpret_cast
<
void
**>
(
&
device_mem_pool_base_
),
device_mem_pool_size_
,
RT_MEMORY_HBM
);
if
(
ret
!=
RT_ERROR_NONE
)
{
MS_EXCEPTION
(
DeviceProcessError
)
<<
"rtMalloc mem size["
<<
device_mem_pool_size_
<<
"] fail, ret["
<<
ret
<<
"]"
;
}
AscendMemoryAllocator
::
GetInstance
().
set_device_mem_pool_base
(
device_mem_pool_base_
);
AscendMemoryAllocator
::
GetInstance
().
set_device_mem_pool_size
(
device_mem_pool_size_
);
}
void
AscendMemoryManager
::
FreeDeviceMemory
()
{
if
(
device_mem_base_
!=
nullptr
)
{
auto
ret
=
rtFree
(
device_mem_base_
);
if
(
ret
!=
RT_ERROR_NONE
)
{
MS_LOG
(
ERROR
)
<<
"rtFree mem size["
<<
device_mem_size_
<<
"] fail, ret["
<<
ret
<<
"]"
;
}
device_mem_base_
=
nullptr
;
}
if
(
device_mem_pool_base_
!=
nullptr
)
{
auto
ret
=
rtFree
(
device_mem_pool_base_
);
if
(
ret
!=
RT_ERROR_NONE
)
{
MS_LOG
(
ERROR
)
<<
"rtFree mem size["
<<
device_mem_pool_size_
<<
"] fail, ret["
<<
ret
<<
"]"
;
}
device_mem_pool_base_
=
nullptr
;
}
}
void
*
AscendMemoryManager
::
AllocTensorMemDynamic
(
size_t
size
)
{
return
AscendMemoryAllocator
::
GetInstance
().
AllocTensorMem
(
size
);
}
}
// namespace ascend
}
// namespace device
}
// namespace mindspore
mindspore/ccsrc/device/ascend/ascend_memory_manager.h
0 → 100644
浏览文件 @
fb343bd6
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_MANAGER_H_
#define MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_MANAGER_H_
#include "device/memory_manager.h"
namespace
mindspore
{
namespace
device
{
namespace
ascend
{
class
AscendMemoryManager
:
public
MemoryManager
{
public:
AscendMemoryManager
()
=
default
;
virtual
~
AscendMemoryManager
()
=
default
;
void
MallocDeviceMemory
()
override
;
void
FreeDeviceMemory
()
override
;
void
*
AllocTensorMemDynamic
(
size_t
size
)
override
;
};
}
// namespace ascend
}
// namespace device
}
// namespace mindspore
#endif // MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_MANAGER_H_
mindspore/ccsrc/device/device_address.h
浏览文件 @
fb343bd6
...
...
@@ -33,12 +33,14 @@ class CPUKernelRuntime;
}
// namespace cpu
namespace
ascend
{
class
AscendKernelRuntime
;
class
AscendMemoryManager
;
namespace
tasksink
{
class
TaskGenerator
;
}
// namespace tasksink
}
// namespace ascend
namespace
gpu
{
class
GPUKernelRuntime
;
class
GPUMemoryManager
;
}
// namespace gpu
}
// namespace device
}
// namespace mindspore
...
...
@@ -70,12 +72,15 @@ class DeviceAddress {
TypeId
type_id_
{
kNumberTypeFloat16
};
bool
mem_dynamic_alloc_
{
false
};
friend
class
KernelRuntime
;
friend
class
MemoryManager
;
friend
class
mindspore
::
device
::
ascend
::
tasksink
::
TaskGenerator
;
friend
class
mindspore
::
device
::
cpu
::
CPUSimpleMemPlan
;
friend
class
mindspore
::
device
::
cpu
::
CPUResourceManager
;
friend
class
mindspore
::
device
::
cpu
::
CPUKernelRuntime
;
friend
class
mindspore
::
device
::
gpu
::
GPUKernelRuntime
;
friend
class
mindspore
::
device
::
gpu
::
GPUMemoryManager
;
friend
class
mindspore
::
device
::
ascend
::
AscendKernelRuntime
;
friend
class
mindspore
::
device
::
ascend
::
AscendMemoryManager
;
};
using
DeviceAddressPtr
=
std
::
shared_ptr
<
DeviceAddress
>
;
...
...
mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc
浏览文件 @
fb343bd6
...
...
@@ -26,6 +26,7 @@
#include "device/kernel_runtime_manager.h"
#include "device/gpu/gpu_common.h"
#include "common/utils.h"
#include "device/gpu/gpu_memory_manager.h"
namespace
mindspore
{
namespace
device
{
...
...
@@ -36,26 +37,14 @@ bool GPUKernelRuntime::Init() {
if
(
device_init_
==
true
)
{
return
true
;
}
auto
ret
=
InitDevice
();
if
(
!
ret
)
{
MS_LOG
(
ERROR
)
<<
"InitDevice error."
;
return
ret
;
}
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
// If use the dynamic memory pool, then alloc the first memory block to init.
if
(
context_ptr
->
enable_dynamic_mem_pool
())
{
auto
device_addr
=
AllocTensorMemDynamic
(
1
);
if
(
!
device_addr
)
{
MS_LOG
(
ERROR
)
<<
"Dynamic memory pool init error."
;
return
false
;
}
}
else
{
MallocDeviceMemory
();
}
mem_manager_
=
std
::
make_shared
<
GPUMemoryManager
>
();
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
mem_manager_
->
MallocDeviceMemory
();
const
void
*
collective_handle_
=
CollectiveInitializer
::
instance
().
collective_handle
();
bool
collective_inited
=
CollectiveInitializer
::
instance
().
collective_inited
();
if
(
collective_inited
&&
collective_handle_
!=
nullptr
)
{
...
...
@@ -101,16 +90,6 @@ bool GPUKernelRuntime::InitDevice() {
return
true
;
}
void
GPUKernelRuntime
::
MallocDeviceMemory
()
{
// Need to reserve 20% space for dynamic memory
const
float
init_gpu_mem_ratio
=
0.8
;
size_t
mem_size
=
FloatToSize
(
GPUMemoryAllocator
::
GetInstance
().
free_mem_size
()
*
init_gpu_mem_ratio
);
auto
alloc_size
=
GPUMemoryAllocator
::
GetInstance
().
AllocDeviceMem
(
mem_size
,
reinterpret_cast
<
void
**>
(
&
device_mem_base_
));
device_mem_size_
=
alloc_size
;
static_mem_offset_
=
device_mem_size_
;
}
void
GPUKernelRuntime
::
ReleaseDeviceRes
()
{
// For dataset mode.
if
(
GpuBufferMgr
::
GetInstance
().
IsInit
())
{
...
...
@@ -122,39 +101,22 @@ void GPUKernelRuntime::ReleaseDeviceRes() {
CHECK_OP_RET_WITH_EXCEPT
(
GpuBufferMgr
::
GetInstance
().
Destroy
(),
"Could not destroy gpu data queue."
);
}
GPUDeviceManager
::
GetInstance
().
ReleaseDevice
();
if
(
device_mem_base_
!=
nullptr
)
{
if
(
!
GPUMemoryAllocator
::
GetInstance
().
FreeDeviceMem
(
device_mem_base_
))
{
MS_LOG
(
EXCEPTION
)
<<
"Could not free gpu device memory."
;
}
}
GPUMemoryAllocator
::
GetInstance
().
ReleaseDeviceRes
();
}
void
GPUKernelRuntime
::
FreeHostMemory
()
{
dynamic_mem_offset_
=
0
;
}
void
*
GPUKernelRuntime
::
AllocTensorMemDynamic
(
size_t
size
)
{
return
GPUMemoryAllocator
::
GetInstance
().
AllocTensorMem
(
size
);
}
void
GPUKernelRuntime
::
FreeTensorMemDynamic
(
void
*
device_ptr
)
{
GPUMemoryAllocator
::
GetInstance
().
FreeTensorMem
(
device_ptr
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
mem_manager_
->
FreeDeviceMemory
();
}
void
GPUKernelRuntime
::
AssignMemory
(
session
::
KernelGraph
*
graph
)
{
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
mem_manager_
->
ResetDynamicMemory
();
AssignStaticMemory
(
graph
);
bool
is_enable_mem_reuse
=
context_ptr
->
enable_mem_reuse
();
bool
is_enable_dynamic_mem
=
context_ptr
->
enable_dynamic_mem_pool
();
if
(
is_enable_dynamic_mem
)
{
// Use the dynamic memory pool.
InitKernelRefCount
(
graph
);
InitKernelOutputAddress
(
graph
);
}
else
if
(
is_enable_mem_reuse
)
{
// Use the memory reuse.
ReuseAssignDynamicMemory
(
graph
);
}
else
{
// Normal way.
AssignDynamicMemory
(
graph
);
}
}
...
...
@@ -179,32 +141,6 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph) {
return
ret
;
}
uint8_t
*
GPUKernelRuntime
::
MallocStaticMem
(
size_t
size
,
bool
)
{
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
if
(
context_ptr
->
enable_dynamic_mem_pool
())
{
auto
device_ptr
=
AllocTensorMemDynamic
(
size
);
MS_EXCEPTION_IF_NULL
(
device_ptr
);
return
AddressOffset
(
device_ptr
,
0
);
}
auto
align_size
=
GetCommonAlignSize
(
size
);
if
(
static_mem_offset_
<
align_size
)
{
MS_LOG
(
EXCEPTION
)
<<
"Out of memory!!! total["
<<
device_mem_size_
<<
"](dynamic["
<<
total_dynamic_size_
<<
"] static["
<<
total_static_size_
<<
"])"
<<
" malloc ["
<<
align_size
<<
"] failed!"
;
}
auto
offset
=
static_mem_offset_
-
align_size
;
if
(
dynamic_mem_offset_
>
offset
)
{
MS_LOG
(
EXCEPTION
)
<<
"Out of memory!!! total["
<<
device_mem_size_
<<
"](dynamic["
<<
total_dynamic_size_
<<
"] static["
<<
total_static_size_
<<
"])"
<<
" malloc ["
<<
align_size
<<
"] failed!"
;
}
total_static_size_
+=
align_size
;
static_mem_offset_
=
offset
;
return
device_mem_base_
+
offset
;
}
void
GPUKernelRuntime
::
InitKernelRefCount
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MemReuseUtilPtr
mem_reuse_util_ptr
=
std
::
make_shared
<
memreuse
::
MemReuseUtil
>
();
...
...
@@ -273,6 +209,7 @@ void GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod
MS_EXCEPTION_IF_NULL
(
kernel_inputs
);
MS_EXCEPTION_IF_NULL
(
kernel_workspaces
);
MS_EXCEPTION_IF_NULL
(
kernel_outputs
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
for
(
size_t
i
=
0
;
i
<
AnfAlgo
::
GetInputTensorNum
(
kernel
);
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetPrevNodeOutputAddr
(
kernel
,
i
);
MS_EXCEPTION_IF_NULL
(
device_address
);
...
...
@@ -290,7 +227,7 @@ void GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod
MS_EXCEPTION_IF_NULL
(
device_address
);
auto
device_ptr
=
device_address
->
ptr_
;
if
(
device_ptr
==
nullptr
)
{
device_ptr
=
AllocTensorMemDynamic
(
output_sizes
[
i
]);
device_ptr
=
mem_manager_
->
AllocTensorMemDynamic
(
output_sizes
[
i
]);
MS_EXCEPTION_IF_NULL
(
device_ptr
);
device_address
->
ptr_
=
device_ptr
;
}
...
...
@@ -307,7 +244,7 @@ void GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod
kernel_workspaces
->
emplace_back
(
nullptr
);
continue
;
}
auto
device_ptr
=
AllocTensorMemDynamic
(
workspace_sizes
[
i
]);
auto
device_ptr
=
mem_manager_
->
AllocTensorMemDynamic
(
workspace_sizes
[
i
]);
MS_EXCEPTION_IF_NULL
(
device_ptr
);
kernel
::
AddressPtr
workspace
=
std
::
make_shared
<
kernel
::
Address
>
();
MS_EXCEPTION_IF_NULL
(
workspace
);
...
...
@@ -333,6 +270,7 @@ void GPUKernelRuntime::AllocCommunicationOpDynamicRes(const session::KernelGraph
void
GPUKernelRuntime
::
AllocCommunicationOpInputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
// The reference count of communication kernel input is not 0.
if
(
communication_op_input_ref_count_
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"The reference count of communication kernel input is not 0."
;
...
...
@@ -354,7 +292,7 @@ void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfN
addr_size
.
emplace_back
(
device_address
.
get
(),
output_size
);
}
auto
device_mem_ptr
=
AllocTensorMemDynamic
(
total
);
auto
device_mem_ptr
=
mem_manager_
->
AllocTensorMemDynamic
(
total
);
MS_EXCEPTION_IF_NULL
(
device_mem_ptr
);
for
(
const
auto
&
iter
:
addr_size
)
{
MS_EXCEPTION_IF_NULL
(
iter
.
first
);
...
...
@@ -366,6 +304,7 @@ void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfN
void
GPUKernelRuntime
::
AllocCommunicationOpOutputDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
// The reference count of communication kernel output is not 0.
if
(
communication_op_output_ref_count_
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"The reference count of communication kernel output is not 0."
;
...
...
@@ -389,7 +328,7 @@ void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::Anf
addr_size
.
emplace_back
(
device_address
.
get
(),
output_sizes
[
i
]);
}
auto
device_mem_ptr
=
AllocTensorMemDynamic
(
total
);
auto
device_mem_ptr
=
mem_manager_
->
AllocTensorMemDynamic
(
total
);
MS_EXCEPTION_IF_NULL
(
device_mem_ptr
);
for
(
const
auto
&
iter
:
addr_size
)
{
MS_EXCEPTION_IF_NULL
(
iter
.
first
);
...
...
@@ -402,6 +341,7 @@ void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::Anf
void
GPUKernelRuntime
::
FreeKernelDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
,
const
AddressPtrList
&
kernel_workspaces
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
auto
cnode
=
kernel
->
cast
<
CNodePtr
>
();
MS_EXCEPTION_IF_NULL
(
cnode
);
// Free the input of kernel by reference count.
...
...
@@ -421,7 +361,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
auto
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
);
MS_EXCEPTION_IF_NULL
(
device_address
);
MS_EXCEPTION_IF_NULL
(
device_address
->
ptr_
);
FreeTensorMemDynamic
(
device_address
->
ptr_
);
mem_manager_
->
FreeTensorMemDynamic
(
device_address
->
ptr_
);
device_address
->
ptr_
=
nullptr
;
}
}
...
...
@@ -432,7 +372,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
auto
workspace
=
kernel_workspaces
[
i
];
if
(
workspace
!=
nullptr
)
{
MS_EXCEPTION_IF_NULL
(
workspace
->
addr
);
FreeTensorMemDynamic
(
workspace
->
addr
);
mem_manager_
->
FreeTensorMemDynamic
(
workspace
->
addr
);
workspace
->
addr
=
nullptr
;
}
}
...
...
@@ -441,6 +381,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
void
GPUKernelRuntime
::
FreeCommunicationOpDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
,
size_t
input_idx
,
bool
*
is_communication_op
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
// The inputs memory of communication kernel is one piece memory, need release together.
if
(
AnfAlgo
::
GetCNodeName
(
kernel
)
==
kAllReduceOpName
)
{
communication_op_input_ref_count_
--
;
...
...
@@ -448,7 +389,7 @@ void GPUKernelRuntime::FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr
auto
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
0
);
MS_EXCEPTION_IF_NULL
(
device_address
);
MS_EXCEPTION_IF_NULL
(
device_address
->
ptr_
);
FreeTensorMemDynamic
(
device_address
->
ptr_
);
mem_manager_
->
FreeTensorMemDynamic
(
device_address
->
ptr_
);
device_address
->
ptr_
=
nullptr
;
}
*
is_communication_op
=
true
;
...
...
@@ -470,19 +411,12 @@ void GPUKernelRuntime::FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel_input
.
first
,
0
);
MS_EXCEPTION_IF_NULL
(
device_address
);
MS_EXCEPTION_IF_NULL
(
device_address
->
ptr_
);
FreeTensorMemDynamic
(
device_address
->
ptr_
);
mem_manager_
->
FreeTensorMemDynamic
(
device_address
->
ptr_
);
device_address
->
ptr_
=
nullptr
;
}
*
is_communication_op
=
true
;
}
}
void
GPUKernelRuntime
::
MallocOpMemory
(
const
DeviceAddressPtr
address
,
size_t
size
,
int
)
{
auto
device_ptr
=
AllocTensorMemDynamic
(
size
);
MS_EXCEPTION_IF_NULL
(
device_ptr
);
address
->
ptr_
=
device_ptr
;
address
->
mem_dynamic_alloc_
=
true
;
}
}
// namespace gpu
}
// namespace device
}
// namespace mindspore
mindspore/ccsrc/device/gpu/gpu_kernel_runtime.h
浏览文件 @
fb343bd6
...
...
@@ -33,7 +33,6 @@ class GPUKernelRuntime : public KernelRuntime {
~
GPUKernelRuntime
()
override
=
default
;
bool
Init
()
override
;
void
ReleaseDeviceRes
()
override
;
void
FreeHostMemory
()
override
;
void
AssignMemory
(
session
::
KernelGraph
*
graph
)
override
;
bool
Run
(
session
::
KernelGraph
*
graph
)
override
;
...
...
@@ -41,18 +40,11 @@ class GPUKernelRuntime : public KernelRuntime {
DeviceAddressPtr
CreateDeviceAddress
(
void
*
device_ptr
,
size_t
device_size
,
const
string
&
format
,
TypeId
type_id
)
override
;
bool
SyncStream
()
override
;
// Alloc memory use the dynamic memory pool.
void
*
AllocTensorMemDynamic
(
size_t
size
)
override
;
// Free memory use the dynamic memory pool.
void
FreeTensorMemDynamic
(
void
*
device_ptr
)
override
;
void
MallocOpMemory
(
const
DeviceAddressPtr
address
,
size_t
size
,
int
flag
)
override
;
uint8_t
*
MallocStaticMem
(
size_t
size
,
bool
communication_mem
)
override
;
private:
GPUKernelRuntime
(
const
GPUKernelRuntime
&
);
GPUKernelRuntime
&
operator
=
(
const
GPUKernelRuntime
&
);
bool
InitDevice
();
void
MallocDeviceMemory
();
bool
device_init_
{
false
};
// The related functions and members for using dynamic memory pool.
...
...
@@ -69,6 +61,7 @@ class GPUKernelRuntime : public KernelRuntime {
void
FreeCommunicationOpDynamicRes
(
const
mindspore
::
AnfNodePtr
&
kernel
,
size_t
input_idx
,
bool
*
is_communication_op
);
size_t
communication_op_input_ref_count_
{
0
};
size_t
communication_op_output_ref_count_
{
0
};
MemReuseUtilPtr
mem_reuse_util_ptr_
{
nullptr
};
};
MS_REG_KERNEL_RUNTIME
(
kGPUDevice
,
GPUKernelRuntime
);
}
// namespace gpu
...
...
mindspore/ccsrc/device/gpu/gpu_memory_manager.cc
0 → 100644
浏览文件 @
fb343bd6
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "device/gpu/gpu_memory_manager.h"
#include "device/gpu/gpu_memory_allocator.h"
#include "utils/context/ms_context.h"
#include "utils/convert_utils.h"
namespace
mindspore
{
namespace
device
{
namespace
gpu
{
void
*
GPUMemoryManager
::
AllocTensorMemDynamic
(
size_t
size
)
{
return
GPUMemoryAllocator
::
GetInstance
().
AllocTensorMem
(
size
);
}
void
GPUMemoryManager
::
FreeTensorMemDynamic
(
void
*
device_ptr
)
{
GPUMemoryAllocator
::
GetInstance
().
FreeTensorMem
(
device_ptr
);
}
void
GPUMemoryManager
::
MallocDeviceMemory
()
{
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
// If use the dynamic memory pool, then alloc the first memory block to init.
if
(
context_ptr
->
enable_dynamic_mem_pool
())
{
auto
device_addr
=
AllocTensorMemDynamic
(
1
);
if
(
!
device_addr
)
{
MS_LOG
(
ERROR
)
<<
"Dynamic memory pool init error."
;
}
}
else
{
// Need to reserve 20% space for dynamic memory
const
float
init_gpu_mem_ratio
=
0.8
;
size_t
mem_size
=
FloatToSize
(
GPUMemoryAllocator
::
GetInstance
().
free_mem_size
()
*
init_gpu_mem_ratio
);
auto
alloc_size
=
GPUMemoryAllocator
::
GetInstance
().
AllocDeviceMem
(
mem_size
,
reinterpret_cast
<
void
**>
(
&
device_mem_base_
));
device_mem_size_
=
alloc_size
;
static_mem_offset_
=
device_mem_size_
;
}
}
void
GPUMemoryManager
::
FreeDeviceMemory
()
{
if
(
device_mem_base_
!=
nullptr
)
{
if
(
!
GPUMemoryAllocator
::
GetInstance
().
FreeDeviceMem
(
device_mem_base_
))
{
MS_LOG
(
EXCEPTION
)
<<
"Could not free gpu device memory."
;
}
}
GPUMemoryAllocator
::
GetInstance
().
ReleaseDeviceRes
();
}
uint8_t
*
GPUMemoryManager
::
MallocStaticMem
(
size_t
size
,
bool
)
{
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
if
(
context_ptr
->
enable_dynamic_mem_pool
())
{
auto
device_ptr
=
AllocTensorMemDynamic
(
size
);
MS_EXCEPTION_IF_NULL
(
device_ptr
);
return
AddressOffset
(
device_ptr
,
0
);
}
auto
align_size
=
GetCommonAlignSize
(
size
);
if
(
static_mem_offset_
<
align_size
)
{
MS_LOG
(
EXCEPTION
)
<<
"Out of memory!!! total["
<<
device_mem_size_
<<
"](dynamic["
<<
total_dynamic_size_
<<
"] static["
<<
total_static_size_
<<
"])"
<<
" malloc ["
<<
align_size
<<
"] failed!"
;
}
auto
offset
=
static_mem_offset_
-
align_size
;
if
(
dynamic_mem_offset_
>
offset
)
{
MS_LOG
(
EXCEPTION
)
<<
"Out of memory!!! total["
<<
device_mem_size_
<<
"](dynamic["
<<
total_dynamic_size_
<<
"] static["
<<
total_static_size_
<<
"])"
<<
" malloc ["
<<
align_size
<<
"] failed!"
;
}
total_static_size_
+=
align_size
;
static_mem_offset_
=
offset
;
return
device_mem_base_
+
offset
;
}
}
// namespace gpu
}
// namespace device
}
// namespace mindspore
mindspore/ccsrc/device/gpu/gpu_memory_manager.h
0 → 100644
浏览文件 @
fb343bd6
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_MANAGER_H_
#define MINDSPORE_MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_MANAGER_H_
#include "device/memory_manager.h"
namespace
mindspore
{
namespace
device
{
namespace
gpu
{
class
GPUMemoryManager
:
public
MemoryManager
{
public:
GPUMemoryManager
()
=
default
;
virtual
~
GPUMemoryManager
()
=
default
;
void
MallocDeviceMemory
()
override
;
void
FreeDeviceMemory
()
override
;
void
*
AllocTensorMemDynamic
(
size_t
size
)
override
;
void
FreeTensorMemDynamic
(
void
*
device_ptr
)
override
;
protected:
uint8_t
*
MallocStaticMem
(
size_t
size
,
bool
communication_mem
);
};
}
// namespace gpu
}
// namespace device
}
// namespace mindspore
#endif // MINDSPORE_MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_MANAGER_H_
mindspore/ccsrc/device/kernel_runtime.cc
浏览文件 @
fb343bd6
...
...
@@ -31,18 +31,13 @@
#include "ir/value.h"
using
mindspore
::
kernel
::
Address
;
using
mindspore
::
kernel
::
AddressPtr
;
using
mindspore
::
memreuse
::
BestFitMemReuse
;
using
mindspore
::
memreuse
::
MemReuseUtilPtr
;
namespace
mindspore
{
namespace
device
{
KernelRuntime
::~
KernelRuntime
()
{
device_mem_base_
=
nullptr
;
device_mem_pool_base_
=
nullptr
;
#ifdef ENABLE_DUMP_E2E
dump_conf_ptr_
=
nullptr
;
#endif
mem_reuse_util_ptr_
=
nullptr
;
}
bool
KernelRuntime
::
Run
(
session
::
KernelGraph
*
graph
)
{
...
...
@@ -88,11 +83,6 @@ bool KernelRuntime::LoadTask(const session::KernelGraph *graph) {
return
false
;
}
void
KernelRuntime
::
FreeHostMemory
()
{
dynamic_mem_offset_
=
0
;
static_mem_offset_
=
0
;
}
// for D to impl
bool
KernelRuntime
::
RunTask
(
const
session
::
KernelGraph
*
graph
)
{
if
(
graph
!=
nullptr
)
{
...
...
@@ -126,13 +116,11 @@ size_t KernelRuntime::CountNodeDeviceMemorySize(const mindspore::AnfNodePtr &nod
void
KernelRuntime
::
AssignMemory
(
session
::
KernelGraph
*
graph
)
{
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
mem_manager_
->
ResetDynamicMemory
();
AssignStaticMemory
(
graph
);
bool
is_enable_mem_reuse
=
context_ptr
->
enable_mem_reuse
();
if
(
is_enable_mem_reuse
)
{
ReuseAssignDynamicMemory
(
graph
);
}
else
{
AssignDynamicMemory
(
graph
);
}
AssignDynamicMemory
(
graph
);
UpdateRefNodeOutputMem
(
graph
);
}
...
...
@@ -159,6 +147,7 @@ void KernelRuntime::AssignStaticMemory(session::KernelGraph *graph) {
void
KernelRuntime
::
RunOpAssignInputMemory
(
const
std
::
vector
<
tensor
::
TensorPtr
>
&
input_tensors
,
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
for
(
size_t
input_index
=
0
;
input_index
<
graph
->
inputs
().
size
();
++
input_index
)
{
auto
item
=
graph
->
inputs
()[
input_index
];
MS_EXCEPTION_IF_NULL
(
item
);
...
...
@@ -180,7 +169,7 @@ void KernelRuntime::RunOpAssignInputMemory(const std::vector<tensor::TensorPtr>
auto
device_address
=
CreateDeviceAddress
(
nullptr
,
tensor_size
,
AnfAlgo
::
GetOutputFormat
(
item
,
index
),
output_type_id
);
MS_EXCEPTION_IF_NULL
(
device_address
);
MallocOpMemory
(
device_address
,
tensor_size
,
kStaticMem
);
mem_manager_
->
MallocOpMemory
(
device_address
,
tensor_size
);
AnfAlgo
::
SetOutputAddr
(
device_address
,
index
,
item
.
get
());
}
}
...
...
@@ -188,6 +177,7 @@ void KernelRuntime::RunOpAssignInputMemory(const std::vector<tensor::TensorPtr>
void
KernelRuntime
::
RunOpAssignOutputMemory
(
const
AnfNodePtr
&
kernel
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
output_sizes
=
kernel_mod
->
GetOutputSizeList
();
...
...
@@ -208,13 +198,14 @@ void KernelRuntime::RunOpAssignOutputMemory(const AnfNodePtr &kernel) {
auto
output_type
=
AnfAlgo
::
GetOutputDeviceDataType
(
kernel
,
i
);
auto
device_address
=
CreateDeviceAddress
(
nullptr
,
output_sizes
[
i
],
output_format
,
output_type
);
MS_EXCEPTION_IF_NULL
(
device_address
);
MallocOpMemory
(
device_address
,
output_sizes
[
i
],
kDynamicMem
);
mem_manager_
->
MallocOpMemory
(
device_address
,
output_sizes
[
i
]
);
AnfAlgo
::
SetOutputAddr
(
device_address
,
i
,
kernel
.
get
());
}
}
void
KernelRuntime
::
RunOpAssignWorkSpaceMemory
(
const
AnfNodePtr
&
kernel
)
{
MS_EXCEPTION_IF_NULL
(
kernel
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
if
(
kernel
->
isa
<
CNode
>
())
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
kernel
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
...
...
@@ -222,7 +213,7 @@ void KernelRuntime::RunOpAssignWorkSpaceMemory(const AnfNodePtr &kernel) {
for
(
size_t
i
=
0
;
i
<
workspace_lists
.
size
();
++
i
)
{
auto
device_address
=
CreateDeviceAddress
(
nullptr
,
workspace_lists
[
i
],
""
,
kTypeUnknown
);
MS_EXCEPTION_IF_NULL
(
device_address
);
MallocOpMemory
(
device_address
,
workspace_lists
[
i
],
kDynamicMem
);
mem_manager_
->
MallocOpMemory
(
device_address
,
workspace_lists
[
i
]
);
AnfAlgo
::
SetWorkspaceAddr
(
device_address
,
i
,
kernel
.
get
());
}
}
...
...
@@ -230,6 +221,7 @@ void KernelRuntime::RunOpAssignWorkSpaceMemory(const AnfNodePtr &kernel) {
void
KernelRuntime
::
AssignStaticMemoryInput
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
for
(
auto
&
item
:
graph
->
inputs
())
{
MS_EXCEPTION_IF_NULL
(
item
);
if
(
!
item
->
isa
<
Parameter
>
())
{
...
...
@@ -247,7 +239,7 @@ void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) {
output_type_id
=
AnfAlgo
::
GetOutputInferDataType
(
item
,
index
);
}
auto
tensor_size
=
CountNodeDeviceMemorySize
(
item
,
index
);
auto
ptr
=
MallocStaticMem
(
tensor_size
,
fals
e
);
auto
ptr
=
mem_manager_
->
MallocMem
(
kStaticMem
,
tensor_siz
e
);
auto
address
=
CreateDeviceAddress
(
ptr
,
tensor_size
,
AnfAlgo
::
GetOutputFormat
(
item
,
index
),
output_type_id
);
AnfAlgo
::
SetOutputAddr
(
address
,
index
,
item
.
get
());
}
...
...
@@ -301,6 +293,7 @@ void KernelRuntime::UpdateRefNodeOutputMem(const session::KernelGraph *graph) {
void
KernelRuntime
::
AssignCommunicationNodeOutputMem
(
int
flag
,
const
AnfNodePtr
&
node
)
{
MS_EXCEPTION_IF_NULL
(
node
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
node
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
output_sizes
=
kernel_mod
->
GetOutputSizeList
();
...
...
@@ -314,12 +307,12 @@ void KernelRuntime::AssignCommunicationNodeOutputMem(int flag, const AnfNodePtr
std
::
vector
<
size_t
>
align_size_list
;
for
(
uint64_t
mem_size
:
output_sizes
)
{
if
(
context_ptr
->
enable_hccl
())
{
mem_size
=
GetCommonAlignSize
(
mem_size
);
mem_size
=
mem_manager_
->
GetCommonAlignSize
(
mem_size
);
}
total_size
+=
mem_size
;
align_size_list
.
emplace_back
(
mem_size
);
}
uint8_t
*
output_ptr
=
CalDeviceMem
(
node
,
total_size
,
flag
,
0
);
uint8_t
*
output_ptr
=
mem_manager_
->
MallocOutputMem
(
node
,
0
,
flag
,
total_size
);
for
(
size_t
j
=
0
;
j
<
align_size_list
.
size
();
++
j
)
{
std
::
string
output_format
=
AnfAlgo
::
GetOutputFormat
(
node
,
j
);
auto
output_type
=
AnfAlgo
::
GetOutputDeviceDataType
(
node
,
j
);
...
...
@@ -333,6 +326,7 @@ void KernelRuntime::UpdateCommunicationOpInputMem(const AnfNodePtr &node) {
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
MS_EXCEPTION_IF_NULL
(
node
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
size_t
total_size
=
0
;
std
::
vector
<
std
::
pair
<
mindspore
::
device
::
DeviceAddress
*
,
size_t
>>
addr_size
;
for
(
size_t
i
=
0
;
i
<
AnfAlgo
::
GetInputTensorNum
(
node
);
++
i
)
{
...
...
@@ -340,12 +334,12 @@ void KernelRuntime::UpdateCommunicationOpInputMem(const AnfNodePtr &node) {
MS_EXCEPTION_IF_NULL
(
address
);
auto
mem_size
=
address
->
size
();
if
(
context_ptr
->
enable_hccl
())
{
mem_size
=
GetCommonAlignSize
(
mem_size
);
mem_size
=
mem_manager_
->
GetCommonAlignSize
(
mem_size
);
}
total_size
+=
mem_size
;
addr_size
.
emplace_back
(
address
.
get
(),
mem_size
);
}
uint8_t
*
input_ptr
=
CalDeviceMem
(
node
,
total_size
,
kDynamicMem
,
0
);
uint8_t
*
input_ptr
=
mem_manager_
->
MallocOutputMem
(
node
,
0
,
kDynamicMem
,
total_size
);
for
(
const
auto
&
iter
:
addr_size
)
{
MS_EXCEPTION_IF_NULL
(
iter
.
first
);
iter
.
first
->
set_ptr
(
input_ptr
);
...
...
@@ -355,7 +349,8 @@ void KernelRuntime::UpdateCommunicationOpInputMem(const AnfNodePtr &node) {
void
KernelRuntime
::
AssignNodeOutputMem
(
int
flag
,
const
AnfNodePtr
&
node
,
int
index
)
{
MS_EXCEPTION_IF_NULL
(
node
);
if
(
IsCommunicationOp
(
node
))
{
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
if
(
AnfAlgo
::
IsCommunicationOp
(
node
))
{
UpdateCommunicationOpInputMem
(
node
);
AssignCommunicationNodeOutputMem
(
flag
,
node
);
return
;
...
...
@@ -375,7 +370,7 @@ void KernelRuntime::AssignNodeOutputMem(int flag, const AnfNodePtr &node, int in
MS_LOG
(
INFO
)
<<
"Already malloc index:"
<<
i
;
continue
;
}
auto
ptr
=
CalDeviceMem
(
node
,
output_sizes
[
i
],
flag
,
i
);
auto
ptr
=
mem_manager_
->
MallocOutputMem
(
node
,
i
,
flag
,
output_sizes
[
i
]
);
if
(
ptr
==
nullptr
)
{
// reused ptr, no need alloc, continue;
continue
;
...
...
@@ -390,6 +385,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const
size_t
output_idx
)
{
MS_EXCEPTION_IF_NULL
(
value_node
);
MS_EXCEPTION_IF_NULL
(
node_value
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
auto
tensor
=
node_value
->
cast
<
TensorPtr
>
();
if
(
tensor
==
nullptr
)
{
MS_LOG
(
WARNING
)
<<
"Tensor is null"
;
...
...
@@ -397,7 +393,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const
}
size_t
tensor_size
=
tensor
->
data
().
nbytes
();
auto
node_size
=
CountNodeDeviceMemorySize
(
value_node
,
output_idx
);
auto
ptr
=
MallocStaticMem
(
node_size
,
fals
e
);
auto
ptr
=
mem_manager_
->
MallocMem
(
kStaticMem
,
node_siz
e
);
TypeId
output_type_id
=
AnfAlgo
::
GetOutputDeviceDataType
(
value_node
,
output_idx
);
if
(
output_type_id
==
kTypeUnknown
)
{
output_type_id
=
AnfAlgo
::
GetOutputInferDataType
(
value_node
,
output_idx
);
...
...
@@ -414,6 +410,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const
void
KernelRuntime
::
AssignStaticMemoryValueNode
(
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
for
(
auto
&
value_node
:
graph
->
graph_value_nodes
())
{
MS_EXCEPTION_IF_NULL
(
value_node
);
if
(
AnfAlgo
::
OutputAddrExist
(
value_node
,
0
))
{
...
...
@@ -440,7 +437,7 @@ void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) {
}
else
if
(
node_value
->
isa
<
StringImm
>
())
{
auto
value
=
GetValue
<
std
::
string
>
(
node_value
);
size_t
tensor_size
=
value
.
size
();
auto
ptr
=
MallocStaticMem
(
tensor_size
,
fals
e
);
auto
ptr
=
mem_manager_
->
MallocMem
(
kStaticMem
,
tensor_siz
e
);
auto
address
=
CreateDeviceAddress
(
ptr
,
tensor_size
,
kOpFormat_DEFAULT
,
kNumberTypeUInt8
);
MS_EXCEPTION_IF_NULL
(
address
);
AnfAlgo
::
SetOutputAddr
(
address
,
0
,
value_node
.
get
());
...
...
@@ -452,103 +449,37 @@ void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) {
}
}
void
KernelRuntime
::
AssignDynamicMemory
(
const
session
::
KernelGraph
*
graph
)
{
void
KernelRuntime
::
AssignDynamicMemory
(
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
// reset dynamic mem offset
dynamic_mem_offset_
=
0
;
auto
&
kernels
=
graph
->
execution_order
();
for
(
auto
&
kernel
:
kernels
)
{
AssignNodeOutputMem
(
kDynamicMem
,
kernel
,
kGetAllOuts
);
AssignWorkSpaceMem
(
kernel
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
bool
is_enable_mem_reuse
=
context_ptr
->
enable_mem_reuse
();
auto
mem_flag
=
kDynamicMem
;
if
(
is_enable_mem_reuse
)
{
mem_manager_
->
InitReuseDynamicMemory
(
graph
);
mem_flag
=
kReuseDynamicMem
;
}
}
void
KernelRuntime
::
ReuseAssignDynamicMemory
(
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
dynamic_mem_offset_
=
0
;
MemReuseUtilPtr
mem_reuse_util_ptr
=
std
::
make_shared
<
memreuse
::
MemReuseUtil
>
();
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_ptr
);
// set all infos
mem_reuse_util_ptr
->
SetAllInfo
(
graph
);
auto
bestfit_mem_reuse
=
std
::
make_shared
<
BestFitMemReuse
>
();
MS_EXCEPTION_IF_NULL
(
bestfit_mem_reuse
);
bestfit_mem_reuse
->
Reuse
(
mem_reuse_util_ptr
.
get
());
size_t
total_allocated_size
=
bestfit_mem_reuse
->
GetAllocatedSize
();
MS_LOG
(
INFO
)
<<
"TotalReuseDynamicSize ["
<<
total_allocated_size
<<
"]"
;
mem_reuse_util_ptr_
=
mem_reuse_util_ptr
;
auto
base_ptr
=
MallocDynamicMem
(
total_allocated_size
,
false
);
mem_reuse_util_ptr_
->
set_mem_base
(
base_ptr
);
auto
&
kernels
=
graph
->
execution_order
();
for
(
auto
&
kernel
:
kernels
)
{
AssignNodeOutputMem
(
kReuseDynamicMem
,
kernel
,
kGetAllOuts
);
Assign
ReuseWorkSpaceMem
(
kernel
);
AssignNodeOutputMem
(
mem_flag
,
kernel
,
kGetAllOuts
);
Assign
WorkSpaceMem
(
mem_flag
,
kernel
);
}
}
void
KernelRuntime
::
Assign
ReuseWorkSpaceMem
(
const
AnfNodePtr
&
node
)
{
void
KernelRuntime
::
Assign
WorkSpaceMem
(
int
flag
,
const
AnfNodePtr
&
node
)
{
MS_EXCEPTION_IF_NULL
(
node
);
MS_EXCEPTION_IF_NULL
(
mem_manager_
);
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
node
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
size_t
index
=
0
;
for
(
auto
&
size
:
kernel_mod
->
GetWorkspaceSizeList
())
{
auto
wk_ptr
=
mem_reuse_util_ptr_
->
GetNodeWorkSpacePtr
(
node
,
index
);
AnfAlgo
::
SetWorkspaceAddr
(
CreateDeviceAddress
(
wk_
ptr
,
size
,
""
,
kTypeUnknown
),
index
,
node
.
get
());
auto
ptr
=
mem_manager_
->
MallocWorkSpaceMem
(
node
,
flag
,
index
,
size
);
AnfAlgo
::
SetWorkspaceAddr
(
CreateDeviceAddress
(
ptr
,
size
,
""
,
kTypeUnknown
),
index
,
node
.
get
());
index
++
;
}
}
void
KernelRuntime
::
AssignWorkSpaceMem
(
const
AnfNodePtr
&
node
)
{
MS_EXCEPTION_IF_NULL
(
node
);
if
(
node
->
isa
<
CNode
>
())
{
auto
kernel_mod
=
AnfAlgo
::
GetKernelMod
(
node
);
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
size_t
index
=
0
;
for
(
auto
&
size
:
kernel_mod
->
GetWorkspaceSizeList
())
{
auto
ptr
=
MallocDynamicMem
(
size
,
false
);
AnfAlgo
::
SetWorkspaceAddr
(
CreateDeviceAddress
(
ptr
,
size
,
""
,
kTypeUnknown
),
index
,
node
.
get
());
index
++
;
}
}
}
bool
KernelRuntime
::
IsCommunicationOp
(
const
AnfNodePtr
&
node
)
{
MS_EXCEPTION_IF_NULL
(
node
);
auto
kernel_name
=
AnfAlgo
::
GetCNodeName
(
node
);
auto
kernel_type
=
AnfAlgo
::
GetKernelType
(
node
);
if
(
kernel_name
==
kAllReduceOpName
||
kernel_type
==
HCCL_KERNEL
)
{
return
true
;
}
return
false
;
}
uint8_t
*
KernelRuntime
::
CalDeviceMem
(
const
AnfNodePtr
&
node
,
size_t
size
,
int
flag
,
size_t
index
)
{
MS_EXCEPTION_IF_NULL
(
node
);
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
uint8_t
*
ptr
=
nullptr
;
if
(
IsCommunicationOp
(
node
))
{
bool
communication_mem
=
false
;
if
(
context_ptr
->
enable_hccl
())
{
communication_mem
=
true
;
}
if
(
flag
==
kStaticMem
)
{
ptr
=
MallocStaticMem
(
size
,
communication_mem
);
}
else
{
ptr
=
MallocDynamicMem
(
size
,
communication_mem
);
}
return
ptr
;
}
if
(
flag
==
kStaticMem
)
{
ptr
=
MallocStaticMem
(
size
,
false
);
}
else
if
(
flag
==
kDynamicMem
)
{
ptr
=
MallocDynamicMem
(
size
,
false
);
}
else
if
(
flag
==
kReuseDynamicMem
)
{
ptr
=
mem_reuse_util_ptr_
->
GetNodeOutputPtr
(
node
,
index
);
}
return
ptr
;
}
void
KernelRuntime
::
GenLaunchArgs
(
const
mindspore
::
kernel
::
KernelMod
&
kernel_mod
,
const
mindspore
::
AnfNodePtr
&
kernel
,
AddressPtrList
*
kernel_inputs
,
AddressPtrList
*
const
kernel_workspaces
,
AddressPtrList
*
kernel_outputs
)
{
...
...
@@ -659,65 +590,6 @@ bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) {
return
true
;
}
size_t
KernelRuntime
::
GetCommonAlignSize
(
size_t
input_size
)
const
{
return
(
input_size
+
mem_align_size_
+
31
)
/
mem_align_size_
*
mem_align_size_
;
}
size_t
KernelRuntime
::
GetCommunicationAlignSize
(
size_t
input_size
)
const
{
return
(
input_size
+
mem_align_size_
-
1
)
/
mem_align_size_
*
mem_align_size_
+
2
*
mem_align_size_
;
}
uint8_t
*
KernelRuntime
::
MallocStaticMem
(
size_t
size
,
bool
communication_mem
)
{
size_t
align_size
=
0
;
if
(
communication_mem
)
{
align_size
=
GetCommunicationAlignSize
(
size
);
}
else
{
align_size
=
GetCommonAlignSize
(
size
);
}
if
(
static_mem_offset_
<
align_size
)
{
MS_LOG
(
EXCEPTION
)
<<
"Out of memory!!! total["
<<
device_mem_size_
<<
"](dynamic["
<<
total_dynamic_size_
<<
"] static["
<<
total_static_size_
<<
"])"
<<
" malloc ["
<<
align_size
<<
"] failed!"
;
}
total_static_size_
+=
align_size
;
auto
offset
=
static_mem_offset_
-
align_size
;
if
(
dynamic_mem_offset_
>
offset
)
{
MS_LOG
(
EXCEPTION
)
<<
"Out of memory!!! total["
<<
device_mem_size_
<<
"](dynamic["
<<
total_dynamic_size_
<<
"] static["
<<
total_static_size_
<<
"])"
<<
" malloc ["
<<
align_size
<<
"] failed!"
;
}
static_mem_offset_
=
offset
;
if
(
communication_mem
)
{
return
device_mem_base_
+
offset
+
mem_align_size_
;
}
else
{
return
device_mem_base_
+
offset
;
}
}
uint8_t
*
KernelRuntime
::
MallocDynamicMem
(
size_t
size
,
bool
communication_mem
)
{
size_t
align_size
=
0
;
if
(
communication_mem
)
{
align_size
=
GetCommunicationAlignSize
(
size
);
}
else
{
align_size
=
GetCommonAlignSize
(
size
);
}
uint64_t
offset
=
dynamic_mem_offset_
;
auto
new_offset
=
dynamic_mem_offset_
+
align_size
;
if
(
new_offset
>
static_mem_offset_
)
{
MS_LOG
(
EXCEPTION
)
<<
"Out of memory!!! total["
<<
device_mem_size_
<<
"](dynamic["
<<
total_dynamic_size_
<<
"] static["
<<
total_static_size_
<<
"])"
<<
" malloc ["
<<
align_size
<<
"] failed!"
;
}
total_dynamic_size_
+=
align_size
;
dynamic_mem_offset_
=
new_offset
;
if
(
communication_mem
)
{
return
device_mem_base_
+
offset
+
mem_align_size_
;
}
else
{
return
device_mem_base_
+
offset
;
}
}
bool
KernelRuntime
::
LaunchKernel
(
const
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
if
(
!
LaunchKernelMod
(
*
graph
))
{
...
...
@@ -731,29 +603,6 @@ bool KernelRuntime::LaunchKernel(const session::KernelGraph *graph) {
return
true
;
}
void
KernelRuntime
::
MallocOpMemory
(
const
DeviceAddressPtr
address
,
size_t
size
,
int
flag
)
{
if
(
flag
==
kStaticMem
)
{
address
->
ptr_
=
MallocStaticMem
(
size
,
false
);
}
else
if
(
flag
==
kDynamicMem
)
{
address
->
ptr_
=
MallocDynamicMem
(
size
,
false
);
}
else
{
MS_LOG
(
EXCEPTION
)
<<
"Unknown memory type!"
;
}
}
void
*
KernelRuntime
::
AllocTensorMemDynamic
(
size_t
size
)
{
if
(
size
==
0
)
{
MS_LOG
(
ERROR
)
<<
"AllocTensorMemDynamic size is 0."
;
}
return
nullptr
;
}
void
KernelRuntime
::
FreeTensorMemDynamic
(
void
*
device_ptr
)
{
if
(
device_ptr
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"FreeTensorMemDynamic device_ptr is null."
;
}
}
#ifdef ENABLE_DUMP_E2E
bool
KernelRuntime
::
SetDumpConf
()
{
dump_conf_ptr_
=
std
::
make_shared
<
Dump
>
();
...
...
mindspore/ccsrc/device/kernel_runtime.h
浏览文件 @
fb343bd6
...
...
@@ -20,8 +20,7 @@
#include <memory>
#include <string>
#include <map>
#include "pre_activate/mem_reuse/mem_reuse.h"
#include "pre_activate/mem_reuse/mem_reuse_allocator.h"
#include "device/device_address.h"
#include "ir/meta_tensor.h"
#include "predict/generator/utils/ir_model_util.h"
...
...
@@ -32,21 +31,16 @@
#include "session/anf_runtime_algorithm.h"
#include "kernel/kernel.h"
#include "utils/context/ms_context.h"
#include "device/memory_manager.h"
// using mindspore::session::KernelGraph;
using
mindspore
::
tensor
::
Tensor
;
using
TensorPtr
=
std
::
shared_ptr
<
Tensor
>
;
using
MemReuseUtilPtr
=
mindspore
::
memreuse
::
MemReuseUtilPtr
;
using
mindspore
::
kernel
::
AddressPtr
;
using
AddressPtrList
=
std
::
vector
<
mindspore
::
kernel
::
AddressPtr
>
;
namespace
mindspore
{
namespace
device
{
const
int
kStaticMem
=
0
;
const
int
kDynamicMem
=
1
;
const
int
kReuseDynamicMem
=
2
;
const
int
kGetAllOuts
=
-
1
;
class
KernelRuntime
{
public:
KernelRuntime
()
=
default
;
...
...
@@ -65,7 +59,6 @@ class KernelRuntime {
DumpConfPtr
GetDumpConf
();
#endif
virtual
bool
LoadTask
(
const
session
::
KernelGraph
*
graph
);
virtual
void
FreeHostMemory
();
// for GPU and D to impl
virtual
void
ReleaseDeviceRes
()
{}
void
set_device_id
(
uint32_t
device_id
)
{
device_id_
=
device_id
;
}
...
...
@@ -75,29 +68,17 @@ class KernelRuntime {
TypeId
type_id
)
=
0
;
virtual
bool
SyncStream
()
=
0
;
void
AssignStaticMemory
(
session
::
KernelGraph
*
graph
);
void
AssignDynamicMemory
(
const
session
::
KernelGraph
*
graph
);
void
AssignDynamicMemory
(
session
::
KernelGraph
*
graph
);
void
ReuseAssignDynamicMemory
(
session
::
KernelGraph
*
graph
);
void
AssignNodeOutputMem
(
int
flag
,
const
AnfNodePtr
&
node
,
int
index
);
void
AssignWorkSpaceMem
(
const
AnfNodePtr
&
node
);
void
AssignWorkSpaceMem
(
int
flag
,
const
AnfNodePtr
&
node
);
void
AssignReuseWorkSpaceMem
(
const
AnfNodePtr
&
node
);
void
AssignCommunicationNodeOutputMem
(
int
flag
,
const
AnfNodePtr
&
node
);
void
UpdateRefNodeOutputMem
(
const
session
::
KernelGraph
*
graph
);
void
UpdateCommunicationOpInputMem
(
const
AnfNodePtr
&
node
);
bool
IsCommunicationOp
(
const
AnfNodePtr
&
node
);
size_t
GetCommonAlignSize
(
size_t
input_size
)
const
;
size_t
GetCommunicationAlignSize
(
size_t
input_size
)
const
;
uint8_t
*
CalDeviceMem
(
const
AnfNodePtr
&
node
,
size_t
size
,
int
flag
,
size_t
index
);
virtual
uint8_t
*
MallocStaticMem
(
size_t
size
,
bool
communication_mem
);
uint8_t
*
MallocDynamicMem
(
size_t
size
,
bool
communication_mem
);
#ifdef ENABLE_DUMP_E2E
bool
SetDumpConf
();
#endif
// Alloc memory use the dynamic memory pool.
virtual
void
*
AllocTensorMemDynamic
(
size_t
size
);
// Free memory use the dynamic memory pool.
virtual
void
FreeTensorMemDynamic
(
void
*
device_ptr
);
virtual
void
MallocOpMemory
(
const
DeviceAddressPtr
address
,
size_t
size
,
int
flag
);
private:
void
AssignStaticMemoryOutput
(
const
session
::
KernelGraph
*
graph
);
...
...
@@ -114,20 +95,11 @@ class KernelRuntime {
protected:
uint32_t
device_id_
{
0
};
uint8_t
*
device_mem_base_
{
nullptr
};
uint8_t
*
device_mem_pool_base_
{
nullptr
};
uint64_t
device_mem_size_
{
0
};
uint64_t
device_mem_pool_size_
{
0
};
uint64_t
dynamic_mem_offset_
{
0
};
uint64_t
static_mem_offset_
{
0
};
const
uint64_t
mem_align_size_
=
512
;
#ifdef ENABLE_DUMP_E2E
DumpConfPtr
dump_conf_ptr_
;
#endif
void
*
stream_
=
nullptr
;
size_t
total_static_size_
=
0
;
size_t
total_dynamic_size_
=
0
;
MemReuseUtilPtr
mem_reuse_util_ptr_
{
nullptr
};
std
::
shared_ptr
<
MemoryManager
>
mem_manager_
{
nullptr
};
};
using
KernelRuntimePtr
=
std
::
shared_ptr
<
KernelRuntime
>
;
}
// namespace device
...
...
mindspore/ccsrc/device/memory_manager.cc
0 → 100644
浏览文件 @
fb343bd6
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "device/memory_manager.h"
#include "session/anf_runtime_algorithm.h"
#include "utils/context/ms_context.h"
using
mindspore
::
memreuse
::
BestFitMemReuse
;
using
mindspore
::
memreuse
::
MemReuseUtilPtr
;
namespace
mindspore
{
namespace
device
{
MemoryManager
::~
MemoryManager
()
{
device_mem_base_
=
nullptr
;
device_mem_pool_base_
=
nullptr
;
mem_reuse_util_ptr_
=
nullptr
;
}
size_t
MemoryManager
::
GetCommonAlignSize
(
size_t
input_size
)
const
{
return
(
input_size
+
kMemAlignSize
+
31
)
/
kMemAlignSize
*
kMemAlignSize
;
}
size_t
MemoryManager
::
GetCommunicationAlignSize
(
size_t
input_size
)
const
{
return
(
input_size
+
kMemAlignSize
-
1
)
/
kMemAlignSize
*
kMemAlignSize
+
2
*
kMemAlignSize
;
}
void
MemoryManager
::
InitReuseDynamicMemory
(
session
::
KernelGraph
*
graph
)
{
MS_EXCEPTION_IF_NULL
(
graph
);
MemReuseUtilPtr
mem_reuse_util_ptr
=
std
::
make_shared
<
memreuse
::
MemReuseUtil
>
();
MS_EXCEPTION_IF_NULL
(
mem_reuse_util_ptr
);
// set all infos
mem_reuse_util_ptr
->
SetAllInfo
(
graph
);
auto
bestfit_mem_reuse
=
std
::
make_shared
<
BestFitMemReuse
>
();
MS_EXCEPTION_IF_NULL
(
bestfit_mem_reuse
);
bestfit_mem_reuse
->
Reuse
(
mem_reuse_util_ptr
.
get
());
size_t
total_allocated_size
=
bestfit_mem_reuse
->
GetAllocatedSize
();
MS_LOG
(
INFO
)
<<
"TotalReuseDynamicSize ["
<<
total_allocated_size
<<
"]"
;
mem_reuse_util_ptr_
=
mem_reuse_util_ptr
;
auto
base_ptr
=
MallocDynamicMem
(
total_allocated_size
,
false
);
mem_reuse_util_ptr_
->
set_mem_base
(
base_ptr
);
}
uint8_t
*
MemoryManager
::
MallocOutputMem
(
const
AnfNodePtr
&
node
,
size_t
index
,
int
flag
,
size_t
size
)
{
MS_EXCEPTION_IF_NULL
(
node
);
auto
context_ptr
=
MsContext
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
context_ptr
);
uint8_t
*
ptr
=
nullptr
;
if
(
AnfAlgo
::
IsCommunicationOp
(
node
))
{
bool
communication_mem
=
false
;
if
(
context_ptr
->
enable_hccl
())
{
communication_mem
=
true
;
}
if
(
flag
==
kStaticMem
)
{
ptr
=
MallocStaticMem
(
size
,
communication_mem
);
}
else
{
ptr
=
MallocDynamicMem
(
size
,
communication_mem
);
}
return
ptr
;
}
if
(
flag
==
kStaticMem
)
{
ptr
=
MallocStaticMem
(
size
,
false
);
}
else
if
(
flag
==
kDynamicMem
)
{
ptr
=
MallocDynamicMem
(
size
,
false
);
}
else
if
(
flag
==
kReuseDynamicMem
)
{
ptr
=
mem_reuse_util_ptr_
->
GetNodeOutputPtr
(
node
,
index
);
}
return
ptr
;
}
uint8_t
*
MemoryManager
::
MallocWorkSpaceMem
(
const
AnfNodePtr
&
node
,
size_t
index
,
int
flag
,
size_t
size
)
{
if
(
flag
==
kReuseDynamicMem
)
{
return
mem_reuse_util_ptr_
->
GetNodeWorkSpacePtr
(
node
,
index
);
}
return
MallocDynamicMem
(
size
,
false
);
}
uint8_t
*
MemoryManager
::
MallocMem
(
int
flag
,
size_t
size
)
{
uint8_t
*
ptr
=
nullptr
;
if
(
flag
==
kStaticMem
)
{
ptr
=
MallocStaticMem
(
size
,
false
);
}
else
if
(
flag
==
kDynamicMem
)
{
ptr
=
MallocDynamicMem
(
size
,
false
);
}
return
ptr
;
}
uint8_t
*
MemoryManager
::
MallocStaticMem
(
size_t
size
,
bool
communication_mem
)
{
size_t
align_size
=
0
;
if
(
communication_mem
)
{
align_size
=
GetCommunicationAlignSize
(
size
);
}
else
{
align_size
=
GetCommonAlignSize
(
size
);
}
if
(
static_mem_offset_
<
align_size
)
{
MS_LOG
(
EXCEPTION
)
<<
"Out of memory!!! total["
<<
device_mem_size_
<<
"](dynamic["
<<
total_dynamic_size_
<<
"] static["
<<
total_static_size_
<<
"])"
<<
" malloc ["
<<
align_size
<<
"] failed!"
;
}
total_static_size_
+=
align_size
;
auto
offset
=
static_mem_offset_
-
align_size
;
if
(
dynamic_mem_offset_
>
offset
)
{
MS_LOG
(
EXCEPTION
)
<<
"Out of memory!!! total["
<<
device_mem_size_
<<
"](dynamic["
<<
total_dynamic_size_
<<
"] static["
<<
total_static_size_
<<
"])"
<<
" malloc ["
<<
align_size
<<
"] failed!"
;
}
static_mem_offset_
=
offset
;
if
(
communication_mem
)
{
return
device_mem_base_
+
offset
+
kMemAlignSize
;
}
else
{
return
device_mem_base_
+
offset
;
}
}
uint8_t
*
MemoryManager
::
MallocDynamicMem
(
size_t
size
,
bool
communication_mem
)
{
size_t
align_size
=
0
;
if
(
communication_mem
)
{
align_size
=
GetCommunicationAlignSize
(
size
);
}
else
{
align_size
=
GetCommonAlignSize
(
size
);
}
uint64_t
offset
=
dynamic_mem_offset_
;
auto
new_offset
=
dynamic_mem_offset_
+
align_size
;
if
(
new_offset
>
static_mem_offset_
)
{
MS_LOG
(
EXCEPTION
)
<<
"Out of memory!!! total["
<<
device_mem_size_
<<
"](dynamic["
<<
total_dynamic_size_
<<
"] static["
<<
total_static_size_
<<
"])"
<<
" malloc ["
<<
align_size
<<
"] failed!"
;
}
total_dynamic_size_
+=
align_size
;
dynamic_mem_offset_
=
new_offset
;
if
(
communication_mem
)
{
return
device_mem_base_
+
offset
+
kMemAlignSize
;
}
else
{
return
device_mem_base_
+
offset
;
}
}
void
MemoryManager
::
MallocOpMemory
(
const
DeviceAddressPtr
address
,
size_t
size
)
{
auto
device_ptr
=
AllocTensorMemDynamic
(
size
);
MS_EXCEPTION_IF_NULL
(
device_ptr
);
address
->
ptr_
=
device_ptr
;
address
->
mem_dynamic_alloc_
=
true
;
}
void
*
MemoryManager
::
AllocTensorMemDynamic
(
size_t
size
)
{
if
(
size
==
0
)
{
MS_LOG
(
ERROR
)
<<
"AllocTensorMemDynamic size is 0."
;
}
return
nullptr
;
}
void
MemoryManager
::
FreeTensorMemDynamic
(
void
*
device_ptr
)
{
if
(
device_ptr
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"FreeTensorMemDynamic device_ptr is null."
;
}
}
}
// namespace device
}
// namespace mindspore
mindspore/ccsrc/device/memory_manager.h
0 → 100644
浏览文件 @
fb343bd6
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_MINDSPORE_CCSRC_DEVICE_MEMORY_MANAGER_H_
#define MINDSPORE_MINDSPORE_CCSRC_DEVICE_MEMORY_MANAGER_H_
#include <memory>
#include "pre_activate/mem_reuse/mem_reuse.h"
#include "pre_activate/mem_reuse/mem_reuse_allocator.h"
namespace
mindspore
{
namespace
device
{
const
int
kStaticMem
=
0
;
const
int
kDynamicMem
=
1
;
const
int
kReuseDynamicMem
=
2
;
const
int
kGetAllOuts
=
-
1
;
const
uint64_t
kMemAlignSize
=
512
;
using
MemReuseUtilPtr
=
mindspore
::
memreuse
::
MemReuseUtilPtr
;
class
MemoryManager
{
public:
MemoryManager
()
=
default
;
virtual
~
MemoryManager
();
virtual
void
MallocDeviceMemory
()
=
0
;
virtual
void
FreeDeviceMemory
()
=
0
;
void
ResetDynamicMemory
()
{
total_dynamic_size_
=
0
;
dynamic_mem_offset_
=
0
;
}
void
InitReuseDynamicMemory
(
session
::
KernelGraph
*
graph
);
uint8_t
*
MallocOutputMem
(
const
AnfNodePtr
&
node
,
size_t
index
,
int
flag
,
size_t
size
);
uint8_t
*
MallocWorkSpaceMem
(
const
AnfNodePtr
&
node
,
size_t
index
,
int
flag
,
size_t
size
);
virtual
uint8_t
*
MallocMem
(
int
flag
,
size_t
size
);
// Alloc memory use the dynamic memory pool.
virtual
void
*
AllocTensorMemDynamic
(
size_t
size
);
// Free memory use the dynamic memory pool.
virtual
void
FreeTensorMemDynamic
(
void
*
device_ptr
);
virtual
void
MallocOpMemory
(
const
DeviceAddressPtr
address
,
size_t
size
);
size_t
GetCommonAlignSize
(
size_t
input_size
)
const
;
size_t
GetCommunicationAlignSize
(
size_t
input_size
)
const
;
protected:
virtual
uint8_t
*
MallocStaticMem
(
size_t
size
,
bool
communication_mem
);
virtual
uint8_t
*
MallocDynamicMem
(
size_t
size
,
bool
communication_mem
);
uint8_t
*
device_mem_base_
{
nullptr
};
uint8_t
*
device_mem_pool_base_
{
nullptr
};
uint64_t
device_mem_size_
{
0
};
uint64_t
device_mem_pool_size_
{
0
};
uint64_t
dynamic_mem_offset_
{
0
};
uint64_t
static_mem_offset_
{
0
};
size_t
total_static_size_
=
0
;
size_t
total_dynamic_size_
=
0
;
MemReuseUtilPtr
mem_reuse_util_ptr_
{
nullptr
};
};
}
// namespace device
}
// namespace mindspore
#endif // MINDSPORE_MINDSPORE_CCSRC_DEVICE_MEMORY_MANAGER_H_
mindspore/ccsrc/session/anf_runtime_algorithm.cc
浏览文件 @
fb343bd6
...
...
@@ -857,5 +857,15 @@ void AnfRuntimeAlgorithm::SetNodeInput(const CNodePtr &node, const AnfNodePtr &i
MS_EXCEPTION_IF_NULL
(
input_node
);
node
->
set_input
(
index
+
1
,
input_node
);
}
bool
AnfRuntimeAlgorithm
::
IsCommunicationOp
(
const
AnfNodePtr
&
node
)
{
MS_EXCEPTION_IF_NULL
(
node
);
auto
kernel_name
=
AnfAlgo
::
GetCNodeName
(
node
);
auto
kernel_type
=
AnfAlgo
::
GetKernelType
(
node
);
if
(
kernel_name
==
kAllReduceOpName
||
kernel_type
==
HCCL_KERNEL
)
{
return
true
;
}
return
false
;
}
}
// namespace session
}
// namespace mindspore
mindspore/ccsrc/session/anf_runtime_algorithm.h
浏览文件 @
fb343bd6
...
...
@@ -166,6 +166,7 @@ class AnfRuntimeAlgorithm {
static
bool
IsFeatureMapInput
(
const
AnfNodePtr
&
node
,
size_t
input_index
);
// get real input index for some tbe ops which input order is different between me and tbe impl
static
size_t
GetRealInputIndex
(
const
AnfNodePtr
&
anf_node
,
const
size_t
cur_index
);
static
bool
IsCommunicationOp
(
const
AnfNodePtr
&
node
);
};
}
// namespace session
using
AnfAlgo
=
session
::
AnfRuntimeAlgorithm
;
...
...
mindspore/ccsrc/session/gpu_session.cc
浏览文件 @
fb343bd6
...
...
@@ -102,10 +102,6 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
graph
->
set_execution_order
(
execution_order
);
// Alloc memory, including static memory and dynamic memory
AllocateMemory
(
graph
.
get
());
// Reset memory resource
auto
runtime_instance
=
device
::
KernelRuntimeManager
::
Instance
().
GetSingleKernelRuntime
(
kGPUDevice
,
device_id_
);
MS_EXCEPTION_IF_NULL
(
runtime_instance
);
runtime_instance
->
FreeHostMemory
();
return
graph_id
;
}
...
...
tests/ut/cpp/CMakeLists.txt
浏览文件 @
fb343bd6
...
...
@@ -85,6 +85,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
"../../../mindspore/ccsrc/kernel/oplib/*.cc"
"../../../mindspore/ccsrc/kernel/tbe/*.cc"
"../../../mindspore/ccsrc/device/kernel_runtime.cc"
"../../../mindspore/ccsrc/device/memory_manager.cc"
"../../../mindspore/ccsrc/device/kernel_runtime_manager.cc"
"../../../mindspore/ccsrc/device/kernel_info.cc"
"../../../mindspore/ccsrc/device/ascend/profiling/*.cc"
...
...
@@ -92,6 +93,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
"../../../mindspore/ccsrc/device/convert_tensor_utils.cc"
"../../../mindspore/ccsrc/device/ascend/kernel_build_ascend.cc"
"../../../mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc"
"../../../mindspore/ccsrc/device/ascend/ascend_memory_manager.cc"
"../../../mindspore/ccsrc/device/ascend/ascend_device_address.cc"
"../../../mindspore/ccsrc/device/ascend/ascend_memory_allocator.cc"
"../../../mindspore/ccsrc/predict/generator/utils/ir_model_util.cc"
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录