diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt index 9f559a51eb1c93947d4221725377f682181f059a..1d104148c323e747b71cc824a63f04137bba49ce 100644 --- a/mindspore/ccsrc/CMakeLists.txt +++ b/mindspore/ccsrc/CMakeLists.txt @@ -132,6 +132,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "kernel/kash/*.cc" "device/kernel_info.cc" "device/kernel_runtime.cc" + "device/memory_manager.cc" "device/kernel_runtime_manager.cc" "device/convert_tensor_utils.cc" "pre_activate/common/*.cc" diff --git a/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc index dc7eb5449b47758c985d11d79bfc491c0a7ad30a..0c2a97a5a6ff924a81419b7fd4ded6e041394116 100644 --- a/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc +++ b/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc @@ -37,6 +37,7 @@ #include "kernel/tbe/tbe_utils.h" #include "kernel/tbe/tbe_python_funcs.h" #include "pre_activate/mem_reuse/mem_reuse_checker.h" +#include "device/ascend/ascend_memory_manager.h" using mindspore::device::ascend::ProfilingManager; using mindspore::device::ascend::ProfilingUtils; @@ -47,8 +48,6 @@ using std::vector; namespace mindspore { namespace device { namespace ascend { -static const uint64_t ASCEND_MEM_SIZE = 20; -static const uint64_t ASCEND_MEM_SIZE_BYTE = (ASCEND_MEM_SIZE << 30); static const size_t PRAMATER_OUTPUT_INDEX = 0; AscendKernelRuntime::~AscendKernelRuntime() { graph_model_map_.clear(); } @@ -86,7 +85,8 @@ void AscendKernelRuntime::ReleaseDeviceRes() { MS_EXCEPTION(DeviceProcessError) << "rtSetDevice, ret[" << static_cast(ret) << "]"; } - FreeDeviceMemory(); + MS_EXCEPTION_IF_NULL(mem_manager_); + mem_manager_->FreeDeviceMemory(); (void)DestroyHccl(); (void)ResetDevice(); (void)ProfilingManager::GetInstance().StopProfiling(); @@ -109,11 +109,9 @@ bool AscendKernelRuntime::Init() { if (!ret) { return ret; } - - ret = MallocDeviceMemory(); - if (!ret) { - return ret; - } + mem_manager_ = std::make_shared(); + MS_EXCEPTION_IF_NULL(mem_manager_); + mem_manager_->MallocDeviceMemory(); ret = ProfilingManager::GetInstance().StartupProfiling(device_id_); if (!ret) { @@ -239,13 +237,6 @@ DeviceAddressPtr AscendKernelRuntime::CreateDeviceAddress(void *device_ptr, size return std::make_shared(device_ptr, device_size, format, type_id); } -void AscendKernelRuntime::MallocOpMemory(const DeviceAddressPtr address, size_t size, int) { - auto device_ptr = AscendMemoryAllocator::GetInstance().AllocTensorMem(size); - MS_EXCEPTION_IF_NULL(device_ptr); - address->ptr_ = device_ptr; - address->mem_dynamic_alloc_ = true; -} - bool AscendKernelRuntime::GenTask(const session::KernelGraph *graph) { auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); @@ -474,42 +465,6 @@ bool AscendKernelRuntime::DestroyHccl() { context_ptr->set_enable_hccl(false); return true; } - -bool AscendKernelRuntime::MallocDeviceMemory() { - device_mem_size_ = ASCEND_MEM_SIZE_BYTE; - static_mem_offset_ = FloatToSize(device_mem_size_ * GRAPH_INIT_ASCEND_MEM_RATIO); - auto ret = rtMalloc(reinterpret_cast(&device_mem_base_), static_mem_offset_, RT_MEMORY_HBM); - if (ret != RT_ERROR_NONE) { - MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << static_mem_offset_ << "] fail, ret[" << ret << "]"; - } - device_mem_pool_size_ = FloatToSize(device_mem_size_ * (1 - GRAPH_INIT_ASCEND_MEM_RATIO)); - ret = rtMalloc(reinterpret_cast(&device_mem_pool_base_), device_mem_pool_size_, RT_MEMORY_HBM); - if (ret != RT_ERROR_NONE) { - MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << device_mem_pool_size_ << "] fail, ret[" << ret << "]"; - } - AscendMemoryAllocator::GetInstance().set_device_mem_pool_base(device_mem_pool_base_); - AscendMemoryAllocator::GetInstance().set_device_mem_pool_size(device_mem_pool_size_); - return true; -} - -void AscendKernelRuntime::FreeDeviceMemory() { - if (device_mem_base_ != nullptr) { - auto ret = rtFree(device_mem_base_); - if (ret != RT_ERROR_NONE) { - MS_LOG(ERROR) << "rtFree mem size[" << device_mem_size_ << "] fail, ret[" << ret << "]"; - } - device_mem_base_ = nullptr; - } - if (device_mem_pool_base_ != nullptr) { - auto ret = rtFree(device_mem_pool_base_); - if (ret != RT_ERROR_NONE) { - MS_LOG(ERROR) << "rtFree mem size[" << device_mem_pool_size_ << "] fail, ret[" << ret << "]"; - } - device_mem_pool_base_ = nullptr; - } -} - -void AscendKernelRuntime::FreeHostMemory() { dynamic_mem_offset_ = 0; } } // namespace ascend } // namespace device } // namespace mindspore diff --git a/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.h b/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.h index dbd1460d24fd01c252e969a28eeda53c41a6c6a1..0eedad3d2b5e27364cc5f2a3ab6f9e8d5de7471d 100644 --- a/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.h +++ b/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.h @@ -39,13 +39,11 @@ class AscendKernelRuntime : public KernelRuntime { bool GenTask(const session::KernelGraph *graph) override; bool RunTask(const session::KernelGraph *graph) override; bool LoadTask(const session::KernelGraph *graph) override; - void FreeHostMemory() override; protected: DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, TypeId type_id) override; bool SyncStream() override; - void MallocOpMemory(const DeviceAddressPtr address, size_t size, int flag) override; private: bool InitDevice(); @@ -53,8 +51,7 @@ class AscendKernelRuntime : public KernelRuntime { bool HcclInit(); bool NeedDestroyHccl(); bool DestroyHccl(); - bool MallocDeviceMemory(); - void FreeDeviceMemory(); + void ClearGraphModelMap(); void ReleaseDeviceRes() override; uint32_t GetGraphModelId(const session::KernelGraph *kernel_graph); diff --git a/mindspore/ccsrc/device/ascend/ascend_memory_manager.cc b/mindspore/ccsrc/device/ascend/ascend_memory_manager.cc new file mode 100644 index 0000000000000000000000000000000000000000..f033d81d826ad16ccab83b249697b327b9c7ae02 --- /dev/null +++ b/mindspore/ccsrc/device/ascend/ascend_memory_manager.cc @@ -0,0 +1,65 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/ascend/ascend_memory_manager.h" +#include "device/ascend/ascend_memory_allocator.h" +#include "utils/context/ms_context.h" +#include "runtime/mem.h" +namespace mindspore { +namespace device { +namespace ascend { +static const uint64_t ASCEND_MEM_SIZE = 20; +static const uint64_t ASCEND_MEM_SIZE_BYTE = (ASCEND_MEM_SIZE << 30); + +void AscendMemoryManager::MallocDeviceMemory() { + device_mem_size_ = ASCEND_MEM_SIZE_BYTE; + static_mem_offset_ = FloatToSize(device_mem_size_ * GRAPH_INIT_ASCEND_MEM_RATIO); + auto ret = rtMalloc(reinterpret_cast(&device_mem_base_), static_mem_offset_, RT_MEMORY_HBM); + if (ret != RT_ERROR_NONE) { + MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << static_mem_offset_ << "] fail, ret[" << ret << "]"; + } + device_mem_pool_size_ = FloatToSize(device_mem_size_ * (1 - GRAPH_INIT_ASCEND_MEM_RATIO)); + ret = rtMalloc(reinterpret_cast(&device_mem_pool_base_), device_mem_pool_size_, RT_MEMORY_HBM); + if (ret != RT_ERROR_NONE) { + MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << device_mem_pool_size_ << "] fail, ret[" << ret << "]"; + } + AscendMemoryAllocator::GetInstance().set_device_mem_pool_base(device_mem_pool_base_); + AscendMemoryAllocator::GetInstance().set_device_mem_pool_size(device_mem_pool_size_); +} + +void AscendMemoryManager::FreeDeviceMemory() { + if (device_mem_base_ != nullptr) { + auto ret = rtFree(device_mem_base_); + if (ret != RT_ERROR_NONE) { + MS_LOG(ERROR) << "rtFree mem size[" << device_mem_size_ << "] fail, ret[" << ret << "]"; + } + device_mem_base_ = nullptr; + } + if (device_mem_pool_base_ != nullptr) { + auto ret = rtFree(device_mem_pool_base_); + if (ret != RT_ERROR_NONE) { + MS_LOG(ERROR) << "rtFree mem size[" << device_mem_pool_size_ << "] fail, ret[" << ret << "]"; + } + device_mem_pool_base_ = nullptr; + } +} + +void *AscendMemoryManager::AllocTensorMemDynamic(size_t size) { + return AscendMemoryAllocator::GetInstance().AllocTensorMem(size); +} +} // namespace ascend +} // namespace device +} // namespace mindspore diff --git a/mindspore/ccsrc/device/ascend/ascend_memory_manager.h b/mindspore/ccsrc/device/ascend/ascend_memory_manager.h new file mode 100644 index 0000000000000000000000000000000000000000..8639fb5c7278bb8101313af88187c864c66915c4 --- /dev/null +++ b/mindspore/ccsrc/device/ascend/ascend_memory_manager.h @@ -0,0 +1,35 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_MANAGER_H_ +#define MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_MANAGER_H_ +#include "device/memory_manager.h" +namespace mindspore { +namespace device { +namespace ascend { +class AscendMemoryManager : public MemoryManager { + public: + AscendMemoryManager() = default; + virtual ~AscendMemoryManager() = default; + + void MallocDeviceMemory() override; + void FreeDeviceMemory() override; + void *AllocTensorMemDynamic(size_t size) override; +}; +} // namespace ascend +} // namespace device +} // namespace mindspore +#endif // MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_MANAGER_H_ diff --git a/mindspore/ccsrc/device/device_address.h b/mindspore/ccsrc/device/device_address.h index 1610d4337245cd5ce11204924bfa4cf5b81f98ef..cb022427e34fe5e1cc6a90e16e20ef5a0778933b 100644 --- a/mindspore/ccsrc/device/device_address.h +++ b/mindspore/ccsrc/device/device_address.h @@ -33,12 +33,14 @@ class CPUKernelRuntime; } // namespace cpu namespace ascend { class AscendKernelRuntime; +class AscendMemoryManager; namespace tasksink { class TaskGenerator; } // namespace tasksink } // namespace ascend namespace gpu { class GPUKernelRuntime; +class GPUMemoryManager; } // namespace gpu } // namespace device } // namespace mindspore @@ -70,12 +72,15 @@ class DeviceAddress { TypeId type_id_{kNumberTypeFloat16}; bool mem_dynamic_alloc_{false}; friend class KernelRuntime; + friend class MemoryManager; friend class mindspore::device::ascend::tasksink::TaskGenerator; friend class mindspore::device::cpu::CPUSimpleMemPlan; friend class mindspore::device::cpu::CPUResourceManager; friend class mindspore::device::cpu::CPUKernelRuntime; friend class mindspore::device::gpu::GPUKernelRuntime; + friend class mindspore::device::gpu::GPUMemoryManager; friend class mindspore::device::ascend::AscendKernelRuntime; + friend class mindspore::device::ascend::AscendMemoryManager; }; using DeviceAddressPtr = std::shared_ptr; diff --git a/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc index 9eeb1062f747cb37b99db9b91391b46e1e443d9c..597e188e9dcac8d5175804b46fd7b4b41bd817a1 100644 --- a/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc +++ b/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc @@ -26,6 +26,7 @@ #include "device/kernel_runtime_manager.h" #include "device/gpu/gpu_common.h" #include "common/utils.h" +#include "device/gpu/gpu_memory_manager.h" namespace mindspore { namespace device { @@ -36,26 +37,14 @@ bool GPUKernelRuntime::Init() { if (device_init_ == true) { return true; } - auto ret = InitDevice(); if (!ret) { MS_LOG(ERROR) << "InitDevice error."; return ret; } - - auto context_ptr = MsContext::GetInstance(); - MS_EXCEPTION_IF_NULL(context_ptr); - // If use the dynamic memory pool, then alloc the first memory block to init. - if (context_ptr->enable_dynamic_mem_pool()) { - auto device_addr = AllocTensorMemDynamic(1); - if (!device_addr) { - MS_LOG(ERROR) << "Dynamic memory pool init error."; - return false; - } - } else { - MallocDeviceMemory(); - } - + mem_manager_ = std::make_shared(); + MS_EXCEPTION_IF_NULL(mem_manager_); + mem_manager_->MallocDeviceMemory(); const void *collective_handle_ = CollectiveInitializer::instance().collective_handle(); bool collective_inited = CollectiveInitializer::instance().collective_inited(); if (collective_inited && collective_handle_ != nullptr) { @@ -101,16 +90,6 @@ bool GPUKernelRuntime::InitDevice() { return true; } -void GPUKernelRuntime::MallocDeviceMemory() { - // Need to reserve 20% space for dynamic memory - const float init_gpu_mem_ratio = 0.8; - size_t mem_size = FloatToSize(GPUMemoryAllocator::GetInstance().free_mem_size() * init_gpu_mem_ratio); - auto alloc_size = - GPUMemoryAllocator::GetInstance().AllocDeviceMem(mem_size, reinterpret_cast(&device_mem_base_)); - device_mem_size_ = alloc_size; - static_mem_offset_ = device_mem_size_; -} - void GPUKernelRuntime::ReleaseDeviceRes() { // For dataset mode. if (GpuBufferMgr::GetInstance().IsInit()) { @@ -122,39 +101,22 @@ void GPUKernelRuntime::ReleaseDeviceRes() { CHECK_OP_RET_WITH_EXCEPT(GpuBufferMgr::GetInstance().Destroy(), "Could not destroy gpu data queue."); } GPUDeviceManager::GetInstance().ReleaseDevice(); - if (device_mem_base_ != nullptr) { - if (!GPUMemoryAllocator::GetInstance().FreeDeviceMem(device_mem_base_)) { - MS_LOG(EXCEPTION) << "Could not free gpu device memory."; - } - } - GPUMemoryAllocator::GetInstance().ReleaseDeviceRes(); -} - -void GPUKernelRuntime::FreeHostMemory() { dynamic_mem_offset_ = 0; } - -void *GPUKernelRuntime::AllocTensorMemDynamic(size_t size) { - return GPUMemoryAllocator::GetInstance().AllocTensorMem(size); -} - -void GPUKernelRuntime::FreeTensorMemDynamic(void *device_ptr) { - GPUMemoryAllocator::GetInstance().FreeTensorMem(device_ptr); + MS_EXCEPTION_IF_NULL(mem_manager_); + mem_manager_->FreeDeviceMemory(); } void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) { auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); + MS_EXCEPTION_IF_NULL(mem_manager_); + mem_manager_->ResetDynamicMemory(); AssignStaticMemory(graph); - bool is_enable_mem_reuse = context_ptr->enable_mem_reuse(); bool is_enable_dynamic_mem = context_ptr->enable_dynamic_mem_pool(); if (is_enable_dynamic_mem) { // Use the dynamic memory pool. InitKernelRefCount(graph); InitKernelOutputAddress(graph); - } else if (is_enable_mem_reuse) { - // Use the memory reuse. - ReuseAssignDynamicMemory(graph); } else { - // Normal way. AssignDynamicMemory(graph); } } @@ -179,32 +141,6 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph) { return ret; } -uint8_t *GPUKernelRuntime::MallocStaticMem(size_t size, bool) { - auto context_ptr = MsContext::GetInstance(); - MS_EXCEPTION_IF_NULL(context_ptr); - if (context_ptr->enable_dynamic_mem_pool()) { - auto device_ptr = AllocTensorMemDynamic(size); - MS_EXCEPTION_IF_NULL(device_ptr); - return AddressOffset(device_ptr, 0); - } - - auto align_size = GetCommonAlignSize(size); - if (static_mem_offset_ < align_size) { - MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ - << "] static[" << total_static_size_ << "])" - << " malloc [" << align_size << "] failed!"; - } - auto offset = static_mem_offset_ - align_size; - if (dynamic_mem_offset_ > offset) { - MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ - << "] static[" << total_static_size_ << "])" - << " malloc [" << align_size << "] failed!"; - } - total_static_size_ += align_size; - static_mem_offset_ = offset; - return device_mem_base_ + offset; -} - void GPUKernelRuntime::InitKernelRefCount(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared(); @@ -273,6 +209,7 @@ void GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod MS_EXCEPTION_IF_NULL(kernel_inputs); MS_EXCEPTION_IF_NULL(kernel_workspaces); MS_EXCEPTION_IF_NULL(kernel_outputs); + MS_EXCEPTION_IF_NULL(mem_manager_); for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) { auto device_address = AnfAlgo::GetPrevNodeOutputAddr(kernel, i); MS_EXCEPTION_IF_NULL(device_address); @@ -290,7 +227,7 @@ void GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod MS_EXCEPTION_IF_NULL(device_address); auto device_ptr = device_address->ptr_; if (device_ptr == nullptr) { - device_ptr = AllocTensorMemDynamic(output_sizes[i]); + device_ptr = mem_manager_->AllocTensorMemDynamic(output_sizes[i]); MS_EXCEPTION_IF_NULL(device_ptr); device_address->ptr_ = device_ptr; } @@ -307,7 +244,7 @@ void GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod kernel_workspaces->emplace_back(nullptr); continue; } - auto device_ptr = AllocTensorMemDynamic(workspace_sizes[i]); + auto device_ptr = mem_manager_->AllocTensorMemDynamic(workspace_sizes[i]); MS_EXCEPTION_IF_NULL(device_ptr); kernel::AddressPtr workspace = std::make_shared(); MS_EXCEPTION_IF_NULL(workspace); @@ -333,6 +270,7 @@ void GPUKernelRuntime::AllocCommunicationOpDynamicRes(const session::KernelGraph void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel) { MS_EXCEPTION_IF_NULL(kernel); + MS_EXCEPTION_IF_NULL(mem_manager_); // The reference count of communication kernel input is not 0. if (communication_op_input_ref_count_ != 0) { MS_LOG(ERROR) << "The reference count of communication kernel input is not 0."; @@ -354,7 +292,7 @@ void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfN addr_size.emplace_back(device_address.get(), output_size); } - auto device_mem_ptr = AllocTensorMemDynamic(total); + auto device_mem_ptr = mem_manager_->AllocTensorMemDynamic(total); MS_EXCEPTION_IF_NULL(device_mem_ptr); for (const auto &iter : addr_size) { MS_EXCEPTION_IF_NULL(iter.first); @@ -366,6 +304,7 @@ void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfN void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel) { MS_EXCEPTION_IF_NULL(kernel); + MS_EXCEPTION_IF_NULL(mem_manager_); // The reference count of communication kernel output is not 0. if (communication_op_output_ref_count_ != 0) { MS_LOG(ERROR) << "The reference count of communication kernel output is not 0."; @@ -389,7 +328,7 @@ void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::Anf addr_size.emplace_back(device_address.get(), output_sizes[i]); } - auto device_mem_ptr = AllocTensorMemDynamic(total); + auto device_mem_ptr = mem_manager_->AllocTensorMemDynamic(total); MS_EXCEPTION_IF_NULL(device_mem_ptr); for (const auto &iter : addr_size) { MS_EXCEPTION_IF_NULL(iter.first); @@ -402,6 +341,7 @@ void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::Anf void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel, const AddressPtrList &kernel_workspaces) { MS_EXCEPTION_IF_NULL(kernel); + MS_EXCEPTION_IF_NULL(mem_manager_); auto cnode = kernel->cast(); MS_EXCEPTION_IF_NULL(cnode); // Free the input of kernel by reference count. @@ -421,7 +361,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel, auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i); MS_EXCEPTION_IF_NULL(device_address); MS_EXCEPTION_IF_NULL(device_address->ptr_); - FreeTensorMemDynamic(device_address->ptr_); + mem_manager_->FreeTensorMemDynamic(device_address->ptr_); device_address->ptr_ = nullptr; } } @@ -432,7 +372,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel, auto workspace = kernel_workspaces[i]; if (workspace != nullptr) { MS_EXCEPTION_IF_NULL(workspace->addr); - FreeTensorMemDynamic(workspace->addr); + mem_manager_->FreeTensorMemDynamic(workspace->addr); workspace->addr = nullptr; } } @@ -441,6 +381,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel, void GPUKernelRuntime::FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr &kernel, size_t input_idx, bool *is_communication_op) { MS_EXCEPTION_IF_NULL(kernel); + MS_EXCEPTION_IF_NULL(mem_manager_); // The inputs memory of communication kernel is one piece memory, need release together. if (AnfAlgo::GetCNodeName(kernel) == kAllReduceOpName) { communication_op_input_ref_count_--; @@ -448,7 +389,7 @@ void GPUKernelRuntime::FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, 0); MS_EXCEPTION_IF_NULL(device_address); MS_EXCEPTION_IF_NULL(device_address->ptr_); - FreeTensorMemDynamic(device_address->ptr_); + mem_manager_->FreeTensorMemDynamic(device_address->ptr_); device_address->ptr_ = nullptr; } *is_communication_op = true; @@ -470,19 +411,12 @@ void GPUKernelRuntime::FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr auto device_address = AnfAlgo::GetMutableOutputAddr(kernel_input.first, 0); MS_EXCEPTION_IF_NULL(device_address); MS_EXCEPTION_IF_NULL(device_address->ptr_); - FreeTensorMemDynamic(device_address->ptr_); + mem_manager_->FreeTensorMemDynamic(device_address->ptr_); device_address->ptr_ = nullptr; } *is_communication_op = true; } } - -void GPUKernelRuntime::MallocOpMemory(const DeviceAddressPtr address, size_t size, int) { - auto device_ptr = AllocTensorMemDynamic(size); - MS_EXCEPTION_IF_NULL(device_ptr); - address->ptr_ = device_ptr; - address->mem_dynamic_alloc_ = true; -} } // namespace gpu } // namespace device } // namespace mindspore diff --git a/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.h b/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.h index f3fdb5fa98f82e3d31b17e5b43683630d13eeeb0..6f761342d36e2b611d6edc0be1a04dde25ef5c57 100644 --- a/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.h +++ b/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.h @@ -33,7 +33,6 @@ class GPUKernelRuntime : public KernelRuntime { ~GPUKernelRuntime() override = default; bool Init() override; void ReleaseDeviceRes() override; - void FreeHostMemory() override; void AssignMemory(session::KernelGraph *graph) override; bool Run(session::KernelGraph *graph) override; @@ -41,18 +40,11 @@ class GPUKernelRuntime : public KernelRuntime { DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, TypeId type_id) override; bool SyncStream() override; - // Alloc memory use the dynamic memory pool. - void *AllocTensorMemDynamic(size_t size) override; - // Free memory use the dynamic memory pool. - void FreeTensorMemDynamic(void *device_ptr) override; - void MallocOpMemory(const DeviceAddressPtr address, size_t size, int flag) override; - uint8_t *MallocStaticMem(size_t size, bool communication_mem) override; private: GPUKernelRuntime(const GPUKernelRuntime &); GPUKernelRuntime &operator=(const GPUKernelRuntime &); bool InitDevice(); - void MallocDeviceMemory(); bool device_init_{false}; // The related functions and members for using dynamic memory pool. @@ -69,6 +61,7 @@ class GPUKernelRuntime : public KernelRuntime { void FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr &kernel, size_t input_idx, bool *is_communication_op); size_t communication_op_input_ref_count_{0}; size_t communication_op_output_ref_count_{0}; + MemReuseUtilPtr mem_reuse_util_ptr_{nullptr}; }; MS_REG_KERNEL_RUNTIME(kGPUDevice, GPUKernelRuntime); } // namespace gpu diff --git a/mindspore/ccsrc/device/gpu/gpu_memory_manager.cc b/mindspore/ccsrc/device/gpu/gpu_memory_manager.cc new file mode 100644 index 0000000000000000000000000000000000000000..3944b504e411acd42dd669a624eeb8b64fc07670 --- /dev/null +++ b/mindspore/ccsrc/device/gpu/gpu_memory_manager.cc @@ -0,0 +1,88 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/gpu/gpu_memory_manager.h" +#include "device/gpu/gpu_memory_allocator.h" +#include "utils/context/ms_context.h" +#include "utils/convert_utils.h" +namespace mindspore { +namespace device { +namespace gpu { +void *GPUMemoryManager::AllocTensorMemDynamic(size_t size) { + return GPUMemoryAllocator::GetInstance().AllocTensorMem(size); +} + +void GPUMemoryManager::FreeTensorMemDynamic(void *device_ptr) { + GPUMemoryAllocator::GetInstance().FreeTensorMem(device_ptr); +} + +void GPUMemoryManager::MallocDeviceMemory() { + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + // If use the dynamic memory pool, then alloc the first memory block to init. + if (context_ptr->enable_dynamic_mem_pool()) { + auto device_addr = AllocTensorMemDynamic(1); + if (!device_addr) { + MS_LOG(ERROR) << "Dynamic memory pool init error."; + } + } else { + // Need to reserve 20% space for dynamic memory + const float init_gpu_mem_ratio = 0.8; + size_t mem_size = FloatToSize(GPUMemoryAllocator::GetInstance().free_mem_size() * init_gpu_mem_ratio); + auto alloc_size = + GPUMemoryAllocator::GetInstance().AllocDeviceMem(mem_size, reinterpret_cast(&device_mem_base_)); + device_mem_size_ = alloc_size; + static_mem_offset_ = device_mem_size_; + } +} + +void GPUMemoryManager::FreeDeviceMemory() { + if (device_mem_base_ != nullptr) { + if (!GPUMemoryAllocator::GetInstance().FreeDeviceMem(device_mem_base_)) { + MS_LOG(EXCEPTION) << "Could not free gpu device memory."; + } + } + GPUMemoryAllocator::GetInstance().ReleaseDeviceRes(); +} + +uint8_t *GPUMemoryManager::MallocStaticMem(size_t size, bool) { + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + if (context_ptr->enable_dynamic_mem_pool()) { + auto device_ptr = AllocTensorMemDynamic(size); + MS_EXCEPTION_IF_NULL(device_ptr); + return AddressOffset(device_ptr, 0); + } + + auto align_size = GetCommonAlignSize(size); + if (static_mem_offset_ < align_size) { + MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ + << "] static[" << total_static_size_ << "])" + << " malloc [" << align_size << "] failed!"; + } + auto offset = static_mem_offset_ - align_size; + if (dynamic_mem_offset_ > offset) { + MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ + << "] static[" << total_static_size_ << "])" + << " malloc [" << align_size << "] failed!"; + } + total_static_size_ += align_size; + static_mem_offset_ = offset; + return device_mem_base_ + offset; +} +} // namespace gpu +} // namespace device +} // namespace mindspore diff --git a/mindspore/ccsrc/device/gpu/gpu_memory_manager.h b/mindspore/ccsrc/device/gpu/gpu_memory_manager.h new file mode 100644 index 0000000000000000000000000000000000000000..a18226bdf3f3ff37d2e0979131c2b0887df6e2c2 --- /dev/null +++ b/mindspore/ccsrc/device/gpu/gpu_memory_manager.h @@ -0,0 +1,40 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_MANAGER_H_ +#define MINDSPORE_MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_MANAGER_H_ +#include "device/memory_manager.h" +namespace mindspore { +namespace device { +namespace gpu { +class GPUMemoryManager : public MemoryManager { + public: + GPUMemoryManager() = default; + virtual ~GPUMemoryManager() = default; + + void MallocDeviceMemory() override; + void FreeDeviceMemory() override; + + void *AllocTensorMemDynamic(size_t size) override; + void FreeTensorMemDynamic(void *device_ptr) override; + + protected: + uint8_t *MallocStaticMem(size_t size, bool communication_mem); +}; +} // namespace gpu +} // namespace device +} // namespace mindspore +#endif // MINDSPORE_MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_MANAGER_H_ diff --git a/mindspore/ccsrc/device/kernel_runtime.cc b/mindspore/ccsrc/device/kernel_runtime.cc index 0a9be35fb50073e4455b05ee623fc8b093936cbe..16025ed8a4e6681b993121d67af91c446ab9e338 100644 --- a/mindspore/ccsrc/device/kernel_runtime.cc +++ b/mindspore/ccsrc/device/kernel_runtime.cc @@ -31,18 +31,13 @@ #include "ir/value.h" using mindspore::kernel::Address; using mindspore::kernel::AddressPtr; -using mindspore::memreuse::BestFitMemReuse; -using mindspore::memreuse::MemReuseUtilPtr; namespace mindspore { namespace device { KernelRuntime::~KernelRuntime() { - device_mem_base_ = nullptr; - device_mem_pool_base_ = nullptr; #ifdef ENABLE_DUMP_E2E dump_conf_ptr_ = nullptr; #endif - mem_reuse_util_ptr_ = nullptr; } bool KernelRuntime::Run(session::KernelGraph *graph) { @@ -88,11 +83,6 @@ bool KernelRuntime::LoadTask(const session::KernelGraph *graph) { return false; } -void KernelRuntime::FreeHostMemory() { - dynamic_mem_offset_ = 0; - static_mem_offset_ = 0; -} - // for D to impl bool KernelRuntime::RunTask(const session::KernelGraph *graph) { if (graph != nullptr) { @@ -126,13 +116,11 @@ size_t KernelRuntime::CountNodeDeviceMemorySize(const mindspore::AnfNodePtr &nod void KernelRuntime::AssignMemory(session::KernelGraph *graph) { auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); + MS_EXCEPTION_IF_NULL(mem_manager_); + mem_manager_->ResetDynamicMemory(); AssignStaticMemory(graph); - bool is_enable_mem_reuse = context_ptr->enable_mem_reuse(); - if (is_enable_mem_reuse) { - ReuseAssignDynamicMemory(graph); - } else { - AssignDynamicMemory(graph); - } + AssignDynamicMemory(graph); + UpdateRefNodeOutputMem(graph); } @@ -159,6 +147,7 @@ void KernelRuntime::AssignStaticMemory(session::KernelGraph *graph) { void KernelRuntime::RunOpAssignInputMemory(const std::vector &input_tensors, const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); + MS_EXCEPTION_IF_NULL(mem_manager_); for (size_t input_index = 0; input_index < graph->inputs().size(); ++input_index) { auto item = graph->inputs()[input_index]; MS_EXCEPTION_IF_NULL(item); @@ -180,7 +169,7 @@ void KernelRuntime::RunOpAssignInputMemory(const std::vector auto device_address = CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id); MS_EXCEPTION_IF_NULL(device_address); - MallocOpMemory(device_address, tensor_size, kStaticMem); + mem_manager_->MallocOpMemory(device_address, tensor_size); AnfAlgo::SetOutputAddr(device_address, index, item.get()); } } @@ -188,6 +177,7 @@ void KernelRuntime::RunOpAssignInputMemory(const std::vector void KernelRuntime::RunOpAssignOutputMemory(const AnfNodePtr &kernel) { MS_EXCEPTION_IF_NULL(kernel); + MS_EXCEPTION_IF_NULL(mem_manager_); auto kernel_mod = AnfAlgo::GetKernelMod(kernel); MS_EXCEPTION_IF_NULL(kernel_mod); auto output_sizes = kernel_mod->GetOutputSizeList(); @@ -208,13 +198,14 @@ void KernelRuntime::RunOpAssignOutputMemory(const AnfNodePtr &kernel) { auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i); auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type); MS_EXCEPTION_IF_NULL(device_address); - MallocOpMemory(device_address, output_sizes[i], kDynamicMem); + mem_manager_->MallocOpMemory(device_address, output_sizes[i]); AnfAlgo::SetOutputAddr(device_address, i, kernel.get()); } } void KernelRuntime::RunOpAssignWorkSpaceMemory(const AnfNodePtr &kernel) { MS_EXCEPTION_IF_NULL(kernel); + MS_EXCEPTION_IF_NULL(mem_manager_); if (kernel->isa()) { auto kernel_mod = AnfAlgo::GetKernelMod(kernel); MS_EXCEPTION_IF_NULL(kernel_mod); @@ -222,7 +213,7 @@ void KernelRuntime::RunOpAssignWorkSpaceMemory(const AnfNodePtr &kernel) { for (size_t i = 0; i < workspace_lists.size(); ++i) { auto device_address = CreateDeviceAddress(nullptr, workspace_lists[i], "", kTypeUnknown); MS_EXCEPTION_IF_NULL(device_address); - MallocOpMemory(device_address, workspace_lists[i], kDynamicMem); + mem_manager_->MallocOpMemory(device_address, workspace_lists[i]); AnfAlgo::SetWorkspaceAddr(device_address, i, kernel.get()); } } @@ -230,6 +221,7 @@ void KernelRuntime::RunOpAssignWorkSpaceMemory(const AnfNodePtr &kernel) { void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); + MS_EXCEPTION_IF_NULL(mem_manager_); for (auto &item : graph->inputs()) { MS_EXCEPTION_IF_NULL(item); if (!item->isa()) { @@ -247,7 +239,7 @@ void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) { output_type_id = AnfAlgo::GetOutputInferDataType(item, index); } auto tensor_size = CountNodeDeviceMemorySize(item, index); - auto ptr = MallocStaticMem(tensor_size, false); + auto ptr = mem_manager_->MallocMem(kStaticMem, tensor_size); auto address = CreateDeviceAddress(ptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id); AnfAlgo::SetOutputAddr(address, index, item.get()); } @@ -301,6 +293,7 @@ void KernelRuntime::UpdateRefNodeOutputMem(const session::KernelGraph *graph) { void KernelRuntime::AssignCommunicationNodeOutputMem(int flag, const AnfNodePtr &node) { MS_EXCEPTION_IF_NULL(node); + MS_EXCEPTION_IF_NULL(mem_manager_); auto kernel_mod = AnfAlgo::GetKernelMod(node); MS_EXCEPTION_IF_NULL(kernel_mod); auto output_sizes = kernel_mod->GetOutputSizeList(); @@ -314,12 +307,12 @@ void KernelRuntime::AssignCommunicationNodeOutputMem(int flag, const AnfNodePtr std::vector align_size_list; for (uint64_t mem_size : output_sizes) { if (context_ptr->enable_hccl()) { - mem_size = GetCommonAlignSize(mem_size); + mem_size = mem_manager_->GetCommonAlignSize(mem_size); } total_size += mem_size; align_size_list.emplace_back(mem_size); } - uint8_t *output_ptr = CalDeviceMem(node, total_size, flag, 0); + uint8_t *output_ptr = mem_manager_->MallocOutputMem(node, 0, flag, total_size); for (size_t j = 0; j < align_size_list.size(); ++j) { std::string output_format = AnfAlgo::GetOutputFormat(node, j); auto output_type = AnfAlgo::GetOutputDeviceDataType(node, j); @@ -333,6 +326,7 @@ void KernelRuntime::UpdateCommunicationOpInputMem(const AnfNodePtr &node) { auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); MS_EXCEPTION_IF_NULL(node); + MS_EXCEPTION_IF_NULL(mem_manager_); size_t total_size = 0; std::vector> addr_size; for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(node); ++i) { @@ -340,12 +334,12 @@ void KernelRuntime::UpdateCommunicationOpInputMem(const AnfNodePtr &node) { MS_EXCEPTION_IF_NULL(address); auto mem_size = address->size(); if (context_ptr->enable_hccl()) { - mem_size = GetCommonAlignSize(mem_size); + mem_size = mem_manager_->GetCommonAlignSize(mem_size); } total_size += mem_size; addr_size.emplace_back(address.get(), mem_size); } - uint8_t *input_ptr = CalDeviceMem(node, total_size, kDynamicMem, 0); + uint8_t *input_ptr = mem_manager_->MallocOutputMem(node, 0, kDynamicMem, total_size); for (const auto &iter : addr_size) { MS_EXCEPTION_IF_NULL(iter.first); iter.first->set_ptr(input_ptr); @@ -355,7 +349,8 @@ void KernelRuntime::UpdateCommunicationOpInputMem(const AnfNodePtr &node) { void KernelRuntime::AssignNodeOutputMem(int flag, const AnfNodePtr &node, int index) { MS_EXCEPTION_IF_NULL(node); - if (IsCommunicationOp(node)) { + MS_EXCEPTION_IF_NULL(mem_manager_); + if (AnfAlgo::IsCommunicationOp(node)) { UpdateCommunicationOpInputMem(node); AssignCommunicationNodeOutputMem(flag, node); return; @@ -375,7 +370,7 @@ void KernelRuntime::AssignNodeOutputMem(int flag, const AnfNodePtr &node, int in MS_LOG(INFO) << "Already malloc index:" << i; continue; } - auto ptr = CalDeviceMem(node, output_sizes[i], flag, i); + auto ptr = mem_manager_->MallocOutputMem(node, i, flag, output_sizes[i]); if (ptr == nullptr) { // reused ptr, no need alloc, continue; continue; @@ -390,6 +385,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const size_t output_idx) { MS_EXCEPTION_IF_NULL(value_node); MS_EXCEPTION_IF_NULL(node_value); + MS_EXCEPTION_IF_NULL(mem_manager_); auto tensor = node_value->cast(); if (tensor == nullptr) { MS_LOG(WARNING) << "Tensor is null"; @@ -397,7 +393,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const } size_t tensor_size = tensor->data().nbytes(); auto node_size = CountNodeDeviceMemorySize(value_node, output_idx); - auto ptr = MallocStaticMem(node_size, false); + auto ptr = mem_manager_->MallocMem(kStaticMem, node_size); TypeId output_type_id = AnfAlgo::GetOutputDeviceDataType(value_node, output_idx); if (output_type_id == kTypeUnknown) { output_type_id = AnfAlgo::GetOutputInferDataType(value_node, output_idx); @@ -414,6 +410,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); + MS_EXCEPTION_IF_NULL(mem_manager_); for (auto &value_node : graph->graph_value_nodes()) { MS_EXCEPTION_IF_NULL(value_node); if (AnfAlgo::OutputAddrExist(value_node, 0)) { @@ -440,7 +437,7 @@ void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) { } else if (node_value->isa()) { auto value = GetValue(node_value); size_t tensor_size = value.size(); - auto ptr = MallocStaticMem(tensor_size, false); + auto ptr = mem_manager_->MallocMem(kStaticMem, tensor_size); auto address = CreateDeviceAddress(ptr, tensor_size, kOpFormat_DEFAULT, kNumberTypeUInt8); MS_EXCEPTION_IF_NULL(address); AnfAlgo::SetOutputAddr(address, 0, value_node.get()); @@ -452,103 +449,37 @@ void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) { } } -void KernelRuntime::AssignDynamicMemory(const session::KernelGraph *graph) { +void KernelRuntime::AssignDynamicMemory(session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); - // reset dynamic mem offset - dynamic_mem_offset_ = 0; - auto &kernels = graph->execution_order(); - for (auto &kernel : kernels) { - AssignNodeOutputMem(kDynamicMem, kernel, kGetAllOuts); - AssignWorkSpaceMem(kernel); + MS_EXCEPTION_IF_NULL(mem_manager_); + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + bool is_enable_mem_reuse = context_ptr->enable_mem_reuse(); + auto mem_flag = kDynamicMem; + if (is_enable_mem_reuse) { + mem_manager_->InitReuseDynamicMemory(graph); + mem_flag = kReuseDynamicMem; } -} - -void KernelRuntime::ReuseAssignDynamicMemory(session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(graph); - dynamic_mem_offset_ = 0; - MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared(); - MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr); - // set all infos - mem_reuse_util_ptr->SetAllInfo(graph); - auto bestfit_mem_reuse = std::make_shared(); - MS_EXCEPTION_IF_NULL(bestfit_mem_reuse); - bestfit_mem_reuse->Reuse(mem_reuse_util_ptr.get()); - size_t total_allocated_size = bestfit_mem_reuse->GetAllocatedSize(); - MS_LOG(INFO) << "TotalReuseDynamicSize [" << total_allocated_size << "]"; - mem_reuse_util_ptr_ = mem_reuse_util_ptr; - auto base_ptr = MallocDynamicMem(total_allocated_size, false); - mem_reuse_util_ptr_->set_mem_base(base_ptr); auto &kernels = graph->execution_order(); for (auto &kernel : kernels) { - AssignNodeOutputMem(kReuseDynamicMem, kernel, kGetAllOuts); - AssignReuseWorkSpaceMem(kernel); + AssignNodeOutputMem(mem_flag, kernel, kGetAllOuts); + AssignWorkSpaceMem(mem_flag, kernel); } } -void KernelRuntime::AssignReuseWorkSpaceMem(const AnfNodePtr &node) { +void KernelRuntime::AssignWorkSpaceMem(int flag, const AnfNodePtr &node) { MS_EXCEPTION_IF_NULL(node); + MS_EXCEPTION_IF_NULL(mem_manager_); auto kernel_mod = AnfAlgo::GetKernelMod(node); MS_EXCEPTION_IF_NULL(kernel_mod); size_t index = 0; for (auto &size : kernel_mod->GetWorkspaceSizeList()) { - auto wk_ptr = mem_reuse_util_ptr_->GetNodeWorkSpacePtr(node, index); - AnfAlgo::SetWorkspaceAddr(CreateDeviceAddress(wk_ptr, size, "", kTypeUnknown), index, node.get()); + auto ptr = mem_manager_->MallocWorkSpaceMem(node, flag, index, size); + AnfAlgo::SetWorkspaceAddr(CreateDeviceAddress(ptr, size, "", kTypeUnknown), index, node.get()); index++; } } -void KernelRuntime::AssignWorkSpaceMem(const AnfNodePtr &node) { - MS_EXCEPTION_IF_NULL(node); - if (node->isa()) { - auto kernel_mod = AnfAlgo::GetKernelMod(node); - MS_EXCEPTION_IF_NULL(kernel_mod); - size_t index = 0; - for (auto &size : kernel_mod->GetWorkspaceSizeList()) { - auto ptr = MallocDynamicMem(size, false); - AnfAlgo::SetWorkspaceAddr(CreateDeviceAddress(ptr, size, "", kTypeUnknown), index, node.get()); - index++; - } - } -} - -bool KernelRuntime::IsCommunicationOp(const AnfNodePtr &node) { - MS_EXCEPTION_IF_NULL(node); - auto kernel_name = AnfAlgo::GetCNodeName(node); - auto kernel_type = AnfAlgo::GetKernelType(node); - if (kernel_name == kAllReduceOpName || kernel_type == HCCL_KERNEL) { - return true; - } - return false; -} - -uint8_t *KernelRuntime::CalDeviceMem(const AnfNodePtr &node, size_t size, int flag, size_t index) { - MS_EXCEPTION_IF_NULL(node); - auto context_ptr = MsContext::GetInstance(); - MS_EXCEPTION_IF_NULL(context_ptr); - uint8_t *ptr = nullptr; - if (IsCommunicationOp(node)) { - bool communication_mem = false; - if (context_ptr->enable_hccl()) { - communication_mem = true; - } - if (flag == kStaticMem) { - ptr = MallocStaticMem(size, communication_mem); - } else { - ptr = MallocDynamicMem(size, communication_mem); - } - return ptr; - } - - if (flag == kStaticMem) { - ptr = MallocStaticMem(size, false); - } else if (flag == kDynamicMem) { - ptr = MallocDynamicMem(size, false); - } else if (flag == kReuseDynamicMem) { - ptr = mem_reuse_util_ptr_->GetNodeOutputPtr(node, index); - } - return ptr; -} - void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs, AddressPtrList *const kernel_workspaces, AddressPtrList *kernel_outputs) { @@ -659,65 +590,6 @@ bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) { return true; } -size_t KernelRuntime::GetCommonAlignSize(size_t input_size) const { - return (input_size + mem_align_size_ + 31) / mem_align_size_ * mem_align_size_; -} - -size_t KernelRuntime::GetCommunicationAlignSize(size_t input_size) const { - return (input_size + mem_align_size_ - 1) / mem_align_size_ * mem_align_size_ + 2 * mem_align_size_; -} - -uint8_t *KernelRuntime::MallocStaticMem(size_t size, bool communication_mem) { - size_t align_size = 0; - if (communication_mem) { - align_size = GetCommunicationAlignSize(size); - } else { - align_size = GetCommonAlignSize(size); - } - if (static_mem_offset_ < align_size) { - MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ - << "] static[" << total_static_size_ << "])" - << " malloc [" << align_size << "] failed!"; - } - total_static_size_ += align_size; - auto offset = static_mem_offset_ - align_size; - if (dynamic_mem_offset_ > offset) { - MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ - << "] static[" << total_static_size_ << "])" - << " malloc [" << align_size << "] failed!"; - } - static_mem_offset_ = offset; - if (communication_mem) { - return device_mem_base_ + offset + mem_align_size_; - } else { - return device_mem_base_ + offset; - } -} - -uint8_t *KernelRuntime::MallocDynamicMem(size_t size, bool communication_mem) { - size_t align_size = 0; - if (communication_mem) { - align_size = GetCommunicationAlignSize(size); - } else { - align_size = GetCommonAlignSize(size); - } - uint64_t offset = dynamic_mem_offset_; - auto new_offset = dynamic_mem_offset_ + align_size; - if (new_offset > static_mem_offset_) { - MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ - << "] static[" << total_static_size_ << "])" - << " malloc [" << align_size << "] failed!"; - } - total_dynamic_size_ += align_size; - dynamic_mem_offset_ = new_offset; - - if (communication_mem) { - return device_mem_base_ + offset + mem_align_size_; - } else { - return device_mem_base_ + offset; - } -} - bool KernelRuntime::LaunchKernel(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); if (!LaunchKernelMod(*graph)) { @@ -731,29 +603,6 @@ bool KernelRuntime::LaunchKernel(const session::KernelGraph *graph) { return true; } -void KernelRuntime::MallocOpMemory(const DeviceAddressPtr address, size_t size, int flag) { - if (flag == kStaticMem) { - address->ptr_ = MallocStaticMem(size, false); - } else if (flag == kDynamicMem) { - address->ptr_ = MallocDynamicMem(size, false); - } else { - MS_LOG(EXCEPTION) << "Unknown memory type!"; - } -} - -void *KernelRuntime::AllocTensorMemDynamic(size_t size) { - if (size == 0) { - MS_LOG(ERROR) << "AllocTensorMemDynamic size is 0."; - } - return nullptr; -} - -void KernelRuntime::FreeTensorMemDynamic(void *device_ptr) { - if (device_ptr == nullptr) { - MS_LOG(ERROR) << "FreeTensorMemDynamic device_ptr is null."; - } -} - #ifdef ENABLE_DUMP_E2E bool KernelRuntime::SetDumpConf() { dump_conf_ptr_ = std::make_shared(); diff --git a/mindspore/ccsrc/device/kernel_runtime.h b/mindspore/ccsrc/device/kernel_runtime.h index ac9a56ed4d8ff91521cbb70ba87c4990005942f4..1224bf14ebdc6e6dd80864f911f2b9179b6f38d6 100644 --- a/mindspore/ccsrc/device/kernel_runtime.h +++ b/mindspore/ccsrc/device/kernel_runtime.h @@ -20,8 +20,7 @@ #include #include #include -#include "pre_activate/mem_reuse/mem_reuse.h" -#include "pre_activate/mem_reuse/mem_reuse_allocator.h" + #include "device/device_address.h" #include "ir/meta_tensor.h" #include "predict/generator/utils/ir_model_util.h" @@ -32,21 +31,16 @@ #include "session/anf_runtime_algorithm.h" #include "kernel/kernel.h" #include "utils/context/ms_context.h" +#include "device/memory_manager.h" // using mindspore::session::KernelGraph; using mindspore::tensor::Tensor; using TensorPtr = std::shared_ptr; -using MemReuseUtilPtr = mindspore::memreuse::MemReuseUtilPtr; using mindspore::kernel::AddressPtr; using AddressPtrList = std::vector; namespace mindspore { namespace device { -const int kStaticMem = 0; -const int kDynamicMem = 1; -const int kReuseDynamicMem = 2; -const int kGetAllOuts = -1; - class KernelRuntime { public: KernelRuntime() = default; @@ -65,7 +59,6 @@ class KernelRuntime { DumpConfPtr GetDumpConf(); #endif virtual bool LoadTask(const session::KernelGraph *graph); - virtual void FreeHostMemory(); // for GPU and D to impl virtual void ReleaseDeviceRes() {} void set_device_id(uint32_t device_id) { device_id_ = device_id; } @@ -75,29 +68,17 @@ class KernelRuntime { TypeId type_id) = 0; virtual bool SyncStream() = 0; void AssignStaticMemory(session::KernelGraph *graph); - void AssignDynamicMemory(const session::KernelGraph *graph); + void AssignDynamicMemory(session::KernelGraph *graph); void ReuseAssignDynamicMemory(session::KernelGraph *graph); void AssignNodeOutputMem(int flag, const AnfNodePtr &node, int index); - void AssignWorkSpaceMem(const AnfNodePtr &node); + void AssignWorkSpaceMem(int flag, const AnfNodePtr &node); void AssignReuseWorkSpaceMem(const AnfNodePtr &node); void AssignCommunicationNodeOutputMem(int flag, const AnfNodePtr &node); void UpdateRefNodeOutputMem(const session::KernelGraph *graph); void UpdateCommunicationOpInputMem(const AnfNodePtr &node); - bool IsCommunicationOp(const AnfNodePtr &node); - size_t GetCommonAlignSize(size_t input_size) const; - size_t GetCommunicationAlignSize(size_t input_size) const; - - uint8_t *CalDeviceMem(const AnfNodePtr &node, size_t size, int flag, size_t index); - virtual uint8_t *MallocStaticMem(size_t size, bool communication_mem); - uint8_t *MallocDynamicMem(size_t size, bool communication_mem); #ifdef ENABLE_DUMP_E2E bool SetDumpConf(); #endif - // Alloc memory use the dynamic memory pool. - virtual void *AllocTensorMemDynamic(size_t size); - // Free memory use the dynamic memory pool. - virtual void FreeTensorMemDynamic(void *device_ptr); - virtual void MallocOpMemory(const DeviceAddressPtr address, size_t size, int flag); private: void AssignStaticMemoryOutput(const session::KernelGraph *graph); @@ -114,20 +95,11 @@ class KernelRuntime { protected: uint32_t device_id_{0}; - uint8_t *device_mem_base_{nullptr}; - uint8_t *device_mem_pool_base_{nullptr}; - uint64_t device_mem_size_{0}; - uint64_t device_mem_pool_size_{0}; - uint64_t dynamic_mem_offset_{0}; - uint64_t static_mem_offset_{0}; - const uint64_t mem_align_size_ = 512; #ifdef ENABLE_DUMP_E2E DumpConfPtr dump_conf_ptr_; #endif void *stream_ = nullptr; - size_t total_static_size_ = 0; - size_t total_dynamic_size_ = 0; - MemReuseUtilPtr mem_reuse_util_ptr_{nullptr}; + std::shared_ptr mem_manager_{nullptr}; }; using KernelRuntimePtr = std::shared_ptr; } // namespace device diff --git a/mindspore/ccsrc/device/memory_manager.cc b/mindspore/ccsrc/device/memory_manager.cc new file mode 100644 index 0000000000000000000000000000000000000000..3c1ddee6bc5a654cb6b9d616e916eab53327bd9d --- /dev/null +++ b/mindspore/ccsrc/device/memory_manager.cc @@ -0,0 +1,170 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/memory_manager.h" +#include "session/anf_runtime_algorithm.h" +#include "utils/context/ms_context.h" +using mindspore::memreuse::BestFitMemReuse; +using mindspore::memreuse::MemReuseUtilPtr; +namespace mindspore { +namespace device { +MemoryManager::~MemoryManager() { + device_mem_base_ = nullptr; + device_mem_pool_base_ = nullptr; + mem_reuse_util_ptr_ = nullptr; +} + +size_t MemoryManager::GetCommonAlignSize(size_t input_size) const { + return (input_size + kMemAlignSize + 31) / kMemAlignSize * kMemAlignSize; +} + +size_t MemoryManager::GetCommunicationAlignSize(size_t input_size) const { + return (input_size + kMemAlignSize - 1) / kMemAlignSize * kMemAlignSize + 2 * kMemAlignSize; +} + +void MemoryManager::InitReuseDynamicMemory(session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared(); + MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr); + // set all infos + mem_reuse_util_ptr->SetAllInfo(graph); + auto bestfit_mem_reuse = std::make_shared(); + MS_EXCEPTION_IF_NULL(bestfit_mem_reuse); + bestfit_mem_reuse->Reuse(mem_reuse_util_ptr.get()); + size_t total_allocated_size = bestfit_mem_reuse->GetAllocatedSize(); + MS_LOG(INFO) << "TotalReuseDynamicSize [" << total_allocated_size << "]"; + mem_reuse_util_ptr_ = mem_reuse_util_ptr; + auto base_ptr = MallocDynamicMem(total_allocated_size, false); + mem_reuse_util_ptr_->set_mem_base(base_ptr); +} + +uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, int flag, size_t size) { + MS_EXCEPTION_IF_NULL(node); + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + uint8_t *ptr = nullptr; + if (AnfAlgo::IsCommunicationOp(node)) { + bool communication_mem = false; + if (context_ptr->enable_hccl()) { + communication_mem = true; + } + if (flag == kStaticMem) { + ptr = MallocStaticMem(size, communication_mem); + } else { + ptr = MallocDynamicMem(size, communication_mem); + } + return ptr; + } + + if (flag == kStaticMem) { + ptr = MallocStaticMem(size, false); + } else if (flag == kDynamicMem) { + ptr = MallocDynamicMem(size, false); + } else if (flag == kReuseDynamicMem) { + ptr = mem_reuse_util_ptr_->GetNodeOutputPtr(node, index); + } + return ptr; +} + +uint8_t *MemoryManager::MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, int flag, size_t size) { + if (flag == kReuseDynamicMem) { + return mem_reuse_util_ptr_->GetNodeWorkSpacePtr(node, index); + } + return MallocDynamicMem(size, false); +} + +uint8_t *MemoryManager::MallocMem(int flag, size_t size) { + uint8_t *ptr = nullptr; + if (flag == kStaticMem) { + ptr = MallocStaticMem(size, false); + } else if (flag == kDynamicMem) { + ptr = MallocDynamicMem(size, false); + } + return ptr; +} + +uint8_t *MemoryManager::MallocStaticMem(size_t size, bool communication_mem) { + size_t align_size = 0; + if (communication_mem) { + align_size = GetCommunicationAlignSize(size); + } else { + align_size = GetCommonAlignSize(size); + } + if (static_mem_offset_ < align_size) { + MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ + << "] static[" << total_static_size_ << "])" + << " malloc [" << align_size << "] failed!"; + } + total_static_size_ += align_size; + auto offset = static_mem_offset_ - align_size; + if (dynamic_mem_offset_ > offset) { + MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ + << "] static[" << total_static_size_ << "])" + << " malloc [" << align_size << "] failed!"; + } + static_mem_offset_ = offset; + if (communication_mem) { + return device_mem_base_ + offset + kMemAlignSize; + } else { + return device_mem_base_ + offset; + } +} + +uint8_t *MemoryManager::MallocDynamicMem(size_t size, bool communication_mem) { + size_t align_size = 0; + if (communication_mem) { + align_size = GetCommunicationAlignSize(size); + } else { + align_size = GetCommonAlignSize(size); + } + uint64_t offset = dynamic_mem_offset_; + auto new_offset = dynamic_mem_offset_ + align_size; + if (new_offset > static_mem_offset_) { + MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ + << "] static[" << total_static_size_ << "])" + << " malloc [" << align_size << "] failed!"; + } + total_dynamic_size_ += align_size; + dynamic_mem_offset_ = new_offset; + + if (communication_mem) { + return device_mem_base_ + offset + kMemAlignSize; + } else { + return device_mem_base_ + offset; + } +} + +void MemoryManager::MallocOpMemory(const DeviceAddressPtr address, size_t size) { + auto device_ptr = AllocTensorMemDynamic(size); + MS_EXCEPTION_IF_NULL(device_ptr); + address->ptr_ = device_ptr; + address->mem_dynamic_alloc_ = true; +} + +void *MemoryManager::AllocTensorMemDynamic(size_t size) { + if (size == 0) { + MS_LOG(ERROR) << "AllocTensorMemDynamic size is 0."; + } + return nullptr; +} + +void MemoryManager::FreeTensorMemDynamic(void *device_ptr) { + if (device_ptr == nullptr) { + MS_LOG(ERROR) << "FreeTensorMemDynamic device_ptr is null."; + } +} +} // namespace device +} // namespace mindspore diff --git a/mindspore/ccsrc/device/memory_manager.h b/mindspore/ccsrc/device/memory_manager.h new file mode 100644 index 0000000000000000000000000000000000000000..2e47237defa747b80c5e971289ca92d4f6ae88a4 --- /dev/null +++ b/mindspore/ccsrc/device/memory_manager.h @@ -0,0 +1,71 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_MINDSPORE_CCSRC_DEVICE_MEMORY_MANAGER_H_ +#define MINDSPORE_MINDSPORE_CCSRC_DEVICE_MEMORY_MANAGER_H_ +#include +#include "pre_activate/mem_reuse/mem_reuse.h" +#include "pre_activate/mem_reuse/mem_reuse_allocator.h" +namespace mindspore { +namespace device { +const int kStaticMem = 0; +const int kDynamicMem = 1; +const int kReuseDynamicMem = 2; +const int kGetAllOuts = -1; +const uint64_t kMemAlignSize = 512; +using MemReuseUtilPtr = mindspore::memreuse::MemReuseUtilPtr; + +class MemoryManager { + public: + MemoryManager() = default; + virtual ~MemoryManager(); + + virtual void MallocDeviceMemory() = 0; + virtual void FreeDeviceMemory() = 0; + void ResetDynamicMemory() { + total_dynamic_size_ = 0; + dynamic_mem_offset_ = 0; + } + + void InitReuseDynamicMemory(session::KernelGraph *graph); + uint8_t *MallocOutputMem(const AnfNodePtr &node, size_t index, int flag, size_t size); + uint8_t *MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, int flag, size_t size); + virtual uint8_t *MallocMem(int flag, size_t size); + + // Alloc memory use the dynamic memory pool. + virtual void *AllocTensorMemDynamic(size_t size); + // Free memory use the dynamic memory pool. + virtual void FreeTensorMemDynamic(void *device_ptr); + virtual void MallocOpMemory(const DeviceAddressPtr address, size_t size); + size_t GetCommonAlignSize(size_t input_size) const; + size_t GetCommunicationAlignSize(size_t input_size) const; + + protected: + virtual uint8_t *MallocStaticMem(size_t size, bool communication_mem); + virtual uint8_t *MallocDynamicMem(size_t size, bool communication_mem); + uint8_t *device_mem_base_{nullptr}; + uint8_t *device_mem_pool_base_{nullptr}; + uint64_t device_mem_size_{0}; + uint64_t device_mem_pool_size_{0}; + uint64_t dynamic_mem_offset_{0}; + uint64_t static_mem_offset_{0}; + size_t total_static_size_ = 0; + size_t total_dynamic_size_ = 0; + MemReuseUtilPtr mem_reuse_util_ptr_{nullptr}; +}; +} // namespace device +} // namespace mindspore +#endif // MINDSPORE_MINDSPORE_CCSRC_DEVICE_MEMORY_MANAGER_H_ diff --git a/mindspore/ccsrc/session/anf_runtime_algorithm.cc b/mindspore/ccsrc/session/anf_runtime_algorithm.cc index cc23dbbdd2f336eadc6cae2659bd75b36ec0b811..78922448afa09f0fa792cbf438b508f040406cdd 100644 --- a/mindspore/ccsrc/session/anf_runtime_algorithm.cc +++ b/mindspore/ccsrc/session/anf_runtime_algorithm.cc @@ -857,5 +857,15 @@ void AnfRuntimeAlgorithm::SetNodeInput(const CNodePtr &node, const AnfNodePtr &i MS_EXCEPTION_IF_NULL(input_node); node->set_input(index + 1, input_node); } + +bool AnfRuntimeAlgorithm::IsCommunicationOp(const AnfNodePtr &node) { + MS_EXCEPTION_IF_NULL(node); + auto kernel_name = AnfAlgo::GetCNodeName(node); + auto kernel_type = AnfAlgo::GetKernelType(node); + if (kernel_name == kAllReduceOpName || kernel_type == HCCL_KERNEL) { + return true; + } + return false; +} } // namespace session } // namespace mindspore diff --git a/mindspore/ccsrc/session/anf_runtime_algorithm.h b/mindspore/ccsrc/session/anf_runtime_algorithm.h index 2de68f0098939f8c02b2fc6c3ab096ac52363094..55650ac31e312a040878b46a7b6d71758e88bb6a 100644 --- a/mindspore/ccsrc/session/anf_runtime_algorithm.h +++ b/mindspore/ccsrc/session/anf_runtime_algorithm.h @@ -166,6 +166,7 @@ class AnfRuntimeAlgorithm { static bool IsFeatureMapInput(const AnfNodePtr &node, size_t input_index); // get real input index for some tbe ops which input order is different between me and tbe impl static size_t GetRealInputIndex(const AnfNodePtr &anf_node, const size_t cur_index); + static bool IsCommunicationOp(const AnfNodePtr &node); }; } // namespace session using AnfAlgo = session::AnfRuntimeAlgorithm; diff --git a/mindspore/ccsrc/session/gpu_session.cc b/mindspore/ccsrc/session/gpu_session.cc index 29330fb19384c58611eb6ace7767f6d90109fdb7..bbcf2228cceb1d72a96db0cccef34f703d2cb616 100644 --- a/mindspore/ccsrc/session/gpu_session.cc +++ b/mindspore/ccsrc/session/gpu_session.cc @@ -102,10 +102,6 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList graph->set_execution_order(execution_order); // Alloc memory, including static memory and dynamic memory AllocateMemory(graph.get()); - // Reset memory resource - auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); - MS_EXCEPTION_IF_NULL(runtime_instance); - runtime_instance->FreeHostMemory(); return graph_id; } diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index 8d3f8a8138d94a8beb8c71ceb3958bde041b4e33..3c1351a85732106a8c7a2a48616b7079fc640166 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -85,6 +85,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "../../../mindspore/ccsrc/kernel/oplib/*.cc" "../../../mindspore/ccsrc/kernel/tbe/*.cc" "../../../mindspore/ccsrc/device/kernel_runtime.cc" + "../../../mindspore/ccsrc/device/memory_manager.cc" "../../../mindspore/ccsrc/device/kernel_runtime_manager.cc" "../../../mindspore/ccsrc/device/kernel_info.cc" "../../../mindspore/ccsrc/device/ascend/profiling/*.cc" @@ -92,6 +93,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "../../../mindspore/ccsrc/device/convert_tensor_utils.cc" "../../../mindspore/ccsrc/device/ascend/kernel_build_ascend.cc" "../../../mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc" + "../../../mindspore/ccsrc/device/ascend/ascend_memory_manager.cc" "../../../mindspore/ccsrc/device/ascend/ascend_device_address.cc" "../../../mindspore/ccsrc/device/ascend/ascend_memory_allocator.cc" "../../../mindspore/ccsrc/predict/generator/utils/ir_model_util.cc"