diff --git a/mindspore/ccsrc/device/ascend/ascend_device_address.cc b/mindspore/ccsrc/device/ascend/ascend_device_address.cc index c4b8717fa505bf95685fa873d76f32d409d49e8a..89f2263abb74e3f4ad23210107c219ffcf870389 100644 --- a/mindspore/ccsrc/device/ascend/ascend_device_address.cc +++ b/mindspore/ccsrc/device/ascend/ascend_device_address.cc @@ -303,12 +303,22 @@ bool AscendDeviceAddress::ConvertFormatAndSyncHostToDevice(const std::vector(ptr_) - kMemAlignSize; +} + AscendDeviceAddress::~AscendDeviceAddress() { if (ptr_ == nullptr) { return; } if (from_mem_pool_) { - AscendMemoryPool::GetInstance().FreeTensorMem(ptr_); + if (communication_ptr_ != nullptr) { + AscendMemoryPool::GetInstance().FreeTensorMem(communication_ptr_); + communication_ptr_ = nullptr; + } else { + AscendMemoryPool::GetInstance().FreeTensorMem(ptr_); + } ptr_ = nullptr; } } diff --git a/mindspore/ccsrc/device/ascend/ascend_device_address.h b/mindspore/ccsrc/device/ascend/ascend_device_address.h index 16b9f7817a95a237c7dd6ff0d7e4a93587bee404..4e560e30f48a497fef41740391637aa453251482 100644 --- a/mindspore/ccsrc/device/ascend/ascend_device_address.h +++ b/mindspore/ccsrc/device/ascend/ascend_device_address.h @@ -39,6 +39,7 @@ class AscendDeviceAddress : public DeviceAddress { bool SyncDeviceToHost(const std::vector &shape, size_t size, TypeId type, void *host_ptr) const override; bool SyncHostToDevice(const std::vector &shape, size_t size, TypeId type, const void *host_ptr) const override; DeviceAddressType DeviceType() const override { return DeviceAddressType::kAscend; } + void UpdateCommunicationAddress() override; #ifdef ENABLE_DUMP_E2E bool DumpMemToFile(bool dump_mode, const std::string &filepath, const std::string &host_fmt, const std::vector &host_shape, TypeId host_type) const; @@ -53,6 +54,7 @@ class AscendDeviceAddress : public DeviceAddress { bool ConvertFormatAndSyncHostToDevice(const std::vector &shape, size_t size, TypeId type, const void *host_ptr) const; void SyncStream() const; + uint8_t *communication_ptr_{nullptr}; }; using AscendDeviceAddressPtr = std::shared_ptr; } // namespace ascend diff --git a/mindspore/ccsrc/device/ascend/ascend_memory_manager.cc b/mindspore/ccsrc/device/ascend/ascend_memory_manager.cc index 42c611c3af0b54cf3f097c68858da675701604f9..a664232a2845c36bece59049d474aea33f115567 100644 --- a/mindspore/ccsrc/device/ascend/ascend_memory_manager.cc +++ b/mindspore/ccsrc/device/ascend/ascend_memory_manager.cc @@ -21,31 +21,22 @@ namespace mindspore { namespace device { namespace ascend { -constexpr uint64_t kAscendDeviceMemGB = 26; -constexpr uint64_t kAscendMemPoolGB = 4; +constexpr uint64_t kAscendDeviceMemGB = 30; constexpr uint64_t kMemSizeGB = 30; -constexpr uint64_t kMaxMemSizeGB = 30; constexpr uint64_t kAscendDeviceMemSize = (kAscendDeviceMemGB << kMemSizeGB); -constexpr uint64_t kAscendMemPoolSize = (kAscendMemPoolGB << kMemSizeGB); void AscendMemoryManager::MallocDeviceMemory() { auto context_mem = GetDeviceMemSizeFromContext(); device_mem_size_ = context_mem == 0 ? kAscendDeviceMemSize : context_mem; - static_mem_offset_ = device_mem_size_; - auto ret = rtMalloc(reinterpret_cast(&device_mem_base_), static_mem_offset_, RT_MEMORY_HBM); + dynamic_mem_offset_ = device_mem_size_; + auto ret = rtMalloc(reinterpret_cast(&device_mem_base_), dynamic_mem_offset_, RT_MEMORY_HBM); + if (ret != RT_ERROR_NONE) { - MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << static_mem_offset_ << "] fail, ret[" << ret << "]"; + MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << dynamic_mem_offset_ << "] fail, ret[" << ret << "]"; } - if (context_mem == 0) { - device_mem_pool_size_ = kAscendMemPoolSize; - ret = rtMalloc(reinterpret_cast(&device_mem_pool_base_), device_mem_pool_size_, RT_MEMORY_HBM); - if (ret != RT_ERROR_NONE) { - MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << device_mem_pool_size_ << "] fail, ret[" << ret << "]"; - } - AscendMemoryPool::GetInstance().set_device_mem_pool_base(device_mem_pool_base_); - AscendMemoryPool::GetInstance().set_device_mem_pool_size(device_mem_pool_size_); - } + AscendMemoryPool::GetInstance().set_device_mem_pool_base(device_mem_base_); + AscendMemoryPool::GetInstance().set_graph_dynamic_mem_offset(dynamic_mem_offset_); } uint64_t AscendMemoryManager::GetDeviceMemSizeFromContext() { @@ -63,7 +54,7 @@ uint64_t AscendMemoryManager::GetDeviceMemSizeFromContext() { auto gb_str = variable_memory_max_size.substr(0, pos); auto gb_var = std::stoull(gb_str); MS_LOG(INFO) << "variable_memory_max_size(GB):" << gb_var; - if (gb_var > kMaxMemSizeGB || gb_var == 0) { + if (gb_var > kAscendDeviceMemGB || gb_var == 0) { MS_LOG(EXCEPTION) << "Invalid allocate memory size:" << gb_var << " which should be in (0-30]GB"; } return gb_var << kMemSizeGB; @@ -86,8 +77,60 @@ void AscendMemoryManager::FreeDeviceMemory() { } } +void AscendMemoryManager::ResetDynamicMemory() { + total_dynamic_size_ = 0; + dynamic_mem_offset_ = device_mem_size_; + AscendMemoryPool::GetInstance().set_graph_dynamic_mem_offset(dynamic_mem_offset_); +} + void *AscendMemoryManager::MallocMemFromMemPool(size_t size) { - return AscendMemoryPool::GetInstance().AllocTensorMem(size); + auto align_size = GetCommonAlignSize(size); + return AscendMemoryPool::GetInstance().AllocTensorMem(align_size); +} + +uint8_t *AscendMemoryManager::MallocStaticMem(size_t size, bool communication_mem) { + size_t align_size = 0; + if (communication_mem) { + align_size = GetCommunicationAlignSize(size); + } else { + align_size = GetCommonAlignSize(size); + } + if (communication_mem) { + // create protect area [kMemAlignSize -- data -- kMemAlignSize] + uint8_t *alloc_address = reinterpret_cast(AscendMemoryPool::GetInstance().AllocTensorMem(align_size)); + return alloc_address + kMemAlignSize; + } else { + return reinterpret_cast(AscendMemoryPool::GetInstance().AllocTensorMem(align_size)); + } +} + +uint8_t *AscendMemoryManager::MallocDynamicMem(size_t size, bool communication_mem) { + size_t align_size = 0; + if (communication_mem) { + align_size = GetCommunicationAlignSize(size); + } else { + align_size = GetCommonAlignSize(size); + } + if (dynamic_mem_offset_ < align_size) { + MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "] (dynamic[" << total_dynamic_size_ + << "]) malloc [" << align_size << "] failed!"; + } + auto new_offset = dynamic_mem_offset_ - align_size; + auto device_mem_pool_offset = AscendMemoryPool::GetInstance().device_mem_pool_offset(); + if (new_offset <= device_mem_pool_offset) { + MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "] (dynamic[" << total_dynamic_size_ + << "] memory pool[" << device_mem_pool_offset << "])" + << " malloc [" << align_size << "] failed!"; + } + total_dynamic_size_ += align_size; + dynamic_mem_offset_ = new_offset; + AscendMemoryPool::GetInstance().set_graph_dynamic_mem_offset(dynamic_mem_offset_); + if (communication_mem) { + // create protect area [kMemAlignSize -- data -- kMemAlignSize] + return device_mem_base_ + new_offset + kMemAlignSize; + } else { + return device_mem_base_ + new_offset; + } } } // namespace ascend } // namespace device diff --git a/mindspore/ccsrc/device/ascend/ascend_memory_manager.h b/mindspore/ccsrc/device/ascend/ascend_memory_manager.h index 7fdd8f553e5668e08fc766b0f3d53e15a3611d46..5b52412d781b3064474c6018a1445076c06cc44c 100644 --- a/mindspore/ccsrc/device/ascend/ascend_memory_manager.h +++ b/mindspore/ccsrc/device/ascend/ascend_memory_manager.h @@ -27,8 +27,13 @@ class AscendMemoryManager : public MemoryManager { void MallocDeviceMemory() override; void FreeDeviceMemory() override; + void ResetDynamicMemory() override; void *MallocMemFromMemPool(size_t size) override; + protected: + uint8_t *MallocStaticMem(size_t size, bool communication_mem) override; + uint8_t *MallocDynamicMem(size_t size, bool communication_mem) override; + private: uint8_t *device_mem_pool_base_{nullptr}; uint64_t device_mem_pool_size_{0}; diff --git a/mindspore/ccsrc/device/ascend/ascend_memory_pool.cc b/mindspore/ccsrc/device/ascend/ascend_memory_pool.cc index 69c6dca57601d91969e6d8daad5e81de962e5c96..f325046486aaf8d02f8bb3c97fda3ba2b52732ff 100644 --- a/mindspore/ccsrc/device/ascend/ascend_memory_pool.cc +++ b/mindspore/ccsrc/device/ascend/ascend_memory_pool.cc @@ -22,45 +22,54 @@ namespace mindspore { namespace device { namespace ascend { size_t AscendMemoryPool::AllocDeviceMem(size_t size, DeviceMemPtr *addr) { - if (has_malloc_) { - MS_LOG(EXCEPTION) << "Has alloc memory pool memory !"; + if (size == 0) { + MS_LOG(EXCEPTION) << "Can not alloc memory size(0) in memory pool !"; } - if (size == 0 || size > free_mem_size_) { - MS_LOG(EXCEPTION) << "Failed to alloc memory pool memory !"; + if (device_mem_pool_offset_ + size >= graph_dynamic_mem_offset_) { + MS_LOG(EXCEPTION) << "Failed to alloc memory pool memory, the current device_mem_pool_offset_ [" + << device_mem_pool_offset_ << "], current graph_dynamic_mem_offset_ " << graph_dynamic_mem_offset_ + << "], need memory size [" << size << "]"; } - *addr = device_mem_pool_base_; + *addr = device_mem_pool_base_ + device_mem_pool_offset_; + device_mem_pool_offset_ += size; if (*addr == nullptr) { - MS_LOG(EXCEPTION) << "Device memory pool base is nullptr, failed to alloc memory pool memory!"; + MS_LOG(EXCEPTION) << "Alloc device address is nullptr, failed to alloc memory pool memory!"; } - has_malloc_ = true; - free_mem_size_ -= size; return size; } bool AscendMemoryPool::FreeDeviceMem(const DeviceMemPtr &addr) { MS_EXCEPTION_IF_NULL(addr); - has_malloc_ = false; - free_mem_size_ = total_mem_size_; return true; } size_t AscendMemoryPool::AlignMemorySize(size_t size) const { if (size == 0) { - return DYNAMIC_MEM_ALIGN_SIZE; + MS_LOG(EXCEPTION) << "The align memory size is a zero !"; } - return ((size + DYNAMIC_MEM_ALIGN_SIZE + 31) / DYNAMIC_MEM_ALIGN_SIZE) * DYNAMIC_MEM_ALIGN_SIZE; + return size; } -size_t AscendMemoryPool::mem_alloc_unit_size() const { return free_mem_size_ - 512; } - void AscendMemoryPool::set_device_mem_pool_base(uint8_t *device_mem_pool_base) { MS_EXCEPTION_IF_NULL(device_mem_pool_base); device_mem_pool_base_ = device_mem_pool_base; } -size_t AscendMemoryPool::free_mem_size() { return free_mem_size_; } +void AscendMemoryPool::set_graph_dynamic_mem_offset(uint64_t graph_dynamic_mem_offset) { + graph_dynamic_mem_offset_ = graph_dynamic_mem_offset; +} + +uint64_t AscendMemoryPool::device_mem_pool_offset() const { return device_mem_pool_offset_; } + +size_t AscendMemoryPool::free_mem_size() { + if (graph_dynamic_mem_offset_ < device_mem_pool_offset_) { + MS_LOG(EXCEPTION) << "graph dynamic mem offset [" << graph_dynamic_mem_offset_ + << "] less than device mem pool offset [" << device_mem_pool_offset_ << "]!"; + } + return graph_dynamic_mem_offset_ - device_mem_pool_offset_; +} -size_t AscendMemoryPool::total_mem_size() { return total_mem_size_; } +size_t AscendMemoryPool::total_mem_size() { return graph_dynamic_mem_offset_ == 0 ? 0 : graph_dynamic_mem_offset_ - 1; } } // namespace ascend } // namespace device } // namespace mindspore diff --git a/mindspore/ccsrc/device/ascend/ascend_memory_pool.h b/mindspore/ccsrc/device/ascend/ascend_memory_pool.h index 7fa3ebc23e8996ec2b5a52a4fc68926e5f7c59c9..ef02f21cde4284248206969907f27d9138c63a4d 100644 --- a/mindspore/ccsrc/device/ascend/ascend_memory_pool.h +++ b/mindspore/ccsrc/device/ascend/ascend_memory_pool.h @@ -32,11 +32,9 @@ class AscendMemoryPool : public DynamicMemPoolBestFit { size_t AllocDeviceMem(size_t size, DeviceMemPtr *addr) override; bool FreeDeviceMem(const DeviceMemPtr &addr) override; void set_device_mem_pool_base(uint8_t *device_mem_pool_base); - void set_device_mem_pool_size(uint64_t device_mem_pool_size) { - device_mem_pool_size_ = device_mem_pool_size; - free_mem_size_ = device_mem_pool_size_; - total_mem_size_ = free_mem_size_; - } + void set_graph_dynamic_mem_offset(uint64_t graph_dynamic_mem_offset); + + uint64_t device_mem_pool_offset() const; size_t free_mem_size() override; size_t total_mem_size() override; @@ -48,16 +46,12 @@ class AscendMemoryPool : public DynamicMemPoolBestFit { protected: // The real size by memory alloc aligned. size_t AlignMemorySize(size_t size) const override; - // Get the minimum memory unit size using for dynamic extend. - size_t mem_alloc_unit_size() const override; private: AscendMemoryPool() = default; - bool has_malloc_{false}; uint8_t *device_mem_pool_base_{nullptr}; - uint64_t device_mem_pool_size_{0}; - size_t free_mem_size_{0}; - size_t total_mem_size_{0}; + uint64_t device_mem_pool_offset_{0}; + uint64_t graph_dynamic_mem_offset_{0}; }; } // namespace ascend } // namespace device diff --git a/mindspore/ccsrc/device/device_address.h b/mindspore/ccsrc/device/device_address.h index 0447cc253977cb0d1cf76e3b6ecf9999393b35cb..f4597f6f460273aeea708357bb292b6175d053d3 100644 --- a/mindspore/ccsrc/device/device_address.h +++ b/mindspore/ccsrc/device/device_address.h @@ -64,6 +64,7 @@ class DeviceAddress { std::string format() const { return format_; } TypeId type_id() const { return type_id_; } void set_host_shape(const std::vector &shape) { host_shape_ = shape; } + virtual void UpdateCommunicationAddress() {} virtual void set_status(DeviceAddressStatus status) {} virtual DeviceAddressStatus status() const { return DeviceAddressStatus::kInDevice; } virtual DeviceAddressType DeviceType() const { return DeviceAddressType::kUnknown; } diff --git a/mindspore/ccsrc/device/kernel_runtime.cc b/mindspore/ccsrc/device/kernel_runtime.cc index 27cf1dfc923f37de4d198fbfc74a20f6e9b241c4..7efb4702e08b3b9057bd3ea4cc7c712556a1ff1d 100644 --- a/mindspore/ccsrc/device/kernel_runtime.cc +++ b/mindspore/ccsrc/device/kernel_runtime.cc @@ -431,6 +431,10 @@ void KernelRuntime::AssignCommunicationNodeOutputMem(int flag, const AnfNodePtr std::string output_format = AnfAlgo::GetOutputFormat(node, j); auto output_type = AnfAlgo::GetOutputDeviceDataType(node, j); auto address = CreateDeviceAddress(output_ptr, output_sizes[j], output_format, output_type); + MS_EXCEPTION_IF_NULL(address); + if (AnfAlgo::IsCommunicationOp(node) && context_ptr->enable_hccl()) { + address->UpdateCommunicationAddress(); + } AnfAlgo::SetOutputAddr(address, j, node.get()); output_ptr += align_size_list[j]; } @@ -480,6 +484,8 @@ void KernelRuntime::AssignCommunicationNodeInputMem(const AnfNodePtr &node) { } void KernelRuntime::AssignNodeOutputMem(int flag, const AnfNodePtr &node, int index) { + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); MS_EXCEPTION_IF_NULL(node); MS_EXCEPTION_IF_NULL(mem_manager_); if (AnfAlgo::IsGetNext(NOT_NULL(node)) && flag == kReuseDynamicMem) { @@ -509,7 +515,11 @@ void KernelRuntime::AssignNodeOutputMem(int flag, const AnfNodePtr &node, int in std::string output_format = AnfAlgo::GetOutputFormat(node, i); auto output_type = AnfAlgo::GetOutputDeviceDataType(node, i); auto device_address = CreateDeviceAddress(ptr, output_sizes[i], output_format, output_type); + MS_EXCEPTION_IF_NULL(device_address); device_address->set_host_shape(trans::GetRuntimePaddingShape(node, i)); + if (AnfAlgo::IsCommunicationOp(node) && context_ptr->enable_hccl()) { + device_address->UpdateCommunicationAddress(); + } AnfAlgo::SetOutputAddr(device_address, i, node.get()); } } diff --git a/mindspore/ccsrc/device/memory_manager.h b/mindspore/ccsrc/device/memory_manager.h index be250e0f3f3f395e8beea9dbccabaeb80925dc3a..fb9c539adb17767219bcf16354670d83df10008b 100644 --- a/mindspore/ccsrc/device/memory_manager.h +++ b/mindspore/ccsrc/device/memory_manager.h @@ -36,7 +36,7 @@ class MemoryManager { virtual void MallocDeviceMemory() = 0; virtual void FreeDeviceMemory() = 0; - void ResetDynamicMemory() { + virtual void ResetDynamicMemory() { total_dynamic_size_ = 0; dynamic_mem_offset_ = 0; } diff --git a/mindspore/ccsrc/pre_activate/mem_reuse/mem_dynamic_allocator.cc b/mindspore/ccsrc/pre_activate/mem_reuse/mem_dynamic_allocator.cc index 095f8f6495a3261138e1418d85677365a49425f0..7c5e87b128ffdea2592bf9db5602cec5bb88582c 100644 --- a/mindspore/ccsrc/pre_activate/mem_reuse/mem_dynamic_allocator.cc +++ b/mindspore/ccsrc/pre_activate/mem_reuse/mem_dynamic_allocator.cc @@ -184,14 +184,16 @@ DynamicMemBlockPtr DynamicMemPoolBestFit::FindMemBlock(const DeviceMemPtr device if (iter != global_mem_block_list_.begin()) { return *(--iter); } - MS_LOG(ERROR) << "Can't find the mem_block of the device address[" << device_addr << "]."; return nullptr; } void DynamicMemPoolBestFit::FreeTensorMem(const DeviceMemPtr device_addr) { MS_EXCEPTION_IF_NULL(device_addr); auto mem_block = FindMemBlock(device_addr); - MS_EXCEPTION_IF_NULL(mem_block); + if (mem_block == nullptr) { + MS_LOG(WARNING) << "Can't find the mem_block of the device address[" << device_addr << "]."; + return; + } CombineMemBuf(mem_block, device_addr); }