提交 5e2f440e 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!1271 refactor of memreuse allocator to adapt the control stream

Merge pull request !1271 from yangjie159/refactor_memreuse_allocator
...@@ -47,7 +47,7 @@ std::vector<int> KernelDef::GetOutputRefIndexs() const { ...@@ -47,7 +47,7 @@ std::vector<int> KernelDef::GetOutputRefIndexs() const {
return output_ref_indexs; return output_ref_indexs;
} }
std::vector<int> KernelDef::GetWkRefIndexs() const { std::vector<int> KernelDef::GetWorkspaceRefIndexs() const {
std::vector<int> wk_ref_indexs; std::vector<int> wk_ref_indexs;
if (wk_space_.empty()) { if (wk_space_.empty()) {
return wk_ref_indexs; return wk_ref_indexs;
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <map> #include <map>
#include <string> #include <string>
#include <memory> #include <memory>
#include <set>
namespace mindspore { namespace mindspore {
namespace memreuse { namespace memreuse {
...@@ -73,13 +74,15 @@ class KernelDef { ...@@ -73,13 +74,15 @@ class KernelDef {
KernelRefCountPtrList output_refs() const { return output_refs_; } KernelRefCountPtrList output_refs() const { return output_refs_; }
std::vector<int> GetInputRefIndexs() const; std::vector<int> GetInputRefIndexs() const;
std::vector<int> GetOutputRefIndexs() const; std::vector<int> GetOutputRefIndexs() const;
std::vector<int> GetWkRefIndexs() const; std::vector<int> GetWorkspaceRefIndexs() const;
void set_stream_id(uint32_t stream_id) { stream_id_ = stream_id; } void set_stream_id(uint32_t stream_id) { stream_id_ = stream_id; }
uint32_t stream_id() const { return stream_id_; } uint32_t stream_id() const { return stream_id_; }
void set_kernel_name(const std::string &kernel_name) { kernel_name_ = kernel_name; } void set_kernel_name(const std::string &kernel_name) { kernel_name_ = kernel_name; }
std::string kernel_name() const { return kernel_name_; } std::string kernel_name() const { return kernel_name_; }
void set_scope_full_name(const std::string &scop_name) { scop_full_name_ = scop_name; } void set_scope_full_name(const std::string &scop_name) { scop_full_name_ = scop_name; }
std::string scope_full_name() const { return scop_full_name_; } std::string scope_full_name() const { return scop_full_name_; }
void InsertInputKernel(const std::shared_ptr<KernelDef> &input_kernel) { input_kernels_.insert(input_kernel); }
const std::set<std::shared_ptr<KernelDef>> &input_kernels() { return input_kernels_; }
private: private:
std::string scop_full_name_; std::string scop_full_name_;
...@@ -87,6 +90,7 @@ class KernelDef { ...@@ -87,6 +90,7 @@ class KernelDef {
uint32_t stream_id_{0}; uint32_t stream_id_{0};
KernelRefCountPtrList input_refs_; KernelRefCountPtrList input_refs_;
KernelRefCountPtrList output_refs_; KernelRefCountPtrList output_refs_;
std::set<std::shared_ptr<KernelDef>> input_kernels_;
}; };
using KernelDefPtr = std::shared_ptr<KernelDef>; using KernelDefPtr = std::shared_ptr<KernelDef>;
} // namespace memreuse } // namespace memreuse
......
...@@ -245,6 +245,34 @@ void MemReuseUtil::SetKernelDefMap() { ...@@ -245,6 +245,34 @@ void MemReuseUtil::SetKernelDefMap() {
kernel_def_ptr->set_input_refs(kernel_def_ptr->inputs_[key]); kernel_def_ptr->set_input_refs(kernel_def_ptr->inputs_[key]);
kernel_def_ptr->set_output_refs(kernel_def_ptr->outputs_[key]); kernel_def_ptr->set_output_refs(kernel_def_ptr->outputs_[key]);
kernel_def_ptr_list_.push_back(kernel_def_ptr); kernel_def_ptr_list_.push_back(kernel_def_ptr);
kernel_map_[key] = kernel_def_ptr;
}
SetKernelDefInputs();
}
void MemReuseUtil::SetKernelDefInputs() {
for (const auto &kernel : graph_->execution_order()) {
auto key = kernel.get();
// find kernel_def according to cnode addr
auto iter = kernel_map_.find(key);
if (iter == kernel_map_.end()) {
MS_LOG(EXCEPTION) << "kernel [" << kernel->fullname_with_scope() << "] is not init.";
}
auto kernel_def = iter->second;
for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
auto ref_ptr = GetKernelInputRef(kernel, i);
if (ref_ptr != nullptr) {
// set the inputs of this kernel_def
auto input_node = AnfAlgo::GetInputNode(kernel, i);
auto input = AnfAlgo::VisitKernel(input_node, 0);
auto input_key = (input.first).get();
auto input_iter = kernel_map_.find(input_key);
if (input_iter == kernel_map_.end()) {
MS_LOG(EXCEPTION) << "kernel [" << (input.first)->fullname_with_scope() << "] is not init.";
}
kernel_def->InsertInputKernel(input_iter->second);
}
}
} }
} }
......
...@@ -61,6 +61,7 @@ class MemReuseUtil { ...@@ -61,6 +61,7 @@ class MemReuseUtil {
void SetInputMap(const CNodePtr &kernel, KernelDef *kernel_def_ptr); void SetInputMap(const CNodePtr &kernel, KernelDef *kernel_def_ptr);
void SetOutputMap(const CNodePtr &kernel, KernelDef *kernel_def_ptr); void SetOutputMap(const CNodePtr &kernel, KernelDef *kernel_def_ptr);
void SetWkMap(const CNodePtr &kernel, KernelDef *kernel_def_ptr); void SetWkMap(const CNodePtr &kernel, KernelDef *kernel_def_ptr);
void SetKernelDefInputs();
void SetReuseRefCount(); void SetReuseRefCount();
// Set the reference count of graph output specially. // Set the reference count of graph output specially.
void SetGraphOutputRefCount(); void SetGraphOutputRefCount();
...@@ -94,6 +95,8 @@ class MemReuseUtil { ...@@ -94,6 +95,8 @@ class MemReuseUtil {
size_t total_workspace_size_ = 0; size_t total_workspace_size_ = 0;
size_t total_reuseworkspace_size_ = 0; size_t total_reuseworkspace_size_ = 0;
uint8_t *mem_base_{nullptr}; uint8_t *mem_base_{nullptr};
// kernel_map_: key is the AnfNodePtr addr, value is the KernelDef
std::map<KernelKey, KernelDefPtr> kernel_map_;
}; };
using MemReuseUtilPtr = std::shared_ptr<MemReuseUtil>; using MemReuseUtilPtr = std::shared_ptr<MemReuseUtil>;
} // namespace memreuse } // namespace memreuse
......
...@@ -29,31 +29,30 @@ ...@@ -29,31 +29,30 @@
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <set> #include <set>
#include <queue>
#include "pre_activate/mem_reuse/kernel_refcount.h" #include "pre_activate/mem_reuse/kernel_refcount.h"
#include "pre_activate/mem_reuse/mem_reuse.h" #include "pre_activate/mem_reuse/mem_reuse.h"
#include "pre_activate/mem_reuse/stream_reuse.h"
namespace mindspore { namespace mindspore {
namespace memreuse { namespace memreuse {
static constexpr int kWkIndexFactor = -1000; static constexpr int kWorkspaceIndexFactor = -1000;
static constexpr int kDyFac = -1; static constexpr int kDynamicMem = -1;
static constexpr int kWkFac = 1; static constexpr int kWorkspaceMem = 1;
static constexpr size_t kTotalSize = 0; static constexpr size_t kTotalSize = 0;
enum Status { kUnused, kReused }; enum Status { kUnused, kReused };
class Membuf { class Membuf {
public: public:
Membuf() = default; Membuf() = default;
Membuf(uint32_t stream_id, Status status, size_t size, size_t offset, int index) Membuf(Status status, size_t size, size_t offset, int index, const KernelDefPtr &used_kernel)
: stream_id_(stream_id), status_(status), size_(size), offset_(offset), index_(index) {} : status_(status), size_(size), offset_(offset), index_(index), used_kernel_(used_kernel) {}
~Membuf() = default; ~Membuf() = default;
// Memory block status flags // Memory block status flags
std::set<uint32_t> called_stream_ids_;
uint32_t stream_id_{0};
Status status_ = kUnused; Status status_ = kUnused;
size_t size_{0}; size_t size_{0};
size_t offset_{0}; size_t offset_{0};
// Store the tensor index stored in this memory block at a certain moment // Store the tensor index stored in this memory block at a certain moment
int index_{0}; int index_{0};
KernelDefPtr used_kernel_;
}; };
using MembufPtr = std::shared_ptr<Membuf>; using MembufPtr = std::shared_ptr<Membuf>;
...@@ -61,24 +60,45 @@ class BestFitMemReuse { ...@@ -61,24 +60,45 @@ class BestFitMemReuse {
public: public:
BestFitMemReuse() = default; BestFitMemReuse() = default;
~BestFitMemReuse() { membuf_ptr_list_.clear(); } ~BestFitMemReuse() { membuf_ptr_list_.clear(); }
// Init all information need by memory reuse /**
* Init all information need by memory reuse
* @param mem_reuse_util_ptr, initialize in the memreuse.cc
*/
void InitMemReuseInfo(const MemReuseUtil *mem_reuse_util_ptr); void InitMemReuseInfo(const MemReuseUtil *mem_reuse_util_ptr);
bool CheckMembufIndx(const std::vector<MembufPtr> &membuf_ptr_list, size_t check_idx) const; void CheckMembufIndx(size_t check_idx) const;
bool IsMembufListEmpty(const std::vector<MembufPtr> &membuf_ptr_list) const; void AssignNodeWorkspaceOffset();
void AssignNodeWkOffset(const KernelDef *kernel_def_ptr); void ReleasePreNodeWorkspace(const KernelDef *kernel_def_ptr);
void ReleasePreNodeWkSpace(const KernelDef *kernel_def_ptr); /**
// void assign node output tensor memory offset * Assign output tensor memory offset of current kernel
void AssignNodeOutputOffset(const KernelDef *kernel_def_ptr); */
void ReleaseParallStream(); void AssignNodeOutputOffset();
// update node input tensor refcount, and membuf list status /**
void UpdateNodeInputAndMembuf(const KernelDef *kernel_def_ptr); * Update input tensor's status of current kernel, and the status of membuf used by current kernel
// check node output tensor which refcount is equal to zero */
void ReleaseNodeUnusedOutput(const KernelDef *kernel_def_ptr); void UpdateNodeInputAndMembuf();
// If there are memory blocks that can be reused /**
* Check whether to release the kernel output tensor which refcount is equal to zero
*/
void ReleaseNodeUnusedOutput();
/**
* Reuse the exist membuf if possible
* @param tensor_desc, the output tensor of current kernel
* @param membuf_index, the index of membuf to be reused
* @param flag
*/
void ReuseExistMembuf(KernelRefCount *tensor_desc, size_t membuf_index, int flag); void ReuseExistMembuf(KernelRefCount *tensor_desc, size_t membuf_index, int flag);
// Save memory blocks that can be reused to the map /**
* Get the membuf that can be reused
* @param tensor_size, the size of the tensor ready to assign memory offset
* @return membuf map, key: the membuf size, value: the membuf index
*/
std::map<size_t, size_t> GetReusableMembufMap(size_t tensor_size); std::map<size_t, size_t> GetReusableMembufMap(size_t tensor_size);
// Update the status of the reused memory block /**
* Update the status of the reused memory block
* @param tensor_desc, the tensor ready to assign memory
* @param membuf, the membuf to be reused
* @param flag, distinguish dynamic memory and workspace
*/
void UpdateMembufInfo(KernelRefCount *tensor_desc, Membuf *membuf, int flag); void UpdateMembufInfo(KernelRefCount *tensor_desc, Membuf *membuf, int flag);
// If the size of the memory block is greater than the size of the tensor, split the extra memory // If the size of the memory block is greater than the size of the tensor, split the extra memory
void SplitMembuf(const KernelRefCount *tensor_desc, size_t membuf_index); void SplitMembuf(const KernelRefCount *tensor_desc, size_t membuf_index);
...@@ -88,30 +108,39 @@ class BestFitMemReuse { ...@@ -88,30 +108,39 @@ class BestFitMemReuse {
void AddNewMembufPtr(KernelRefCount *tensor_desc, int flag); void AddNewMembufPtr(KernelRefCount *tensor_desc, int flag);
// Merge unused membuf // Merge unused membuf
void ReleaseMembuf(size_t tensor_index, int flag); void ReleaseMembuf(size_t tensor_index, int flag);
bool HasParallelId(const std::set<uint32_t> &called_ids, uint32_t curr_id);
void MergeCalledIds(const Membuf *membuf_target, Membuf *membuf);
// Memory address alignment 512 // Memory address alignment 512
size_t AlignMemorySize(size_t size) const; size_t AlignMemorySize(size_t size) const;
int GetFacIdx(size_t real_idx, int flag = kDyFac) const; int GetRealIndex(size_t index, int flag = kDynamicMem) const;
int GetRealIdx(int fac_idx, int flag = kDyFac) const; size_t GetTensorIndex(int index) const;
size_t FindIndx(const std::vector<MembufPtr> &membuf_ptr_list, int fac_idx) const; size_t GetWorkspaceIndex(int index) const;
void CheckTensorIndex(int tensor_index) const;
// Memory reuse main program entry // Memory reuse main program entry
void Reuse(const MemReuseUtil *mem_reuse_util_ptr); void Reuse(const MemReuseUtil *mem_reuse_util_ptr);
// Get the total memory that needs to be applied eventually // Get the total memory that needs to be applied eventually
size_t GetAllocatedSize(); size_t GetAllocatedSize();
// If the target stream can be reused by current stream
bool IsReusableStream(uint32_t curr_stream_id, uint32_t target_stream_id);
// return false, when the node output cannot be released // return false, when the node output cannot be released
bool IsRelease(const std::string &kernel_name); bool IsRelease();
/**
* determine if the kernel_curr can reuse the output tensor add of kernel_prev
* @param kernel_curr, current kernel
* @param kernel_prev, the membuf used by this kernel
* @return bool
*/
bool IsUsable(const KernelDefPtr &kernel_curr, const KernelDefPtr &kernel_prev);
/**
* init the dependence of all kernels in the graph
*/
void InitKernelDependence();
// set tensor_def and op_def // set tensor_def and op_def
void set_tensor_ptr_list(const std::vector<KernelRefCountPtr> &tensor_ptr_list) { void set_tensor_ptr_list(const std::vector<KernelRefCountPtr> &tensor_ptr_list) {
tensor_ptr_list_ = tensor_ptr_list; tensor_ptr_list_ = tensor_ptr_list;
} }
void set_workspace_ptr_list(const std::vector<KernelRefCountPtr> &workspace_ptr_list) {
wk_tensor_list_ = workspace_ptr_list;
}
void set_op_ptr_list(const std::vector<KernelDefPtr> &op_ptr_list) { op_ptr_list_ = op_ptr_list; } void set_op_ptr_list(const std::vector<KernelDefPtr> &op_ptr_list) { op_ptr_list_ = op_ptr_list; }
private: private:
uint32_t current_stream_id_{0}; KernelDefPtr current_kernel_;
// Save all tensor information // Save all tensor information
std::vector<KernelRefCountPtr> tensor_ptr_list_; std::vector<KernelRefCountPtr> tensor_ptr_list_;
std::vector<KernelRefCountPtr> wk_tensor_list_; std::vector<KernelRefCountPtr> wk_tensor_list_;
...@@ -119,7 +148,8 @@ class BestFitMemReuse { ...@@ -119,7 +148,8 @@ class BestFitMemReuse {
std::vector<KernelDefPtr> op_ptr_list_; std::vector<KernelDefPtr> op_ptr_list_;
// Memory block information sequence, temporary variables // Memory block information sequence, temporary variables
std::vector<MembufPtr> membuf_ptr_list_; std::vector<MembufPtr> membuf_ptr_list_;
std::unordered_map<uint32_t, std::unordered_set<uint32_t>> parallel_streams_map_; // kernel_front_map_, key: the kernel_def, value: kernels before this kernel_def
std::map<KernelDefPtr, std::set<KernelDefPtr>> kernel_front_map_;
}; };
} // namespace memreuse } // namespace memreuse
} // namespace mindspore } // namespace mindspore
......
...@@ -19,8 +19,6 @@ ...@@ -19,8 +19,6 @@
#include <vector> #include <vector>
#include <utility> #include <utility>
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set>
namespace mindspore { namespace mindspore {
namespace memreuse { namespace memreuse {
...@@ -188,6 +186,27 @@ void MemReuseChecker::CheckMemReuseIR(const KernelRefCountPtrList &total_refs_li ...@@ -188,6 +186,27 @@ void MemReuseChecker::CheckMemReuseIR(const KernelRefCountPtrList &total_refs_li
ofs.close(); ofs.close();
} }
void MemReuseChecker::ExportKernelDependence() {
std::string filename = "./memreuse_dependence.ir";
std::ofstream ofs(filename);
if (!ofs.is_open()) {
MS_LOG(ERROR) << "Open file [" << filename << "] failed!";
return;
}
size_t i = 0;
for (const auto &kernel_front : kernel_front_map_) {
auto kernel = kernel_front.first;
auto front = kernel_front.second;
ofs << "[" << i++ << "] " << kernel->scope_full_name() << "\n";
for (const auto &node : front) {
ofs << node->scope_full_name() << "\n";
}
ofs << "\n\n";
}
ofs.close();
}
bool MemReuseChecker::CheckGraphOutputAssigned(const session::KernelGraph *graph) { bool MemReuseChecker::CheckGraphOutputAssigned(const session::KernelGraph *graph) {
// set real graph output node to be special who's refcount equal kMaxRefCount // set real graph output node to be special who's refcount equal kMaxRefCount
for (const auto &output : graph->outputs()) { for (const auto &output : graph->outputs()) {
...@@ -393,7 +412,7 @@ void MemReuseChecker::CheckNormalIR(const session::KernelGraph *graph) { ...@@ -393,7 +412,7 @@ void MemReuseChecker::CheckNormalIR(const session::KernelGraph *graph) {
void MemReuseChecker::SetMembuInfos(const KernelDef *op_def, const std::vector<MembufPtr> &membuf_ptr_list) { void MemReuseChecker::SetMembuInfos(const KernelDef *op_def, const std::vector<MembufPtr> &membuf_ptr_list) {
std::vector<MembufPtr> curr_mem_infos; std::vector<MembufPtr> curr_mem_infos;
for (const auto &mem : membuf_ptr_list) { for (const auto &mem : membuf_ptr_list) {
auto mem_checker = std::make_shared<Membuf>(mem->stream_id_, mem->status_, mem->size_, mem->offset_, mem->index_); auto mem_checker = std::make_shared<Membuf>(mem->status_, mem->size_, mem->offset_, mem->index_, mem->used_kernel_);
curr_mem_infos.push_back(mem_checker); curr_mem_infos.push_back(mem_checker);
} }
membuf_all_infos_.push_back(curr_mem_infos); membuf_all_infos_.push_back(curr_mem_infos);
...@@ -407,7 +426,7 @@ void MemReuseChecker::SetAddNewMembuInfos(const KernelDef *op_def, const std::ve ...@@ -407,7 +426,7 @@ void MemReuseChecker::SetAddNewMembuInfos(const KernelDef *op_def, const std::ve
std::vector<MembufPtr> add_new_curr_mem; std::vector<MembufPtr> add_new_curr_mem;
for (const auto &mem : membuf_ptr_list) { for (const auto &mem : membuf_ptr_list) {
auto mem_checker = std::make_shared<Membuf>(mem->stream_id_, mem->status_, mem->size_, mem->offset_, mem->index_); auto mem_checker = std::make_shared<Membuf>(mem->status_, mem->size_, mem->offset_, mem->index_, mem->used_kernel_);
add_new_curr_mem.push_back(mem_checker); add_new_curr_mem.push_back(mem_checker);
} }
add_new_mem_infos_.push_back(add_new_curr_mem); add_new_mem_infos_.push_back(add_new_curr_mem);
...@@ -424,11 +443,11 @@ void MemReuseChecker::ExportMembufInfoIR() { ...@@ -424,11 +443,11 @@ void MemReuseChecker::ExportMembufInfoIR() {
if (!ofs.is_open()) { if (!ofs.is_open()) {
MS_LOG(ERROR) << "Open file [" << ir_file_name << "] failed!"; MS_LOG(ERROR) << "Open file [" << ir_file_name << "] failed!";
} }
ofs << "total_ori_static_size:" << total_ori_static_size_ << "\n"; ofs << "Total static size:\t" << total_ori_static_size_ << "\n";
ofs << "total_ori_weight_size:" << total_ori_input_size_ << "\n"; ofs << "Graph inputs size:\t" << total_ori_input_size_ << "\n";
ofs << "total_ori_constant_size:" << total_ori_value_size_ << "\n"; ofs << "Value nodes size:\t" << total_ori_value_size_ << "\n";
ofs << "total_ori_dy_size:" << total_ori_dy_size_ << "\n"; ofs << "Total dynamic size:\t" << total_ori_dy_size_ << "\n";
ofs << "total_ori_wkspace_size:" << total_ori_wkspace_size_ << "\n"; ofs << "Total workspace size:\t" << total_ori_wkspace_size_ << "\n";
// get last membuf_list // get last membuf_list
if (membuf_all_infos_.empty()) { if (membuf_all_infos_.empty()) {
return; return;
...@@ -438,8 +457,10 @@ void MemReuseChecker::ExportMembufInfoIR() { ...@@ -438,8 +457,10 @@ void MemReuseChecker::ExportMembufInfoIR() {
auto checker_size = SizeToLong(membuf->size_); auto checker_size = SizeToLong(membuf->size_);
total_reuse_size += checker_size; total_reuse_size += checker_size;
} }
ofs << "total_reuse_size:" << total_reuse_size << "\n"; ofs << "After reuse size:\t" << total_reuse_size << "\n\n";
size_t i = 0; size_t i = 0;
std::vector<size_t> each_node_used_size;
std::vector<size_t> each_node_allocated_size;
for (const auto &curr_membuf_list : membuf_all_infos_) { for (const auto &curr_membuf_list : membuf_all_infos_) {
ofs << all_split_names_.at(i) << "\n"; ofs << all_split_names_.at(i) << "\n";
++i; ++i;
...@@ -449,17 +470,42 @@ void MemReuseChecker::ExportMembufInfoIR() { ...@@ -449,17 +470,42 @@ void MemReuseChecker::ExportMembufInfoIR() {
<< "tensor_idex\t" << "tensor_idex\t"
<< "mem_size\t" << "mem_size\t"
<< "mem_head\t" << "mem_head\t"
<< "mem_tail\n"; << "mem_tail\t"
<< "used_kernel\n";
size_t curr_used = 0;
size_t curr_allocated = 0;
for (size_t j = 0; j < curr_membuf_list.size(); ++j) { for (size_t j = 0; j < curr_membuf_list.size(); ++j) {
auto membuf = curr_membuf_list.at(j); auto membuf = curr_membuf_list.at(j);
auto used_kernel = membuf->used_kernel_->scope_full_name();
ofs << "&" << j << "\t" ofs << "&" << j << "\t"
<< "streamID[@" << membuf->stream_id_ << "]" << "streamID[@" << membuf->used_kernel_->stream_id() << "]"
<< "\t" << "\t"
<< "#" << static_cast<int>(membuf->status_) << "\t%" << membuf->index_ << "T" << "#" << static_cast<int>(membuf->status_) << "\t%" << membuf->index_ << "T"
<< "\t" << membuf->size_ << "\t" << membuf->offset_ << "\t" << membuf->offset_ + membuf->size_ << "\n"; << "\t" << membuf->size_ << "\t" << membuf->offset_ << "\t" << membuf->offset_ + membuf->size_ << "\t"
<< GetSplitName(used_kernel) << "\n";
if (membuf->status_ == kReused) {
curr_used += membuf->size_;
}
}
if (!curr_membuf_list.empty()) {
curr_allocated = curr_membuf_list.back()->offset_ + curr_membuf_list.back()->size_;
} }
each_node_used_size.push_back(curr_used);
each_node_allocated_size.push_back(curr_allocated);
ofs << "curr real used size: \t" << curr_used << "\n";
ofs << "curr allocated size: \t" << curr_allocated << "\n";
ofs << "\n\n"; ofs << "\n\n";
} }
ofs << "each node used size: \n";
for (auto size : each_node_used_size) {
ofs << size << "\t";
}
ofs << "\n\n";
ofs << "each node allocated size: \n";
for (auto size : each_node_allocated_size) {
ofs << size << "\t";
}
ofs << "\n\n";
ofs.close(); ofs.close();
} }
...@@ -479,7 +525,6 @@ void MemReuseChecker::ExportAddNewMmebufIR() { ...@@ -479,7 +525,6 @@ void MemReuseChecker::ExportAddNewMmebufIR() {
<< "\n"; << "\n";
i++; i++;
ofs << "mem_num\t" ofs << "mem_num\t"
<< "stream_id\t"
<< "status\t" << "status\t"
<< "tensor_idex\t" << "tensor_idex\t"
<< "mem_size\t" << "mem_size\t"
...@@ -490,7 +535,6 @@ void MemReuseChecker::ExportAddNewMmebufIR() { ...@@ -490,7 +535,6 @@ void MemReuseChecker::ExportAddNewMmebufIR() {
for (size_t j = 0; j < curr_membuf_list.size(); ++j) { for (size_t j = 0; j < curr_membuf_list.size(); ++j) {
auto membuf = curr_membuf_list.at(j); auto membuf = curr_membuf_list.at(j);
ofs << "&" << j << "\t" ofs << "&" << j << "\t"
<< "streamID[@" << membuf->stream_id_ << "]"
<< "\t" << "\t"
<< "#" << static_cast<int>(membuf->status_) << "\t%" << membuf->index_ << "T" << "#" << static_cast<int>(membuf->status_) << "\t%" << membuf->index_ << "T"
<< "\t" << membuf->size_ << "\t" << membuf->offset_ << "\t" << membuf->offset_ + membuf->size_ << "\t"; << "\t" << membuf->size_ << "\t" << membuf->offset_ << "\t" << membuf->offset_ + membuf->size_ << "\t";
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_MEM_REUSE_MEM_REUSE_CHECKER_H_ #ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_MEM_REUSE_MEM_REUSE_CHECKER_H_
#define MINDSPORE_CCSRC_PRE_ACTIVATE_MEM_REUSE_MEM_REUSE_CHECKER_H_ #define MINDSPORE_CCSRC_PRE_ACTIVATE_MEM_REUSE_MEM_REUSE_CHECKER_H_
#include <map> #include <map>
#include <set>
#include <vector> #include <vector>
#include <string> #include <string>
#include <memory> #include <memory>
...@@ -59,10 +60,14 @@ class MemReuseChecker { ...@@ -59,10 +60,14 @@ class MemReuseChecker {
void ExportMembufInfoIR(); void ExportMembufInfoIR();
void SetAddNewMembuInfos(const KernelDef *op_def, const std::vector<MembufPtr> &membuf_ptr_list, size_t op_idx); void SetAddNewMembuInfos(const KernelDef *op_def, const std::vector<MembufPtr> &membuf_ptr_list, size_t op_idx);
void ExportAddNewMmebufIR(); void ExportAddNewMmebufIR();
void set_kernel_front_map(const std::map<KernelDefPtr, std::set<KernelDefPtr>> &kernel_front_map) {
kernel_front_map_ = kernel_front_map;
}
void ExportKernelDependence();
private: private:
MemReuseChecker() = default; MemReuseChecker() = default;
~MemReuseChecker() { MS_LOG(INFO) << "Total reused workspace size: " << total_re_wkspe_size_checker_; } ~MemReuseChecker() {}
size_t total_re_wkspe_size_checker_{0}; size_t total_re_wkspe_size_checker_{0};
std::vector<std::vector<MembufPtr>> membuf_all_infos_; std::vector<std::vector<MembufPtr>> membuf_all_infos_;
std::vector<const void *> nor_output_tensors_; std::vector<const void *> nor_output_tensors_;
...@@ -79,6 +84,7 @@ class MemReuseChecker { ...@@ -79,6 +84,7 @@ class MemReuseChecker {
std::vector<std::string> all_split_names_; std::vector<std::string> all_split_names_;
std::map<int, std::vector<string>> tensor_from_; std::map<int, std::vector<string>> tensor_from_;
std::map<int, std::vector<string>> tensor_to_; std::map<int, std::vector<string>> tensor_to_;
std::map<KernelDefPtr, std::set<KernelDefPtr>> kernel_front_map_;
int64_t total_ori_static_size_ = 0; int64_t total_ori_static_size_ = 0;
int64_t total_ori_input_size_ = 0; int64_t total_ori_input_size_ = 0;
int64_t total_ori_value_size_ = 0; int64_t total_ori_value_size_ = 0;
......
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "pre_activate/mem_reuse/stream_reuse.h"
namespace mindspore {
namespace memreuse {
void StreamReuse::SetStreamReuseResource() {
#ifdef ENABLE_D
auto logic_physic_map = device::ascend::AscendStreamAssign::GetInstance().logic_to_physic_map();
auto logic_independent_map = device::ascend::AscendStreamAssign::GetInstance().logic_to_independent_map();
MS_LOG(INFO) << "stream mem reuse for Davici";
if (!logic_independent_map.empty() && !logic_physic_map.empty()) {
set_logic_physic_map(logic_physic_map);
set_logic_independent_map(logic_independent_map);
InitReusableStreamMap();
} else {
MS_LOG(INFO) << "Non task sink or No Parallel stream exists";
}
#endif
MS_LOG(INFO) << "no need to set stream mem reuse resource";
}
std::vector<std::pair<uint32_t, uint32_t>> StreamReuse::SortLogicPhysicMapToList() {
std::vector<std::pair<uint32_t, uint32_t>> logic_physic_list;
(void)std::transform(logic_physic_map_.begin(), logic_physic_map_.end(), std::back_inserter(logic_physic_list),
[](std::pair<uint32_t, uint32_t> log_phy) { return log_phy; });
std::sort(
logic_physic_list.begin(), logic_physic_list.end(),
[](const std::pair<uint32_t, uint32_t> &logic_phyic_pair1, const std::pair<uint32_t, uint32_t> &logic_phyic_pair2) {
return logic_phyic_pair1.second < logic_phyic_pair2.second;
});
return logic_physic_list;
}
std::unordered_map<int, std::set<uint32_t>> StreamReuse::GetLogicPhysicsStreamMap() {
auto logic_physic_list = SortLogicPhysicMapToList();
std::unordered_map<int, std::set<uint32_t>> logic_phyics_map;
for (size_t i = 0; i < logic_physic_list.size() - IntToSize(1); ++i) {
auto curr_logic_physic = logic_physic_list.at(i);
auto next_logic_physic = logic_physic_list.at(i + 1);
for (auto j = curr_logic_physic.second; j < next_logic_physic.second; ++j) {
(void)logic_phyics_map[curr_logic_physic.first].insert(j);
}
}
// sort the logic independ map by value
std::map<uint32_t, uint32_t> temp_map;
for (const auto &logic_independ : logic_independent_map_) {
(void)temp_map.insert(std::make_pair(logic_independ.second, logic_independ.first));
}
auto first_independent_stream_id = (*temp_map.begin()).first;
auto last_physic_logic_stream_id = (*logic_physic_list.rbegin()).second;
for (auto i = last_physic_logic_stream_id; i < first_independent_stream_id; ++i) {
(void)logic_phyics_map[(*logic_physic_list.rbegin()).first].insert(i);
}
return logic_phyics_map;
}
void StreamReuse::InitReusableStreamMap() {
// logic_phyics_map, key, logic_stream_id; value, physic_strema_ids included in that logic stream
auto logic_phyics_map = GetLogicPhysicsStreamMap();
// parallel_streams_map: key, current_stream_id; value, streams parallel to current stream
for (const auto &logic_to_phyics : logic_phyics_map) {
auto logic_stream_id = logic_to_phyics.first;
auto iter_inde = logic_independent_map_.find(logic_stream_id);
if (iter_inde != logic_independent_map_.end()) {
// exist independent steam parallel to these logic streams
auto independent_stream_id = iter_inde->second;
auto physics_stream_id = logic_to_phyics.second;
for (const auto &physic : physics_stream_id) {
(void)parallel_streams_map_[physic].insert(independent_stream_id);
}
}
}
for (const auto &logic_to_independent : logic_independent_map_) {
auto logic_stream_id = logic_to_independent.first;
auto independent_stream_id = logic_to_independent.second;
auto iter_physics = logic_phyics_map.find(logic_stream_id);
if (iter_physics != logic_phyics_map.end()) {
// exist logic steam parallel to these independent streams, default
auto physics_set = iter_physics->second;
for (const auto &physic : physics_set) {
(void)parallel_streams_map_[independent_stream_id].insert(physic);
}
}
}
}
} // namespace memreuse
} // namespace mindspore
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_MEM_REUSE_STREAM_REUSE_H_
#define MINDSPORE_CCSRC_PRE_ACTIVATE_MEM_REUSE_STREAM_REUSE_H_
#include <cmath>
#include <map>
#include <set>
#include <list>
#include <memory>
#include <vector>
#include <numeric>
#include <algorithm>
#include <utility>
#include <fstream>
#include <unordered_set>
#include <unordered_map>
#include "session/anf_runtime_algorithm.h"
#include "pre_activate/mem_reuse/kernel_refcount.h"
#ifdef ENABLE_D
#include "device/ascend/ascend_stream_assign.h"
#endif
namespace mindspore {
namespace memreuse {
class StreamReuse {
public:
StreamReuse() = default;
~StreamReuse() = default;
void SetStreamReuseResource();
void InitReusableStreamMap();
std::vector<std::pair<uint32_t, uint32_t>> SortLogicPhysicMapToList();
std::unordered_map<int, std::set<uint32_t>> GetLogicPhysicsStreamMap();
void set_logic_physic_map(const std::unordered_map<uint32_t, uint32_t> &logic_physic_map) {
logic_physic_map_ = logic_physic_map;
}
void set_logic_independent_map(const std::unordered_map<uint32_t, uint32_t> &logic_independent_map) {
logic_independent_map_ = logic_independent_map;
}
std::unordered_map<uint32_t, std::unordered_set<uint32_t>> parallel_streams_map() { return parallel_streams_map_; }
private:
std::unordered_map<uint32_t, std::unordered_set<uint32_t>> parallel_streams_map_;
std::unordered_map<uint32_t, uint32_t> logic_physic_map_;
std::unordered_map<uint32_t, uint32_t> logic_independent_map_;
};
} // namespace memreuse
} // namespace mindspore
#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_MEM_REUSE_STREAM_REUSE_H_
...@@ -117,16 +117,13 @@ TEST_F(TestMemReuseAllocator, mem_reuse_allocator) { ...@@ -117,16 +117,13 @@ TEST_F(TestMemReuseAllocator, mem_reuse_allocator) {
MS_LOG(INFO) << "run mem reuse success"; MS_LOG(INFO) << "run mem reuse success";
size_t total_allocated_size = best_fit_mem_reuse->GetAllocatedSize(); size_t total_allocated_size = best_fit_mem_reuse->GetAllocatedSize();
ASSERT_NE(total_allocated_size, 0); ASSERT_NE(total_allocated_size, 0);
auto is_reusable_stream = best_fit_mem_reuse->IsReusableStream(1, 3);
ASSERT_EQ(is_reusable_stream, true);
} }
TEST_F(TestMemReuseAllocator, mem_reuse_allocator_add_membuf) { TEST_F(TestMemReuseAllocator, mem_reuse_allocator_add_membuf) {
auto best_fit_mem_reuse = std::make_shared<BestFitMemReuse>(); auto best_fit_mem_reuse = std::make_shared<BestFitMemReuse>();
auto tensor_desc = std::make_shared<KernelRefCount>(); auto tensor_desc = std::make_shared<KernelRefCount>();
tensor_desc->SetKernelRefCountInfo(0, 1024, kDynamicRefCount); tensor_desc->SetKernelRefCountInfo(0, 1024, kDynamicRefCount);
best_fit_mem_reuse->AddNewMembufPtr(tensor_desc.get(), kDyFac); best_fit_mem_reuse->AddNewMembufPtr(tensor_desc.get(), kDynamicMem);
auto allocated_size = best_fit_mem_reuse->GetAllocatedSize(); auto allocated_size = best_fit_mem_reuse->GetAllocatedSize();
ASSERT_EQ(allocated_size, 1024); ASSERT_EQ(allocated_size, 1024);
} }
...@@ -135,7 +132,7 @@ TEST_F(TestMemReuseAllocator, mem_reuse_allocator_split_membuf) { ...@@ -135,7 +132,7 @@ TEST_F(TestMemReuseAllocator, mem_reuse_allocator_split_membuf) {
auto best_fit_mem_reuse = std::make_shared<BestFitMemReuse>(); auto best_fit_mem_reuse = std::make_shared<BestFitMemReuse>();
auto tensor_0 = std::make_shared<KernelRefCount>(); auto tensor_0 = std::make_shared<KernelRefCount>();
tensor_0->SetKernelRefCountInfo(0, 2048, kDynamicRefCount); tensor_0->SetKernelRefCountInfo(0, 2048, kDynamicRefCount);
best_fit_mem_reuse->AddNewMembufPtr(tensor_0.get(), kDyFac); best_fit_mem_reuse->AddNewMembufPtr(tensor_0.get(), kDynamicMem);
auto tensor_1 = std::make_shared<KernelRefCount>(); auto tensor_1 = std::make_shared<KernelRefCount>();
tensor_1->SetKernelRefCountInfo(1, 800, kDynamicRefCount); tensor_1->SetKernelRefCountInfo(1, 800, kDynamicRefCount);
......
...@@ -228,12 +228,6 @@ TEST_F(TestMemReuseWithPy, KernelRef) { ...@@ -228,12 +228,6 @@ TEST_F(TestMemReuseWithPy, KernelRef) {
ASSERT_EQ(kernel_def_ptr->dirty, false); ASSERT_EQ(kernel_def_ptr->dirty, false);
MembufPtr membuf_ptr = std::make_shared<Membuf>(); MembufPtr membuf_ptr = std::make_shared<Membuf>();
ASSERT_NE(membuf_ptr, nullptr); ASSERT_NE(membuf_ptr, nullptr);
MembufPtr membuf_ptr_x = std::make_shared<Membuf>(0, memreuse::kUnused, 512, 128, 2);
ASSERT_EQ(membuf_ptr_x->status_, memreuse::kUnused);
ASSERT_EQ(membuf_ptr_x->size_, 512);
ASSERT_EQ(membuf_ptr_x->offset_, 128);
ASSERT_EQ(membuf_ptr_x->index_, 2);
ASSERT_EQ(membuf_ptr_x->stream_id_, 0);
} }
TEST_F(TestMemReuseWithPy, ReuseAssignDynamicMemory) { TEST_F(TestMemReuseWithPy, ReuseAssignDynamicMemory) {
......
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <memory>
#include <vector>
#include <string>
#include "operator/ops.h"
#include "pre_activate/mem_reuse/stream_reuse.h"
#include "common/common_test.h"
#include "common/py_func_graph_fetcher.h"
using mindspore::memreuse::StreamReuse;
namespace mindspore {
class TestStreamMemReuse : public UT::Common {
public:
TestStreamMemReuse() : getPyFun_("gtest_input.mem_reuse.TestMemReuseAllocator", true) {}
void SetUp() {}
public:
UT::PyFuncGraphFetcher getPyFun_;
};
TEST_F(TestStreamMemReuse, init_reusable_stream_map_test) {
std::unordered_map<uint32_t, uint32_t> logic_physic_map{{1, 0}, {2, 8}, {3, 3}};
std::unordered_map<uint32_t, uint32_t> logic_independent_map{{3, 10}, {2, 11}};
auto stream_reuse = std::make_shared<StreamReuse>();
stream_reuse->set_logic_physic_map(logic_physic_map);
stream_reuse->set_logic_independent_map(logic_independent_map);
auto logic_phyics_map = stream_reuse->GetLogicPhysicsStreamMap();
for (const auto &logic_physics : logic_phyics_map) {
MS_LOG(INFO) << "[logic_id: " << logic_physics.first << "]";
for (const auto &physic : logic_physics.second) {
MS_LOG(INFO) << "physic: " << physic;
}
}
MS_LOG(INFO) << "===========UT logic_physic_map size: " << logic_physic_map.size() << "========";
ASSERT_EQ(logic_physic_map.size(), 3);
stream_reuse->InitReusableStreamMap();
auto parallel_streams_map = stream_reuse->parallel_streams_map();
for (const auto &parallel_streams : parallel_streams_map) {
MS_LOG(INFO) << "[stream id: " << parallel_streams.first << "]";
for (const auto &stream : parallel_streams.second) {
MS_LOG(INFO) << "parallel stream id: " << stream;
}
}
ASSERT_EQ(parallel_streams_map[7].size(), 1);
}
} // namespace mindspore
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册