diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index 556a167e74f226bc838b020809ada790f464bfdd..c31e0661140daf1c99c82f4465efcb9769577178 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -66,13 +66,6 @@ void NaiveExecutor::Run() { platform::NvtxRangeColor::Green); #endif - // According to reuse table, we share the out tensor's holder. - if (reuse_cache_.count(op.get())) { - for (auto &it : reuse_cache_[op.get()]) { - it.first->ShareBufferWith(*cluster_buffer_[it.second], true); - } - } - op->Run(*scope_, place_); // Update the shared_holder so that only records the max one. @@ -81,6 +74,22 @@ void NaiveExecutor::Run() { if (it.first->memory_size() > cluster_buffer_[it.second]->memory_size()) { cluster_buffer_[it.second] = it.first; + int updated_cluster_id = it.second; + + // cluster_buffer_[it.second] has been updated to be a new + // phi::DenseTensor*, we need change all phi::DenseTensor's + // shared_holder in this cluster. The following two loops code looks + // ugly, it does work. The following two loops seem time-consuming, + // but once the memory reaches its peak, the cluster will not update, + // so it's ok. + for (auto &op_map : reuse_cache_) { + // op_map.second is std::unordered_map. + for (auto &it2 : op_map.second) { + if (it2.second == updated_cluster_id) { + it2.first->ShareBufferWith(*cluster_buffer_[it2.second], true); + } + } + } } } }