未验证 提交 488719ba 编写于 作者: D dzhwinter 提交者: GitHub

Enhance/memory optimize (#15634)

* add skip send.recv test=develop

* enhanced print message. test=develop

* rerun ci. test=develop
...@@ -403,18 +403,20 @@ void GraphView::Build(ir::Graph* g) { ...@@ -403,18 +403,20 @@ void GraphView::Build(ir::Graph* g) {
// 2. track the nodes which used by parameter server. // 2. track the nodes which used by parameter server.
// these node can not be inplaced, otherwise trainer // these node can not be inplaced, otherwise trainer
// pserver can not find each other name. // pserver can not find each other name.
for (auto& node : g->Nodes()) { auto update_skip_set = [&](ir::Node* node) {
if (!node->IsOp()) continue; for (auto& in : node->inputs) {
if (node->Name() == "send") { if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name());
for (auto& in : node->inputs) {
dup_nodes_.emplace(in->Name());
}
} }
if (node->Name() == "recv") { for (auto& out : node->outputs) {
for (auto& out : node->outputs) { if (out->IsVar() && out->Var() != nullptr)
dup_nodes_.emplace(out->Name()); dup_nodes_.emplace(out->Name());
}
} }
};
for (auto& node : g->Nodes()) {
if (!node->IsOp()) continue;
if (node->Name() == "send") update_skip_set(node);
if (node->Name() == "recv") update_skip_set(node);
if (node->Name() == "prefetch") update_skip_set(node);
} }
} }
......
...@@ -51,8 +51,7 @@ static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { ...@@ -51,8 +51,7 @@ static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
auto nodes = graph->Nodes(); auto nodes = graph->Nodes();
auto subblock_vars = GetSubBlockVars(nodes); CollectSkipVarsSet(nodes);
skip_set_.insert(subblock_vars.begin(), subblock_vars.end());
cfg_.reset(new details::ControlFlowGraph(*graph)); cfg_.reset(new details::ControlFlowGraph(*graph));
cfg_->LiveVariableAnalysis(); cfg_->LiveVariableAnalysis();
...@@ -224,20 +223,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { ...@@ -224,20 +223,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
} }
} }
std::unordered_set<std::string> MemoryOptimizePass::GetSubBlockVars( void MemoryOptimizePass::CollectSkipVarsSet(
const std::unordered_set<ir::Node*>& nodes) const { const std::unordered_set<ir::Node*>& nodes) const {
std::unordered_set<std::string> vars; auto update_skip_set = [&](OpDesc* op_desc) {
auto inputs = op_desc->InputArgumentNames();
auto outputs = op_desc->OutputArgumentNames();
skip_set_.insert(inputs.begin(), inputs.end());
skip_set_.insert(outputs.begin(), outputs.end());
};
for (auto& op : nodes) { for (auto& op : nodes) {
if (!op->IsOp() || op->Op() == nullptr) continue; if (!op->IsOp() || op->Op() == nullptr) continue;
auto* op_desc = op->Op(); auto* op_desc = op->Op();
if (OpHasSubBlock(op_desc)) { // NOTE(dzhwinter):
auto inputs = op_desc->InputArgumentNames(); // current block can not reuse next level block vars.
auto outputs = op_desc->OutputArgumentNames(); if (OpHasSubBlock(op_desc)) update_skip_set(op_desc);
vars.insert(inputs.begin(), inputs.end()); // NOTE(dzhwinter):
vars.insert(outputs.begin(), outputs.end()); // distributed ops input/output name need to
} // keep same bettwen trainer/pserver
if (op_desc->Type() == "send") update_skip_set(op_desc);
if (op_desc->Type() == "recv") update_skip_set(op_desc);
if (op_desc->Type() == "prefetch") update_skip_set(op_desc);
} }
return vars;
} }
void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var, void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var,
......
...@@ -55,9 +55,10 @@ class MemoryOptimizePass : public ir::Pass { ...@@ -55,9 +55,10 @@ class MemoryOptimizePass : public ir::Pass {
ir::Graph* graph) const; ir::Graph* graph) const;
void SubGraphOptimize(OpDesc* op_desc) const; void SubGraphOptimize(OpDesc* op_desc) const;
// scan subblock and collect the output/input variables. // 1. scan op with subblock and collect the output/input vars.
std::unordered_set<std::string> GetSubBlockVars( // while, while_grad, conditional_block
const std::unordered_set<ir::Node*>&) const; // 2. scan distributed ops and collect the output/input vars
void CollectSkipVarsSet(const std::unordered_set<ir::Node*>&) const;
private: private:
// Reuse Node Pool, Owned. // Reuse Node Pool, Owned.
......
...@@ -276,6 +276,7 @@ TEST(InferInplace, MultiGradInplaceInToOut) { ...@@ -276,6 +276,7 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
EXPECT_EQ(in_to_outs.size(), 3ul); EXPECT_EQ(in_to_outs.size(), 3ul);
std::unordered_map<std::string, std::string> expects = { std::unordered_map<std::string, std::string> expects = {
{"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"}, {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
......
...@@ -22,7 +22,11 @@ limitations under the License. */ ...@@ -22,7 +22,11 @@ limitations under the License. */
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
DECLARE_bool(benchmark); DEFINE_bool(benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
DEFINE_bool( DEFINE_bool(
eager_delete_scope, true, eager_delete_scope, true,
......
...@@ -35,7 +35,6 @@ DEFINE_bool(init_allocated_mem, false, ...@@ -35,7 +35,6 @@ DEFINE_bool(init_allocated_mem, false,
"To find this error in time, we use init_allocated_mem to indicate " "To find this error in time, we use init_allocated_mem to indicate "
"that initializing the allocated memory with a small value " "that initializing the allocated memory with a small value "
"during unit testing."); "during unit testing.");
DECLARE_bool(benchmark);
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
namespace paddle { namespace paddle {
...@@ -188,21 +187,20 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place, ...@@ -188,21 +187,20 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
platform::SetDeviceId(place.device); platform::SetDeviceId(place.device);
size_t avail, total; size_t avail, total;
platform::GpuMemoryUsage(&avail, &total); platform::GpuMemoryUsage(&avail, &total);
LOG(WARNING) << "Cannot allocate " << string::HumanReadableSize(size) LOG(FATAL) << "Cannot allocate " << string::HumanReadableSize(size)
<< " in GPU " << place.device << ", available " << " in GPU " << place.device << ", available "
<< string::HumanReadableSize(avail); << string::HumanReadableSize(avail) << "total " << total
LOG(WARNING) << "total " << total; << "GpuMinChunkSize "
LOG(WARNING) << "GpuMinChunkSize " << string::HumanReadableSize(buddy_allocator->GetMinChunkSize())
<< string::HumanReadableSize( << "GpuMaxChunkSize "
buddy_allocator->GetMinChunkSize()); << string::HumanReadableSize(buddy_allocator->GetMaxChunkSize())
LOG(WARNING) << "GpuMaxChunkSize " << "GPU memory used: "
<< string::HumanReadableSize( << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
buddy_allocator->GetMaxChunkSize());
LOG(WARNING) << "GPU memory used: "
<< string::HumanReadableSize(Used<platform::CUDAPlace>(place));
platform::SetDeviceId(cur_dev); platform::SetDeviceId(cur_dev);
} else { } else {
if (FLAGS_benchmark) allocation::GPUMemMonitor.Add(place.device, size); if (VLOG_IS_ON(3)) {
allocation::GPUMemMonitor.Add(place.device, size);
}
if (FLAGS_init_allocated_mem) { if (FLAGS_init_allocated_mem) {
cudaMemset(ptr, 0xEF, size); cudaMemset(ptr, 0xEF, size);
} }
...@@ -218,7 +216,9 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p, ...@@ -218,7 +216,9 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
size_t size) { size_t size) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
GetGPUBuddyAllocator(place.device)->Free(p); GetGPUBuddyAllocator(place.device)->Free(p);
if (FLAGS_benchmark) allocation::GPUMemMonitor.Minus(place.device, size); if (VLOG_IS_ON(3)) {
allocation::GPUMemMonitor.Minus(place.device, size);
}
#else #else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif #endif
......
...@@ -14,12 +14,6 @@ limitations under the License. */ ...@@ -14,12 +14,6 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
DEFINE_bool(benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
namespace paddle { namespace paddle {
namespace platform { namespace platform {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册