diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc index 64368a5e8737b2484bda9b7dd52451b4d4f760ff..78c5d5b50e606daa963e728355dc1bce83cd5484 100644 --- a/paddle/fluid/framework/details/inplace_op_pass.cc +++ b/paddle/fluid/framework/details/inplace_op_pass.cc @@ -403,18 +403,20 @@ void GraphView::Build(ir::Graph* g) { // 2. track the nodes which used by parameter server. // these node can not be inplaced, otherwise trainer // pserver can not find each other name. - for (auto& node : g->Nodes()) { - if (!node->IsOp()) continue; - if (node->Name() == "send") { - for (auto& in : node->inputs) { - dup_nodes_.emplace(in->Name()); - } + auto update_skip_set = [&](ir::Node* node) { + for (auto& in : node->inputs) { + if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name()); } - if (node->Name() == "recv") { - for (auto& out : node->outputs) { + for (auto& out : node->outputs) { + if (out->IsVar() && out->Var() != nullptr) dup_nodes_.emplace(out->Name()); - } } + }; + for (auto& node : g->Nodes()) { + if (!node->IsOp()) continue; + if (node->Name() == "send") update_skip_set(node); + if (node->Name() == "recv") update_skip_set(node); + if (node->Name() == "prefetch") update_skip_set(node); } } diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc index 33ca45668e86bdbe615b91366db7e286258dd7d6..85de14a60a8fe6958794f0ac25768b9da1943f9d 100644 --- a/paddle/fluid/framework/details/memory_optimize_pass.cc +++ b/paddle/fluid/framework/details/memory_optimize_pass.cc @@ -51,8 +51,7 @@ static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { std::unique_ptr MemoryOptimizePass::ApplyImpl( std::unique_ptr graph) const { auto nodes = graph->Nodes(); - auto subblock_vars = GetSubBlockVars(nodes); - skip_set_.insert(subblock_vars.begin(), subblock_vars.end()); + CollectSkipVarsSet(nodes); cfg_.reset(new details::ControlFlowGraph(*graph)); cfg_->LiveVariableAnalysis(); @@ -224,20 +223,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { } } -std::unordered_set MemoryOptimizePass::GetSubBlockVars( +void MemoryOptimizePass::CollectSkipVarsSet( const std::unordered_set& nodes) const { - std::unordered_set vars; + auto update_skip_set = [&](OpDesc* op_desc) { + auto inputs = op_desc->InputArgumentNames(); + auto outputs = op_desc->OutputArgumentNames(); + skip_set_.insert(inputs.begin(), inputs.end()); + skip_set_.insert(outputs.begin(), outputs.end()); + }; for (auto& op : nodes) { if (!op->IsOp() || op->Op() == nullptr) continue; auto* op_desc = op->Op(); - if (OpHasSubBlock(op_desc)) { - auto inputs = op_desc->InputArgumentNames(); - auto outputs = op_desc->OutputArgumentNames(); - vars.insert(inputs.begin(), inputs.end()); - vars.insert(outputs.begin(), outputs.end()); - } + // NOTE(dzhwinter): + // current block can not reuse next level block vars. + if (OpHasSubBlock(op_desc)) update_skip_set(op_desc); + // NOTE(dzhwinter): + // distributed ops input/output name need to + // keep same bettwen trainer/pserver + if (op_desc->Type() == "send") update_skip_set(op_desc); + if (op_desc->Type() == "recv") update_skip_set(op_desc); + if (op_desc->Type() == "prefetch") update_skip_set(op_desc); } - return vars; } void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var, diff --git a/paddle/fluid/framework/details/memory_optimize_pass.h b/paddle/fluid/framework/details/memory_optimize_pass.h index b3e026e0bc1e222e82a22b343c86ddc87a967e8f..3d6b1897f3b5106054b8f647f9cf613ebd1d65ff 100644 --- a/paddle/fluid/framework/details/memory_optimize_pass.h +++ b/paddle/fluid/framework/details/memory_optimize_pass.h @@ -55,9 +55,10 @@ class MemoryOptimizePass : public ir::Pass { ir::Graph* graph) const; void SubGraphOptimize(OpDesc* op_desc) const; - // scan subblock and collect the output/input variables. - std::unordered_set GetSubBlockVars( - const std::unordered_set&) const; + // 1. scan op with subblock and collect the output/input vars. + // while, while_grad, conditional_block + // 2. scan distributed ops and collect the output/input vars + void CollectSkipVarsSet(const std::unordered_set&) const; private: // Reuse Node Pool, Owned. diff --git a/paddle/fluid/framework/inplace_op_inference_test.cc b/paddle/fluid/framework/inplace_op_inference_test.cc index 121f648a5f04ae65560ae8d04042e40df61aad50..3e4d715c6f089496d1b1f7906e3f10147a073622 100644 --- a/paddle/fluid/framework/inplace_op_inference_test.cc +++ b/paddle/fluid/framework/inplace_op_inference_test.cc @@ -276,6 +276,7 @@ TEST(InferInplace, MultiGradInplaceInToOut) { auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto in_to_outs = infer_inplace(*op, op->Block()); + EXPECT_EQ(in_to_outs.size(), 3ul); std::unordered_map expects = { {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"}, diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 87f0f307d30bc90a43a698c3766b16c975f0635e..953618560913229cd1e47659ad61e621efc10ed1 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -22,7 +22,11 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/string/printf.h" -DECLARE_bool(benchmark); +DEFINE_bool(benchmark, false, + "Doing memory benchmark. It will make deleting scope synchronized, " + "and add some memory usage logs." + "Default cuda is asynchronous device, set to True will" + "force op run in synchronous mode."); DEFINE_bool( eager_delete_scope, true, diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index ef62f758e37f28ab826faac84fd1276b14de7980..327adcc4aac1c50b51942c557d66dae6770e24f2 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -35,7 +35,6 @@ DEFINE_bool(init_allocated_mem, false, "To find this error in time, we use init_allocated_mem to indicate " "that initializing the allocated memory with a small value " "during unit testing."); -DECLARE_bool(benchmark); DECLARE_double(fraction_of_gpu_memory_to_use); namespace paddle { @@ -188,21 +187,20 @@ void *Alloc(const platform::CUDAPlace &place, platform::SetDeviceId(place.device); size_t avail, total; platform::GpuMemoryUsage(&avail, &total); - LOG(WARNING) << "Cannot allocate " << string::HumanReadableSize(size) - << " in GPU " << place.device << ", available " - << string::HumanReadableSize(avail); - LOG(WARNING) << "total " << total; - LOG(WARNING) << "GpuMinChunkSize " - << string::HumanReadableSize( - buddy_allocator->GetMinChunkSize()); - LOG(WARNING) << "GpuMaxChunkSize " - << string::HumanReadableSize( - buddy_allocator->GetMaxChunkSize()); - LOG(WARNING) << "GPU memory used: " - << string::HumanReadableSize(Used(place)); + LOG(FATAL) << "Cannot allocate " << string::HumanReadableSize(size) + << " in GPU " << place.device << ", available " + << string::HumanReadableSize(avail) << "total " << total + << "GpuMinChunkSize " + << string::HumanReadableSize(buddy_allocator->GetMinChunkSize()) + << "GpuMaxChunkSize " + << string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()) + << "GPU memory used: " + << string::HumanReadableSize(Used(place)); platform::SetDeviceId(cur_dev); } else { - if (FLAGS_benchmark) allocation::GPUMemMonitor.Add(place.device, size); + if (VLOG_IS_ON(3)) { + allocation::GPUMemMonitor.Add(place.device, size); + } if (FLAGS_init_allocated_mem) { cudaMemset(ptr, 0xEF, size); } @@ -218,7 +216,9 @@ void Free(const platform::CUDAPlace &place, void *p, size_t size) { #ifdef PADDLE_WITH_CUDA GetGPUBuddyAllocator(place.device)->Free(p); - if (FLAGS_benchmark) allocation::GPUMemMonitor.Minus(place.device, size); + if (VLOG_IS_ON(3)) { + allocation::GPUMemMonitor.Minus(place.device, size); + } #else PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); #endif diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index 60b2d83f15746eab0a4d29c7965c064690b6d46d..655ce8485d4584aa0955315b045da6bf541f7fe2 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -14,12 +14,6 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" -DEFINE_bool(benchmark, false, - "Doing memory benchmark. It will make deleting scope synchronized, " - "and add some memory usage logs." - "Default cuda is asynchronous device, set to True will" - "force op run in synchronous mode."); - namespace paddle { namespace platform {