未验证 提交 488719ba 编写于 作者: D dzhwinter 提交者: GitHub

Enhance/memory optimize (#15634)

* add skip send.recv test=develop

* enhanced print message. test=develop

* rerun ci. test=develop
......@@ -403,18 +403,20 @@ void GraphView::Build(ir::Graph* g) {
// 2. track the nodes which used by parameter server.
// these node can not be inplaced, otherwise trainer
// pserver can not find each other name.
for (auto& node : g->Nodes()) {
if (!node->IsOp()) continue;
if (node->Name() == "send") {
for (auto& in : node->inputs) {
dup_nodes_.emplace(in->Name());
}
auto update_skip_set = [&](ir::Node* node) {
for (auto& in : node->inputs) {
if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name());
}
if (node->Name() == "recv") {
for (auto& out : node->outputs) {
for (auto& out : node->outputs) {
if (out->IsVar() && out->Var() != nullptr)
dup_nodes_.emplace(out->Name());
}
}
};
for (auto& node : g->Nodes()) {
if (!node->IsOp()) continue;
if (node->Name() == "send") update_skip_set(node);
if (node->Name() == "recv") update_skip_set(node);
if (node->Name() == "prefetch") update_skip_set(node);
}
}
......
......@@ -51,8 +51,7 @@ static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
auto nodes = graph->Nodes();
auto subblock_vars = GetSubBlockVars(nodes);
skip_set_.insert(subblock_vars.begin(), subblock_vars.end());
CollectSkipVarsSet(nodes);
cfg_.reset(new details::ControlFlowGraph(*graph));
cfg_->LiveVariableAnalysis();
......@@ -224,20 +223,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
}
}
std::unordered_set<std::string> MemoryOptimizePass::GetSubBlockVars(
void MemoryOptimizePass::CollectSkipVarsSet(
const std::unordered_set<ir::Node*>& nodes) const {
std::unordered_set<std::string> vars;
auto update_skip_set = [&](OpDesc* op_desc) {
auto inputs = op_desc->InputArgumentNames();
auto outputs = op_desc->OutputArgumentNames();
skip_set_.insert(inputs.begin(), inputs.end());
skip_set_.insert(outputs.begin(), outputs.end());
};
for (auto& op : nodes) {
if (!op->IsOp() || op->Op() == nullptr) continue;
auto* op_desc = op->Op();
if (OpHasSubBlock(op_desc)) {
auto inputs = op_desc->InputArgumentNames();
auto outputs = op_desc->OutputArgumentNames();
vars.insert(inputs.begin(), inputs.end());
vars.insert(outputs.begin(), outputs.end());
}
// NOTE(dzhwinter):
// current block can not reuse next level block vars.
if (OpHasSubBlock(op_desc)) update_skip_set(op_desc);
// NOTE(dzhwinter):
// distributed ops input/output name need to
// keep same bettwen trainer/pserver
if (op_desc->Type() == "send") update_skip_set(op_desc);
if (op_desc->Type() == "recv") update_skip_set(op_desc);
if (op_desc->Type() == "prefetch") update_skip_set(op_desc);
}
return vars;
}
void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var,
......
......@@ -55,9 +55,10 @@ class MemoryOptimizePass : public ir::Pass {
ir::Graph* graph) const;
void SubGraphOptimize(OpDesc* op_desc) const;
// scan subblock and collect the output/input variables.
std::unordered_set<std::string> GetSubBlockVars(
const std::unordered_set<ir::Node*>&) const;
// 1. scan op with subblock and collect the output/input vars.
// while, while_grad, conditional_block
// 2. scan distributed ops and collect the output/input vars
void CollectSkipVarsSet(const std::unordered_set<ir::Node*>&) const;
private:
// Reuse Node Pool, Owned.
......
......@@ -276,6 +276,7 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block());
EXPECT_EQ(in_to_outs.size(), 3ul);
std::unordered_map<std::string, std::string> expects = {
{"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
......
......@@ -22,7 +22,11 @@ limitations under the License. */
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/string/printf.h"
DECLARE_bool(benchmark);
DEFINE_bool(benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
DEFINE_bool(
eager_delete_scope, true,
......
......@@ -35,7 +35,6 @@ DEFINE_bool(init_allocated_mem, false,
"To find this error in time, we use init_allocated_mem to indicate "
"that initializing the allocated memory with a small value "
"during unit testing.");
DECLARE_bool(benchmark);
DECLARE_double(fraction_of_gpu_memory_to_use);
namespace paddle {
......@@ -188,21 +187,20 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
platform::SetDeviceId(place.device);
size_t avail, total;
platform::GpuMemoryUsage(&avail, &total);
LOG(WARNING) << "Cannot allocate " << string::HumanReadableSize(size)
<< " in GPU " << place.device << ", available "
<< string::HumanReadableSize(avail);
LOG(WARNING) << "total " << total;
LOG(WARNING) << "GpuMinChunkSize "
<< string::HumanReadableSize(
buddy_allocator->GetMinChunkSize());
LOG(WARNING) << "GpuMaxChunkSize "
<< string::HumanReadableSize(
buddy_allocator->GetMaxChunkSize());
LOG(WARNING) << "GPU memory used: "
<< string::HumanReadableSize(Used<platform::CUDAPlace>(place));
LOG(FATAL) << "Cannot allocate " << string::HumanReadableSize(size)
<< " in GPU " << place.device << ", available "
<< string::HumanReadableSize(avail) << "total " << total
<< "GpuMinChunkSize "
<< string::HumanReadableSize(buddy_allocator->GetMinChunkSize())
<< "GpuMaxChunkSize "
<< string::HumanReadableSize(buddy_allocator->GetMaxChunkSize())
<< "GPU memory used: "
<< string::HumanReadableSize(Used<platform::CUDAPlace>(place));
platform::SetDeviceId(cur_dev);
} else {
if (FLAGS_benchmark) allocation::GPUMemMonitor.Add(place.device, size);
if (VLOG_IS_ON(3)) {
allocation::GPUMemMonitor.Add(place.device, size);
}
if (FLAGS_init_allocated_mem) {
cudaMemset(ptr, 0xEF, size);
}
......@@ -218,7 +216,9 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
size_t size) {
#ifdef PADDLE_WITH_CUDA
GetGPUBuddyAllocator(place.device)->Free(p);
if (FLAGS_benchmark) allocation::GPUMemMonitor.Minus(place.device, size);
if (VLOG_IS_ON(3)) {
allocation::GPUMemMonitor.Minus(place.device, size);
}
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
......
......@@ -14,12 +14,6 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h"
DEFINE_bool(benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
namespace paddle {
namespace platform {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册