未验证 提交 2b557da0 编写于 作者: Z Zeng Jinle 提交者: GitHub

expose gc analysis interface (#34092)

上级 3316409c
......@@ -254,7 +254,7 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry denormal device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector)
cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector op_registry while_op_helper recurrent_op_helper conditional_block_op_helper)
if(WITH_DISTRIBUTE)
if(WITH_PSLIB)
cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
......
......@@ -20,8 +20,12 @@
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
#include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
#include "paddle/fluid/operators/controlflow/while_op_helper.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
......@@ -185,5 +189,91 @@ void DeleteUnusedTensors(
}
}
static std::vector<std::unique_ptr<OperatorBase>> CreateOpsFromBlock(
const BlockDesc &block) {
std::vector<std::unique_ptr<OperatorBase>> ops;
size_t op_num = block.OpSize();
ops.reserve(op_num);
for (size_t i = 0; i < op_num; ++i) {
auto *op_desc = block.Op(i);
ops.push_back(OpRegistry::CreateOp(*op_desc));
}
return ops;
}
std::vector<std::vector<std::vector<std::string>>> GetEagerDeletionCleanVars(
const ProgramDesc &origin_program,
const std::vector<std::string> &skip_vars) {
ProgramDesc program{origin_program};
size_t block_num = program.Size();
PADDLE_ENFORCE_GE(block_num, 1,
platform::errors::PermissionDenied(
"Program should have at least one block"));
// prepare safe GCs on sub block ops
auto global_block_ops = CreateOpsFromBlock(program.Block(0));
operators::PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
program, 0, global_block_ops);
operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(program, 0,
global_block_ops);
operators::PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
program, 0, global_block_ops);
// find the skip vars on each block
std::vector<std::vector<std::string>> skip_vars_on_each_block(block_num);
skip_vars_on_each_block[0] = skip_vars;
std::vector<bool> found_skip_vars(block_num, false);
found_skip_vars[0] = true;
const char *kSubBlock = "sub_block";
const char *kSkipEagerDeletionVars = "skip_eager_deletion_vars";
for (size_t i = 0; i < block_num; ++i) {
const auto &block = program.Block(i);
size_t op_num = block.OpSize();
for (size_t j = 0; j < op_num; ++j) {
auto *op = block.Op(j);
if (!op->HasAttr(kSubBlock) || !op->HasAttr(kSkipEagerDeletionVars)) {
continue;
}
auto sub_block_id = op->GetAttrIfExists<BlockDesc *>(kSubBlock)->ID();
PADDLE_ENFORCE_GE(sub_block_id, 0,
platform::errors::PermissionDenied(
"sub_block id must be non-negative number"));
PADDLE_ENFORCE_LT(sub_block_id, block_num,
platform::errors::PermissionDenied(
"sub_block id exceeds max block num"));
PADDLE_ENFORCE_EQ(
found_skip_vars[sub_block_id], false,
platform::errors::PermissionDenied(
"there are 2 ops which refer to the same sub_block %d",
sub_block_id));
found_skip_vars[sub_block_id] = true;
auto sub_block_skip_vars =
op->GetAttrIfExists<std::vector<std::string>>(kSkipEagerDeletionVars);
skip_vars_on_each_block[sub_block_id] = std::move(sub_block_skip_vars);
}
}
std::vector<std::vector<std::vector<std::string>>> result;
result.reserve(block_num);
for (size_t i = 0; i < block_num; ++i) {
const auto &block = program.Block(i);
const auto block_ops = CreateOpsFromBlock(block);
const auto &block_skip_vars = skip_vars_on_each_block[i];
auto delete_var_map = GetUnusedVars(block, block_ops, block_skip_vars);
std::vector<std::vector<std::string>> block_result;
block_result.reserve(block_ops.size());
for (const auto &op : block_ops) {
auto &delete_vars = delete_var_map[op.get()];
std::sort(delete_vars.begin(), delete_vars.end()); // for stable result
block_result.emplace_back(delete_vars);
}
result.emplace_back(std::move(block_result));
}
return result;
}
} // namespace framework
} // namespace paddle
......@@ -43,5 +43,11 @@ void DeleteUnusedTensors(
&delete_vars_map,
GarbageCollector *gc);
// Get the clean vars of GC after each op runs. This function is used for
// analysis statically.
// result is in the format: result[block_idx][op_idx][delete_var_idx]
std::vector<std::vector<std::vector<std::string>>> GetEagerDeletionCleanVars(
const ProgramDesc &program, const std::vector<std::string> &skip_vars = {});
} // namespace framework
} // namespace paddle
......@@ -31,6 +31,7 @@ limitations under the License. */
#include "paddle/fluid/framework/custom_operator.h"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/garbage_collector.h"
......@@ -1849,6 +1850,8 @@ All parameter, weight, gradient are variables in Paddle.
py::return_value_policy::reference)
.def("finalize", &TrainerBase::Finalize);
m.def("_get_eager_deletion_vars", &framework::GetEagerDeletionCleanVars);
py::class_<framework::Executor>(m, "Executor")
.def(py::init<const platform::Place &>())
.def("close", &Executor::Close)
......
......@@ -263,6 +263,7 @@ if avx_supported():
from .core_avx import _get_all_register_op_kernels
from .core_avx import _is_program_version_supported
from .core_avx import _set_eager_deletion_mode
from .core_avx import _get_eager_deletion_vars
from .core_avx import _set_fuse_parameter_group_size
from .core_avx import _set_fuse_parameter_memory_size
from .core_avx import _is_dygraph_debug_enabled
......@@ -311,6 +312,7 @@ if load_noavx:
from .core_noavx import _get_all_register_op_kernels
from .core_noavx import _is_program_version_supported
from .core_noavx import _set_eager_deletion_mode
from .core_noavx import _get_eager_deletion_vars
from .core_noavx import _set_fuse_parameter_group_size
from .core_noavx import _set_fuse_parameter_memory_size
from .core_noavx import _is_dygraph_debug_enabled
......
......@@ -21,6 +21,10 @@ import paddle.fluid as fluid
import six
import unittest
import multiprocessing
from functools import reduce
import paddle
paddle.enable_static()
fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
......@@ -114,6 +118,12 @@ class TestExecutor(unittest.TestCase):
self.assertEqual(len(outline_p_vars), 0)
self.assertEqual(len(outline_np_vars), 0)
def assert_gc_vars(self, program, skip_vars, non_persistable_vars):
gc_vars = fluid.core._get_eager_deletion_vars(program.desc, skip_vars)
self.assertEqual(len(gc_vars), program.num_blocks)
gc_vars = reduce(lambda x, y: x + y, gc_vars[0])
self.assertEqual(set(gc_vars), set(non_persistable_vars))
def executor_main(self):
image, label, loss = simple_fc_net()
loss.persistable = False
......@@ -122,6 +132,9 @@ class TestExecutor(unittest.TestCase):
print('Non-persistable var number {}'.format(len(non_persistables)))
print(non_persistables)
self.assert_gc_vars(fluid.default_main_program(), [loss.name],
non_persistables)
exe = fluid.Executor(self.place)
exe.run(fluid.default_startup_program())
......@@ -147,6 +160,8 @@ class TestExecutor(unittest.TestCase):
loss.persistable = False
persistables, non_persistables = get_persistables_and_non_persistables(
fluid.default_main_program(), [loss.name])
self.assert_gc_vars(fluid.default_main_program(), [loss.name],
non_persistables)
exe = fluid.Executor(self.place)
exe.run(fluid.default_startup_program())
......
......@@ -26,6 +26,8 @@ from paddle.fluid import ParamAttr
from paddle.fluid.framework import Program, grad_var_name
from paddle.fluid.executor import Executor
from paddle.fluid.backward import append_backward
import paddle
paddle.enable_static()
np.random.seed(123)
os.environ["CPU_NUM"] = "1"
......@@ -163,6 +165,9 @@ class EagerDeletionRecurrentOpTest1(unittest.TestCase):
return rnn()
def forward(self):
gc_vars = core._get_eager_deletion_vars(self.main_program.desc,
[self.output.name])
self.assertEqual(len(gc_vars), self.main_program.num_blocks)
self.feed_map = {
x: create_tensor(getattr(self.py_rnn, x), self.place)
for x in self.data_field
......@@ -184,6 +189,10 @@ class EagerDeletionRecurrentOpTest1(unittest.TestCase):
for x in self.data_field
]
gc_vars = core._get_eager_deletion_vars(
self.main_program.desc, [var.name for var in fetch_list])
self.assertEqual(len(gc_vars), self.main_program.num_blocks)
exe = Executor(self.place)
return exe.run(self.main_program,
feed=self.feed_map,
......
......@@ -27,6 +27,8 @@ import paddle.fluid.compiler as compiler
import numpy
import multiprocessing
import paddle
paddle.enable_static()
fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
......@@ -125,6 +127,10 @@ class TestEagerDeletionWhileOpBase(unittest.TestCase):
optim = fluid.optimizer.Adam(learning_rate=1e-3)
optim.minimize(loss)
gc_vars = core._get_eager_deletion_vars(
fluid.default_main_program().desc, [loss.name])
self.assertEqual(len(gc_vars), 5)
exe = Executor(self.place)
exe.run(fluid.default_startup_program())
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册