未验证 提交 d8d124b6 编写于 作者: P pangyoki 提交者: GitHub

apply buffer_shared_inplace_pass and inplace_addto_op_pass pass to program in...

apply buffer_shared_inplace_pass and inplace_addto_op_pass pass to program in Standalone Executor (#45085)

* apply inplace addto in python apply_pass

* fix

* apply inplace pass for program

* skip feed and fetch var

* fix block_desc.move_from

* fix block desc

* alltoall remove inplace

* fix
上级 f36c4da5
......@@ -311,9 +311,28 @@ void BlockDesc::MoveFrom(BlockDesc *block) {
attr_type == proto::AttrType::VARS) {
dst_op->UpdateVarAttr(attr_name, attr_value);
} else if (attr_type == proto::AttrType::BLOCK) {
auto block_id = PADDLE_GET_CONST(BlockDesc *, attr_value)->ID();
dst_op->SetBlockAttr(attr_name, prog_->MutableBlock(block_id));
VLOG(10) << "Set block attr " << attr_name << " id " << block_id;
ProgramDesc *program = block->Program();
std::vector<framework::BlockDesc *> old_block_desc;
for (int i = 0; i < program->Proto()->blocks_size(); ++i) {
// record all block desc's ptr from origin block's program
old_block_desc.emplace_back(program->MutableBlock(i));
}
framework::BlockDesc *block_desc =
PADDLE_GET_CONST(BlockDesc *, attr_value);
if (std::find(old_block_desc.begin(),
old_block_desc.end(),
block_desc) != old_block_desc.end()) {
// The block is owned by the origin block's program. Just use id to
// get the corresponding block.
auto block_id = block_desc->ID();
dst_op->SetBlockAttr(attr_name, prog_->MutableBlock(block_id));
VLOG(10) << "Set block attr " << attr_name << " id " << block_id;
} else {
// The block is not owned by the origin block's program. Should copy
// the real block desc instead of logical block in the program.
dst_op->SetBlockAttr(attr_name, block_desc);
VLOG(10) << "Set block attr " << attr_name << " from attr_value";
}
} else if (attr_type == proto::AttrType::BLOCKS) {
auto old_blocks =
PADDLE_GET_CONST(std::vector<BlockDesc *>, attr_value);
......
......@@ -31,7 +31,7 @@
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace,
true,
false,
"Use inplace in new executor");
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
true,
......
......@@ -75,8 +75,6 @@ class AllToAllOpGradMaker : public framework::SingleGradOpMaker<T> {
}
};
DECLARE_INPLACE_OP_INFERER(AllToAllInplaceInferer, {"X", "Out"});
} // namespace operators
} // namespace paddle
......@@ -87,8 +85,7 @@ REGISTER_OPERATOR(alltoall,
ops::AllToAllOp,
ops::AllToAllOpMaker,
ops::AllToAllOpGradMaker<paddle::framework::OpDesc>,
ops::AllToAllOpGradMaker<paddle::imperative::OpBase>,
ops::AllToAllInplaceInferer)
ops::AllToAllOpGradMaker<paddle::imperative::OpBase>)
REGISTER_OP_CPU_KERNEL(alltoall,
ops::AllToAllOpCPUKernel<float>,
......
......@@ -24,7 +24,7 @@ from .wrapped_decorator import signature_safe_contextmanager
import six
from .data_feeder import convert_dtype
from .framework import Program, default_main_program, Variable, Operator
from .framework import convert_np_dtype_to_dtype_
from .framework import convert_np_dtype_to_dtype_, _apply_pass
from . import core
from . import unique_name
......@@ -1468,6 +1468,23 @@ class Executor(object):
assert isinstance(program, Program)
return True
def _apply_inplace_addto_pass(program, enable_inplace, enable_addto,
skip_var_names):
use_cuda = True if core.is_compiled_with_cuda() else False
attrs = {"use_cuda": use_cuda, "mem_opt_skip_vars": skip_var_names}
attr_types = {"use_cuda": "bool", "mem_opt_skip_vars": "list[str]"}
empty_startup_program = Program()
if enable_inplace:
pass_name = "buffer_shared_inplace_pass"
_apply_pass(program, empty_startup_program, pass_name, attrs,
attr_types)
if enable_addto and use_cuda:
pass_name = "inplace_addto_op_pass"
_apply_pass(program, empty_startup_program, pass_name, attrs,
attr_types)
# NOTE: This is an experimental feature. If `export FLAGS_USE_STANDALONE_EXECUTOR=1 `,
# use StandaloneExecutor to run the program.
if return_merged and self._enable_interpreter_core and _can_use_interpreter_core(
......@@ -1494,6 +1511,7 @@ class Executor(object):
if key not in self._executor_cache._cached_executors:
# To apply IR pass, compile the Program to IrGraph and convert it back to Program
if isinstance(program, compiler.CompiledProgram):
build_strategy = program._build_strategy
# print(f"Program before convert:\n {inner_program}", flush=True)
program._compile(scope, self.place)
ir_graph = framework.IrGraph(program._graph)
......@@ -1503,6 +1521,7 @@ class Executor(object):
"FLAGS_USE_STANDALONE_EXECUTOR and FLAGS_CONVERT_GRAPH_TO_PROGRAM is set to 1. Graph will be converted to Program and executed using new executor."
)
else:
build_strategy = None
from paddle.incubate.autograd import prim_enabled, prim2orig
if prim_enabled() and program == default_main_program():
prim2orig()
......@@ -1515,6 +1534,17 @@ class Executor(object):
fetch_var_name=fetch_var_name,
use_fetch_v2=True)
# standalone executor will apply buffer_shared_inplace_pass and
# inplace_addto_op_pass to program according to build_strategy
enable_inplace = True if build_strategy is None or build_strategy.enable_inplace else False
enable_addto = True if build_strategy is not None and build_strategy.enable_addto else False
if enable_inplace or enable_addto:
# inplace should skip feed and fetch var
skip_var_names = eval(
_get_program_cache_key(feed, fetch_list))
_apply_inplace_addto_pass(program, enable_inplace,
enable_addto, skip_var_names)
new_program = program.clone()
new_exe = _StandaloneExecutor(self.place, new_program,
scope)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册