diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc index e971ebd396fab8fec4f70c59b8c2a8d7425d5d06..f8fe099255df820582975922adfd5f4f1f1a2e6b 100644 --- a/paddle/fluid/framework/block_desc.cc +++ b/paddle/fluid/framework/block_desc.cc @@ -311,9 +311,28 @@ void BlockDesc::MoveFrom(BlockDesc *block) { attr_type == proto::AttrType::VARS) { dst_op->UpdateVarAttr(attr_name, attr_value); } else if (attr_type == proto::AttrType::BLOCK) { - auto block_id = PADDLE_GET_CONST(BlockDesc *, attr_value)->ID(); - dst_op->SetBlockAttr(attr_name, prog_->MutableBlock(block_id)); - VLOG(10) << "Set block attr " << attr_name << " id " << block_id; + ProgramDesc *program = block->Program(); + std::vector old_block_desc; + for (int i = 0; i < program->Proto()->blocks_size(); ++i) { + // record all block desc's ptr from origin block's program + old_block_desc.emplace_back(program->MutableBlock(i)); + } + framework::BlockDesc *block_desc = + PADDLE_GET_CONST(BlockDesc *, attr_value); + if (std::find(old_block_desc.begin(), + old_block_desc.end(), + block_desc) != old_block_desc.end()) { + // The block is owned by the origin block's program. Just use id to + // get the corresponding block. + auto block_id = block_desc->ID(); + dst_op->SetBlockAttr(attr_name, prog_->MutableBlock(block_id)); + VLOG(10) << "Set block attr " << attr_name << " id " << block_id; + } else { + // The block is not owned by the origin block's program. Should copy + // the real block desc instead of logical block in the program. + dst_op->SetBlockAttr(attr_name, block_desc); + VLOG(10) << "Set block attr " << attr_name << " from attr_value"; + } } else if (attr_type == proto::AttrType::BLOCKS) { auto old_blocks = PADDLE_GET_CONST(std::vector, attr_value); diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 4cd0a2c9e1e5bd1451982eab415969673b64a37d..6b6eb3f8d674b3928b17b625a2c5b0767d6fa0b5 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -31,7 +31,7 @@ #include "paddle/fluid/platform/device/gpu/gpu_info.h" PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, - true, + false, "Use inplace in new executor"); PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, true, diff --git a/paddle/fluid/operators/collective/alltoall_op.cc b/paddle/fluid/operators/collective/alltoall_op.cc index e476d956a459cfa9d67c3ac6b65fa23b27a28a25..fa7476e07cc11535f4477771d027f3fc478eb24f 100644 --- a/paddle/fluid/operators/collective/alltoall_op.cc +++ b/paddle/fluid/operators/collective/alltoall_op.cc @@ -75,8 +75,6 @@ class AllToAllOpGradMaker : public framework::SingleGradOpMaker { } }; -DECLARE_INPLACE_OP_INFERER(AllToAllInplaceInferer, {"X", "Out"}); - } // namespace operators } // namespace paddle @@ -87,8 +85,7 @@ REGISTER_OPERATOR(alltoall, ops::AllToAllOp, ops::AllToAllOpMaker, ops::AllToAllOpGradMaker, - ops::AllToAllOpGradMaker, - ops::AllToAllInplaceInferer) + ops::AllToAllOpGradMaker) REGISTER_OP_CPU_KERNEL(alltoall, ops::AllToAllOpCPUKernel, diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index b8f6e2eceb5c1592c6daa005b818f7be1ae8f861..218ee94a22beaa75f9f062ee515aeaf69657ae62 100755 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -24,7 +24,7 @@ from .wrapped_decorator import signature_safe_contextmanager import six from .data_feeder import convert_dtype from .framework import Program, default_main_program, Variable, Operator -from .framework import convert_np_dtype_to_dtype_ +from .framework import convert_np_dtype_to_dtype_, _apply_pass from . import core from . import unique_name @@ -1468,6 +1468,23 @@ class Executor(object): assert isinstance(program, Program) return True + def _apply_inplace_addto_pass(program, enable_inplace, enable_addto, + skip_var_names): + use_cuda = True if core.is_compiled_with_cuda() else False + + attrs = {"use_cuda": use_cuda, "mem_opt_skip_vars": skip_var_names} + attr_types = {"use_cuda": "bool", "mem_opt_skip_vars": "list[str]"} + + empty_startup_program = Program() + if enable_inplace: + pass_name = "buffer_shared_inplace_pass" + _apply_pass(program, empty_startup_program, pass_name, attrs, + attr_types) + if enable_addto and use_cuda: + pass_name = "inplace_addto_op_pass" + _apply_pass(program, empty_startup_program, pass_name, attrs, + attr_types) + # NOTE: This is an experimental feature. If `export FLAGS_USE_STANDALONE_EXECUTOR=1 `, # use StandaloneExecutor to run the program. if return_merged and self._enable_interpreter_core and _can_use_interpreter_core( @@ -1494,6 +1511,7 @@ class Executor(object): if key not in self._executor_cache._cached_executors: # To apply IR pass, compile the Program to IrGraph and convert it back to Program if isinstance(program, compiler.CompiledProgram): + build_strategy = program._build_strategy # print(f"Program before convert:\n {inner_program}", flush=True) program._compile(scope, self.place) ir_graph = framework.IrGraph(program._graph) @@ -1503,6 +1521,7 @@ class Executor(object): "FLAGS_USE_STANDALONE_EXECUTOR and FLAGS_CONVERT_GRAPH_TO_PROGRAM is set to 1. Graph will be converted to Program and executed using new executor." ) else: + build_strategy = None from paddle.incubate.autograd import prim_enabled, prim2orig if prim_enabled() and program == default_main_program(): prim2orig() @@ -1515,6 +1534,17 @@ class Executor(object): fetch_var_name=fetch_var_name, use_fetch_v2=True) + # standalone executor will apply buffer_shared_inplace_pass and + # inplace_addto_op_pass to program according to build_strategy + enable_inplace = True if build_strategy is None or build_strategy.enable_inplace else False + enable_addto = True if build_strategy is not None and build_strategy.enable_addto else False + if enable_inplace or enable_addto: + # inplace should skip feed and fetch var + skip_var_names = eval( + _get_program_cache_key(feed, fetch_list)) + _apply_inplace_addto_pass(program, enable_inplace, + enable_addto, skip_var_names) + new_program = program.clone() new_exe = _StandaloneExecutor(self.place, new_program, scope)