apply buffer_shared_inplace_pass and inplace_addto_op_pass pass to program in...

apply buffer_shared_inplace_pass and inplace_addto_op_pass pass to program in Standalone Executor (#45085) * apply inplace addto in python apply_pass * fix * apply inplace pass for program * skip feed and fetch var * fix block_desc.move_from * fix block desc * alltoall remove inplace * fix

apply buffer_shared_inplace_pass and inplace_addto_op_pass pass to program in...
apply buffer_shared_inplace_pass and inplace_addto_op_pass pass to program in Standalone Executor (#45085) * apply inplace addto in python apply_pass * fix * apply inplace pass for program * skip feed and fetch var * fix block_desc.move_from * fix block desc * alltoall remove inplace * fix
d8d124b6 · pangyoki · GitHub · f36c4da5 · d8d124b6 · d8d124b6
4 changed file
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -311,9 +311,28 @@ void BlockDesc::MoveFrom(BlockDesc *block) {
          attr_type == proto::AttrType::VARS) {
        dst_op->UpdateVarAttr(attr_name, attr_value);
      } else if (attr_type == proto::AttrType::BLOCK) {
-        auto block_id = PADDLE_GET_CONST(BlockDesc *, attr_value)->ID();
+        ProgramDesc *program = block->Program();
+        std::vector<framework::BlockDesc *> old_block_desc;
+        for (int i = 0; i < program->Proto()->blocks_size(); ++i) {
+          // record all block desc's ptr from origin block's program
+          old_block_desc.emplace_back(program->MutableBlock(i));
+        }
+        framework::BlockDesc *block_desc =
+            PADDLE_GET_CONST(BlockDesc *, attr_value);
+        if (std::find(old_block_desc.begin(),
+                      old_block_desc.end(),
+                      block_desc) != old_block_desc.end()) {
+          // The block is owned by the origin block's program. Just use id to
+          // get the corresponding block.
+          auto block_id = block_desc->ID();
          dst_op->SetBlockAttr(attr_name, prog_->MutableBlock(block_id));
          VLOG(10) << "Set block attr " << attr_name << " id " << block_id;
+        } else {
+          // The block is not owned by the origin block's program. Should copy
+          // the real block desc instead of logical block in the program.
+          dst_op->SetBlockAttr(attr_name, block_desc);
+          VLOG(10) << "Set block attr " << attr_name << " from attr_value";
+        }
      } else if (attr_type == proto::AttrType::BLOCKS) {
        auto old_blocks =
            PADDLE_GET_CONST(std::vector<BlockDesc *>, attr_value);

--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -31,7 +31,7 @@
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"

 PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace,
-                            true,
+                            false,
                            "Use inplace in new executor");
 PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
                            true,

--- a/paddle/fluid/operators/collective/alltoall_op.cc
+++ b/paddle/fluid/operators/collective/alltoall_op.cc
@@ -75,8 +75,6 @@ class AllToAllOpGradMaker : public framework::SingleGradOpMaker<T> {
  }
 };

-DECLARE_INPLACE_OP_INFERER(AllToAllInplaceInferer, {"X", "Out"});
-
 }  // namespace operators
 }  // namespace paddle

@@ -87,8 +85,7 @@ REGISTER_OPERATOR(alltoall,
                  ops::AllToAllOp,
                  ops::AllToAllOpMaker,
                  ops::AllToAllOpGradMaker<paddle::framework::OpDesc>,
-                  ops::AllToAllOpGradMaker<paddle::imperative::OpBase>,
-                  ops::AllToAllInplaceInferer)
+                  ops::AllToAllOpGradMaker<paddle::imperative::OpBase>)

 REGISTER_OP_CPU_KERNEL(alltoall,
                       ops::AllToAllOpCPUKernel<float>,

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -24,7 +24,7 @@ from .wrapped_decorator import signature_safe_contextmanager
 import six
 from .data_feeder import convert_dtype
 from .framework import Program, default_main_program, Variable, Operator
-from .framework import convert_np_dtype_to_dtype_
+from .framework import convert_np_dtype_to_dtype_, _apply_pass

 from . import core
 from . import unique_name
@@ -1468,6 +1468,23 @@ class Executor(object):
                assert isinstance(program, Program)
                return True

+        def _apply_inplace_addto_pass(program, enable_inplace, enable_addto,
+                                      skip_var_names):
+            use_cuda = True if core.is_compiled_with_cuda() else False
+
+            attrs = {"use_cuda": use_cuda, "mem_opt_skip_vars": skip_var_names}
+            attr_types = {"use_cuda": "bool", "mem_opt_skip_vars": "list[str]"}
+
+            empty_startup_program = Program()
+            if enable_inplace:
+                pass_name = "buffer_shared_inplace_pass"
+                _apply_pass(program, empty_startup_program, pass_name, attrs,
+                            attr_types)
+            if enable_addto and use_cuda:
+                pass_name = "inplace_addto_op_pass"
+                _apply_pass(program, empty_startup_program, pass_name, attrs,
+                            attr_types)
+
        # NOTE: This is an experimental feature. If `export FLAGS_USE_STANDALONE_EXECUTOR=1 `,
        # use StandaloneExecutor to run the program.
        if return_merged and self._enable_interpreter_core and _can_use_interpreter_core(
@@ -1494,6 +1511,7 @@ class Executor(object):
                if key not in self._executor_cache._cached_executors:
                    # To apply IR pass, compile the Program to IrGraph and convert it back to Program
                    if isinstance(program, compiler.CompiledProgram):
+                        build_strategy = program._build_strategy
                        # print(f"Program before convert:\n {inner_program}", flush=True)
                        program._compile(scope, self.place)
                        ir_graph = framework.IrGraph(program._graph)
@@ -1503,6 +1521,7 @@ class Executor(object):
                            "FLAGS_USE_STANDALONE_EXECUTOR and FLAGS_CONVERT_GRAPH_TO_PROGRAM is set to 1. Graph will be converted to Program and executed using new executor."
                        )
                    else:
+                        build_strategy = None
                        from paddle.incubate.autograd import prim_enabled, prim2orig
                        if prim_enabled() and program == default_main_program():
                            prim2orig()
@@ -1515,6 +1534,17 @@ class Executor(object):
                        fetch_var_name=fetch_var_name,
                        use_fetch_v2=True)

+                    # standalone executor will apply buffer_shared_inplace_pass and
+                    # inplace_addto_op_pass to program according to build_strategy
+                    enable_inplace = True if build_strategy is None or build_strategy.enable_inplace else False
+                    enable_addto = True if build_strategy is not None and build_strategy.enable_addto else False
+                    if enable_inplace or enable_addto:
+                        # inplace should skip feed and fetch var
+                        skip_var_names = eval(
+                            _get_program_cache_key(feed, fetch_list))
+                        _apply_inplace_addto_pass(program, enable_inplace,
+                                                  enable_addto, skip_var_names)
+
                    new_program = program.clone()
                    new_exe = _StandaloneExecutor(self.place, new_program,
                                                  scope)