diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc
index e971ebd396fab8fec4f70c59b8c2a8d7425d5d06..f8fe099255df820582975922adfd5f4f1f1a2e6b 100644
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -311,9 +311,28 @@ void BlockDesc::MoveFrom(BlockDesc *block) {
           attr_type == proto::AttrType::VARS) {
         dst_op->UpdateVarAttr(attr_name, attr_value);
       } else if (attr_type == proto::AttrType::BLOCK) {
-        auto block_id = PADDLE_GET_CONST(BlockDesc *, attr_value)->ID();
-        dst_op->SetBlockAttr(attr_name, prog_->MutableBlock(block_id));
-        VLOG(10) << "Set block attr " << attr_name << " id " << block_id;
+        ProgramDesc *program = block->Program();
+        std::vector<framework::BlockDesc *> old_block_desc;
+        for (int i = 0; i < program->Proto()->blocks_size(); ++i) {
+          // record all block desc's ptr from origin block's program
+          old_block_desc.emplace_back(program->MutableBlock(i));
+        }
+        framework::BlockDesc *block_desc =
+            PADDLE_GET_CONST(BlockDesc *, attr_value);
+        if (std::find(old_block_desc.begin(),
+                      old_block_desc.end(),
+                      block_desc) != old_block_desc.end()) {
+          // The block is owned by the origin block's program. Just use id to
+          // get the corresponding block.
+          auto block_id = block_desc->ID();
+          dst_op->SetBlockAttr(attr_name, prog_->MutableBlock(block_id));
+          VLOG(10) << "Set block attr " << attr_name << " id " << block_id;
+        } else {
+          // The block is not owned by the origin block's program. Should copy
+          // the real block desc instead of logical block in the program.
+          dst_op->SetBlockAttr(attr_name, block_desc);
+          VLOG(10) << "Set block attr " << attr_name << " from attr_value";
+        }
       } else if (attr_type == proto::AttrType::BLOCKS) {
         auto old_blocks =
             PADDLE_GET_CONST(std::vector<BlockDesc *>, attr_value);
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 4cd0a2c9e1e5bd1451982eab415969673b64a37d..6b6eb3f8d674b3928b17b625a2c5b0767d6fa0b5 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -31,7 +31,7 @@
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace,
-                            true,
+                            false,
                             "Use inplace in new executor");
 PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
                             true,
diff --git a/paddle/fluid/operators/collective/alltoall_op.cc b/paddle/fluid/operators/collective/alltoall_op.cc
index e476d956a459cfa9d67c3ac6b65fa23b27a28a25..fa7476e07cc11535f4477771d027f3fc478eb24f 100644
--- a/paddle/fluid/operators/collective/alltoall_op.cc
+++ b/paddle/fluid/operators/collective/alltoall_op.cc
@@ -75,8 +75,6 @@ class AllToAllOpGradMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-DECLARE_INPLACE_OP_INFERER(AllToAllInplaceInferer, {"X", "Out"});
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -87,8 +85,7 @@ REGISTER_OPERATOR(alltoall,
                   ops::AllToAllOp,
                   ops::AllToAllOpMaker,
                   ops::AllToAllOpGradMaker<paddle::framework::OpDesc>,
-                  ops::AllToAllOpGradMaker<paddle::imperative::OpBase>,
-                  ops::AllToAllInplaceInferer)
+                  ops::AllToAllOpGradMaker<paddle::imperative::OpBase>)
 
 REGISTER_OP_CPU_KERNEL(alltoall,
                        ops::AllToAllOpCPUKernel<float>,
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index b8f6e2eceb5c1592c6daa005b818f7be1ae8f861..218ee94a22beaa75f9f062ee515aeaf69657ae62 100755
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -24,7 +24,7 @@ from .wrapped_decorator import signature_safe_contextmanager
 import six
 from .data_feeder import convert_dtype
 from .framework import Program, default_main_program, Variable, Operator
-from .framework import convert_np_dtype_to_dtype_
+from .framework import convert_np_dtype_to_dtype_, _apply_pass
 
 from . import core
 from . import unique_name
@@ -1468,6 +1468,23 @@ class Executor(object):
                 assert isinstance(program, Program)
                 return True
 
+        def _apply_inplace_addto_pass(program, enable_inplace, enable_addto,
+                                      skip_var_names):
+            use_cuda = True if core.is_compiled_with_cuda() else False
+
+            attrs = {"use_cuda": use_cuda, "mem_opt_skip_vars": skip_var_names}
+            attr_types = {"use_cuda": "bool", "mem_opt_skip_vars": "list[str]"}
+
+            empty_startup_program = Program()
+            if enable_inplace:
+                pass_name = "buffer_shared_inplace_pass"
+                _apply_pass(program, empty_startup_program, pass_name, attrs,
+                            attr_types)
+            if enable_addto and use_cuda:
+                pass_name = "inplace_addto_op_pass"
+                _apply_pass(program, empty_startup_program, pass_name, attrs,
+                            attr_types)
+
         # NOTE: This is an experimental feature. If `export FLAGS_USE_STANDALONE_EXECUTOR=1 `,
         # use StandaloneExecutor to run the program.
         if return_merged and self._enable_interpreter_core and _can_use_interpreter_core(
@@ -1494,6 +1511,7 @@ class Executor(object):
                 if key not in self._executor_cache._cached_executors:
                     # To apply IR pass, compile the Program to IrGraph and convert it back to Program
                     if isinstance(program, compiler.CompiledProgram):
+                        build_strategy = program._build_strategy
                         # print(f"Program before convert:\n {inner_program}", flush=True)
                         program._compile(scope, self.place)
                         ir_graph = framework.IrGraph(program._graph)
@@ -1503,6 +1521,7 @@ class Executor(object):
                             "FLAGS_USE_STANDALONE_EXECUTOR and FLAGS_CONVERT_GRAPH_TO_PROGRAM is set to 1. Graph will be converted to Program and executed using new executor."
                         )
                     else:
+                        build_strategy = None
                         from paddle.incubate.autograd import prim_enabled, prim2orig
                         if prim_enabled() and program == default_main_program():
                             prim2orig()
@@ -1515,6 +1534,17 @@ class Executor(object):
                         fetch_var_name=fetch_var_name,
                         use_fetch_v2=True)
 
+                    # standalone executor will apply buffer_shared_inplace_pass and
+                    # inplace_addto_op_pass to program according to build_strategy
+                    enable_inplace = True if build_strategy is None or build_strategy.enable_inplace else False
+                    enable_addto = True if build_strategy is not None and build_strategy.enable_addto else False
+                    if enable_inplace or enable_addto:
+                        # inplace should skip feed and fetch var
+                        skip_var_names = eval(
+                            _get_program_cache_key(feed, fetch_list))
+                        _apply_inplace_addto_pass(program, enable_inplace,
+                                                  enable_addto, skip_var_names)
+
                     new_program = program.clone()
                     new_exe = _StandaloneExecutor(self.place, new_program,
                                                   scope)