diff --git a/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt b/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt
index bbd2e93184315922a4f75c06d7eeeaf8fd7f15a8..3885c29c6a909d23ea292e6f9ce1ca7091fdeb78 100644
--- a/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt
@@ -1,9 +1,12 @@
 set(INTERPRETER_SRCS data_transfer.cc dependency_builder.cc execution_config.cc
-                     interpreter_util.cc stream_analyzer.cc)
+                     interpreter_util.cc static_build.cc stream_analyzer.cc)
 
 set(INTERPRETER_DEPS
+    buffered_reader
     device_context
+    global_utils
     op_registry
+    phi_tensor_utils
     scope
     framework_proto
     data_feed_proto
diff --git a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc
index 7024a57bb84b12e5210bfe694eee75a734850449..1b208b1967018a93f4b7649a836124f83da81295 100644
--- a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
+#include "paddle/fluid/framework/new_executor/interpreter/static_build.h"
 #include "paddle/phi/core/kernel_context.h"
 #include "paddle/phi/core/kernel_factory.h"
 
@@ -37,7 +38,7 @@ bool DataTranferHelper::apply(const phi::KernelKey& kernel_type_for_var,
                               std::vector<OpFuncNode>* op_func_nodes,
                               bool use_local_scope,
                               bool is_fetch_v2,
-                              bool skip_run) {
+                              bool static_build) {
   bool is_transferred = false;
   auto* src_var_name = &var_name;
 
@@ -52,7 +53,7 @@ bool DataTranferHelper::apply(const phi::KernelKey& kernel_type_for_var,
                              is_fetch_v2);
     if (op) {
       RunAndConstructOpFuncNode(
-          op, *src_var_name, *new_var_name, op_func_nodes, skip_run);
+          op, *src_var_name, *new_var_name, op_func_nodes, static_build);
     }
     // update src_var_name
     src_var_name = new_var_name;
@@ -70,7 +71,7 @@ bool DataTranferHelper::apply(const phi::KernelKey& kernel_type_for_var,
         scope_);
     if (op) {
       RunAndConstructOpFuncNode(
-          op, *src_var_name, *new_var_name, op_func_nodes, skip_run);
+          op, *src_var_name, *new_var_name, op_func_nodes, static_build);
     }
     // update src_var_name
     src_var_name = new_var_name;
@@ -87,7 +88,7 @@ bool DataTranferHelper::apply(const phi::KernelKey& kernel_type_for_var,
         *src_var_name, new_var_name, src_place, dst_place, var_scope_, scope_);
     if (op) {
       RunAndConstructOpFuncNode(
-          op, *src_var_name, *new_var_name, op_func_nodes, skip_run);
+          op, *src_var_name, *new_var_name, op_func_nodes, static_build);
     }
     is_transferred = true;
   }
@@ -98,7 +99,7 @@ void DataTranferHelper::RunAndConstructShareNode(
     const std::string& src_var_name,
     const std::string& dst_var_name,
     std::vector<OpFuncNode>* op_func_nodes,
-    bool skip_run) {
+    bool static_build) {
   VariableNameMap in_name_map = {{"X", {src_var_name}}};
   VariableNameMap out_name_map = {{"Out", {dst_var_name}}};
   AttributeMap attr_map;
@@ -112,7 +113,7 @@ void DataTranferHelper::RunAndConstructShareNode(
       "Insert %s with %s -> %s.", op_type, src_var_name, dst_var_name);
 
   RunAndConstructOpFuncNode(
-      op, src_var_name, dst_var_name, op_func_nodes, skip_run);
+      op, src_var_name, dst_var_name, op_func_nodes, static_build);
 }
 
 void DataTranferHelper::RunAndConstructOpFuncNode(
@@ -120,15 +121,18 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
     const std::string& var_name,
     const std::string& new_var_name,
     std::vector<OpFuncNode>* new_op_func_nodes,
-    bool skip_run) {
+    bool static_build) {
   auto& op_type = op->Type();
 
   // 1. Construct RuntimeContext
   RuntimeContext runtime_context({}, {});
   runtime_context.inputs["X"] = {scope_->FindVar(var_name)};
   runtime_context.outputs["Out"] = {scope_->Var(new_var_name)};
-  RuntimeInferShapeContext infer_shape_ctx(*op, runtime_context);
-  op.get()->Info().infer_shape_(&infer_shape_ctx);
+
+  if (!static_build) {
+    RuntimeInferShapeContext infer_shape_ctx(*op, runtime_context);
+    op->Info().infer_shape_(&infer_shape_ctx);
+  }
 
   // 2. choose kernel
 
@@ -203,8 +207,9 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
   } else {
     new_op_func_node.phi_kernel_ = op_with_kernel->PhiKernel();
 
-    if (skip_run) {
+    if (static_build) {
       FakeInitializeOutputsForFunctionKernel(
+          *op,
           *(new_op_func_node.phi_kernel_),
           *(op_with_kernel->PhiKernelSignature()),
           runtime_context,
@@ -449,7 +454,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
                         OpFuncNode* op_func_node,
                         std::vector<OpFuncNode>* new_op_func_nodes,
                         bool use_local_scope,
-                        bool skip_run) {
+                        bool static_build) {
   Scope* local_scope = use_local_scope ? var_scope->GetMutableLocalScope()
                                        : var_scope->GetMutableScope();
 
@@ -546,7 +551,11 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
                                          op_base->Type() == "fetch_v2");
                 if (op) {
                   data_transfer_helper.RunAndConstructOpFuncNode(
-                      op, var_name, new_var_name, new_op_func_nodes, skip_run);
+                      op,
+                      var_name,
+                      new_var_name,
+                      new_op_func_nodes,
+                      static_build);
                 }
                 is_transferred = true;
               } else {
@@ -611,7 +620,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
                 new_op_func_nodes,
                 use_local_scope,
                 op_base->Type() == "fetch_v2",
-                skip_run);
+                static_build);
           }
 
           if (is_transferred) {
@@ -741,7 +750,7 @@ void HandleComplexGradToRealGrad(const OpFuncNode& op_func_node,
                                  VariableScope* var_scope,
                                  std::vector<OpFuncNode>* op_func_nodes,
                                  framework::Scope* local_scope,
-                                 bool skip_run) {
+                                 bool static_build) {
   DataTranferHelper data_transfer_helper(place, var_scope, local_scope);
   for (auto& var_name_item : out_names) {
     std::vector<Variable*>& vars = out_vars->at(var_name_item.first);
@@ -817,9 +826,9 @@ void HandleComplexGradToRealGrad(const OpFuncNode& op_func_node,
       auto op = TransferDtype(
           var_name, &new_var_name, src_type, dst_type, var_scope, local_scope);
       data_transfer_helper.RunAndConstructOpFuncNode(
-          op, var_name, new_var_name, op_func_nodes, skip_run);
+          op, var_name, new_var_name, op_func_nodes, static_build);
       data_transfer_helper.RunAndConstructShareNode(
-          new_var_name, var_name, op_func_nodes, skip_run);
+          new_var_name, var_name, op_func_nodes, static_build);
     }
   }
 }
diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
index babac78146f406a7911656dbefc35549313b3a31..6709ad8978b9d5fc061e38e219613a471add9a39 100644
--- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
@@ -61,10 +61,9 @@ const std::string StringizeDownstreamMap(
 
 const std::map<size_t, std::set<size_t>>& DependencyBuilder::Build(
     const std::vector<Instruction>& instructions) {
-  PADDLE_ENFORCE_EQ(
-      is_build_,
-      false,
-      phi::errors::AlreadyExists("The op dependency has been built"));
+  if (is_build_) {
+    return op_downstream_map_;
+  }
 
   instructions_ = &instructions;
   op_num_ = instructions_->size();
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
index 29626988132f929be40dec4b9ea0c4ebe876138b..0a93659cf9c16e49b8601f1810e7120f496c327d 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/new_executor/interpreter/data_transfer.h"
 #include "paddle/fluid/framework/new_executor/interpreter/execution_config.h"
+#include "paddle/fluid/framework/new_executor/interpreter/static_build.h"
 #include "paddle/fluid/memory/stats.h"
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
 #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
@@ -48,34 +49,6 @@ namespace interpreter {
 
 using VariableIdMap = std::map<std::string, std::vector<int>>;
 
-// These Op needs set output dtype when register phi kernel, but they didn't
-static std::set<std::string> OpsNeedSetOutputDtypeWhenRegisterPhiKernel = {
-    "abs",
-    "adam",
-    "adamw",
-    "any_raw",
-    "eig_grad",
-    "eigh",
-    "lamb",
-    "layer_norm",
-    "layer_norm_grad",
-    "less_equal",
-    "less_than",
-    "merged_adam",
-    "sync_batch_norm_grad",
-    "unique",
-    "unique_consecutive_flattened_tensor",
-    "unique_raw"};
-
-// These Ops can use InferMeta to infer the output dtype
-static std::set<std::string> OpsWithAvailablePhiInferMeta = {
-    "abs", "adam", "adamw", "layer_norm", "layer_norm_grad", "merged_adam"};
-
-// Cannot static analysis these Ops' output dtype or backend because their
-// kernels have not moved to PHI yet.
-static std::set<std::string> OpsWithFluidKernelNeedMoveToPhi = {
-    "fused_batch_norm_act", "fused_batch_norm_act_grad"};
-
 // NOTE(Ruibiao): SingleStreamGuard make some multi-strem op (i.e.,
 // c_allreduce_sum) run in single stream. It is dedicated to BuildOpFuncList
 // which run kernel without stream synchronization.
@@ -145,48 +118,6 @@ void AsyncWorkQueue::AddTask(const OpFuncType& op_func_type,
   queue_group_->AddTask(op_func_type == OpFuncType::kGpuAsync, std::move(fn));
 }
 
-bool BlockCanBeStaticBuilt(const framework::BlockDesc& block) {
-  // has_fluid_kernel = (kernelCode >> 3) & 1
-  // has_structed_kernel = (kernelCode >> 2) & 1
-  // need_move_to_phi = (kernelCode >> 1) & 1
-  // need_set_dtype =  KernelCode & 1
-  using KernelCode = int8_t;
-  std::set<std::pair<std::string, KernelCode>> invalid_ops;
-  for (auto& op : block.AllOps()) {
-    auto op_type = op->Type();
-    bool has_fluid_kernel = OperatorWithKernel::AllOpKernels().count(op_type);
-    bool has_structured_kernel =
-        phi::KernelFactory::Instance().HasStructuredKernel(op_type);
-    bool need_move_to_phi = (has_fluid_kernel || has_structured_kernel) &&
-                            OpsWithFluidKernelNeedMoveToPhi.count(op_type);
-    bool need_set_dtype =
-        !has_fluid_kernel && !has_structured_kernel &&
-        OpsNeedSetOutputDtypeWhenRegisterPhiKernel.count(op_type) &&
-        !OpsWithAvailablePhiInferMeta.count(op_type);
-
-    KernelCode kernel_code = (has_fluid_kernel << 3) +
-                             (has_structured_kernel << 2) +
-                             (need_move_to_phi << 1) + need_set_dtype;
-    if (need_move_to_phi || need_set_dtype) {
-      invalid_ops.insert(std::make_pair(op_type, kernel_code));
-    }
-  }
-
-  if (!invalid_ops.empty()) {
-    std::stringstream ss;
-    ss << "The following OPs are unable to static build:\n";
-    for (auto& item : invalid_ops) {
-      ss << item.first << " [has_fluid_kernel = " << (item.second >> 3 & 1)
-         << ", has_structed_kerenl = " << (item.second >> 2 & 1)
-         << ", need_move_to_phi = " << (item.second >> 1 & 1)
-         << ", need_set_dtype = " << (item.second & 1) << "]\n";
-    }
-    VLOG(0) << ss.str();
-  }
-
-  return invalid_ops.empty();
-}
-
 bool IsCommunicationOp(const std::string& op_name) {
   const std::set<std::string> special_comm_op_set = {
       "send",
@@ -492,17 +423,25 @@ void ApplyDeviceGuard(const OperatorBase* op_base,
 }
 
 void HandleOperatorBase(const platform::Place& place,
-                        const VariableScope* var_scope,
-                        std::shared_ptr<OperatorBase> op_base,
+                        std::shared_ptr<OperatorBase> op,
                         OpFuncNode* op_func_node,
-                        Scope* local_scope) {
+                        Scope* scope,
+                        bool static_build) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
   // input, output is prepared. set the other attributes.
-  op_func_node->operator_base_ = op_base;
+  op_func_node->operator_base_ = op;
   op_func_node->type_ = AnalyseOpFuncType(*op_func_node, place);
   op_func_node->kernel_func_ = nullptr;
-  op_base->Run(*local_scope, place);  // Run without data transformer.
+  if (static_build) {
+    if (OperatorBasesMustRunInStaticBuild.count(op->Type())) {
+      op->Run(*scope, place);
+    }
+    FakeInitializeOutputsForOperatorBase(*op, place, scope);
+  } else {
+    op->Run(*scope, place);  // Run without data transformer.
+  }
+
   op_func_node->dev_ctx_ = dev_ctx;
 }
 
@@ -636,7 +575,7 @@ void BuildOpFuncList(const platform::Place& place,
         VLOG(4) << "HandleOperatorBase";
         // op is not a operatorwithkernel, so direcly run OperatorBase::Run()
         HandleOperatorBase(
-            place, var_scope, ops[i], &op_func_node, local_scope);
+            place, ops[i], &op_func_node, local_scope, static_build);
         vec_func_list->emplace_back(op_func_node);
       } else {
         VLOG(4) << "OP is not null";
@@ -754,15 +693,18 @@ void BuildOpFuncList(const platform::Place& place,
                            use_local_scope,
                            static_build);
         VLOG(4) << "apply data transform done. ";
-        // step 4. infershape, see OperatorWithKernel::RunImpl in operator.cc
-        // for why.
-        if (!(op->HasAttr(kAllKernelsMustComputeRuntimeShape) &&
-              op->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
+
+        // step 4. infershape
+        if (!static_build) {
           VLOG(4) << "infer shape";
-          RuntimeInferShapeContext infer_shape_ctx(*op, runtime_context);
-          // TODO(Aurelius84): In case of control flow ops, they are NOT
-          // inheritted from OperatorWithKernel.
-          op_with_kernel->Info().infer_shape_(&infer_shape_ctx);
+          // see kAllKernelsMustComputeRuntimeShape in operator.h for why
+          if (!(op->HasAttr(kAllKernelsMustComputeRuntimeShape) &&
+                op->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
+            RuntimeInferShapeContext infer_shape_ctx(*op, runtime_context);
+            // TODO(Aurelius84): In case of control flow ops, they are NOT
+            // inheritted from OperatorWithKernel.
+            op_with_kernel->Info().infer_shape_(&infer_shape_ctx);
+          }
         }
 
         // step 5. run kernel
@@ -772,6 +714,7 @@ void BuildOpFuncList(const platform::Place& place,
           VLOG(6) << op_type << " run function kernel";
           if (static_build) {
             FakeInitializeOutputsForFunctionKernel(
+                *op,
                 *(op_func_node.phi_kernel_),
                 *(op_with_kernel->PhiKernelSignature()),
                 runtime_context,
@@ -826,7 +769,27 @@ void BuildOpFuncList(const platform::Place& place,
             auto* original_tensor =
                 GetMutableLoDTensorOrSelectedRowsValueFromVar(
                     local_scope->FindVar(var_scope->GetNameById(p.second)));
-            original_tensor->ShareDataWith(*transformed_tensor);
+
+            // avoid overwriting valid data
+            if (static_build && original_tensor->initialized()) {
+              const phi::Place& target_place = transformed_tensor->place();
+              platform::DeviceContext* dev_ctx_for_copy;
+              if (target_place.GetType() != AllocationType::CPU) {
+                dev_ctx_for_copy = pool.Get(target_place);
+              } else {
+                dev_ctx_for_copy = pool.Get(original_tensor->place());
+              }
+
+              phi::Copy(*dev_ctx_for_copy,
+                        *original_tensor,
+                        target_place,
+                        /*blocking=*/true,
+                        original_tensor);
+              original_tensor->set_type(transformed_tensor->dtype());
+              original_tensor->set_layout(transformed_tensor->layout());
+            } else {
+              original_tensor->ShareDataWith(*transformed_tensor);
+            }
             VLOG(4) << "Transfer inplace variable back form "
                     << var_scope->GetNameById(p.first) << " to "
                     << var_scope->GetNameById(p.second);
@@ -866,32 +829,35 @@ void BuildOpFuncList(const platform::Place& place,
     VLOG(4) << "End run " << place << " "
             << op_func_node.operator_base_->DebugStringEx(local_scope);
 
-    // gc---------------------------------------------
-    auto iter = unused_var_map.find(op);
-    if (iter == unused_var_map.end()) {
-      interpreter::LogDeviceMemoryStats(place);
-      continue;
-    }
-
-    auto& delete_vars = iter->second;
-    std::deque<std::shared_ptr<memory::Allocation>>* garbages =
-        new std::deque<std::shared_ptr<memory::Allocation>>();
-
-    for (auto& var_name : delete_vars) {
-      auto* var = local_scope->FindVar(var_name);
-      if (var == nullptr || skip_gc_vars.find(var_name) != skip_gc_vars.end()) {
+    if (!static_build) {
+      // gc---------------------------------------------
+      auto iter = unused_var_map.find(op);
+      if (iter == unused_var_map.end()) {
+        interpreter::LogDeviceMemoryStats(place);
         continue;
       }
 
-      VLOG(6) << "Erase variable " << var_name;
-      if (var->IsType<phi::DenseTensor>()) {
-        garbages->emplace_back(
-            var->GetMutable<phi::DenseTensor>()->MoveMemoryHolder());
+      auto& delete_vars = iter->second;
+      std::deque<std::shared_ptr<memory::Allocation>>* garbages =
+          new std::deque<std::shared_ptr<memory::Allocation>>();
+
+      for (auto& var_name : delete_vars) {
+        auto* var = local_scope->FindVar(var_name);
+        if (var == nullptr ||
+            skip_gc_vars.find(var_name) != skip_gc_vars.end()) {
+          continue;
+        }
+
+        VLOG(6) << "Erase variable " << var_name;
+        if (var->IsType<phi::DenseTensor>()) {
+          garbages->emplace_back(
+              var->GetMutable<phi::DenseTensor>()->MoveMemoryHolder());
+        }
       }
-    }
-    delete garbages;  // free mem
+      delete garbages;  // free mem
 
-    interpreter::LogDeviceMemoryStats(place);
+      interpreter::LogDeviceMemoryStats(place);
+    }
   }
 }
 
@@ -942,160 +908,6 @@ void BuildVariableScope(const framework::BlockDesc& block,
   }
 }
 
-phi::TensorBase* GetTensorFormVar(framework::Variable* var) {
-  if (var) {
-    if (var->template IsType<phi::DenseTensor>()) {
-      return var->template GetMutable<phi::DenseTensor>();
-    } else if (var->template IsType<phi::SelectedRows>()) {
-      return var->template GetMutable<phi::SelectedRows>();
-    } else if (var->template IsType<phi::SparseCooTensor>()) {
-      return var->template GetMutable<phi::SparseCooTensor>();
-    } else if (var->template IsType<framework::LoDTensorArray>()) {
-      return var->template GetMutable<framework::LoDTensorArray>();
-    } else if (var->template IsType<framework::Strings>()) {
-      return var->template GetMutable<framework::Strings>();
-    } else if (var->template IsType<paddle::framework::RawTensor>()) {
-      return var->template GetMutable<paddle::framework::RawTensor>();
-    } else if (!var->IsInitialized()) {
-      // The following is for RAW type of var
-      return var->template GetMutable<paddle::framework::RawTensor>();
-    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Unsupported `%s` type when get tensor.",
-          framework::ToTypeName(var->Type())));
-    }
-  } else {
-    VLOG(4) << "Var is nullptr";
-    return nullptr;
-  }
-}
-
-void FakeInitializeTensor(const platform::DeviceContext& dev_ctx,
-                          const phi::DataType& dtype,
-                          const phi::Place& place,
-                          phi::TensorBase* tensor) {
-  PADDLE_ENFORCE_NOT_NULL(
-      tensor,
-      phi::errors::InvalidArgument(
-          "The tensor to fake intialize should not be null."));
-  if (place == phi::CPUPlace()) {
-    dev_ctx.HostAlloc(tensor,
-                      dtype,
-                      /*requested_size=*/0,
-                      /*fake_alloc=*/true);
-  } else {
-    PADDLE_ENFORCE_EQ(
-        place,
-        dev_ctx.GetPlace(),
-        phi::errors::Unavailable("The place %s for fack alloc is not equal to "
-                                 "the place %s of DeviceContext.",
-                                 place,
-                                 dev_ctx.GetPlace()));
-    dev_ctx.Alloc(tensor,
-                  dtype,
-                  /*requested_size=*/0,
-                  /*pinned=*/false,
-                  /*fake_alloc=*/true);
-  }
-}
-
-void FakeInitializeOutputsForFunctionKernel(
-    const phi::Kernel& phi_kernel,
-    const phi::KernelSignature& kernel_sig,
-    const RuntimeContext& ctx,
-    const platform::DeviceContext& dev_ctx) {
-  std::string op_name = std::string(kernel_sig.name);
-  if (OpsNeedSetOutputDtypeWhenRegisterPhiKernel.count(op_name)) {
-    PADDLE_ENFORCE_GT(
-        OpsWithAvailablePhiInferMeta.count(op_name),
-        0,
-        phi::errors::Unavailable(
-            "Cannot static build for op %s because it did not set output dtype "
-            "in phi kernel register. Please set its output dtype and remove it "
-            "from OpsNeedSetOutputDtypeWhenRegisterPhiKernel set, or add it to "
-            " OpsWithAvailablePhiInferMeta set if its InferMeta is available.",
-            op_name));
-  }
-
-  auto output_names = kernel_sig.output_names;
-  auto output_defs = phi_kernel.args_def().output_defs();
-  PADDLE_ENFORCE_EQ(output_names.size(),
-                    output_defs.size(),
-                    platform::errors::InvalidArgument(
-                        "The size of outputs_args names (%d) must be equal to "
-                        "the size of kernel output_defs (%d).",
-                        output_names.size(),
-                        output_defs.size()));
-
-  size_t start_idx = 0;
-  for (size_t i = 0; i < output_names.size(); ++i) {
-    auto it = ctx.outputs.find(output_names[i]);
-
-    // Deal with the case that some outputs are not found or be NULL when run
-    // the kernel. For example : the outputs of matmul_grad are dx and dy,
-    // sometimes dx or dy may be NULL.
-    if (it == ctx.outputs.end() || it->second.empty()) {
-      VLOG(4) << "Output " << output_names[i] << " not found";
-      ++start_idx;
-      continue;
-    }
-
-    auto& outs_vector = it->second;
-    for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
-      phi::TensorBase* out_tensor = GetTensorFormVar(outs_vector[offset]);
-      if (out_tensor && !out_tensor->initialized()) {
-        phi::TensorArgDef& tensor_arg_def = output_defs[start_idx + offset];
-        phi::DataType dtype = tensor_arg_def.dtype;
-        phi::Place place = tensor_arg_def.backend == phi::Backend::CUSTOM
-                               ? dev_ctx.GetPlace()
-                               : phi::TransToPhiPlace(tensor_arg_def.backend);
-
-        if (dtype == DataType::UNDEFINED ||
-            OpsNeedSetOutputDtypeWhenRegisterPhiKernel.count(
-                std::string(kernel_sig.name))) {
-          VLOG(4) << "Get dtype result from InferMeta";
-          dtype = out_tensor->dtype();  // dtype from InferMeta
-        }
-
-        VLOG(4) << output_names[i] << " fake alloc with type " << dtype
-                << " on place " << place << " " << out_tensor;
-
-        FakeInitializeTensor(dev_ctx, dtype, place, out_tensor);
-      }
-    }
-    start_idx += outs_vector.size();
-  }
-}
-
-void FakeInitializeOutputsForStructureKernel(
-    const framework::OpKernelType& op_kernel_type,
-    ExecutionContext* execution_context) {
-  const std::string& op_type = execution_context->Type();
-  if (op_type == "fetch_v2") {
-    return;
-  }
-
-  const VariableNameMap& outputs = execution_context->GetOp().Outputs();
-  for (auto& item : outputs) {
-    const std::string& parameter_name = item.first;
-    auto multi_output_var = execution_context->MultiOutputVar(parameter_name);
-    for (Variable* var : multi_output_var) {
-      phi::TensorBase* out_tensor = GetTensorFormVar(var);
-      if (out_tensor && !out_tensor->initialized()) {
-        phi::DataType dtype =
-            phi::TransToPhiDataType(op_kernel_type.data_type_);
-        phi::Place place = execution_context->GetPlace();
-
-        VLOG(4) << parameter_name << " fake alloc with type " << dtype
-                << " on place " << place << " " << out_tensor;
-
-        FakeInitializeTensor(
-            execution_context->device_context(), dtype, place, out_tensor);
-      }
-    }
-  }
-}
-
 void LogDeviceMemoryStats(const platform::Place& place) {
   if (FLAGS_new_executor_log_memory_stats && platform::is_gpu_place(place)) {
     VLOG(0) << "memory_allocated: "
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h
index 5a74ab662b88899bd8a63f243a35d297a48d6970..f31dd7f789d3743152ddfc6c505ff458df5affd5 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h
@@ -65,8 +65,6 @@ class AsyncWorkQueue {
   std::unique_ptr<WorkQueueGroup> queue_group_;
 };
 
-bool BlockCanBeStaticBuilt(const framework::BlockDesc& block);
-
 bool IsCommunicationOp(const std::string& op_name);
 
 bool IsCommunicationOp(const Instruction& instr);
@@ -99,16 +97,6 @@ void BuildVariableScope(const framework::BlockDesc& block,
                         const ExecutionConfig& execution_config,
                         VariableScope* var_scope);
 
-void FakeInitializeOutputsForFunctionKernel(
-    const phi::Kernel& phi_kernel,
-    const phi::KernelSignature& kernel_sig,
-    const RuntimeContext& ctx,
-    const platform::DeviceContext& dev_ctx);
-
-void FakeInitializeOutputsForStructureKernel(
-    const framework::OpKernelType& op_kernel_type,
-    ExecutionContext* execution_context);
-
 void LogDeviceMemoryStats(const platform::Place& place);
 
 void SetDeviceCommContext(framework::OperatorBase* operator_base,
diff --git a/paddle/fluid/framework/new_executor/interpreter/static_build.cc b/paddle/fluid/framework/new_executor/interpreter/static_build.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc63666ba720b649ec40da5b76529508388e38b4
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/interpreter/static_build.cc
@@ -0,0 +1,533 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/new_executor/interpreter/static_build.h"
+
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/operators/reader/buffered_reader.h"
+
+// These Ops is OperatorBase, but we have been handle them in static build
+std::set<std::string> OperatorBasesHandledInStaticBuild = {"read"};
+
+std::set<std::string> OperatorBasesMustRunInStaticBuild = {
+    "create_double_buffer_reader", "create_py_reader"};
+
+std::set<std::string> OpsCanSkipedFakeAllocInStaticBuild = {
+    "create_double_buffer_reader", "create_py_reader", "fetch_v2"};
+
+// These Op needs set output dtype when register phi kernel, but they didn't
+std::set<std::string> OpsNeedSetOutputDtypeWhenRegisterPhiKernel = {
+    "eig_grad",
+    "eigh",
+    "lamb",
+    "sync_batch_norm_grad",
+    "update_loss_scaling",
+    "unique",
+    "unique_consecutive_flattened_tensor",
+    "unique_raw"};
+
+// Cannot static analysis these Ops' output dtype or backend because their
+// kernels have not moved to PHI yet.
+std::set<std::string> OpsWithFluidKernelNeedMoveToPhi = {
+    "cudnn_lstm",
+    "dequantize",
+    "distributed_fused_lamb",
+    "fused_attention",
+    "fused_attention_grad",
+    "fused_batch_norm_act",
+    "fused_batch_norm_act_grad",
+    "fusion_group",
+    "pow2_decay_with_linear_warmup",
+    "sequence_mask",
+    "sequence_pool",
+    "stft"};
+
+std::set<std::string> StaticBuildBlackList = {
+    "batch_norm" /*: to handle reserve_space output*/,
+    "cinn_instruction_run" /*: to handle subgraph infermeta*/,
+    "cinn_launch" /*: to handle subgraph infermeta*/,
+    "run_program" /*: to handle scope output*/,
+    "sparse_sparse_coo_tensor" /*: to handle sparse output*/};
+
+namespace paddle {
+namespace framework {
+namespace interpreter {
+
+bool BlockCanBeStaticBuilt(const framework::BlockDesc& block) {
+  // in_black_list = (kernelCode >> 7) & 1
+  // is_operator_base = (kernelCode >> 6) & 1
+  // is_custom_op = (kernelCode >> 5) & 1
+  // use_mkldnn = (kernelCode >> 4) & 1
+  // has_fluid_kernel = (kernelCode >> 3) & 1
+  // has_structed_kernel = (kernelCode >> 2) & 1
+  // need_move_to_phi = (kernelCode >> 1) & 1
+  // need_set_dtype =  KernelCode & 1
+  using KernelCode = int8_t;
+  std::set<std::pair<std::string, KernelCode>> invalid_ops;
+  for (auto& op : block.AllOps()) {
+    auto op_type = op->Type();
+    const framework::OpInfo& info = OpInfoMap::Instance().Get(op_type);
+    auto op_base =
+        info.Creator()(op_type, op->Inputs(), op->Outputs(), op->GetAttrMap());
+
+    bool in_black_list = StaticBuildBlackList.count(op_type);
+    bool is_operator_base =
+        (dynamic_cast<framework::OperatorWithKernel*>(op_base) == nullptr);
+    bool is_custom_op =
+        egr::Controller::Instance().GetOpMetaInfoMap().count(op_type);
+    bool use_mkldnn = false;
+    if (op->HasAttr("use_mkldnn")) {
+      Attribute attr = op->GetAttr("use_mkldnn");
+      use_mkldnn = attr.index() == 1 ? PADDLE_GET_CONST(int, attr)
+                                     : PADDLE_GET_CONST(bool, attr);
+    }
+    bool has_fluid_kernel = OperatorWithKernel::AllOpKernels().count(op_type);
+    bool has_structured_kernel =
+        phi::KernelFactory::Instance().HasStructuredKernel(op_type);
+    bool need_move_to_phi = (has_fluid_kernel || has_structured_kernel) &&
+                            OpsWithFluidKernelNeedMoveToPhi.count(op_type);
+    bool need_set_dtype =
+        OpsNeedSetOutputDtypeWhenRegisterPhiKernel.count(op_type);
+
+    KernelCode kernel_code =
+        (in_black_list << 7) + (is_operator_base << 6) + (is_custom_op << 5) +
+        (use_mkldnn << 4) + (has_fluid_kernel << 3) +
+        (has_structured_kernel << 2) + (need_move_to_phi << 1) + need_set_dtype;
+    if (!OpsCanSkipedFakeAllocInStaticBuild.count(op_type)) {
+      if (in_black_list ||
+          (is_operator_base &&
+           !OperatorBasesHandledInStaticBuild.count(op_type)) ||
+          is_custom_op || use_mkldnn || need_move_to_phi || need_set_dtype) {
+        invalid_ops.insert(std::make_pair(op_type, kernel_code));
+      }
+    }
+  }
+
+  if (!invalid_ops.empty()) {
+    std::stringstream ss;
+    ss << "The following OPs are unable to static build:\n";
+    for (auto& item : invalid_ops) {
+      ss << item.first << " [in_black_list = " << (item.second >> 7 & 1)
+         << ", is_operator_base = " << (item.second >> 6 & 1)
+         << ", is_custom_op = " << (item.second >> 5 & 1)
+         << ", use_mkldnn = " << (item.second >> 4 & 1)
+         << ", has_fluid_kernel = " << (item.second >> 3 & 1)
+         << ", has_structed_kerenl = " << (item.second >> 2 & 1)
+         << ", need_move_to_phi = " << (item.second >> 1 & 1)
+         << ", need_set_dtype = " << (item.second & 1) << "]\n";
+    }
+    VLOG(1) << ss.str();
+  }
+
+  return invalid_ops.empty();
+}
+
+inline bool IsExtendedTensor(const phi::TensorBase& tensor) {
+  return framework::RawTensor::classof(&tensor) ||
+         framework::Strings::classof(&tensor) ||
+         framework::Vocab::classof(&tensor);
+}
+
+bool TensorShouldBeFakeInitialized(const OperatorBase& op,
+                                   const std::string& parameter_name,
+                                   const phi::TensorBase* tensor) {
+  const std::string& op_type = op.Type();
+  if (OpsCanSkipedFakeAllocInStaticBuild.count(op_type)) {
+    return false;
+  }
+
+  if (op_type == "adam" || op_type == "adamw" || op_type == "merged_adam") {
+    if (op.Attr<bool>("use_global_beta_pow") &&
+        (parameter_name == "Beta1PowOut" || parameter_name == "Beta2PowOut")) {
+      VLOG(2) << "Skip fake initialization for: " << parameter_name;
+      return false;
+    }
+  }
+
+  if (op_type == "coalesce_tensor" && parameter_name == "Output") {
+    VLOG(2) << "Skip fake initialization for: " << parameter_name;
+    return false;
+  }
+
+  if (op_type == "dgc" && parameter_name == "k") {
+    VLOG(2) << "Skip fake initialization for: " << parameter_name;
+    return false;
+  }
+
+  if (op_type == "fake_quantize_range_abs_max") {
+    if (op.Attr<bool>("is_test") &&
+        (parameter_name == "OutScale" || parameter_name == "OutScales")) {
+      VLOG(2) << "Skip fake initialization for: " << parameter_name;
+      return false;
+    }
+  }
+
+  if (op_type == "segment_pool" && parameter_name == "SummedIds") {
+    return op.Attr<std::string>("pooltype") == "MEAN" &&
+           dynamic_cast<const OperatorWithKernel*>(&op)
+                   ->kernel_type()
+                   ->place_ != phi::CPUPlace();
+  }
+
+  return tensor && !IsExtendedTensor(*tensor);
+}
+
+phi::TensorBase* GetTensorFormVar(framework::Variable* var) {
+  if (var) {
+    if (var->template IsType<phi::DenseTensor>()) {
+      return var->template GetMutable<phi::DenseTensor>();
+    } else if (var->template IsType<phi::SelectedRows>()) {
+      return var->template GetMutable<phi::SelectedRows>();
+    } else if (var->template IsType<phi::SparseCooTensor>()) {
+      return var->template GetMutable<phi::SparseCooTensor>();
+    } else if (var->template IsType<phi::TensorArray>()) {
+      return var->template GetMutable<phi::TensorArray>();
+    } else if (var->template IsType<framework::Strings>()) {
+      return var->template GetMutable<framework::Strings>();
+    } else if (var->template IsType<paddle::framework::RawTensor>()) {
+      return var->template GetMutable<paddle::framework::RawTensor>();
+    } else if (!var->IsInitialized()) {
+      // The following is for RAW type of var
+      return var->template GetMutable<paddle::framework::RawTensor>();
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported `%s` type when get tensor.",
+          framework::ToTypeName(var->Type())));
+    }
+  } else {
+    VLOG(4) << "Var is nullptr";
+    return nullptr;
+  }
+}
+
+template <class TensorType>
+void FakeInitializeTensor(const platform::DeviceContext& dev_ctx,
+                          const phi::Place& place,
+                          const phi::DataType& dtype,
+                          const phi::DataLayout& layout,
+                          TensorType* tensor) {
+  PADDLE_ENFORCE_NE(place.GetType(),
+                    phi::AllocationType::UNDEFINED,
+                    phi::errors::InvalidArgument(
+                        "The place %s to fake intialize is not valid.", place));
+  PADDLE_ENFORCE_NE(dtype,
+                    phi::DataType::UNDEFINED,
+                    phi::errors::InvalidArgument(
+                        "The dtype %s to fake intialize is not valid.", dtype));
+  PADDLE_ENFORCE_NE(
+      layout,
+      phi::DataLayout::UNDEFINED,
+      phi::errors::InvalidArgument(
+          "The layout %s to fake intialize is not valid.", layout));
+  PADDLE_ENFORCE_NOT_NULL(
+      tensor,
+      phi::errors::InvalidArgument(
+          "The tensor to fake intialize should not be null."));
+
+  if (tensor->initialized() && place == tensor->place() &&
+      dtype == tensor->dtype() && tensor->layout() == layout) {
+    return;
+  }
+
+  // set place
+  if (tensor->initialized()) {  // avoid overwriting valid data
+    platform::DeviceContext* dev_ctx_for_copy;
+    if (place.GetType() != AllocationType::CPU) {
+      dev_ctx_for_copy = platform::DeviceContextPool::Instance().Get(place);
+    } else {
+      dev_ctx_for_copy =
+          platform::DeviceContextPool::Instance().Get(tensor->place());
+    }
+    phi::Copy(*dev_ctx_for_copy, *tensor, place, /*blocking=*/true, tensor);
+  } else {
+    if (place == phi::CPUPlace()) {
+      dev_ctx.HostAlloc(tensor,
+                        dtype,
+                        /*requested_size=*/0,
+                        /*fake_alloc=*/true);
+    } else {
+      PADDLE_ENFORCE_EQ(place,
+                        dev_ctx.GetPlace(),
+                        phi::errors::Unavailable(
+                            "The place %s for fack alloc is not equal to "
+                            "the place %s of DeviceContext.",
+                            place,
+                            dev_ctx.GetPlace()));
+      dev_ctx.Alloc(tensor,
+                    dtype,
+                    /*requested_size=*/0,
+                    /*pinned=*/false,
+                    /*fake_alloc=*/true);
+    }
+  }
+
+  // set dtype and layout
+  tensor->set_type(dtype);
+  tensor->set_layout(layout);
+
+  VLOG(4) << "Tensor " << tensor << " fake alloc with type = " << dtype
+          << ", place = " << place << ", layout = " << layout;
+}
+
+void FakeInitializeTensorBase(const platform::DeviceContext& dev_ctx,
+                              const phi::Place& place,
+                              const phi::DataType& dtype,
+                              const phi::DataLayout& layout,
+                              phi::TensorBase* tensor) {
+  if (phi::DenseTensor::classof(tensor)) {
+    FakeInitializeTensor(
+        dev_ctx, place, dtype, layout, dynamic_cast<phi::DenseTensor*>(tensor));
+  } else if (phi::SelectedRows::classof(tensor)) {
+    FakeInitializeTensor(dev_ctx,
+                         place,
+                         dtype,
+                         layout,
+                         dynamic_cast<phi::SelectedRows*>(tensor));
+  } else if (phi::SparseCooTensor::classof(tensor)) {
+    FakeInitializeTensor(dev_ctx,
+                         place,
+                         dtype,
+                         layout,
+                         dynamic_cast<phi::SparseCooTensor*>(tensor));
+  } else if (phi::SparseCsrTensor::classof(tensor)) {
+    FakeInitializeTensor(dev_ctx,
+                         place,
+                         dtype,
+                         layout,
+                         dynamic_cast<phi::SparseCsrTensor*>(tensor));
+  } else if (phi::TensorArray::classof(tensor)) {
+    FakeInitializeTensor(
+        dev_ctx, place, dtype, layout, dynamic_cast<phi::TensorArray*>(tensor));
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "Unsupported `%s` type when fake initialize tensor.",
+        tensor->type_info().name()));
+  }
+}
+
+void FakeInitializeOutputsForOperatorBase(const OperatorBase& op,
+                                          const phi::Place& place,
+                                          Scope* scope) {
+  const std::string& op_type = op.Type();
+  if (OpsCanSkipedFakeAllocInStaticBuild.count(op_type)) {
+    return;
+  }
+
+  phi::DeviceContext* dev_ctx =
+      platform::DeviceContextPool::Instance().Get(place);
+
+  if (op_type == "read") {
+    const std::string& reader_name = op.Input("Reader");
+    framework::ReaderHolder* reader =
+        GET_DATA_SAFELY(scope->FindVar(reader_name), "Input", "Reader", "Read")
+            .GetMutable<framework::ReaderHolder>();
+
+    std::shared_ptr<operators::reader::BufferedReader> buffered_reader =
+        std::dynamic_pointer_cast<operators::reader::BufferedReader>(
+            reader->Get());
+    phi::Place target_place =
+        buffered_reader ? buffered_reader->GetPlace() : phi::CPUPlace();
+
+    auto& outputs = op.Outputs("Out");
+    auto& var_types = reader->VarTypes();
+    PADDLE_ENFORCE_EQ(
+        outputs.size(),
+        var_types.size(),
+        phi::errors::Unavailable("The output size of read_op (%d) should equal "
+                                 "to the var_types size of ReaderHolder (%d).",
+                                 outputs.size(),
+                                 var_types.size()));
+
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      const std::string& parameter_name = outputs[i];
+      phi::TensorBase* out_tensor =
+          GetTensorFormVar(scope->FindVar(parameter_name));
+      if (TensorShouldBeFakeInitialized(op, parameter_name, out_tensor)) {
+        phi::DataType dtype = phi::TransToPhiDataType(var_types[i]);
+        FakeInitializeTensorBase(
+            *dev_ctx, target_place, dtype, out_tensor->layout(), out_tensor);
+      }
+    }
+  } else {
+    PADDLE_THROW(
+        phi::errors::Unimplemented("Can not static build for op: %s", op_type));
+  }
+}
+
+phi::DataType GetInputDType(const RuntimeContext& runtime_ctx,
+                            const std::string parameter_name) {
+  phi::TensorBase* in_tensor =
+      GetTensorFormVar(runtime_ctx.inputs.find(parameter_name)->second.at(0));
+  return in_tensor->dtype();
+}
+
+phi::DataType InferDTypeFromAttr(const framework::OperatorBase& op,
+                                 const RuntimeContext& runtime_ctx,
+                                 const std::string& attr_name) {
+  int dtype_attr = op.Attr<int>(attr_name);
+  if (dtype_attr == -1) {  // -1 means the dtype is same as intput
+    return GetInputDType(runtime_ctx, "X");
+  }
+  return phi::TransToPhiDataType(dtype_attr);
+}
+
+phi::DataType InferMPDType(const RuntimeContext& runtime_ctx,
+                           const std::string parameter_name) {
+  phi::DataType in_dtype = GetInputDType(runtime_ctx, parameter_name);
+  return (in_dtype == phi::DataType::BFLOAT16 ||
+          in_dtype == phi::DataType::FLOAT16)
+             ? phi::DataType::FLOAT32
+             : in_dtype;
+}
+
+void FakeInitializeOutputsForFunctionKernel(
+    const framework::OperatorBase& op,
+    const phi::Kernel& phi_kernel,
+    const phi::KernelSignature& kernel_sig,
+    const RuntimeContext& runtime_ctx,
+    const platform::DeviceContext& dev_ctx) {
+  std::string op_type = op.Type();
+  auto output_names = kernel_sig.output_names;
+  auto output_defs = phi_kernel.args_def().output_defs();
+  PADDLE_ENFORCE_EQ(output_names.size(),
+                    output_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "The size of outputs_args names (%d) must be equal to "
+                        "the size of kernel output_defs (%d).",
+                        output_names.size(),
+                        output_defs.size()));
+  size_t start_idx = 0;
+  for (size_t i = 0; i < output_names.size(); ++i) {
+    const std::string& parameter_name = output_names[i];
+    auto it = runtime_ctx.outputs.find(parameter_name);
+    // Deal with the case that some outputs are not found or be NULL when run
+    // the kernel. For example : the outputs of matmul_grad are dx and dy,
+    // sometimes dx or dy may be NULL.
+    if (it == runtime_ctx.outputs.end() || it->second.empty()) {
+      VLOG(4) << "Output " << parameter_name << " not found";
+      ++start_idx;
+      continue;
+    }
+    auto& outs_vector = it->second;
+    for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
+      phi::TensorBase* out_tensor = GetTensorFormVar(outs_vector[offset]);
+      if (TensorShouldBeFakeInitialized(op, parameter_name, out_tensor)) {
+        phi::TensorArgDef& tensor_arg_def = output_defs[i];
+
+        // analyze place
+        phi::Backend backend = tensor_arg_def.backend;
+        if (backend == phi::Backend::UNDEFINED) {
+          if (op_type == "adam" || op_type == "adamw" ||
+              op_type == "merged_adam") {
+            phi::TensorBase* beta1_pow = GetTensorFormVar(
+                runtime_ctx.inputs.find("Beta1Pow")->second.at(0));
+            phi::TensorBase* beta2_pow = GetTensorFormVar(
+                runtime_ctx.inputs.find("Beta2Pow")->second.at(0));
+            if (beta1_pow->place() == CPUPlace() &&
+                beta2_pow->place() == CPUPlace()) {
+              backend = phi::TransToPhiBackend(CPUPlace());
+            } else {
+              backend = phi::TransToPhiBackend(GPUPlace());
+            }
+          } else {
+            PADDLE_THROW(phi::errors::Unimplemented(
+                "Unsupported UNDEFINED backend for op: %s, parameter: %s",
+                op_type,
+                parameter_name));
+          }
+        }
+        phi::Place place = backend == phi::Backend::CUSTOM
+                               ? dev_ctx.GetPlace()
+                               : phi::TransToPhiPlace(backend);
+
+        // analyze dtype
+        phi::DataType dtype = tensor_arg_def.dtype;
+        if (dtype == DataType::UNDEFINED ||
+            OpsNeedSetOutputDtypeWhenRegisterPhiKernel.count(
+                std::string(op_type))) {
+          // Some OP's InferMeta is sensitive to DDim, so we cannot get their
+          // output dtype from InferMeta
+          if (op_type == "adam" || op_type == "adamw") {
+            dtype = InferMPDType(runtime_ctx, "Param");
+          } else if (op_type == "arg_min" || op_type == "arg_max" ||
+                     op_type == "coalesce_tensor" || op_type == "one_hot_v2") {
+            dtype = InferDTypeFromAttr(op, runtime_ctx, "dtype");
+          } else if (op_type == "bincount" || op_type == "reduce_sum_grad") {
+            dtype = GetInputDType(runtime_ctx, "X");
+          } else if (op_type == "layer_norm") {
+            dtype = InferMPDType(runtime_ctx, "X");
+          } else if (op_type == "reduce_sum") {
+            int dtype_attr = op.Attr<int>("out_dtype");
+            if (dtype_attr != -1) {
+              dtype = phi::TransToPhiDataType(dtype_attr);
+            } else {
+              phi::DataType in_dtype = GetInputDType(runtime_ctx, "X");
+              dtype =
+                  (in_dtype == DataType::BOOL || in_dtype == DataType::INT32)
+                      ? DataType::INT64
+                      : in_dtype;
+            }
+          } else {
+            VLOG(4) << "Get dtype result from InferMeta";
+            RuntimeInferShapeContext infer_shape_ctx(op, runtime_ctx);
+            dynamic_cast<const framework::OperatorWithKernel*>(&op)
+                ->Info()
+                .infer_shape_(&infer_shape_ctx);
+            dtype = out_tensor->dtype();  // dtype from InferMeta
+          }
+        }
+
+        // analyze layout
+        phi::DataLayout layout = tensor_arg_def.layout;
+
+        FakeInitializeTensorBase(dev_ctx, place, dtype, layout, out_tensor);
+      }
+    }
+    start_idx += outs_vector.size();
+  }
+}
+
+void FakeInitializeOutputsForStructureKernel(
+    const framework::OpKernelType& op_kernel_type,
+    ExecutionContext* execution_context) {
+  const framework::OperatorBase& op = execution_context->GetOp();
+  if (OpsCanSkipedFakeAllocInStaticBuild.count(op.Type())) {
+    return;
+  }
+
+  const VariableNameMap& outputs = op.Outputs();
+  for (auto& item : outputs) {
+    const std::string& parameter_name = item.first;
+    auto multi_output_var = execution_context->MultiOutputVar(parameter_name);
+    for (Variable* var : multi_output_var) {
+      phi::TensorBase* out_tensor = GetTensorFormVar(var);
+      if (TensorShouldBeFakeInitialized(op, parameter_name, out_tensor)) {
+        phi::Place place = execution_context->GetPlace();
+        phi::DataType dtype =
+            phi::TransToPhiDataType(op_kernel_type.data_type_);
+        phi::DataLayout layout = out_tensor->layout();
+        FakeInitializeTensorBase(execution_context->device_context(),
+                                 place,
+                                 dtype,
+                                 layout,
+                                 out_tensor);
+      }
+    }
+  }
+}
+
+}  // namespace interpreter
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpreter/static_build.h b/paddle/fluid/framework/new_executor/interpreter/static_build.h
new file mode 100644
index 0000000000000000000000000000000000000000..e070f66b02549418bc78aeb15d99978480ff6851
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/interpreter/static_build.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+
+extern std::set<std::string> OperatorBasesMustRunInStaticBuild;
+
+namespace paddle {
+namespace framework {
+namespace interpreter {
+
+bool BlockCanBeStaticBuilt(const framework::BlockDesc& block);
+
+void FakeInitializeOutputsForOperatorBase(const OperatorBase& op,
+                                          const platform::Place& place,
+                                          Scope* scope);
+
+void FakeInitializeOutputsForFunctionKernel(
+    const framework::OperatorBase& op,
+    const phi::Kernel& phi_kernel,
+    const phi::KernelSignature& kernel_sig,
+    const RuntimeContext& ctx,
+    const platform::DeviceContext& dev_ctx);
+
+void FakeInitializeOutputsForStructureKernel(
+    const framework::OpKernelType& op_kernel_type,
+    ExecutionContext* execution_context);
+
+}  // namespace interpreter
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index bee8e8ca7b795b7a26d737c35f027c779166a146..046edd45b2a89894f918bdf511884263e38864fb 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
+#include "paddle/fluid/framework/new_executor/interpreter/static_build.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/os_info.h"
@@ -112,6 +113,8 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
   VLOG(4) << "InterpreterCore(): " << this << " on " << place_;
 
   static_build_ = FLAGS_new_executor_static_build &&
+                  !FLAGS_new_executor_use_cuda_graph &&
+                  !execution_config.used_for_control_flow_op &&
                   interpreter::BlockCanBeStaticBuilt(block);
 
   exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);
@@ -281,12 +284,12 @@ paddle::framework::FetchList InterpreterCore::Run(
     SetFeedVarsInplaceSkip(feed_names);
     // convert vec func_list to graph
     Convert(&op_func_nodes);
-    is_build_ = true;
     UpdateSyncOpNum();
     if (static_build_) {
       VLOG(4) << "RUN impl";
       RunImpl();
     }
+    is_build_ = true;
   } else {
     RunImpl();
   }
@@ -597,7 +600,7 @@ void InterpreterCore::BuildOperatorDependences() {
   // analysis the dependences between ops, add next_instr_list to each instr,
   // and set the dependecy_count_
   size_t instr_num = vec_instruction_.size();
-  dependecy_count_.resize(instr_num);
+  dependecy_count_ = std::vector<size_t>(instr_num, 0);
   auto downstream_map = dependency_builder_.Build(vec_instruction_);
 
   for (size_t instr_id = 0; instr_id < instr_num; ++instr_id) {
@@ -657,6 +660,7 @@ void InterpreterCore::Convert(
   auto& vec_meta_info = var_scope_.MutableVecMetaInfo();
   auto nodes = *op_func_nodes;
   auto op_nums = nodes.size();
+  vec_instruction_.clear();
   vec_instruction_.reserve(op_nums);
   for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
     auto& op_func_node = nodes[op_idx];
@@ -825,8 +829,6 @@ void InterpreterCore::Convert(
     BuildAndCacheInstructionCtx(&vec_instruction_[i]);
   }
 
-  BuildSkipShareLoDInfo();
-
   bool inplaced = false;
   for (const Instruction& inst : vec_instruction_) {
     if (inst.OpBase()->Type() == "share_buffer" ||
@@ -867,6 +869,10 @@ void InterpreterCore::BuildSkipShareLoDInfo() {
         }
       }
     }
+    if (can_skip_lod) {
+      VLOG(8) << "skip share lod for: " << vec_instruction_[i].OpBase()->Type()
+              << " (" << i << ")";
+    }
     vec_instruction_[i].InnerInferShapeContext()->SetSkipLoD(can_skip_lod);
   }
 }
@@ -1060,6 +1066,7 @@ void InterpreterCore::ExecuteInstructionList(
     // EOF is not a fatal error.
     if (exception_holder_.Type() != "EOF") {
       async_work_queue_->Cancel();
+      async_work_queue_.reset();
     }
     VLOG(4) << "Cancel ok";
     PADDLE_ENFORCE_EQ(
@@ -1297,11 +1304,12 @@ void InterpreterCore::Prepare(const std::vector<std::string>& feed_names,
     // convert vec func_list to graph
     Convert(&op_func_nodes);
     UpdateSyncOpNum();
-    is_build_ = true;
     if (static_build_) {
       VLOG(4) << "RUN impl";
       RunImpl();
     }
+    BuildSkipShareLoDInfo();
+    is_build_ = true;
   }
   // NOTE: Because feed_tensor will be GC after
   // paddle::framework::BuildOpFuncList, so we should
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 62d91f5da8b58a57aa4c52923e5ce24747518f57..7fdc830ab268bc0e0080c4d32fccb2b24f101397 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2753,13 +2753,6 @@ void OperatorWithKernel::ParseInputDataType(
       t = &(var->Get<phi::SelectedRows>().value());
     } else if (var->IsType<phi::SparseCooTensor>()) {
       const phi::SparseCooTensor* sp_t = &(var->Get<phi::SparseCooTensor>());
-      PADDLE_ENFORCE_EQ(
-          sp_t->initialized(),
-          true,
-          platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
-                                            "contains uninitialized Tensor.",
-                                            Type(),
-                                            name));
       *data_type = paddle::framework::TransToProtoVarType(sp_t->dtype());
       return;
     } else if (var->IsType<LoDTensorArray>()) {
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
index 6eacc496db70d58d9cdeeded298abd9fc66efded..f2f47c5ffb0ffd03c0d6cae10684538c686a5a54 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -37,8 +37,23 @@ REGISTER_OP_WITHOUT_GRADIENT(c_sync_calc_stream,
                              ops::CSyncCalcStreamOp,
                              ops::CSyncCalcStreamOpMaker);
 
-REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
-
-REGISTER_OP_NPU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
-
-REGISTER_OP_MLU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
+REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream,
+                        ops::CSyncCalcStreamKernel<float>,
+                        ops::CSyncCalcStreamKernel<double>,
+                        ops::CSyncCalcStreamKernel<int>,
+                        ops::CSyncCalcStreamKernel<int64_t>,
+                        ops::CSyncCalcStreamKernel<paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(c_sync_calc_stream,
+                       ops::CSyncCalcStreamKernel<float>,
+                       ops::CSyncCalcStreamKernel<double>,
+                       ops::CSyncCalcStreamKernel<int>,
+                       ops::CSyncCalcStreamKernel<int64_t>,
+                       ops::CSyncCalcStreamKernel<paddle::platform::float16>);
+
+REGISTER_OP_MLU_KERNEL(c_sync_calc_stream,
+                       ops::CSyncCalcStreamKernel<float>,
+                       ops::CSyncCalcStreamKernel<double>,
+                       ops::CSyncCalcStreamKernel<int>,
+                       ops::CSyncCalcStreamKernel<int64_t>,
+                       ops::CSyncCalcStreamKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
index da3fdd345393473918b69f819dec4428fafc1135..369adc019c231d7a603ec210853871cd4e54780b 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
@@ -30,7 +30,8 @@ class CSyncCalcStreamOp : public framework::OperatorWithKernel {
  protected:
   phi::KernelKey GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(framework::proto::VarType::FP32, ctx.GetPlace());
+    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+                          ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 979e7a529b79ec7d55b50dd443e0177f3b91bb1d..1f5618b498ffa8c31144d38c9351f0803bb76a65 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -41,7 +41,7 @@ class ConcatOp : public framework::OperatorWithKernel {
     auto input_data_type = framework::proto::VarType::Type(0);
     bool flag = 0;
     for (auto *input : inputs) {
-      if (input->IsInitialized() && input->numel() > 0) {
+      if (input->IsInitialized()) {
         input_data_type = framework::TransToProtoVarType(input->dtype());
         flag = 1;
         break;
diff --git a/paddle/fluid/operators/partial_concat_op.cc b/paddle/fluid/operators/partial_concat_op.cc
index a8a7d82e4627a5011ef9dffdc13ed7e0294eb07c..1fb9dceb4150c035a5d6d84b0e09b5eb852943aa 100644
--- a/paddle/fluid/operators/partial_concat_op.cc
+++ b/paddle/fluid/operators/partial_concat_op.cc
@@ -95,7 +95,7 @@ class PartialConcatOp : public framework::OperatorWithKernel {
     auto input_data_type = framework::proto::VarType::Type(0);
     bool flag = 0;
     for (auto *input : inputs) {
-      if (input->IsInitialized() && input->numel() > 0) {
+      if (input->IsInitialized()) {
         input_data_type = framework::TransToProtoVarType(input->dtype());
         flag = 1;
         break;
diff --git a/paddle/fluid/operators/partial_sum_op.cc b/paddle/fluid/operators/partial_sum_op.cc
index a2255d8e07abf886158615916925044e875f52ee..9ef7ac0a21a4813d71658f3c0a63a03eba37810d 100644
--- a/paddle/fluid/operators/partial_sum_op.cc
+++ b/paddle/fluid/operators/partial_sum_op.cc
@@ -97,7 +97,7 @@ class PartialSumOp : public framework::OperatorWithKernel {
     auto input_data_type = framework::proto::VarType::Type(0);
     bool flag = 0;
     for (auto *input : inputs) {
-      if (input->IsInitialized() && input->numel() > 0) {
+      if (input->IsInitialized()) {
         input_data_type = framework::TransToProtoVarType(input->dtype());
         flag = 1;
         break;
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index 9bec93e635ef4a0c3ee41f78c8baf3f485d74b76..442330b8caa6f47afea4ad35661059501e77777f 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -55,6 +55,8 @@ class BufferedReader : public framework::DecoratedReader {
 
   ~BufferedReader() override;
 
+  platform::Place GetPlace() const { return place_; }
+
  private:
   void ReadTillBufferFullAsync();
 
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index a4902a85fcba7b5ae173b173d8fe75cada079f44..588521c400329f403ea439c9415f5cf14416154a 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -54,7 +54,7 @@ class SumOp : public framework::OperatorWithKernel {
                                        x_vars_name[idx]));
         auto tensor =
             framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_vars[idx]);
-        if (tensor->numel() <= 0 || (!tensor->IsInitialized())) {
+        if (!tensor->IsInitialized()) {
           continue;
         }
         if (dtype == -1) {
diff --git a/paddle/fluid/operators/transfer_layout_op.cc b/paddle/fluid/operators/transfer_layout_op.cc
index a197546b357bb11b5e8a2a8aa051dd1c65451ee4..01514ed4d1075fb050296f167dd42b2d49888b6f 100644
--- a/paddle/fluid/operators/transfer_layout_op.cc
+++ b/paddle/fluid/operators/transfer_layout_op.cc
@@ -57,9 +57,9 @@ class TransferLayoutOp : public framework::OperatorWithKernel {
     }
     auto place =
         in_tensor->IsInitialized() ? in_tensor->place() : platform::CPUPlace();
-
-    // dtype is not important
-    return phi::KernelKey(framework::proto::VarType::FP32, place);
+    phi::DataType dtype = in_tensor->IsInitialized() ? in_tensor->dtype()
+                                                     : phi::DataType::FLOAT32;
+    return phi::KernelKey(phi::TransToProtoVarType(dtype), place);
   }
 
   phi::KernelKey GetKernelTypeForVar(
diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc
index 97ef11c37fe96e3b2aa72cbcce5d5cac4daca032..5c8fc75ff0ee25a072d15a3a81b21a7436b033c3 100644
--- a/paddle/phi/core/device_context.cc
+++ b/paddle/phi/core/device_context.cc
@@ -146,15 +146,26 @@ struct DeviceContext::Impl {
     // NOTE(paddle-dev): In case of tensor has already hold allocation and
     // is going to allocate allocation on new place, we will clear its holder
     // firstly and then re-alloc it.
-    if (tensor->initialized() && tensor->place() != place) {
-      ClearHolder(tensor);
+    if (phi::DenseTensor::classof(tensor)) {
+      // NOTE(Ruibiao): The tensor hold zero-size allocation is not regarded as
+      // `initialized`. Fix other tensor class when needed.
+      if (static_cast<phi::DenseTensor*>(tensor)->Holder() &&
+          tensor->place() != place) {
+        ClearHolder(tensor);
+      }
+    } else {
+      if (tensor->initialized() && tensor->place() != place) {
+        ClearHolder(tensor);
+      }
     }
+
     auto* allocator =
-        (tensor->numel() == 0 || fake_alloc) && requested_size == 0
+        (fake_alloc || tensor->numel() == 0) && requested_size == 0
             ? zero_allocator_
             : (pinned ? pinned_allocator_ : device_allocator_);
 #ifdef PADDLE_WITH_CUDA
-    bool must_cuda_graph_allocator = (tensor->numel() != 0) && !pinned;
+    bool must_cuda_graph_allocator =
+        (!fake_alloc && tensor->numel() != 0) && !pinned;
     if (must_cuda_graph_allocator &&
         place.GetType() == phi::AllocationType::GPU &&
         phi::backends::gpu::CUDAGraph::IsThisThreadCapturing()) {
@@ -189,11 +200,22 @@ struct DeviceContext::Impl {
     if (dtype == DataType::UNDEFINED) {
       dtype = tensor->dtype();
     }
-    if (tensor->initialized() && tensor->place() != CPUPlace()) {
-      ClearHolder(tensor);
+
+    if (phi::DenseTensor::classof(tensor)) {
+      // NOTE(Ruibiao): The tensor holds zero-size allocation is not regarded as
+      // `initialized`. Fix other tensor class when needed.
+      if (static_cast<phi::DenseTensor*>(tensor)->Holder() &&
+          tensor->place() != CPUPlace()) {
+        ClearHolder(tensor);
+      }
+    } else {
+      if (tensor->initialized() && tensor->place() != CPUPlace()) {
+        ClearHolder(tensor);
+      }
     }
+
     auto* allocator =
-        (tensor->numel() == 0 || fake_alloc) && requested_size == 0
+        (fake_alloc || tensor->numel() == 0) && requested_size == 0
             ? host_zero_allocator_
             : host_allocator_;
     return tensor->AllocateFrom(
@@ -246,8 +268,6 @@ struct DeviceContext::Impl {
 
  private:
   void ClearHolder(TensorBase* tensor) const {
-    if (!tensor->initialized()) return;
-
     if (DenseTensor::classof(tensor)) {
       static_cast<DenseTensor*>(tensor)->clear();
     } else if (SelectedRows::classof(tensor)) {
diff --git a/paddle/phi/core/selected_rows.h b/paddle/phi/core/selected_rows.h
index 08d02bee40d25be9ce8cf029cfe5c3d730960da9..aa528969fbf8f2105353213221429103bf195f01 100644
--- a/paddle/phi/core/selected_rows.h
+++ b/paddle/phi/core/selected_rows.h
@@ -139,10 +139,14 @@ class SelectedRows : public TensorBase,
   /// \return The data type of the tensor.
   DataType dtype() const noexcept override { return impl_->dtype(); }
 
+  void set_type(const DataType dtype) { impl_->set_type(dtype); }
+
   /// \brief Returns the data layout of the tensor.
   /// \return The data layout of the tensor.
   DataLayout layout() const noexcept override { return impl_->layout(); }
 
+  void set_layout(const DataLayout layout) { impl_->set_layout(layout); }
+
   /// \brief Returns the data place of the tensor.
   /// \return The data place of the tensor.
   const Place& place() const override { return impl_->place(); };
diff --git a/paddle/phi/core/selected_rows_impl.h b/paddle/phi/core/selected_rows_impl.h
index d4a42a9653b87eef4e2b0d8ef60914f95acd423b..a1864ad3aa657f57053e7a800c4e9c868a1a42ad 100644
--- a/paddle/phi/core/selected_rows_impl.h
+++ b/paddle/phi/core/selected_rows_impl.h
@@ -159,10 +159,14 @@ class SelectedRowsImpl {
   /// \return The data type of the tensor.
   DataType dtype() const noexcept { return value_->dtype(); }
 
+  void set_type(const DataType dtype) { value_->set_type(dtype); }
+
   /// \brief Returns the data layout of the tensor.
   /// \return The data layout of the tensor.
   DataLayout layout() const noexcept { return value_->layout(); }
 
+  void set_layout(const DataLayout layout) { value_->set_layout(layout); }
+
   /// \brief Returns the data place of the tensor.
   /// \return The data place of the tensor.
   const Place& place() const { return value_->place(); }
diff --git a/paddle/phi/core/sparse_coo_tensor.h b/paddle/phi/core/sparse_coo_tensor.h
index 13fc7d444b4242b386b0ab25e6197f3f45028174..542f4e8627758a6cb0c090ae1a50d4dbf4423d30 100644
--- a/paddle/phi/core/sparse_coo_tensor.h
+++ b/paddle/phi/core/sparse_coo_tensor.h
@@ -104,11 +104,14 @@ class SparseCooTensor : public TensorBase,
   /// \brief Returns the data type of the tensor.
   /// \return The data type of the tensor.
   DataType dtype() const noexcept override { return meta_.dtype; }
+  void set_type(const DataType dtype) { meta_.dtype = dtype; }
 
   /// \brief Returns the data layout of the tensor.
   /// \return The data layout of the tensor.
   DataLayout layout() const noexcept override { return meta_.layout; }
 
+  void set_layout(const DataLayout layout) { meta_.layout = layout; }
+
   /// \brief Returns the data place of the tensor.
   /// \return The data place of the tensor.
   const Place& place() const override { return non_zero_elements_.place(); }
diff --git a/paddle/phi/core/sparse_csr_tensor.h b/paddle/phi/core/sparse_csr_tensor.h
index 4d607188d2ebbe150f30feb5db74687a327b3b26..ec9dd7ab7907ebb75570eb971f2503940f8255d4 100644
--- a/paddle/phi/core/sparse_csr_tensor.h
+++ b/paddle/phi/core/sparse_csr_tensor.h
@@ -110,10 +110,14 @@ class SparseCsrTensor : public TensorBase,
   /// \return The data type of the tensor.
   DataType dtype() const noexcept override { return meta_.dtype; }
 
+  void set_type(const DataType dtype) { meta_.dtype = dtype; }
+
   /// \brief Returns the data layout of the tensor.
   /// \return The data layout of the tensor.
   DataLayout layout() const noexcept override { return meta_.layout; }
 
+  void set_layout(const DataLayout layout) { meta_.layout = layout; }
+
   /// \brief Returns the data place of the tensor.
   /// \return The data place of the tensor.
   const Place& place() const override { return non_zero_elements_.place(); }
diff --git a/paddle/phi/core/tensor_array.cc b/paddle/phi/core/tensor_array.cc
index 43089d952542f2ba2390085fe59e6f8f3b0f0671..e774bd0da448a76512bcba2db706d8aa3fe03bd9 100644
--- a/paddle/phi/core/tensor_array.cc
+++ b/paddle/phi/core/tensor_array.cc
@@ -23,8 +23,12 @@ TensorArray::TensorArray(const std::vector<DenseTensor>& vec) {
 /// \brief Test whether the tensor's storage in TensorArray is allocated.
 /// return Whether all tensors in TensorArray is allocated.
 bool TensorArray::initialized() const {
+  if (tensors_.empty()) {
+    return false;
+  }
+
   for (auto tensor : tensors_) {
-    if (!tensor.IsInitialized()) {
+    if (!tensor.initialized()) {
       return false;
     }
   }
@@ -42,18 +46,69 @@ const DDim& TensorArray::dims() const {
 }
 
 const Place& TensorArray::place() const {
-  PADDLE_THROW(errors::Unavailable("place() can't be used in TensorArray"));
-  return tensors_[0].place();
+  PADDLE_ENFORCE_NE(
+      tensors_.size(), 0, errors::Unavailable("TensorArray is not assigned."));
+
+  const Place& place = tensors_[0].place();
+  for (size_t i = 1; i < tensors_.size(); ++i) {
+    PADDLE_ENFORCE_EQ(
+        tensors_[i].place(),
+        place,
+        errors::Unavailable(
+            "The Place of all tensors in TensorArray must be consistent. The "
+            "current place is %s, but the previous place is %s.",
+            tensors_[i].place(),
+            place));
+  }
+  return place;
 }
 
 DataType TensorArray::dtype() const {
-  PADDLE_THROW(errors::Unavailable("dtype() can't be used in TensorArray"));
-  return DataType::UNDEFINED;
+  PADDLE_ENFORCE_NE(
+      tensors_.size(), 0, errors::Unavailable("TensorArray is not assigned."));
+
+  const DataType dtype = tensors_[0].dtype();
+  for (size_t i = 1; i < tensors_.size(); ++i) {
+    PADDLE_ENFORCE_EQ(
+        tensors_[i].dtype(),
+        dtype,
+        errors::Unavailable(
+            "The DataType of all tensors in TensorArray must be consistent. "
+            "The current dtype is %s, but the previous dtype is %s.",
+            tensors_[i].dtype(),
+            dtype));
+  }
+  return dtype;
+}
+
+void TensorArray::set_type(const DataType dtype) {
+  for (size_t i = 0; i < tensors_.size(); ++i) {
+    tensors_[i].set_type(dtype);
+  }
 }
 
 DataLayout TensorArray::layout() const {
-  PADDLE_THROW(errors::Unavailable("layout() can't be used in TensorArray"));
-  return DataLayout::UNDEFINED;
+  PADDLE_ENFORCE_NE(
+      tensors_.size(), 0, errors::Unavailable("TensorArray is not assigned."));
+
+  const DataLayout layout = tensors_[0].layout();
+  for (size_t i = 1; i < tensors_.size(); ++i) {
+    PADDLE_ENFORCE_EQ(
+        tensors_[i].layout(),
+        layout,
+        errors::Unavailable(
+            "The DataLayout of all tensors in TensorArray must be consistent. "
+            "The current layout is %s, but the previous layout is %s.",
+            tensors_[i].layout(),
+            layout));
+  }
+  return layout;
+}
+
+void TensorArray::set_layout(DataLayout layout) {
+  for (size_t i = 0; i < tensors_.size(); ++i) {
+    tensors_[i].set_layout(layout);
+  }
 }
 
 bool TensorArray::valid() const {
diff --git a/paddle/phi/core/tensor_array.h b/paddle/phi/core/tensor_array.h
index 14679429ea7f0b58edb9cdd69365d82034ff8628..4fd8fe1df5e1f43840764ef0b3ad06f5dbb171f5 100644
--- a/paddle/phi/core/tensor_array.h
+++ b/paddle/phi/core/tensor_array.h
@@ -63,12 +63,14 @@ class TensorArray : public TensorBase,
   /// \brief This overrided function is not used in TensorArray.
   const Place& place() const override;
 
-  /// \brief This overrided function is not used in TensorArray.
   DataType dtype() const override;
 
-  /// \brief This overrided function is not used in TensorArray.
+  void set_type(const DataType dtype);
+
   DataLayout layout() const override;
 
+  void set_layout(const DataLayout layout);
+
   /// \brief This overrided function is not used in TensorArray.
   bool valid() const override;
 
diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc
index 7876aa0437d3ca7e39aee9c27e4503451139743f..4fd11df211f9b7b8c45cf9d7623fff30d7367acf 100644
--- a/paddle/phi/core/tensor_utils.cc
+++ b/paddle/phi/core/tensor_utils.cc
@@ -316,6 +316,16 @@ void Copy(const Context& dev_ctx,
   dst->set_dims(src.dims());
 }
 
+template <typename Context>
+void Copy(const Context& dev_ctx,
+          const TensorArray& src,
+          Place dst_place,
+          bool blocking,
+          TensorArray* dst) {
+  // NOTE(Ruibiao): implements Copy() for TensorArray when needed.
+  PADDLE_THROW(errors::Unimplemented("Copy for TensorArray is unimplemented."));
+}
+
 template void Copy(const CPUContext& dev_ctx,
                    const DenseTensor& src,
                    Place dst_place,
@@ -363,6 +373,18 @@ template void Copy(const DeviceContext& dev_ctx,
                    bool blocking,
                    SparseCsrTensor* dst);
 
+template void Copy(const CPUContext& dev_ctx,
+                   const TensorArray& src,
+                   Place dst_place,
+                   bool blocking,
+                   TensorArray* dst);
+
+template void Copy(const DeviceContext& dev_ctx,
+                   const TensorArray& src,
+                   Place dst_place,
+                   bool blocking,
+                   TensorArray* dst);
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template void Copy(const GPUContext& dev_ctx,
                    const DenseTensor& src,
@@ -384,6 +406,11 @@ template void Copy(const GPUContext& dev_ctx,
                    Place dst_place,
                    bool blocking,
                    SparseCsrTensor* dst);
+template void Copy(const GPUContext& dev_ctx,
+                   const TensorArray& src,
+                   Place dst_place,
+                   bool blocking,
+                   TensorArray* dst);
 #endif
 
 #ifdef PADDLE_WITH_XPU
@@ -392,6 +419,11 @@ template void Copy(const XPUContext& dev_ctx,
                    Place dst_place,
                    bool blocking,
                    DenseTensor* dst);
+template void Copy(const XPUContext& dev_ctx,
+                   const TensorArray& src,
+                   Place dst_place,
+                   bool blocking,
+                   TensorArray* dst);
 #endif
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -400,6 +432,11 @@ template void Copy(const CustomContext& dev_ctx,
                    Place dst_place,
                    bool blocking,
                    DenseTensor* dst);
+template void Copy(const CustomContext& dev_ctx,
+                   const TensorArray& src,
+                   Place dst_place,
+                   bool blocking,
+                   TensorArray* dst);
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -408,6 +445,11 @@ template void Copy(const OneDNNContext& dev_ctx,
                    Place dst_place,
                    bool blocking,
                    DenseTensor* dst);
+template void Copy(const OneDNNContext& dev_ctx,
+                   const TensorArray& src,
+                   Place dst_place,
+                   bool blocking,
+                   TensorArray* dst);
 #endif
 
 template <typename T>
diff --git a/paddle/phi/core/tensor_utils.h b/paddle/phi/core/tensor_utils.h
index 56a4f933ceb44f03798f7834afe8248a2ea8e5f8..e2679ffb206a8b31f02e3a86226a545e0c9f43d9 100644
--- a/paddle/phi/core/tensor_utils.h
+++ b/paddle/phi/core/tensor_utils.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/phi/core/selected_rows.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/core/tensor_array.h"
 #include "paddle/phi/core/tensor_meta.h"
 namespace phi {
 
@@ -109,6 +110,13 @@ void Copy(const Context& dev_ctx,
           bool blocking,
           SparseCsrTensor* dst);
 
+template <typename Context>
+void Copy(const Context& dev_ctx,
+          const TensorArray& src,
+          Place dst_place,
+          bool blocking,
+          TensorArray* dst);
+
 template <typename T>
 void TensorFromVector(const std::vector<T>& src,
                       const phi::DeviceContext& ctx,
diff --git a/paddle/phi/kernels/coalesce_tensor_kernel.cc b/paddle/phi/kernels/coalesce_tensor_kernel.cc
index 5e801769a55aa6ad46f3d277f476a334b7869180..5b9c1b4a3de21d952631292f826e8d6540b691c5 100644
--- a/paddle/phi/kernels/coalesce_tensor_kernel.cc
+++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc
@@ -79,7 +79,7 @@ void GetMemSizeAndDtype(const std::vector<const DenseTensor *> &lod_tensors,
                          size_of_dtype
                    : static_cast<size_t>(size);
     const void *ptr =
-        lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr;
+        lod_tensors[i]->initialized() ? lod_tensors[i]->data() : nullptr;
     VLOG(4) << size << " " << len;
     ss << "input(" << i << "-th tensor) dim:(" << lod_tensors[i]->dims() << ") "
        << " addres:" << ptr << " len: " << len << ", ";
@@ -127,7 +127,7 @@ void CoalesceTensorKernel(const Context &dev_ctx,
         output[i],
         errors::InvalidArgument("The %d-th output tensor cannot be nullptr.",
                                 i));
-    if (!input[i]->IsInitialized()) {
+    if (!input[i]->initialized()) {
       has_not_init_in_vars = true;
     }
   }
@@ -142,7 +142,7 @@ void CoalesceTensorKernel(const Context &dev_ctx,
     for (size_t i = 0; i < input.size(); ++i) {
       phi::DDim dims(concated_shapes.data() + accumulated_ranks,
                      concated_ranks[i]);
-      if (!input[i]->IsInitialized()) {
+      if (!input[i]->initialized()) {
         PADDLE_ENFORCE_EQ(
             input[i],
             output[i],
@@ -220,7 +220,7 @@ void CoalesceTensorKernel(const Context &dev_ctx,
       auto sub_tensor = fused_output->Slice(static_cast<int64_t>(offset),
                                             static_cast<int64_t>(offset + len));
       // some var may not persistable, or persistable var may not init
-      if (output[i]->IsInitialized()) {
+      if (output[i]->initialized()) {
         phi::Copy(dev_ctx, *output[i], dev_ctx.GetPlace(), false, &sub_tensor);
       }
       offset += use_align
@@ -270,7 +270,9 @@ PD_REGISTER_KERNEL(coalesce_tensor,
                    phi::CoalesceTensorKernel,
                    int,
                    float,
-                   double) {}
+                   double) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
+}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(coalesce_tensor,
@@ -282,6 +284,7 @@ PD_REGISTER_KERNEL(coalesce_tensor,
                    float,
                    double) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
 }
 #endif
 
@@ -295,5 +298,6 @@ PD_REGISTER_KERNEL(coalesce_tensor,
                    float,
                    double) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
 }
 #endif
diff --git a/paddle/phi/kernels/cpu/abs_kernel.cc b/paddle/phi/kernels/cpu/abs_kernel.cc
index a10e0eed64aec894e8639bbcff54f4723295adc5..34e84c60cc1ff26204fb4d9b4601aa69a683413d 100644
--- a/paddle/phi/kernels/cpu/abs_kernel.cc
+++ b/paddle/phi/kernels/cpu/abs_kernel.cc
@@ -46,4 +46,6 @@ PD_REGISTER_KERNEL(abs,
                    int,
                    int64_t,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
diff --git a/paddle/phi/kernels/cpu/accuracy_kernel.cc b/paddle/phi/kernels/cpu/accuracy_kernel.cc
index 4f39d28816ae3d99a7431aa0c146b05db9c66ecc..d426e98fbc5458a52beda733d24c2b8caf5031e5 100644
--- a/paddle/phi/kernels/cpu/accuracy_kernel.cc
+++ b/paddle/phi/kernels/cpu/accuracy_kernel.cc
@@ -97,6 +97,6 @@ PD_REGISTER_KERNEL(
   kernel->InputAt(1).SetDataType(phi::DataType::INT64);
   kernel->InputAt(2).SetDataType(phi::DataType::INT64);
   kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32);
-  kernel->OutputAt(1).SetDataType(phi::DataType::INT64);
-  kernel->OutputAt(2).SetDataType(phi::DataType::INT64);
+  kernel->OutputAt(1).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(2).SetDataType(phi::DataType::INT32);
 }
diff --git a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
index 7c487287bb256841e59175a2755ab2a7d0bb54e3..20dfd2faff8a42549530b74e7e30841215a451b2 100644
--- a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
+++ b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
@@ -201,7 +201,9 @@ PD_REGISTER_KERNEL(argmin,
                    int32_t,
                    int64_t,
                    int16_t,
-                   uint8_t) {}
+                   uint8_t) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
+}
 
 PD_REGISTER_KERNEL(argmax,
                    CPU,
@@ -212,4 +214,6 @@ PD_REGISTER_KERNEL(argmax,
                    int32_t,
                    int64_t,
                    int16_t,
-                   uint8_t) {}
+                   uint8_t) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
+}
diff --git a/paddle/phi/kernels/cpu/as_complex_kernel.cc b/paddle/phi/kernels/cpu/as_complex_kernel.cc
index 8166a548aa4d26cd8a3fec8f4a120dd156e3905c..9ffdbe5a0e5fe8340b6d3656a22521cbda1e1932 100644
--- a/paddle/phi/kernels/cpu/as_complex_kernel.cc
+++ b/paddle/phi/kernels/cpu/as_complex_kernel.cc
@@ -15,8 +15,11 @@
 #include "paddle/phi/kernels/as_complex_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/as_complex_impl.h"
 
 PD_REGISTER_KERNEL(
-    as_complex, CPU, ALL_LAYOUT, phi::AsComplexKernel, float, double) {}
+    as_complex, CPU, ALL_LAYOUT, phi::AsComplexKernel, float, double) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
+}
diff --git a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
index 14eb38d5b99b6e3870f54f334e034161e2a1f472..1a9921aecc985af664a46fc0bfb3155b484b98cd 100644
--- a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
+++ b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
@@ -53,4 +53,8 @@ PD_REGISTER_KERNEL(average_accumulates,
                    ALL_LAYOUT,
                    phi::AverageAccumulatesKernel,
                    float,
-                   double) {}
+                   double) {
+  kernel->OutputAt(3).SetDataType(phi::DataType::INT64);
+  kernel->OutputAt(4).SetDataType(phi::DataType::INT64);
+  kernel->OutputAt(5).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/phi/kernels/cpu/eig_kernel.cc b/paddle/phi/kernels/cpu/eig_kernel.cc
index 0ef1e19965093d93f992bc1f2afa99d4a9a9eae5..b53b0c153e5410ef2a6defce1277889565998e49 100644
--- a/paddle/phi/kernels/cpu/eig_kernel.cc
+++ b/paddle/phi/kernels/cpu/eig_kernel.cc
@@ -105,6 +105,7 @@ PD_REGISTER_KERNEL(eig,
                    double,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {
-  kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
-  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
+  const phi::DataType& out_dtype = phi::dtype::ToComplex(kernel_key.dtype());
+  kernel->OutputAt(0).SetDataType(out_dtype);
+  kernel->OutputAt(1).SetDataType(out_dtype);
 }
diff --git a/paddle/phi/kernels/cpu/eigvals_kernel.cc b/paddle/phi/kernels/cpu/eigvals_kernel.cc
index 38864ee9a10d49fbb2baf918f8ce8981ccdc5a9a..555dbfb71dfb775f1ff6b1ce2e2db98e6d271eb7 100644
--- a/paddle/phi/kernels/cpu/eigvals_kernel.cc
+++ b/paddle/phi/kernels/cpu/eigvals_kernel.cc
@@ -258,5 +258,5 @@ PD_REGISTER_KERNEL(eigvals,
                    double,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {
-  kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
 }
diff --git a/paddle/phi/kernels/cpu/fft_grad_kernel.cc b/paddle/phi/kernels/cpu/fft_grad_kernel.cc
index aecaf6c5c13f8f78e99bbde895be6e219a201bf0..a9e017ac794e5ba2ab18b41fd3cb4c9ecd67065f 100644
--- a/paddle/phi/kernels/cpu/fft_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/fft_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/fft_grad_kernel.h"
+#include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/fft_grad_kernel_impl.h"
 
@@ -23,10 +24,14 @@ PD_REGISTER_KERNEL(fft_c2c_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 PD_REGISTER_KERNEL(
-    fft_c2r_grad, CPU, ALL_LAYOUT, phi::FFTC2RGradKernel, float, double) {}
+    fft_c2r_grad, CPU, ALL_LAYOUT, phi::FFTC2RGradKernel, float, double) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
+}
 PD_REGISTER_KERNEL(fft_r2c_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::FFTR2CGradKernel,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
diff --git a/paddle/phi/kernels/cpu/fft_kernel.cc b/paddle/phi/kernels/cpu/fft_kernel.cc
index 4d64119206f6370f32ffeeb5742283bb01a5ac81..781490422371ffe835b93e46fbff69d764b9e0b9 100644
--- a/paddle/phi/kernels/cpu/fft_kernel.cc
+++ b/paddle/phi/kernels/cpu/fft_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/fft_kernel.h"
+#include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/fft_kernel_impl.h"
 
@@ -28,8 +29,8 @@ PD_REGISTER_KERNEL(fft_c2r,
                    phi::FFTC2RKernel,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {
-  kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
 }
 PD_REGISTER_KERNEL(fft_r2c, CPU, ALL_LAYOUT, phi::FFTR2CKernel, float, double) {
-  kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
 }
diff --git a/paddle/phi/kernels/cpu/layer_norm_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
index 4a2cfc5de36e23d68370ed4b7b224ac79c2b93d2..1c82866f0bbda06ed35a8e9390c80c3d6305015d 100644
--- a/paddle/phi/kernels/cpu/layer_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
@@ -141,4 +141,7 @@ void LayerNormKernel(const Context& dev_ctx,
 }  // namespace phi
 
 PD_REGISTER_KERNEL(
-    layer_norm, CPU, ALL_LAYOUT, phi::LayerNormKernel, float, double) {}
+    layer_norm, CPU, ALL_LAYOUT, phi::LayerNormKernel, float, double) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
+}
diff --git a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
index b261e610d2073f3713906b6e2c0c6a09be76e592..8a5e3812950ece26090980e85d1649a93b98e71a 100644
--- a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
@@ -54,4 +54,6 @@ PD_REGISTER_KERNEL(sum_grad,
                    int,
                    int64_t,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
+}
diff --git a/paddle/phi/kernels/gpu/abs_kernel.cu b/paddle/phi/kernels/gpu/abs_kernel.cu
index 9f27c986166f478582ba1c39a18253aed2537da9..8f55f49daf3de3c3d0804e11dfb3e0abbf31cdf7 100644
--- a/paddle/phi/kernels/gpu/abs_kernel.cu
+++ b/paddle/phi/kernels/gpu/abs_kernel.cu
@@ -76,4 +76,6 @@ PD_REGISTER_KERNEL(abs,
                    phi::dtype::float16,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->InputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
diff --git a/paddle/phi/kernels/gpu/accuracy_kernel.cu b/paddle/phi/kernels/gpu/accuracy_kernel.cu
index a42d131a48d69d71c70305b453186d859edae796..c12afe94b0bdf5961a728ec372ad399c15d5f653 100644
--- a/paddle/phi/kernels/gpu/accuracy_kernel.cu
+++ b/paddle/phi/kernels/gpu/accuracy_kernel.cu
@@ -144,6 +144,6 @@ PD_REGISTER_KERNEL(accuracy,
                    double) {
   kernel->InputAt(1).SetDataType(phi::DataType::INT64);
   kernel->InputAt(2).SetDataType(phi::DataType::INT64);
-  kernel->OutputAt(1).SetDataType(phi::DataType::INT64);
-  kernel->OutputAt(2).SetDataType(phi::DataType::INT64);
+  kernel->OutputAt(1).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(2).SetDataType(phi::DataType::INT32);
 }
diff --git a/paddle/phi/kernels/gpu/as_complex_kernel.cu b/paddle/phi/kernels/gpu/as_complex_kernel.cu
index 5f2bfaaea54ce1206e3623540f779187202682e1..a376d3a9a5415b3d655c565890cdd3c5f540e2cf 100644
--- a/paddle/phi/kernels/gpu/as_complex_kernel.cu
+++ b/paddle/phi/kernels/gpu/as_complex_kernel.cu
@@ -15,8 +15,11 @@
 #include "paddle/phi/kernels/as_complex_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/as_complex_impl.h"
 
 PD_REGISTER_KERNEL(
-    as_complex, GPU, ALL_LAYOUT, phi::AsComplexKernel, float, double) {}
+    as_complex, GPU, ALL_LAYOUT, phi::AsComplexKernel, float, double) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
+}
diff --git a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
index 7af4430427997432c06553db172343c492f5c2a6..ec2c8d3fdb330d59e2afd5c47c6694a1ed641d0e 100644
--- a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
+++ b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
@@ -97,4 +97,8 @@ PD_REGISTER_KERNEL(average_accumulates,
                    ALL_LAYOUT,
                    phi::AverageAccumulatesKernel,
                    float,
-                   double) {}
+                   double) {
+  kernel->OutputAt(3).SetDataType(phi::DataType::INT64);
+  kernel->OutputAt(4).SetDataType(phi::DataType::INT64);
+  kernel->OutputAt(5).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/phi/kernels/gpu/fft_grad_kernel.cu b/paddle/phi/kernels/gpu/fft_grad_kernel.cu
index 69a95cffc3ee0e5ba71c4d96f35ad1b91ae6ccf6..d5f86292899c337e5c0a34639e9e2ebb1a0e48ce 100644
--- a/paddle/phi/kernels/gpu/fft_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/fft_grad_kernel.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/fft_grad_kernel.h"
+#include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/fft_grad_kernel_impl.h"
 
@@ -23,10 +24,14 @@ PD_REGISTER_KERNEL(fft_c2c_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 PD_REGISTER_KERNEL(
-    fft_c2r_grad, GPU, ALL_LAYOUT, phi::FFTC2RGradKernel, float, double) {}
+    fft_c2r_grad, GPU, ALL_LAYOUT, phi::FFTC2RGradKernel, float, double) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
+}
 PD_REGISTER_KERNEL(fft_r2c_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::FFTR2CGradKernel,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
diff --git a/paddle/phi/kernels/gpu/fft_kernel.cu b/paddle/phi/kernels/gpu/fft_kernel.cu
index 02c5fc036f2b1151cae8ada2cc876424e7016cdb..ae8fe365e3f3fb292dc95a4985ece9730b74ed0b 100644
--- a/paddle/phi/kernels/gpu/fft_kernel.cu
+++ b/paddle/phi/kernels/gpu/fft_kernel.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/fft_kernel.h"
+#include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/fft_kernel_impl.h"
 
@@ -28,8 +29,8 @@ PD_REGISTER_KERNEL(fft_c2r,
                    phi::FFTC2RKernel,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {
-  kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
 }
 PD_REGISTER_KERNEL(fft_r2c, GPU, ALL_LAYOUT, phi::FFTR2CKernel, float, double) {
-  kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
 }
diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
index ba731e700e8ea2e6669c19b52d95949df59bcac2..e8fc640cdd508eb232246463729e9e9360241b2f 100644
--- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
@@ -117,7 +117,12 @@ PD_REGISTER_KERNEL(layer_norm_grad,
                    ALL_LAYOUT,
                    phi::LayerNormGradKernel,
                    float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
 #elif CUDNN_VERSION_MIN(8, 1, 0)
 PD_REGISTER_KERNEL(layer_norm_grad,
                    GPU,
@@ -126,7 +131,12 @@ PD_REGISTER_KERNEL(layer_norm_grad,
                    float,
                    double,
                    phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
 #else
 PD_REGISTER_KERNEL(layer_norm_grad,
                    GPU,
@@ -134,5 +144,10 @@ PD_REGISTER_KERNEL(layer_norm_grad,
                    phi::LayerNormGradKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
 #endif
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
index 9f8122aa7589beb3fd5186ccd6993ccf89ef2278..34425d8cfcfe234cdad362a65842bc2faf678258 100644
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -673,7 +673,10 @@ PD_REGISTER_KERNEL(layer_norm,
                    ALL_LAYOUT,
                    phi::LayerNormKernel,
                    float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
+}
 #elif CUDNN_VERSION_MIN(8, 1, 0)
 PD_REGISTER_KERNEL(layer_norm,
                    GPU,
@@ -682,7 +685,10 @@ PD_REGISTER_KERNEL(layer_norm,
                    float,
                    double,
                    phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
+}
 #else
 PD_REGISTER_KERNEL(layer_norm,
                    GPU,
@@ -690,5 +696,8 @@ PD_REGISTER_KERNEL(layer_norm,
                    phi::LayerNormKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
+}
 #endif
diff --git a/paddle/phi/kernels/gpu/merged_momentum_kernel.cu b/paddle/phi/kernels/gpu/merged_momentum_kernel.cu
index c6883caecd1a61430ea2ca2c353cb8c5419d664e..c8df58c13806337aaee202e9b20fe59f302f8fda 100644
--- a/paddle/phi/kernels/gpu/merged_momentum_kernel.cu
+++ b/paddle/phi/kernels/gpu/merged_momentum_kernel.cu
@@ -22,4 +22,9 @@ PD_REGISTER_KERNEL(merged_momentum,
                    phi::MergedMomentumKernel,
                    phi::dtype::float16,
                    float,
-                   double) {}
+                   double) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
diff --git a/paddle/phi/kernels/gpu/momentum_kernel.cu b/paddle/phi/kernels/gpu/momentum_kernel.cu
index 6d2b51dff64cb00bb968513c03be0cd989bda8cc..464c2c220d1501b6e041a89e01d5b346a8122e44 100644
--- a/paddle/phi/kernels/gpu/momentum_kernel.cu
+++ b/paddle/phi/kernels/gpu/momentum_kernel.cu
@@ -25,8 +25,10 @@ PD_REGISTER_KERNEL(momentum,
                    float,
                    double,
                    phi::dtype::float16) {
-  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
-  kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
 }
 
 PD_REGISTER_KERNEL(momentum_dense_param_sparse_grad,
@@ -36,6 +38,8 @@ PD_REGISTER_KERNEL(momentum_dense_param_sparse_grad,
                    float,
                    double,
                    phi::dtype::float16) {
-  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
-  kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
 }
diff --git a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
index 04b3253178902f85462362a39f9485a6d0eadf11..15215c05d6361ac723ff587482bd2b591f2bae0f 100644
--- a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
@@ -70,4 +70,6 @@ PD_REGISTER_KERNEL(sum_grad,
                    int,
                    int64_t,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
+}
diff --git a/paddle/phi/kernels/gpu/sgd_kernel.cu b/paddle/phi/kernels/gpu/sgd_kernel.cu
index d489ccb4cb223826ecb40f05c34c64e82a553e1e..3e379650f9525935ddee7e2f49a34df91e064d25 100644
--- a/paddle/phi/kernels/gpu/sgd_kernel.cu
+++ b/paddle/phi/kernels/gpu/sgd_kernel.cu
@@ -188,7 +188,9 @@ PD_REGISTER_KERNEL(sgd,
                    phi::dtype::float16,
                    float,
                    double) {
-  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+  }
 }
 
 PD_REGISTER_KERNEL(sgd_dense_param_sparse_grad,
diff --git a/paddle/phi/kernels/memcpy_kernel.cc b/paddle/phi/kernels/memcpy_kernel.cc
index 489929fa87b05da3c45f0d3abac9713ac8bd3f35..cf2f6ac00a6d6b85a8ae907cacb0f3d3ad6f45f5 100644
--- a/paddle/phi/kernels/memcpy_kernel.cc
+++ b/paddle/phi/kernels/memcpy_kernel.cc
@@ -146,13 +146,17 @@ PD_REGISTER_GENERAL_KERNEL(memcpy_d2h,
                            CPU,
                            ALL_LAYOUT,
                            phi::MemcpyD2HKernel<phi::CPUContext>,
-                           ALL_DTYPE) {}
+                           ALL_DTYPE) {
+  kernel->OutputAt(0).SetBackend(phi::Backend::CPU);
+}
 
 PD_REGISTER_GENERAL_KERNEL(memcpy_d2h_multi_io,
                            CPU,
                            ALL_LAYOUT,
                            phi::MemcpyD2HMultiIOKernel<phi::CPUContext>,
-                           ALL_DTYPE) {}
+                           ALL_DTYPE) {
+  kernel->OutputAt(0).SetBackend(phi::Backend::CPU);
+}
 
 PD_REGISTER_GENERAL_KERNEL(
     memcpy, CPU, ALL_LAYOUT, phi::MemcpyKernel<phi::CPUContext>, ALL_DTYPE) {
@@ -170,13 +174,17 @@ PD_REGISTER_GENERAL_KERNEL(memcpy_d2h,
                            GPU,
                            ALL_LAYOUT,
                            phi::MemcpyD2HKernel<phi::GPUContext>,
-                           ALL_DTYPE) {}
+                           ALL_DTYPE) {
+  kernel->OutputAt(0).SetBackend(phi::Backend::CPU);
+}
 
 PD_REGISTER_GENERAL_KERNEL(memcpy_d2h_multi_io,
                            GPU,
                            ALL_LAYOUT,
                            phi::MemcpyD2HMultiIOKernel<phi::GPUContext>,
-                           ALL_DTYPE) {}
+                           ALL_DTYPE) {
+  kernel->OutputAt(0).SetBackend(phi::Backend::CPU);
+}
 
 PD_REGISTER_GENERAL_KERNEL(
     memcpy, GPU, ALL_LAYOUT, phi::MemcpyKernel<phi::GPUContext>, ALL_DTYPE) {
@@ -196,12 +204,16 @@ PD_REGISTER_GENERAL_KERNEL(memcpy_d2h,
                            XPU,
                            ALL_LAYOUT,
                            phi::MemcpyD2HKernel<phi::XPUContext>,
-                           ALL_DTYPE) {}
+                           ALL_DTYPE) {
+  kernel->OutputAt(0).SetBackend(phi::Backend::CPU);
+}
 
 PD_REGISTER_GENERAL_KERNEL(memcpy_d2h_multi_io,
                            XPU,
                            ALL_LAYOUT,
                            phi::MemcpyD2HMultiIOKernel<phi::XPUContext>,
-                           ALL_DTYPE) {}
+                           ALL_DTYPE) {
+  kernel->OutputAt(0).SetBackend(phi::Backend::CPU);
+}
 
 #endif
diff --git a/paddle/phi/kernels/onednn/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/onednn/reduce_sum_grad_kernel.cc
index 10b914a2005cd836a540fb06dcda5bc0edd9addb..8967df2f80e3633821b32ab8c98c8855912c4e43 100644
--- a/paddle/phi/kernels/onednn/reduce_sum_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/reduce_sum_grad_kernel.cc
@@ -42,4 +42,5 @@ void SumGradKernel(const Context& dev_ctx,
 
 PD_REGISTER_KERNEL(
     sum_grad, OneDNN, ONEDNN, phi::SumGradKernel, float, phi::dtype::bfloat16) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
 }
diff --git a/paddle/phi/kernels/xpu/layer_norm_grad_kernel.cc b/paddle/phi/kernels/xpu/layer_norm_grad_kernel.cc
index f83b72b79a080957bd85805e1a6ac5e0273244e0..3f99c1ace5176eaec44b0b0aebb2562b60c1065f 100644
--- a/paddle/phi/kernels/xpu/layer_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/layer_norm_grad_kernel.cc
@@ -129,4 +129,9 @@ PD_REGISTER_KERNEL(layer_norm_grad,
                    ALL_LAYOUT,
                    phi::LayerNormGradKernel,
                    float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
diff --git a/paddle/phi/kernels/xpu/momentum_kernel.cc b/paddle/phi/kernels/xpu/momentum_kernel.cc
index 207bfef37f947ae4ae3bb93bad52fe831de840d9..ad9cb2e6ef86ef86d09636554bacdd9801b58b30 100644
--- a/paddle/phi/kernels/xpu/momentum_kernel.cc
+++ b/paddle/phi/kernels/xpu/momentum_kernel.cc
@@ -69,7 +69,4 @@ PD_REGISTER_KERNEL(momentum,
                    ALL_LAYOUT,
                    phi::MomentumDenseKernel,
                    float,
-                   phi::dtype::float16) {
-  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
-  kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
-}
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc
index 0ba67f68bccf3c2d37d280b8266189a35e220563..44be073894d81abf3d748af9fda8408dcf75ada4 100644
--- a/paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc
@@ -71,4 +71,5 @@ void ReduceSumGradKernel(const Context& dev_ctx,
 }  // namespace phi
 
 PD_REGISTER_KERNEL(sum_grad, XPU, ALL_LAYOUT, phi::ReduceSumGradKernel, float) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
 }
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 1b881051b041b7e850fcf1cfa9547e2251411248..57ef073e51243298fc7785bf7e36ae233fb263a0 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -913,6 +913,11 @@ if(WITH_DISTRIBUTE)
   set_tests_properties(test_dist_fleet_raw_program_optimizer_fuse_allreduce
                        PROPERTIES TIMEOUT 60)
   set_tests_properties(test_dist_dygraph_apis PROPERTIES TIMEOUT 120)
+
+  # NODE(Ruibiao): Remove it after static build is enabled by default.
+  set_tests_properties(
+    test_dist_mnist_fp16_allreduce test_dist_mnist_pg
+    PROPERTIES ENVIRONMENT FLAGS_new_executor_static_build=true)
 endif()
 
 # setting timeout value as 15S
@@ -1229,3 +1234,52 @@ set_tests_properties(
 set_tests_properties(
   test_cuda_graph_static_mode_error
   PROPERTIES ENVIRONMENT "FLAGS_CUDA_GRAPH_USE_STANDALONE_EXECUTOR=1")
+
+# These UTs are to temporarily test static build for standalone_executor, will be removed after static build is enabled by default.
+set(STATIC_BUILD_TESTS
+    test_adagrad_op
+    test_adamw_op
+    test_arg_min_max_op
+    test_bincount_op
+    test_decoupled_py_reader
+    test_fake_quantize_op
+    test_fetch_lod_tensor_array
+    test_imperative_optimizer
+    test_lamb_op
+    test_layer_norm_op
+    test_lookup_table_bf16_op
+    test_lookup_table_v2_op
+    test_matmul_op
+    test_matmul_v2_op
+    test_merged_adam_op
+    test_momentum_op
+    test_nce
+    test_paddle_save_load_binary
+    test_reduce_op
+    test_segment_ops
+    test_sparse_momentum_op
+    test_sgd_op_bf16
+    test_softmax_mask_fuse_upper_triangle_op
+    test_sparse_conv_op
+    test_sparse_norm_op
+    test_sparse_pooling_op
+    test_tensor_array_to_tensor
+    test_while_op
+    test_one_hot_v2_op)
+
+foreach(STATIC_BUILD_TEST ${STATIC_BUILD_TESTS})
+  py_test_modules(
+    ${STATIC_BUILD_TEST}_static_build MODULES ${STATIC_BUILD_TEST} ENVS
+    FLAGS_new_executor_static_build=true)
+endforeach()
+
+set_tests_properties(test_decoupled_py_reader_static_build PROPERTIES TIMEOUT
+                                                                      120)
+set_tests_properties(test_imperative_optimizer_static_build PROPERTIES TIMEOUT
+                                                                       250)
+set_tests_properties(test_matmul_op_static_build PROPERTIES TIMEOUT 120)
+set_tests_properties(test_matmul_v2_op_static_build PROPERTIES TIMEOUT 120)
+set_tests_properties(test_layer_norm_op_static_build PROPERTIES TIMEOUT 1500)
+set_tests_properties(test_paddle_save_load_binary_static_build
+                     PROPERTIES TIMEOUT 120)
+set_tests_properties(test_reduce_op_static_build PROPERTIES TIMEOUT 500)
diff --git a/python/paddle/fluid/tests/unittests/standalone_executor/CMakeLists.txt b/python/paddle/fluid/tests/unittests/standalone_executor/CMakeLists.txt
index d6a1fa1c9be541fea4a5c07a8bddbee4c688d91c..2105cee7c478935b98af1e82f778a9722a717716 100644
--- a/python/paddle/fluid/tests/unittests/standalone_executor/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/standalone_executor/CMakeLists.txt
@@ -24,6 +24,18 @@ py_test_modules(
   test_standalone_executor_stats MODULES test_standalone_executor ENVS
   FLAGS_host_trace_level=10 FLAGS_static_executor_perfstat_filepath=./perfstat)
 
+# These UTs are to temporarily test static build for standalone_executor, will be removed after static build is enabled by default.
+set(STATIC_BUILD_TESTS
+    test_standalone_controlflow test_standalone_cuda_graph_multi_stream
+    test_standalone_custom_stream test_standalone_executor
+    test_standalone_multiply_write)
+
+foreach(STATIC_BUILD_TEST ${STATIC_BUILD_TESTS})
+  py_test_modules(
+    ${STATIC_BUILD_TEST}_static_build MODULES ${STATIC_BUILD_TEST} ENVS
+    FLAGS_new_executor_static_build=true)
+endforeach()
+
 set_tests_properties(test_standalone_cross_step_overlap PROPERTIES TIMEOUT 30)
 set_tests_properties(test_standalone_executor_aot_choose_kernel
                      PROPERTIES TIMEOUT 60)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 000e2955e464812a10d7899b5e418bd23ecdc25b..a4730bdccc1ac23f6437f634da49ca763d7106d6 100755
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -1705,6 +1705,7 @@ class TestDistBase(unittest.TestCase):
             "http_proxy": "",
             "NCCL_P2P_DISABLE": "1",
             "NCCL_SHM_DISABLE": "1",
+            "FLAGS_new_executor_static_build": "1",
         }
 
         if check_error_log:
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_upper_triangle_op.py b/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_upper_triangle_op.py
index b4811d926d5c6319c5b29b8e3d2aa9c2432ef22a..82dbaaf0e78c460e60bbcce660fd77bbfdd13898 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_upper_triangle_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_upper_triangle_op.py
@@ -69,13 +69,13 @@ class TestSoftmaxMaskFuseOp1(OpTest):
     def test_check_output(self):
         try:
             self.check_output_with_place(core.CPUPlace())
-        except NotImplementedError:
+        except (NotImplementedError, RuntimeError):
             pass
 
     def test_check_grad(self):
         try:
             self.check_grad_with_place(core.CPUPlace(), ["X"], "Out")
-        except NotImplementedError:
+        except (NotImplementedError, RuntimeError):
             pass
 
 
diff --git a/test/custom_op/CMakeLists.txt b/test/custom_op/CMakeLists.txt
index e0538d417a6920fddabf11fa57d306e13969e5df..d7f32625db4abb0649329a21684da86f761ecaf9 100644
--- a/test/custom_op/CMakeLists.txt
+++ b/test/custom_op/CMakeLists.txt
@@ -49,6 +49,9 @@ if(WITH_TESTING)
   py_test(test_multi_out_jit SRCS test_multi_out_jit.py)
   py_test(test_custom_attrs_jit SRCS test_custom_attrs_jit.py)
   py_test(test_custom_concat SRCS test_custom_concat.py)
+  set_tests_properties(
+    test_custom_concat PROPERTIES ENVIRONMENT
+                                  FLAGS_new_executor_static_build=true)
   py_test(test_custom_conj SRCS test_custom_conj.py)
   py_test(test_custom_linear SRCS test_custom_linear.py)
   py_test(test_custom_simple_slice SRCS test_custom_simple_slice.py)
diff --git a/test/mkldnn/CMakeLists.txt b/test/mkldnn/CMakeLists.txt
index 50062d69bc8c920446d756d2650f01042ae47731..d3da5f2897da39edee1fe2d634ca6a6addbad23d 100644
--- a/test/mkldnn/CMakeLists.txt
+++ b/test/mkldnn/CMakeLists.txt
@@ -14,6 +14,14 @@ endif()
 foreach(TEST_OP ${TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach()
+
+# NODE(Ruibiao): Remove it after static build is enabled by default.
+if(WITH_MKLDNN AND NOT WIN32)
+  py_test_modules(
+    test_dequantize_mkldnn_op_static_build MODULES test_dequantize_mkldnn_op
+    ENVS FLAGS_new_executor_static_build=true)
+endif()
+
 set_tests_properties(test_concat_mkldnn_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_conv3d_mkldnn_op PROPERTIES TIMEOUT 120)
 if(WITH_MKLDNN AND NOT WIN32)
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index e0d4a6e062383dbefb2a89487d154656018008b3..39ece6a3b13466ed9e1f2d69deffc52b1574215b 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -306,6 +306,12 @@ if [ "${HAS_MODIFIED_SETUP}" != "" ] || ([ "${HAS_MODIFIED_SETUP_IN}" != "" ] &&
     check_approval 1 risemeup1 zhangbo9674
 fi
 
+HAS_MODIFIED_STATIC_BUILD=`git diff --name-only upstream/$BRANCH | grep "new_executor/interpreter/static_build.cc" || true`
+if [ "${HAS_MODIFIED_STATIC_BUILD}" != "" ] && [ "${GIT_PR_ID}" != ""]; then
+    echo_line="You must have one RD (From00 or zhiqiu) approval for file changes in new_executor/interpreter/static_build.cc.\n"
+    check_approval 1 From00 zhiqiu
+fi
+
 ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true`
 if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_GPU_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend), luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n"
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index 20c84c778d8024a6e939e3c5b66905b4458fab8b..87b32b12932e0afd8a87dcb45ec9a2e7fdeabf55 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -20,6 +20,7 @@ disable_wingpu_test="^test_model$|\
 ^test_add_reader_dependency$|\
 ^test_add_reader_dependency_for_interpretercore$|\
 ^test_decoupled_py_reader$|\
+^test_decoupled_py_reader_static_build$|\
 ^test_generator_dataloader$|\
 ^test_parallel_dygraph_sync_batch_norm$|\
 ^test_py_reader_using_executor$|\
@@ -103,6 +104,7 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\
 ^test_conv3d_transpose_part2_op$|\
 ^test_deform_conv2d$|\
 ^test_matmul_op$|\
+^test_matmul_op_static_build$|\
 ^test_basic_api_transformation$|\
 ^test_deformable_conv_op$|\
 ^test_variable$|\
@@ -153,6 +155,7 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\
 ^test_add_reader_dependency_for_interpretercore$|\
 ^test_compat$|\
 ^test_decoupled_py_reader$|\
+^test_decoupled_py_reader_static_build$|\
 ^test_generator_dataloader$|\
 ^test_py_reader_using_executor$|\
 ^test_dataloader_keep_order$|\
@@ -223,6 +226,7 @@ long_time_test="^test_gru_op$|\
 ^test_imperative_lod_tensor_to_selected_rows$|\
 ^test_imperative_selected_rows_to_lod_tensor$|\
 ^test_layer_norm_op$|\
+^test_layer_norm_op_static_build$|\
 ^test_multiclass_nms_op$|\
 ^test_nearest_interp_v2_op$|\
 ^test_nn_grad$|\