Cherry pick for standalone executor (#42281)

* [cherry-pick] Support cinn_launch op in standalone executor (#42046) * Support cinn_launch OP in standalone executor * Remove some redundant code * [cherry-pick] Do not reset default stream for StreamSafeCUDAAllocator (#42149)

Cherry pick for standalone executor (#42281)
* [cherry-pick] Support cinn_launch op in standalone executor (#42046) * Support cinn_launch OP in standalone executor * Remove some redundant code * [cherry-pick] Do not reset default stream for StreamSafeCUDAAllocator (#42149)
6e7e4501 · Ruibiao Chen · GitHub · 9bc423b1 · 6e7e4501 · 6e7e4501
5 changed file
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -429,8 +429,17 @@ void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) {
    }
    outs_map.emplace(var_name_item.first, std::move(out_vars));
  }
+
  // set runtime_ctx and infershape_ctx_
-  instr_node->ResetContext(ins_map, outs_map);
+  if (instr_node->OpBase()->Type() == "cinn_launch") {  // OP use scope in
+                                                        // kernel
+    Scope* local_scope = create_local_scope_
+                             ? global_scope_->GetMutableLocalScope()
+                             : global_scope_->GetMutableScope();
+    instr_node->ResetContextWithScope(ins_map, outs_map, *local_scope);
+  } else {
+    instr_node->ResetContext(ins_map, outs_map);
+  }
 }

 void InterpreterCore::BuildSkipShareLoDInfo() {

--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -393,8 +393,19 @@ void build_op_func_list(const platform::Place& place,
          platform::DeviceContextPool::Instance();
      auto* dev_ctx = pool.Get(place);
      Scope scope;
+      Scope* runtime_scope = &scope;
+      // NOTE(Ruibiao): We do not encourage directly using scope in OP kernel.
+      // But some OPs do have such behavior (e.g., cinn_launch OP). Here special
+      // treatment for them.
+      if (op_with_kernel->Type() == "cinn_launch") {
+        VLOG(6) << "OP(" << op_with_kernel->Type() << ") use scope in kernel, "
+                                                      "so pass a real scope to "
+                                                      "ExecutionContext";
+        runtime_scope = local_scope;
+      }
+
      auto expected_kernel_key = op_with_kernel->GetExpectedKernelType(
-          ExecutionContext(*op, scope, *dev_ctx, runtime_context));
+          ExecutionContext(*op, *runtime_scope, *dev_ctx, runtime_context));
      op_with_kernel->ResetKernelType(new OpKernelType(expected_kernel_key));

      // change device by the device_guard()
@@ -442,8 +453,8 @@ void build_op_func_list(const platform::Place& place,
        op_with_kernel->Info().infer_shape_(&infer_shape_ctx);
      }

-      auto exec_ctx =
-          ExecutionContext(*op_with_kernel, scope, *dev_ctx, runtime_context);
+      auto exec_ctx = ExecutionContext(*op_with_kernel, *runtime_scope,
+                                       *dev_ctx, runtime_context);

      auto run_phi_kernel = false;
      if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(

--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -765,6 +765,16 @@ void Instruction::ResetContext(const VariableValueMap& in_vars,
      new ExecutionContext(*OpBase(), scope_, dev_ctx_, *runtime_ctx_.get()));
 }

+void Instruction::ResetContextWithScope(const VariableValueMap& in_vars,
+                                        const VariableValueMap& out_vars,
+                                        const framework::Scope& scope) {
+  runtime_ctx_.reset(new RuntimeContext(in_vars, out_vars));
+  infershape_ctx_.reset(
+      new InterpretercoreInferShapeContext(*OpBase(), *runtime_ctx_.get()));
+  execution_ctx_.reset(
+      new ExecutionContext(*OpBase(), scope, dev_ctx_, *runtime_ctx_.get()));
+}
+
 std::shared_ptr<RuntimeContext> Instruction::InnerRuntimeContext() const {
  return runtime_ctx_;
 }

--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -351,6 +351,10 @@ class Instruction {
  void ResetContext(const VariableValueMap& in_vars,
                    const VariableValueMap& out_vars);

+  void ResetContextWithScope(const VariableValueMap& in_vars,
+                             const VariableValueMap& out_vars,
+                             const framework::Scope& scope);
+
  std::shared_ptr<RuntimeContext> InnerRuntimeContext() const;

  std::shared_ptr<InterpretercoreInferShapeContext> InnerInferShapeContext()

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -415,6 +415,23 @@ class AllocatorFacadePrivate {
  void SetDefaultStream(const platform::CUDAPlace& place, gpuStream_t stream) {
    const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
        GetDefaultStreamSafeCUDAAllocator(place);
+
+    // NOTE(Ruibiao): The default stream will be set when the CUDADeviceContext
+    // created. Normally, the DeviceContextPool is a global singleton and one
+    // Place only correspond to one DeviceContext. However, to support
+    // multi-stream scheduling, standalone executor creates two extra
+    // DeviceContextPools for H2D and D2H stream in StreamAnalyzer, which make
+    // one Place correspond to multiple DeviceContext and unexpectedly reset the
+    // default stream in runtime. To avoid this behavior, we do not allow
+    // changing default stream after initially setting.
+    if (allocator->GetDefaultStream() != nullptr) {
+      VLOG(5) << "The default stream for StreamSafeCUDAAllocator("
+              << allocator.get() << ") in " << place << " has been set to "
+              << allocator->GetDefaultStream()
+              << " before, not allow to change now.";
+      return;
+    }
+
    allocator->SetDefaultStream(stream);
    VLOG(8) << "Set default stream to " << stream
            << " for StreamSafeCUDAAllocator(" << allocator.get() << ") in "