cache scope and place on CinnLaunchContext and pass them to callback (#37983)

cinn_launch_op： cache scope and place on CinnLaunchContext to skip duplicate alloc/free callback construction

cache scope and place on CinnLaunchContext and pass them to callback (#37983)
cinn_launch_op： cache scope and place on CinnLaunchContext to skip duplicate alloc/free callback construction
151c5d74 · CtfGo · GitHub · 9776888a · 151c5d74 · 151c5d74
9 changed file
--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -2,7 +2,7 @@ cc_library(cinn_cache_key SRCS cinn_cache_key.cc DEPS boost graph graph_helper l
 cc_library(build_cinn_pass SRCS build_cinn_pass.cc DEPS pass subgraph_detector graph_pattern_detector cinn_compiler errors enforce)
 cc_library(transform_desc SRCS transform_desc.cc DEPS proto_desc cinn)
 cc_library(cinn_graph_symbolization SRCS cinn_graph_symbolization.cc DEPS lod_tensor graph transform_desc cinn)
-cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn)
+cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn cinn_launch_context)

 if (WITH_TESTING)
  cc_test(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn)

--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -41,6 +41,7 @@
 #include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/analysis/dot.h"
+#include "paddle/fluid/operators/cinn/cinn_launch_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/string_helper.h"

@@ -217,6 +218,9 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
  *compiled_obj = {std::move(graph_compiler),
                   std::move(compiled_res.runtime_program), scope,
                   symbol.var_model_to_program_map()};
+  compiled_obj->launch_context =
+      std::make_unique<operators::details::CinnLaunchContext>(
+          compiled_obj->paddle2cinn_varmap, compiled_obj->scope);
  return compiled_obj;
 }


--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -31,6 +31,13 @@
 #include "paddle/fluid/platform/macros.h"

 namespace paddle {
+
+namespace operators {
+namespace details {
+class CinnLaunchContext;
+}
+}
+
 namespace framework {
 namespace paddle2cinn {

@@ -39,6 +46,7 @@ struct CinnCompiledObject {
  std::unique_ptr<::cinn::hlir::framework::Program> runtime_program;
  std::shared_ptr<::cinn::hlir::framework::Scope> scope;
  std::unordered_map<std::string, std::string> paddle2cinn_varmap;
+  std::unique_ptr<operators::details::CinnLaunchContext> launch_context;
 };

 // Entrance to use CINN.

--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -32,7 +32,30 @@ CinnLaunchContext::CinnLaunchContext(
      [](const auto& name_view) { return std::string(name_view.data()); });
 }

-bool CinnLaunchContext::IsVariableUsed(const std::string& paddle_name) {
+void CinnLaunchContext::UpdateCapturedEnv(const framework::Scope& scope,
+                                          const platform::Place& place) {
+  if (std::addressof(scope) == cached_scope_ &&
+      std::addressof(place) == cached_place_) {
+    VLOG(4) << "Captured scope:" << cached_scope_ << ", place:" << cached_place_
+            << " are not changed";
+    return;
+  }
+  cached_scope_ = std::addressof(scope);
+  cached_place_ = std::addressof(place);
+  cached_temp_scope_ = scope.NewTmpScope();
+  VLOG(4) << "Captured env is update, scope:" << cached_scope_ << "->"
+          << std::addressof(scope) << ", place:" << cached_place_ << "->"
+          << std::addressof(place);
+}
+
+bool CinnLaunchContext::IsArgumentsInitialized() const {
+  if (hold_buffers_.empty() || name2argument_.empty()) {
+    return false;
+  }
+  return true;
+}
+
+bool CinnLaunchContext::IsVariableUsed(const std::string& paddle_name) const {
  return paddle2cinn_varmap_.count(paddle_name) > 0 &&
         cinn_variable_names_.count(paddle2cinn_varmap_.at(paddle_name)) > 0;
 }
@@ -67,85 +90,83 @@ void CinnLaunchContext::CheckTensorEquivalent(const std::string& paddle_name,
  // TODO(CtfGo): check the underlying data type after CINN ready
 }

-void CinnLaunchContext::AssignExternalVariable(const std::string& paddle_name,
-                                               const platform::Place& place,
-                                               LoDTensor* paddle_tensor) {
+void CinnLaunchContext::AssignExternalVariable(const std::string& paddle_name) {
  PADDLE_ENFORCE_EQ(IsVariableUsed(paddle_name), true,
                    platform::errors::InvalidArgument(
                        "Paddle variable(%s) not used by cinn", paddle_name));

  const auto& cinn_name = paddle2cinn_varmap_.at(paddle_name);
+  const auto& paddle_tensor =
+      cached_scope_->GetVar(paddle_name)->Get<LoDTensor>();
  CinnTensor cinn_tensor = GetCinnTensor(cinn_name);
-  if (!paddle_tensor->IsInitialized()) {
-    paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data()));
+  if (paddle_tensor.IsInitialized()) {
+    CheckTensorEquivalent(paddle_name, paddle_tensor, cinn_tensor);
  }
-  CheckTensorEquivalent(paddle_name, *paddle_tensor, cinn_tensor);
-  return SetArgument(cinn_name, place, /* free_mem_callback = */ false,
-                     paddle_tensor);
+
+  auto cinn_buffer = std::make_unique<cinn_buffer_t>();
+  // assign dimensions and alloc/free callback of cinn_buffer_t
+  cinn_buffer->resize(cinn_tensor->shape().data().data(),
+                      cinn_tensor->shape().data().size());
+  cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
+      [this, paddle_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor =
+            cached_scope_->GetVar(paddle_name)->GetMutable<LoDTensor>();
+        tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions));
+        buffer->memory = reinterpret_cast<uint8_t*>(
+            tensor->mutable_data<float>(*cached_place_));
+        return 0;
+      });
+
+  // external variables will be recycled by global gc, so do nothing here
+  cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
+      [](void* ctx, cinn_buffer_t* buffer) {
+        // Do nothing
+        return 0;
+      });
+
+  return SetArgument(cinn_name, std::move(cinn_buffer));
 }

-void CinnLaunchContext::AssignInternalVariable(const std::string& cinn_name,
-                                               const platform::Place& place,
-                                               LoDTensor* paddle_tensor) {
+void CinnLaunchContext::AssignInternalVariable(const std::string& cinn_name) {
  PADDLE_ENFORCE_GT(cinn_variable_names_.count(cinn_name), 0,
                    platform::errors::InvalidArgument(
                        "Variable(%s) not found in cinn socpe.", cinn_name));
  CinnTensor cinn_tensor = GetCinnTensor(cinn_name);
-  if (!paddle_tensor->IsInitialized()) {
-    paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data()));
-  }
-  CheckTensorEquivalent(cinn_name, *paddle_tensor, cinn_tensor);
-  return SetArgument(cinn_name, place, /* free_mem_callback = */ true,
-                     paddle_tensor);
-}
-
-std::unique_ptr<cinn_buffer_t> CinnLaunchContext::ShareTensorWithCinnBuffer(
-    const platform::Place& place, bool free_mem_callback, LoDTensor* tensor) {
-  // convert paddle dimensions array to cinn format
-  std::vector<cinn_dimension_t> cinn_dims(tensor->dims().size());
-  for (auto i = 0; i < tensor->dims().size(); ++i) {
-    cinn_dims[i] = static_cast<cinn_dimension_t>(tensor->dims().at(i));
-  }
-
  auto cinn_buffer = std::make_unique<cinn_buffer_t>();
-  // assign size and memory
-  cinn_buffer->resize(cinn_dims.data(), cinn_dims.size());
+  // assign dimensions and alloc/free callback of cinn_buffer_t
+  cinn_buffer->resize(cinn_tensor->shape().data().data(),
+                      cinn_tensor->shape().data().size());

  cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
-      [place, tensor](void* ctx, cinn_buffer_t* buffer) {
-        buffer->memory =
-            reinterpret_cast<uint8_t*>(tensor->mutable_data<float>(place));
+      [this, cinn_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor =
+            cached_temp_scope_->Var(cinn_name)->GetMutable<LoDTensor>();
+        tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions));
+        buffer->memory = reinterpret_cast<uint8_t*>(
+            tensor->mutable_data<float>(*cached_place_));
        return 0;
      });

-  if (free_mem_callback) {
-    cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
-        [tensor](void* ctx, cinn_buffer_t* buffer) {
-          tensor->clear();
-          return 0;
-        });
-    return cinn_buffer;
-  }
-
+  // internal variables should release its buffer immediately
+  // if no instruction use it
  cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
-      [](void* ctx, cinn_buffer_t* buffer) {
-        // Do nothing
+      [this, cinn_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor =
+            cached_temp_scope_->GetVar(cinn_name)->GetMutable<LoDTensor>();
+        tensor->clear();
        return 0;
      });
-  return cinn_buffer;
+  return SetArgument(cinn_name, std::move(cinn_buffer));
 }

 void CinnLaunchContext::SetArgument(const std::string& cinn_name,
-                                    const platform::Place& place,
-                                    bool free_mem_callback,
-                                    LoDTensor* paddle_tensor) {
-  auto buffer =
-      ShareTensorWithCinnBuffer(place, free_mem_callback, paddle_tensor);
+                                    std::unique_ptr<cinn_buffer_t>&& buffer) {
+  VLOG(4) << "SetArgument-" << name2argument_.size() << ": name(" << cinn_name
+          << "), dims(" << framework::DDim(buffer->dims, buffer->dimensions)
+          << ").";
+
  name2argument_.emplace(cinn_name, buffer.get());
  hold_buffers_.emplace_back(std::move(buffer));
-  VLOG(4) << "SetArgument-" << name2argument_.size() << ": "
-          << "name(" << cinn_name << "), dims(" << paddle_tensor->dims()
-          << ").";
 }

 const std::map<std::string, cinn_pod_value_t>&

--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -24,7 +24,7 @@
 #include "cinn/runtime/cinn_runtime.h"
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/framework/scope.h"

 namespace paddle {
 namespace operators {
@@ -40,16 +40,22 @@ class CinnLaunchContext {
      const std::unordered_map<std::string, std::string>& paddle2cinn_varmap,
      const std::shared_ptr<CinnScope>& cinn_scope);

+  // explicitly update several environment variables captured
+  // by callback of execution arguments
+  void UpdateCapturedEnv(const framework::Scope& scope,
+                         const platform::Place& place);
+
+  // Return whether execution arguments has been initialized
+  bool IsArgumentsInitialized() const;
+
  // Return whether a Paddle variable used on compiled kernels
-  bool IsVariableUsed(const std::string& var_name);
+  bool IsVariableUsed(const std::string& paddle_name) const;

  // Assign tensor buffer to input or output variables
-  void AssignExternalVariable(const std::string& var_name,
-                              const platform::Place& place, LoDTensor* tensor);
+  void AssignExternalVariable(const std::string& paddle_name);

  // Assign tensor buffer to internal variables
-  void AssignInternalVariable(const std::string& var_name,
-                              const platform::Place& place, LoDTensor* tensor);
+  void AssignInternalVariable(const std::string& cinn_name);

  // Extract internal variable names from CinnScope
  // by excluding used input and output variables
@@ -58,10 +64,6 @@ class CinnLaunchContext {
  // Finalize all execution arguments and return them
  const std::map<std::string, cinn_pod_value_t>& FinalizeArguments() const;

-  std::vector<std::unique_ptr<cinn_buffer_t>> HandoverBuffers() {
-    return std::move(hold_buffers_);
-  }
-
 private:
  // Get CinnTensor with CINN variable name
  CinnTensor GetCinnTensor(const std::string& var_name);
@@ -72,16 +74,15 @@ class CinnLaunchContext {
                             const LoDTensor& paddle_tensor,
                             const CinnTensor& cinn_tensor);

-  // Share the buffer of a Paddle tensor to CINN by delivering memory address
-  // to a cinn_buffer_t object
-  std::unique_ptr<cinn_buffer_t> ShareTensorWithCinnBuffer(
-      const platform::Place& place, bool free_mem_callback, LoDTensor* tensor);
-
-  // Set an argument with (cinn name)->(paddle tensor) pair
-  void SetArgument(const std::string& cinn_name, const platform::Place& place,
-                   bool free_mem_callback, LoDTensor* paddle_tensor);
+  // Set an argument with (cinn name)->(cinn_buffer_t) pair
+  void SetArgument(const std::string& cinn_name,
+                   std::unique_ptr<cinn_buffer_t>&& buffer);

 private:
+  const framework::Scope* cached_scope_ = nullptr;
+  const platform::Place* cached_place_ = nullptr;
+  std::unique_ptr<framework::Scope> cached_temp_scope_ = nullptr;
+
  // a variable name map from paddle to cinn
  const std::unordered_map<std::string, std::string>& paddle2cinn_varmap_;
  // the variable scope of cinn

--- a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
@@ -45,81 +45,86 @@ std::unique_ptr<CinnLaunchContext> CreateDefaultLaunchContext() {
  return std::make_unique<CinnLaunchContext>(paddle2cinn_varmap, cinn_scope);
 }

-TEST(CinnLaunchContextTest, TestIsVariableUsed) {
+TEST(CinnLaunchContextTest, TestBasic) {
  auto launch_context = CreateDefaultLaunchContext();
-
+  // test IsVariableUsed
  ASSERT_EQ(launch_context->IsVariableUsed("var1"), true);
  ASSERT_EQ(launch_context->IsVariableUsed("var4"), false);
-}
-
-TEST(CinnLaunchContextTest, TestGetInternalVariableNames) {
-  auto launch_context = CreateDefaultLaunchContext();
-  auto internal_variable_names = launch_context->GetInternalVariableNames();
-  ASSERT_EQ(internal_variable_names.size(), 3);
-  EXPECT_NE(internal_variable_names.find("cinn_var2"),
-            internal_variable_names.end());
+  // test UpdateCapturedEnv
+  platform::CPUPlace place;
+  framework::Scope scope;
+  ASSERT_NO_THROW(launch_context->UpdateCapturedEnv(scope, place));
+  // test IsArgumentsInitialized
+  ASSERT_FALSE(launch_context->IsArgumentsInitialized());
 }

 TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) {
-  auto launch_context = CreateDefaultLaunchContext();
  platform::CPUPlace place;
  framework::Scope scope;
+  auto launch_context = CreateDefaultLaunchContext();
+  launch_context->UpdateCapturedEnv(scope, place);
  auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();

  // CheckTensorEquivalent: tensor dimension not equivalent
  tensor1->mutable_data<float>(framework::make_ddim({3, 5}), place);
-  ASSERT_THROW(launch_context->AssignExternalVariable("var1", place, tensor1),
+  ASSERT_THROW(launch_context->AssignExternalVariable("var1"),
               paddle::platform::EnforceNotMet);
 }

 TEST(CinnLaunchContextTest, TestAssignVariablePreCondition) {
-  auto launch_context = CreateDefaultLaunchContext();
  platform::CPUPlace place;
  framework::Scope scope;
+  auto launch_context = CreateDefaultLaunchContext();
+  launch_context->UpdateCapturedEnv(scope, place);
  auto* tensor4 = scope.Var("var4")->GetMutable<LoDTensor>();

  // not used
-  ASSERT_THROW(launch_context->AssignExternalVariable("var4", place, tensor4),
+  ASSERT_THROW(launch_context->AssignExternalVariable("var4"),
               paddle::platform::EnforceNotMet);
  // not found
-  ASSERT_THROW(
-      launch_context->AssignExternalVariable("cinn_var4", place, tensor4),
-      paddle::platform::EnforceNotMet);
+  ASSERT_THROW(launch_context->AssignInternalVariable("cinn_var4"),
+               paddle::platform::EnforceNotMet);
 }

 TEST(CinnLaunchContextTest, TestSetArgument) {
+  platform::CPUPlace cpu_place;
+  platform::Place place(cpu_place);
+  framework::Scope scope;
  auto launch_context = CreateDefaultLaunchContext();
+  launch_context->UpdateCapturedEnv(scope, place);

-  platform::CPUPlace place;
-  framework::Scope scope;
+  // assign external variables
  auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
  float* data1 =
      tensor1->mutable_data<float>(framework::make_ddim({3, 4}), place);
  data1[0] = 9.99f;
  data1[10] = 19.99f;
+  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var1"));

-  // assign external variable
-  ASSERT_NO_THROW(
-      launch_context->AssignExternalVariable("var1", place, tensor1));
-  auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
-  tensor2->mutable_data<float>(framework::make_ddim({6, 7, 8}), place);
-  ASSERT_NO_THROW(
-      launch_context->AssignInternalVariable("cinn_var2", place, tensor2));
-  // FinalizeArguments not missed check
-  ASSERT_THROW(launch_context->FinalizeArguments(),
-               paddle::platform::EnforceNotMet);
  auto* tensor3 = scope.Var("var3")->GetMutable<LoDTensor>();
  tensor3->mutable_data<float>(framework::make_ddim({10, 16}), place);
-  ASSERT_NO_THROW(
-      launch_context->AssignExternalVariable("var3", place, tensor3));
+  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var3"));
+
+  // FinalizeArguments missed check
+  ASSERT_THROW(launch_context->FinalizeArguments(),
+               paddle::platform::EnforceNotMet);
+  // test get internal variables
+  auto internal_variable_names = launch_context->GetInternalVariableNames();
+  ASSERT_EQ(internal_variable_names.size(), 1);
+  EXPECT_EQ(*internal_variable_names.begin(), "cinn_var2");

+  auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
+  tensor2->mutable_data<float>(framework::make_ddim({6, 7, 8}), place);
+  ASSERT_NO_THROW(launch_context->AssignInternalVariable("cinn_var2"));
+
+  // check argument is set correctly and alloc/free callbacks work well
  auto name2argument = launch_context->FinalizeArguments();
  ASSERT_EQ(name2argument.size(), 3);
  ASSERT_EQ(name2argument.count("cinn_var1"), 1);
-  // check ShareTensorWithCinnBuffer
+  ASSERT_TRUE(launch_context->IsArgumentsInitialized());
+
  auto* cinn_buffer =
      static_cast<cinn_buffer_t*>(name2argument.at("cinn_var1"));
-
  ASSERT_EQ(cinn_buffer->memory, nullptr);
  cinn_buffer->external_malloc->operator()(nullptr, cinn_buffer);
  ASSERT_NE(cinn_buffer->memory, nullptr);

--- a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
@@ -31,26 +31,6 @@ namespace operators {
 namespace details {

 #ifdef PADDLE_WITH_CUDA
-void CUDART_CB ReleaseScope(void* data) {
-  auto* temp_scope = static_cast<framework::Scope*>(data);
-  delete temp_scope;
-}
-
-void CUDART_CB ReleaseBuffers(void* data) {
-  auto* buffers =
-      static_cast<std::vector<std::unique_ptr<cinn_buffer_t>>*>(data);
-  delete buffers;
-}
-
-template <>
-void ReleaseResource<platform::CUDADeviceContext>(
-    const std::vector<void*>& resources, void* stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc(
-      static_cast<gpuStream_t>(stream), ReleaseScope, resources[0]));
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc(
-      static_cast<gpuStream_t>(stream), ReleaseBuffers, resources[1]));
-}
-
 template <>
 void* GetStream<platform::CUDADeviceContext>(
    const framework::ExecutionContext& ctx) {

--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -56,25 +56,12 @@ void LaunchCinnExecution(const CinnCompiledObject& compiled_obj,
 // Set cinn FLAGS (such as FLAGS_cinn_cudnn_deterministic) with paddle's FLAGS.
 void SetCinnRuntimeFlags();

-template <typename DeviceContext>
-void ReleaseResource(const std::vector<void*>& resources, void* stream) {
-  auto* temp_scope = static_cast<framework::Scope*>(resources[0]);
-  auto* buffers =
-      static_cast<std::vector<std::unique_ptr<cinn_buffer_t>>*>(resources[1]);
-  delete temp_scope;
-  delete buffers;
-}
-
 template <typename DeviceContext>
 void* GetStream(const framework::ExecutionContext& ctx) {
  return nullptr;
 }

 #ifdef PADDLE_WITH_CUDA
-template <>
-void ReleaseResource<platform::CUDADeviceContext>(
-    const std::vector<void*>& resources, void* stream);
-
 template <>
 void* GetStream<platform::CUDADeviceContext>(
    const framework::ExecutionContext& ctx);
@@ -116,56 +103,54 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
        compilation_key, inputs_name2tensor, target, stream);
    details::DebugCinnCompiledResult(cinn_compiled_object);

-    auto launch_context = std::make_unique<details::CinnLaunchContext>(
-        cinn_compiled_object.paddle2cinn_varmap, cinn_compiled_object.scope);
-
+    const auto& launch_context = cinn_compiled_object.launch_context;
    // Step 3. Prepare arguments needed for the compiled executable program.
-    VLOG(4) << "CinnLaunchOp prepare arguments";
-
-    // 3.1 Prepare input variables: tensors of input variables have
-    //     been initialized before graph compiled, just check the
-    //     equiality between tensors of paddle and cinn.
-    for (const auto& var_name : input_variable_names) {
-      if (!launch_context->IsVariableUsed(var_name)) {
-        // some input variables don't need for cinn because they are
-        // eliminated by optimized passes or some cinn operators use
-        // less variables
-        VLOG(4) << "Input variable(" << var_name << ") not used by cinn";
-        continue;
+    launch_context->UpdateCapturedEnv(scope, place);
+    if (!launch_context->IsArgumentsInitialized()) {
+      VLOG(4) << "CinnLaunchOp prepare arguments";
+
+      // 3.1 Prepare input variables: tensors of input variables have
+      //     been initialized before graph compiled, just check the
+      //     equiality between tensors of paddle and cinn.
+      for (const auto& var_name : input_variable_names) {
+        if (!launch_context->IsVariableUsed(var_name)) {
+          // some input variables don't need for cinn because they are
+          // eliminated by optimized passes or some cinn operators use
+          // less variables
+          VLOG(4) << "Input variable(" << var_name << ") not used by cinn";
+          continue;
+        }
+
+        launch_context->AssignExternalVariable(var_name);
      }

-      launch_context->AssignExternalVariable(
-          var_name, place, scope.GetVar(var_name)->GetMutable<LoDTensor>());
-    }
-
-    // 3.2 Prepare output variables: all output variables should
-    //     be initialized and allocated buffer before
-    //     the runtime program start execution, the compilation result
-    //     includes details of their buffer assginment and we use that to
-    //     allocate space in Paddle. For those variables allocated yet,
-    //     like persistable parameters, just check the equiality between
-    //     Paddle allocation and CINN buffer assginment.
-    auto output_variable_names = ctx.OutputNames(kOutputs);
-    for (const auto var_name : output_variable_names) {
-      PADDLE_ENFORCE_EQ(launch_context->IsVariableUsed(var_name), true,
-                        platform::errors::InvalidArgument(
-                            "Output variable(%s) not used by cinn", var_name));
-
-      auto* tensor = scope.GetVar(var_name)->GetMutable<LoDTensor>();
-      launch_context->AssignExternalVariable(var_name, place, tensor);
-    }
+      // 3.2 Prepare output variables: all output variables should
+      //     be initialized and allocated buffer before
+      //     the runtime program start execution, the compilation result
+      //     includes details of their buffer assginment and we use that to
+      //     allocate space in Paddle. For those variables allocated yet,
+      //     like persistable parameters, just check the equiality between
+      //     Paddle allocation and CINN buffer assginment.
+      auto output_variable_names = ctx.OutputNames(kOutputs);
+      for (const auto var_name : output_variable_names) {
+        PADDLE_ENFORCE_EQ(
+            launch_context->IsVariableUsed(var_name), true,
+            platform::errors::InvalidArgument(
+                "Output variable(%s) not used by cinn", var_name));
+
+        launch_context->AssignExternalVariable(var_name);
+      }

-    // 3.3 Prepare internal or temporary variables: Create a temporary
-    //     scope to keep internal variables within graph or temporary
-    //     variables needed by the compiled runtime program in addition.
-    //     Here we directly use the names from CinnScope as Paddle variable
-    //     names, because they will not be used outside the graph
-    //     and should be destructed after computation finished.
-    auto internal_variable_names = launch_context->GetInternalVariableNames();
-    framework::Scope* temp_scope = scope.NewTmpScope().release();
-    for (const auto& var_name : internal_variable_names) {
-      auto* tensor = temp_scope->Var(var_name)->GetMutable<LoDTensor>();
-      launch_context->AssignInternalVariable(var_name, place, tensor);
+      // 3.3 Prepare internal or temporary variables: Create a temporary
+      //     scope to keep internal variables within graph or temporary
+      //     variables needed by the compiled runtime program in addition.
+      //     Here we directly use the names from CinnScope as Paddle variable
+      //     names, because they will not be used outside the graph
+      //     and should be destructed after computation finished.
+      auto internal_variable_names = launch_context->GetInternalVariableNames();
+      for (const auto& var_name : internal_variable_names) {
+        launch_context->AssignInternalVariable(var_name);
+      }
    }

    // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
@@ -175,12 +160,6 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
    VLOG(4) << "Run Cinn compiled executable program with stream: " << stream;
    details::LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
    VLOG(4) << "CinnLaunchOp launch execution done.";
-
-    // Step 6. Release some resources, such as `temp_scope` and cinn_buffers.
-    auto* buffers_holder = new std::vector<std::unique_ptr<cinn_buffer_t>>{
-        launch_context->HandoverBuffers()};
-    details::ReleaseResource<DeviceContext>({temp_scope, buffers_holder},
-                                            stream);
  }
 };


--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -130,8 +130,9 @@ TEST(CinnLaunchOpTest, TestElementwiseAddPass) {
    scope.Var(test_out_name)->GetMutable<LoDTensor>();
    scope.Var(expected_out_name)->GetMutable<LoDTensor>();

-    cinn_launch_op->Run(scope, place);
-    elementwise_add_op->Run(scope, place);
+    platform::Place run_place(place);
+    cinn_launch_op->Run(scope, run_place);
+    elementwise_add_op->Run(scope, run_place);

    LoDTensor test_out, expected_out;
    TensorCopySync(scope.Var(test_out_name)->Get<LoDTensor>(),