Pass the stream created by Paddle to CINN. (#37337)

c249556d · Zhen Wang · GitHub · a4ef88ed · c249556d · c249556d
5 changed file
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -66,7 +66,7 @@ CinnCompiler* CinnCompiler::GetInstance() {
 const CinnCompiledObject& CinnCompiler::Compile(
    const Graph& graph,
    const std::map<std::string, const LoDTensor*>& input_tensors,
-    const Target& target) {
+    const Target& target, void* stream) {
  VLOG(1) << "-- The graph to be compiled is:\n" << VizGraph(graph);
  CinnCacheKey cur_key(graph, input_tensors, target.arch_str());
  bool exist = false;
@@ -77,7 +77,7 @@ const CinnCompiledObject& CinnCompiler::Compile(
  if (!exist) {
    std::int64_t compiled_num = real_compiled_num_.fetch_add(1);
    auto compiled_res =
-        CompileGraph(graph, input_tensors, target, compiled_num);
+        CompileGraph(graph, input_tensors, target, compiled_num, stream);
    AutoWRLock w_guard{&rwlock_};
    if (!cache_.count(cur_key)) {
      cache_[cur_key] = std::move(compiled_res);
@@ -91,9 +91,9 @@ const CinnCompiledObject& CinnCompiler::Compile(
 const CinnCompiledObject& CinnCompiler::Compile(
    const std::string& compilation_key,
    const std::map<std::string, const LoDTensor*>& input_tensors,
-    const Target& target) {
+    const Target& target, void* stream) {
  const auto& graph = FindGraph(compilation_key);
-  return Compile(graph, input_tensors, target);
+  return Compile(graph, input_tensors, target, stream);
 }

 std::string CinnCompiler::AddGraph(std::unique_ptr<Graph> graph) {
@@ -189,7 +189,7 @@ void CinnCompiler::Clear() {
 std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
    const ir::Graph& graph,
    const std::map<std::string, const LoDTensor*>& input_tensors,
-    const Target& target, std::int64_t compiled_num) const {
+    const Target& target, std::int64_t compiled_num, void* stream) const {
  CinnGraphSymbolization symbol{compiled_num, graph, target, input_tensors};
  auto frontend_program = symbol();
  ProgramPass::Apply(&frontend_program, target, {"Decomposer"});
@@ -209,7 +209,8 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
      std::make_unique<GraphCompiler>(target, scope, cinn_graph);
  GraphCompiler::CompileOptions options;
  options.with_instantiate_variables = false;
-  auto compiled_res = graph_compiler->Build(options, std::move(fetch_ids));
+  auto compiled_res =
+      graph_compiler->Build(options, std::move(fetch_ids), stream);
  auto compiled_obj = std::make_unique<CinnCompiledObject>();
  *compiled_obj = {std::move(graph_compiler),
                   std::move(compiled_res.runtime_program), scope,

--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -55,12 +55,12 @@ class CinnCompiler {
  const CinnCompiledObject& Compile(
      const ir::Graph& graph,
      const std::map<std::string, const LoDTensor*>& input_tensors,
-      const ::cinn::common::Target& target);
+      const ::cinn::common::Target& target, void* stream = nullptr);

  const CinnCompiledObject& Compile(
      const std::string& compilation_key,
      const std::map<std::string, const LoDTensor*>& input_tensors,
-      const ::cinn::common::Target& target);
+      const ::cinn::common::Target& target, void* stream = nullptr);

  std::string AddGraph(std::unique_ptr<ir::Graph> graph);

@@ -83,7 +83,8 @@ class CinnCompiler {
  std::unique_ptr<CinnCompiledObject> CompileGraph(
      const ir::Graph& graph,
      const std::map<std::string, const LoDTensor*>& input_tensors,
-      const ::cinn::common::Target& target, std::int64_t compiled_num) const;
+      const ::cinn::common::Target& target, std::int64_t compiled_num,
+      void* stream = nullptr) const;

  std::unordered_map<std::string, std::unique_ptr<ir::Graph>> graphs_;
  std::unordered_map<CinnCacheKey, std::unique_ptr<CinnCompiledObject>,

--- a/paddle/fluid/operators/cinn_launch_op.cc
+++ b/paddle/fluid/operators/cinn_launch_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/operators/cinn_launch_op.h"
+#include <vector>
 #include "paddle/fluid/string/string_helper.h"

 DECLARE_bool(cudnn_deterministic);
@@ -65,8 +66,8 @@ void DebugCinnCompiledResult(const CinnCompiledObject& result) {
 }

 void LaunchCinnExecution(const CinnCompiledObject& compiled_obj,
-                         const CinnLaunchContext& context) {
-  compiled_obj.runtime_program->Execute(&context.FinalizeArguments());
+                         const CinnLaunchContext& context, void* stream) {
+  compiled_obj.runtime_program->Execute(&context.FinalizeArguments(), stream);
 }

 void SetCinnRuntimeFlags() {

--- a/paddle/fluid/operators/cinn_launch_op.cu.cc
+++ b/paddle/fluid/operators/cinn_launch_op.cu.cc
@@ -13,6 +13,56 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/cinn_launch_op.h"
+#include <memory>
+#include <vector>
+#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/type_defs.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+#endif
+
+namespace paddle {
+namespace operators {
+namespace details {
+
+#ifdef PADDLE_WITH_CUDA
+void CUDART_CB ReleaseScope(void* data) {
+  auto* temp_scope = static_cast<framework::Scope*>(data);
+  delete temp_scope;
+}
+
+void CUDART_CB ReleaseBuffers(void* data) {
+  auto* buffers =
+      static_cast<std::vector<std::unique_ptr<cinn_buffer_t>>*>(data);
+  delete buffers;
+}
+
+template <>
+void ReleaseResource<platform::CUDADeviceContext>(
+    const std::vector<void*>& resources, void* stream) {
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaLaunchHostFunc(
+      static_cast<gpuStream_t>(stream), ReleaseScope, resources[0]));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaLaunchHostFunc(
+      static_cast<gpuStream_t>(stream), ReleaseBuffers, resources[1]));
+}
+
+template <>
+void* GetStream<platform::CUDADeviceContext>(
+    const framework::ExecutionContext& ctx) {
+  const auto& dev_ctx =
+      ctx.template device_context<platform::CUDADeviceContext>();
+  return dev_ctx.stream();
+}
+#endif
+
+}  // namespace details
+}  // namespace operators
+}  // namespace paddle

 /* see [Why use single type kernel] */
 REGISTER_OP_CUDA_KERNEL(cinn_launch,

--- a/paddle/fluid/operators/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn_launch_op.h
@@ -67,6 +67,10 @@ class CinnLaunchContext {
  // Finalize all execution arguments and return them
  const std::map<std::string, cinn_pod_value_t>& FinalizeArguments() const;

+  std::vector<std::unique_ptr<cinn_buffer_t>> HandoverBuffers() {
+    return std::move(hold_buffers_);
+  }
+
 private:
  // Get CinnTensor with CINN variable name
  CinnTensor GetCinnTensor(const std::string& var_name);
@@ -110,10 +114,35 @@ void DebugCinnCompiledResult(const CinnCompiledObject& result);

 // Launch cinn to execute compiled executable program and wait done
 void LaunchCinnExecution(const CinnCompiledObject& compiled_obj,
-                         const CinnLaunchContext& context);
+                         const CinnLaunchContext& context, void* stream);

 // Set cinn FLAGS (such as FLAGS_cinn_cudnn_deterministic) with paddle's FLAGS.
 void SetCinnRuntimeFlags();
+
+template <typename DeviceContext>
+void ReleaseResource(const std::vector<void*>& resources, void* stream) {
+  auto* temp_scope = static_cast<framework::Scope*>(resources[0]);
+  auto* buffers =
+      static_cast<std::vector<std::unique_ptr<cinn_buffer_t>>*>(resources[1]);
+  delete temp_scope;
+  delete buffers;
+}
+
+template <typename DeviceContext>
+void* GetStream(const framework::ExecutionContext& ctx) {
+  return nullptr;
+}
+
+#ifdef PADDLE_WITH_CUDA
+template <>
+void ReleaseResource<platform::CUDADeviceContext>(
+    const std::vector<void*>& resources, void* stream);
+
+template <>
+void* GetStream<platform::CUDADeviceContext>(
+    const framework::ExecutionContext& ctx);
+#endif
+
 }  // namespace details

 template <typename DeviceContext, typename T>
@@ -122,6 +151,7 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    const auto& scope = ctx.scope();
    const auto& place = ctx.GetPlace();
+    void* stream = details::GetStream<DeviceContext>(ctx);
    // Step 1. Find graph object and prepare input
    PADDLE_ENFORCE_EQ(ctx.HasAttr(kCompilationKey), true,
                      platform::errors::NotFound(
@@ -146,7 +176,7 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
    // Step 2. Get compilation result of the graph
    auto target = details::PlaceToCinnTarget(place);
    const auto& cinn_compiled_object = CinnCompiler::GetInstance()->Compile(
-        compilation_key, inputs_name2tensor, target);
+        compilation_key, inputs_name2tensor, target, stream);
    details::DebugCinnCompiledResult(cinn_compiled_object);

    auto launch_context =
@@ -199,7 +229,7 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
    //     names, because they will not be used outside the graph
    //     and should be destructed after computation finished.
    auto internal_variable_names = launch_context->GetInternalVariableNames();
-    auto temp_scope = scope.NewTmpScope();
+    framework::Scope* temp_scope = scope.NewTmpScope().release();
    for (const auto& var_name : internal_variable_names) {
      auto* tensor = temp_scope->Var(var_name)->GetMutable<LoDTensor>();
      launch_context->MutableTensorData(var_name, place, tensor, true);
@@ -210,8 +240,15 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
    details::SetCinnRuntimeFlags();

    // Step 5. Launch CINN to execute the compiled executable program
-    details::LaunchCinnExecution(cinn_compiled_object, *launch_context);
+    VLOG(4) << "Run Cinn compiled executable program with stream: " << stream;
+    details::LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
    VLOG(4) << "CinnLaunchOp launch execution done.";
+
+    // Step 6. Release some resources, such as `temp_scope` and cinn_buffers.
+    auto* buffers_holder = new std::vector<std::unique_ptr<cinn_buffer_t>>{
+        launch_context->HandoverBuffers()};
+    details::ReleaseResource<DeviceContext>({temp_scope, buffers_holder},
+                                            stream);
  }
 };