diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc index c870536362e913e3b6a239124915b115a98f89d8..3f1b6c78d8417d2616da5eb24b35c37eb3e13bdb 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc @@ -66,7 +66,7 @@ CinnCompiler* CinnCompiler::GetInstance() { const CinnCompiledObject& CinnCompiler::Compile( const Graph& graph, const std::map& input_tensors, - const Target& target) { + const Target& target, void* stream) { VLOG(1) << "-- The graph to be compiled is:\n" << VizGraph(graph); CinnCacheKey cur_key(graph, input_tensors, target.arch_str()); bool exist = false; @@ -77,7 +77,7 @@ const CinnCompiledObject& CinnCompiler::Compile( if (!exist) { std::int64_t compiled_num = real_compiled_num_.fetch_add(1); auto compiled_res = - CompileGraph(graph, input_tensors, target, compiled_num); + CompileGraph(graph, input_tensors, target, compiled_num, stream); AutoWRLock w_guard{&rwlock_}; if (!cache_.count(cur_key)) { cache_[cur_key] = std::move(compiled_res); @@ -91,9 +91,9 @@ const CinnCompiledObject& CinnCompiler::Compile( const CinnCompiledObject& CinnCompiler::Compile( const std::string& compilation_key, const std::map& input_tensors, - const Target& target) { + const Target& target, void* stream) { const auto& graph = FindGraph(compilation_key); - return Compile(graph, input_tensors, target); + return Compile(graph, input_tensors, target, stream); } std::string CinnCompiler::AddGraph(std::unique_ptr graph) { @@ -189,7 +189,7 @@ void CinnCompiler::Clear() { std::unique_ptr CinnCompiler::CompileGraph( const ir::Graph& graph, const std::map& input_tensors, - const Target& target, std::int64_t compiled_num) const { + const Target& target, std::int64_t compiled_num, void* stream) const { CinnGraphSymbolization symbol{compiled_num, graph, target, input_tensors}; auto frontend_program = symbol(); ProgramPass::Apply(&frontend_program, target, {"Decomposer"}); @@ -209,7 +209,8 @@ std::unique_ptr CinnCompiler::CompileGraph( std::make_unique(target, scope, cinn_graph); GraphCompiler::CompileOptions options; options.with_instantiate_variables = false; - auto compiled_res = graph_compiler->Build(options, std::move(fetch_ids)); + auto compiled_res = + graph_compiler->Build(options, std::move(fetch_ids), stream); auto compiled_obj = std::make_unique(); *compiled_obj = {std::move(graph_compiler), std::move(compiled_res.runtime_program), scope, diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h index 39216866635c078750a6a7a80ad21a0cbcd20c91..71119acf1fb49ec464b752a69a51b999233aea20 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h @@ -55,12 +55,12 @@ class CinnCompiler { const CinnCompiledObject& Compile( const ir::Graph& graph, const std::map& input_tensors, - const ::cinn::common::Target& target); + const ::cinn::common::Target& target, void* stream = nullptr); const CinnCompiledObject& Compile( const std::string& compilation_key, const std::map& input_tensors, - const ::cinn::common::Target& target); + const ::cinn::common::Target& target, void* stream = nullptr); std::string AddGraph(std::unique_ptr graph); @@ -83,7 +83,8 @@ class CinnCompiler { std::unique_ptr CompileGraph( const ir::Graph& graph, const std::map& input_tensors, - const ::cinn::common::Target& target, std::int64_t compiled_num) const; + const ::cinn::common::Target& target, std::int64_t compiled_num, + void* stream = nullptr) const; std::unordered_map> graphs_; std::unordered_map, diff --git a/paddle/fluid/operators/cinn_launch_op.cc b/paddle/fluid/operators/cinn_launch_op.cc index 9e11884b8c1782c01f95735511b7ed0576d4fe62..615fa4493d05dcf92c228f993926b643a5b21e25 100644 --- a/paddle/fluid/operators/cinn_launch_op.cc +++ b/paddle/fluid/operators/cinn_launch_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/cinn_launch_op.h" +#include #include "paddle/fluid/string/string_helper.h" DECLARE_bool(cudnn_deterministic); @@ -65,8 +66,8 @@ void DebugCinnCompiledResult(const CinnCompiledObject& result) { } void LaunchCinnExecution(const CinnCompiledObject& compiled_obj, - const CinnLaunchContext& context) { - compiled_obj.runtime_program->Execute(&context.FinalizeArguments()); + const CinnLaunchContext& context, void* stream) { + compiled_obj.runtime_program->Execute(&context.FinalizeArguments(), stream); } void SetCinnRuntimeFlags() { diff --git a/paddle/fluid/operators/cinn_launch_op.cu.cc b/paddle/fluid/operators/cinn_launch_op.cu.cc index 7066cd4e598872781393f58b739e0e63249009d2..d557cfc7c08927900d702b9ee951faeaacccf620 100644 --- a/paddle/fluid/operators/cinn_launch_op.cu.cc +++ b/paddle/fluid/operators/cinn_launch_op.cu.cc @@ -13,6 +13,56 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/cinn_launch_op.h" +#include +#include +#include "cinn/runtime/cinn_runtime.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/type_defs.h" + +#ifdef PADDLE_WITH_CUDA +#include +#endif + +namespace paddle { +namespace operators { +namespace details { + +#ifdef PADDLE_WITH_CUDA +void CUDART_CB ReleaseScope(void* data) { + auto* temp_scope = static_cast(data); + delete temp_scope; +} + +void CUDART_CB ReleaseBuffers(void* data) { + auto* buffers = + static_cast>*>(data); + delete buffers; +} + +template <> +void ReleaseResource( + const std::vector& resources, void* stream) { + PADDLE_ENFORCE_CUDA_SUCCESS(cudaLaunchHostFunc( + static_cast(stream), ReleaseScope, resources[0])); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaLaunchHostFunc( + static_cast(stream), ReleaseBuffers, resources[1])); +} + +template <> +void* GetStream( + const framework::ExecutionContext& ctx) { + const auto& dev_ctx = + ctx.template device_context(); + return dev_ctx.stream(); +} +#endif + +} // namespace details +} // namespace operators +} // namespace paddle /* see [Why use single type kernel] */ REGISTER_OP_CUDA_KERNEL(cinn_launch, diff --git a/paddle/fluid/operators/cinn_launch_op.h b/paddle/fluid/operators/cinn_launch_op.h index 99446d15aa208d76f126310f93bd73874f6ba113..53e6ff0d61387372583670b32d8e9762fd5140b6 100644 --- a/paddle/fluid/operators/cinn_launch_op.h +++ b/paddle/fluid/operators/cinn_launch_op.h @@ -67,6 +67,10 @@ class CinnLaunchContext { // Finalize all execution arguments and return them const std::map& FinalizeArguments() const; + std::vector> HandoverBuffers() { + return std::move(hold_buffers_); + } + private: // Get CinnTensor with CINN variable name CinnTensor GetCinnTensor(const std::string& var_name); @@ -110,10 +114,35 @@ void DebugCinnCompiledResult(const CinnCompiledObject& result); // Launch cinn to execute compiled executable program and wait done void LaunchCinnExecution(const CinnCompiledObject& compiled_obj, - const CinnLaunchContext& context); + const CinnLaunchContext& context, void* stream); // Set cinn FLAGS (such as FLAGS_cinn_cudnn_deterministic) with paddle's FLAGS. void SetCinnRuntimeFlags(); + +template +void ReleaseResource(const std::vector& resources, void* stream) { + auto* temp_scope = static_cast(resources[0]); + auto* buffers = + static_cast>*>(resources[1]); + delete temp_scope; + delete buffers; +} + +template +void* GetStream(const framework::ExecutionContext& ctx) { + return nullptr; +} + +#ifdef PADDLE_WITH_CUDA +template <> +void ReleaseResource( + const std::vector& resources, void* stream); + +template <> +void* GetStream( + const framework::ExecutionContext& ctx); +#endif + } // namespace details template @@ -122,6 +151,7 @@ class CinnLaunchOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { const auto& scope = ctx.scope(); const auto& place = ctx.GetPlace(); + void* stream = details::GetStream(ctx); // Step 1. Find graph object and prepare input PADDLE_ENFORCE_EQ(ctx.HasAttr(kCompilationKey), true, platform::errors::NotFound( @@ -146,7 +176,7 @@ class CinnLaunchOpKernel : public framework::OpKernel { // Step 2. Get compilation result of the graph auto target = details::PlaceToCinnTarget(place); const auto& cinn_compiled_object = CinnCompiler::GetInstance()->Compile( - compilation_key, inputs_name2tensor, target); + compilation_key, inputs_name2tensor, target, stream); details::DebugCinnCompiledResult(cinn_compiled_object); auto launch_context = @@ -199,7 +229,7 @@ class CinnLaunchOpKernel : public framework::OpKernel { // names, because they will not be used outside the graph // and should be destructed after computation finished. auto internal_variable_names = launch_context->GetInternalVariableNames(); - auto temp_scope = scope.NewTmpScope(); + framework::Scope* temp_scope = scope.NewTmpScope().release(); for (const auto& var_name : internal_variable_names) { auto* tensor = temp_scope->Var(var_name)->GetMutable(); launch_context->MutableTensorData(var_name, place, tensor, true); @@ -210,8 +240,15 @@ class CinnLaunchOpKernel : public framework::OpKernel { details::SetCinnRuntimeFlags(); // Step 5. Launch CINN to execute the compiled executable program - details::LaunchCinnExecution(cinn_compiled_object, *launch_context); + VLOG(4) << "Run Cinn compiled executable program with stream: " << stream; + details::LaunchCinnExecution(cinn_compiled_object, *launch_context, stream); VLOG(4) << "CinnLaunchOp launch execution done."; + + // Step 6. Release some resources, such as `temp_scope` and cinn_buffers. + auto* buffers_holder = new std::vector>{ + launch_context->HandoverBuffers()}; + details::ReleaseResource({temp_scope, buffers_holder}, + stream); } };