modify default parallel compile thread (#56282)

04b6035d · 傅剑寒 · GitHub · 47686692 · 04b6035d · 04b6035d
隐藏空白更改
内联并排

Showing with 9 addition and 8 deletion

paddle/cinn/hlir/framework/parallel_compiler.cc paddle/cinn/hlir/framework/parallel_compiler.cc +6 -6

paddle/cinn/runtime/flags.cc paddle/cinn/runtime/flags.cc +3 -2

未找到文件。
--- a/paddle/cinn/hlir/framework/parallel_compiler.cc
+++ b/paddle/cinn/hlir/framework/parallel_compiler.cc
@@ -92,9 +92,7 @@ void ParallelCompiler::LaunchTask() {
  RunTask(&tasks_[0]);
  // syncthreads.
-  for (auto& worker : threads) {
+  for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
-    worker.join();
-  }
 }
 ParallelCompiler::CompilationResult ParallelCompiler::MergeResult() {
@@ -113,7 +111,7 @@ ParallelCompiler::CompilationResult ParallelCompiler::MergeResult() {
      res.instructions.emplace_back(std::move(instruction));
    }
  }
-  return std::move(res);
+  return res;
 }
 void ParallelCompiler::Task::Lowering() {
@@ -167,7 +165,6 @@ void ParallelCompiler::Task::CodegenAndJit() {
    auto cuda_c = codegen.Compile(dmodule);
    CHECK(!cuda_c.empty()) << "Compile CUDA C code failed from device module:\n"
                           << dmodule;
-    source_codes.emplace_back(cuda_c);
    cinn::backends::SourceCodePrint::GetInstance()->write(cuda_c);
@@ -175,13 +172,16 @@ void ParallelCompiler::Task::CodegenAndJit() {
    backends::nvrtc::Compiler compiler;
    auto ptx = compiler(cuda_c);
    CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << cuda_c;
-    source_ptxs.emplace_back(ptx);
    // load cumodule
    cumodule.reset(new CUDAModule(ptx,
                                  compiler.compile_to_cubin()
                                      ? CUDAModule::Kind::CUBIN
                                      : CUDAModule::Kind::PTX));
+    source_codes.emplace_back(std::move(cuda_c));
+    source_ptxs.emplace_back(std::move(ptx));
    // register kernel
    backends::RuntimeSymbols symbols;
    for (auto& fn : dmodule.functions()) {

--- a/paddle/cinn/runtime/flags.cc
+++ b/paddle/cinn/runtime/flags.cc
@@ -19,7 +19,7 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
+#include <thread>
 #include <unordered_set>
 #include "paddle/cinn/common/target.h"
@@ -46,7 +46,8 @@ DEFINE_string(cinn_nvcc_cmd_path,
              "Setting nvcc default path!");
 DEFINE_int32(cinn_parallel_compile_thread,
-             Int32FromEnv("FLAGS_cinn_parallel_compile_thread", 16),
+             Int32FromEnv("FLAGS_cinn_parallel_compile_thread",
+                          (std::thread::hardware_concurrency() >> 1)),
             "How much thread the parallel compile used.");
 DEFINE_bool(cinn_use_op_fusion,