[CINN] Dump more compilation result and optimize parallel compiler flags (#55935)

1. `Parallel Compiler`： - 合并`FLAGS_cinn_parallel_compile_size`和`FLAGS_cinn_parallel_compile_thread`，通过`FLAGS_cinn_parallel_compile_thread`即可指定编译时使用的线程数，所有的`fusion_groups`将会平均分配到可用的线程上 - 增强编译完成后返回的信息，除`instruction`外，将`lowered_function`、`source_code`、`source_ptx`返回，供上层进一步使用 2. Debug信息： - 新增`FLAGS_ cinn_dump_group_lowered_func`、`FLAGS_cinn_dump_group_source_code`、`FLAGS_ cinn_dump_group_ptx`、`FLAGS_ cinn_dump_group_instruction`，可分别按`fusion_groups`储存编译的每个阶段中的中间代码 - 重新整理`graph_visualization`，所有的可视化图、单测代码均能正确分组储存 3. Bug修复： - 修复`MakeDirectory`不能正确创建文件夹的问题 4. 其他： - 清除了一些无用代码

[CINN] Dump more compilation result and optimize parallel compiler flags (#55935)
1. `Parallel Compiler`： - 合并`FLAGS_cinn_parallel_compile_size`和`FLAGS_cinn_parallel_compile_thread`，通过`FLAGS_cinn_parallel_compile_thread`即可指定编译时使用的线程数，所有的`fusion_groups`将会平均分配到可用的线程上 - 增强编译完成后返回的信息，除`instruction`外，将`lowered_function`、`source_code`、`source_ptx`返回，供上层进一步使用 2. Debug信息： - 新增`FLAGS_ cinn_dump_group_lowered_func`、`FLAGS_cinn_dump_group_source_code`、`FLAGS_ cinn_dump_group_ptx`、`FLAGS_ cinn_dump_group_instruction`，可分别按`fusion_groups`储存编译的每个阶段中的中间代码 - 重新整理`graph_visualization`，所有的可视化图、单测代码均能正确分组储存 3. Bug修复： - 修复`MakeDirectory`不能正确创建文件夹的问题 4. 其他： - 清除了一些无用代码
39b59603 · Fisher · GitHub · 469a0392 · 39b59603 · 39b59603
13 changed file
--- a/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
+++ b/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
@@ -55,7 +55,6 @@ DEFINE_string(resnet50_model_dir,
 DEFINE_int32(evaluate_knobs,
             -1,
             "the options to control which schedule tests will be run.");
-DECLARE_int32(cinn_parallel_compile_size);
 namespace cinn {
 namespace auto_schedule {
@@ -78,8 +77,6 @@ class PerformanceTester : public ::testing::Test {
    std::bitset<3> evaluate_knobs = 0UL;
  };
-  void SetUp() override { FLAGS_cinn_parallel_compile_size = 0; }
  void Evaluate(const frontend::Program& program) {
    if (FLAGS_evaluate_knobs >= 0) {
      options_.evaluate_knobs = FLAGS_evaluate_knobs;

--- a/paddle/cinn/backends/compiler.cc
+++ b/paddle/cinn/backends/compiler.cc
@@ -18,6 +18,7 @@
 #include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
 #include "paddle/cinn/common/context.h"
+#include "paddle/cinn/hlir/framework/visualize_helper.h"
 #ifdef CINN_WITH_CUDA
 #include "paddle/cinn/backends/codegen_cuda_dev.h"
 #include "paddle/cinn/backends/codegen_cuda_host.h"
@@ -29,6 +30,10 @@
 #endif
 DECLARE_string(cinn_source_code_save_path);
+DECLARE_string(cinn_dump_group_lowered_func);
+DECLARE_string(cinn_dump_group_source_code);
+DECLARE_string(cinn_dump_group_ptx);
+DECLARE_string(cinn_dump_group_instruction);
 namespace cinn {
 namespace backends {
@@ -36,6 +41,81 @@ using ir::Module;
 static constexpr int DebugLogMaxLen = 30000;
+void CompilationInfoDumper::DumpLoweredFunc() {
+  if (FLAGS_cinn_dump_group_lowered_func.empty()) {
+    return;
+  }
+  for (int idx = 0; idx < info_.lowered_funcs.size(); ++idx) {
+    std::stringstream content;
+    content << info_.lowered_funcs[idx].front();
+    Dump(FLAGS_cinn_dump_group_lowered_func,
+         idx,
+         "lowered_function.txt",
+         content.str());
+  }
+}
+void CompilationInfoDumper::DumpSourceCode() {
+  if (FLAGS_cinn_dump_group_source_code.empty()) {
+    return;
+  }
+  for (int idx = 0; idx < info_.source_codes.size(); ++idx) {
+    Dump(FLAGS_cinn_dump_group_source_code,
+         idx,
+         "source_code.cu",
+         info_.source_codes[idx]);
+  }
+}
+void CompilationInfoDumper::DumpPtxCode() {
+  if (FLAGS_cinn_dump_group_ptx.empty()) {
+    return;
+  }
+  for (int idx = 0; idx < info_.source_ptxs.size(); ++idx) {
+    Dump(FLAGS_cinn_dump_group_ptx,
+         idx,
+         "source_ptx.ptx",
+         info_.source_ptxs[idx]);
+  }
+}
+void CompilationInfoDumper::DumpInstruction() {
+  if (FLAGS_cinn_dump_group_instruction.empty()) {
+    return;
+  }
+  for (int idx = 0; idx < info_.instructions.size(); ++idx) {
+    Dump(FLAGS_cinn_dump_group_instruction,
+         idx,
+         "instruction.txt",
+         info_.instructions[idx]->DumpInstruction());
+  }
+}
+void CompilationInfoDumper::Dump(const std::string& base_path,
+                                 const int idx,
+                                 const std::string& file_name,
+                                 const std::string& content) {
+  auto dump_path =
+      utils::StringFormat("%s/fusion_group_%d", base_path.c_str(), idx);
+  if (!hlir::framework::MakeDirectory(
+          dump_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
+    LOG(WARNING) << "Failed to make directory: \"" << dump_path
+                 << "\", the instruction for this group will not dump.";
+  } else {
+    auto dump_file =
+        utils::StringFormat("%s/%s", dump_path.c_str(), file_name.c_str());
+    VLOG(7) << "Dump instruction to: " << dump_file;
+    std::ofstream of(dump_file, std::ios_base::out);
+    if (of.is_open()) {
+      of << content;
+      of.close();
+    } else {
+      LOG(WARNING) << "Failed to open file: " << dump_file
+                   << ", please check your path.";
+    }
+  }
+}
 SourceCodePrint::SourceCodePrint() {
  if (!FLAGS_cinn_source_code_save_path.empty()) {
    LOG(INFO)

--- a/paddle/cinn/backends/compiler.h
+++ b/paddle/cinn/backends/compiler.h
@@ -24,6 +24,7 @@
 #include "paddle/cinn/backends/llvm/codegen_llvm.h"
 #include "paddle/cinn/backends/llvm/execution_engine.h"
 #include "paddle/cinn/backends/llvm/simple_jit.h"
+#include "paddle/cinn/hlir/framework/parallel_compiler.h"
 #include "paddle/cinn/lang/packed_func.h"
 #ifdef CINN_WITH_CUDA
 #include "paddle/cinn/runtime/cuda/cuda_module.h"
@@ -32,6 +33,38 @@
 namespace cinn {
 namespace backends {
+/**
+ * A class for dumping the code after compilation.
+ * Use FLAGS_cinn_dump_group_lowered_func to specify the directory to dump
+ * lowered function. Use FLAGS_cinn_dump_group_source_code to specify the
+ * directory to dump the source code. Use FLAGS_cinn_dump_group_ptx to specify
+ * the directory to dump ptx. Use FLAGS_cinn_dump_group_instruction to specify
+ * the directory to dump instruction.
+ */
+class CompilationInfoDumper {
+ public:
+  explicit CompilationInfoDumper(
+      const hlir::framework::ParallelCompiler::CompilationResult& info)
+      : info_(info) {
+    DumpLoweredFunc();
+    DumpSourceCode();
+    DumpPtxCode();
+    DumpInstruction();
+  }
+ private:
+  void DumpLoweredFunc();
+  void DumpSourceCode();
+  void DumpPtxCode();
+  void DumpInstruction();
+  void Dump(const std::string& base_path,
+            const int idx,
+            const std::string& file_name,
+            const std::string& content);
+  const hlir::framework::ParallelCompiler::CompilationResult& info_;
+};
 class SourceCodePrint {
 public:
  static SourceCodePrint* GetInstance() {

--- a/paddle/cinn/hlir/framework/graph.cc
+++ b/paddle/cinn/hlir/framework/graph.cc
@@ -308,66 +308,40 @@ void Graph::VisualizeGroupedGraph(
    return;
  }
-  int viz_id = viz_count_.fetch_add(1);
+  // Dump debug info for each group
-  {
+  LOG(INFO) << "Dump graph debug info to: "
-    // create base Directory
+            << FLAGS_cinn_fusion_groups_graphviz_dir;
-    viz_path_ =
-        utils::StringFormat("%s/fusion_groups_%d/",
-                            FLAGS_cinn_fusion_groups_graphviz_dir.c_str(),
-                            viz_id);
-    if (!MakeDirectory(viz_path_,
-                       S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
-      LOG_IF(WARNING, viz_id == 0)
-          << "Failed to make directory: \"" << viz_path_
-          << "\", the CINN subgraph's fusion group information will not print.";
-      viz_path_.clear();
-      return;
-    }
-    LOG_IF(INFO, viz_id == 0) << "The CINN subgraph's fusion group information "
-                                 "will writing into path: \""
-                              << FLAGS_cinn_fusion_groups_graphviz_dir << "\"";
-  }
  const auto& groups = RemoveAccCheckGroups(origin_groups);
-  {
+  const auto& group_dots = VisualizeGroups(groups, fetch_var_ids);
-    // save python test file
+  for (int idx = 0; idx < groups.size(); ++idx) {
-    std::string py_test_path = viz_path_ + "/tests/";
+    // Create fusion_group_x folder
-    if (!MakeDirectory(py_test_path,
+    auto group_path =
-                       S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
+        utils::StringFormat("%s/fusion_group_%d",
-      LOG_IF(WARNING, viz_id == 0)
+                            FLAGS_cinn_fusion_groups_graphviz_dir.c_str(),
-          << "Failed to make directory: \"" << py_test_path
+                            idx);
-          << "\", the CINN subgraph's python test file will not generate.";
-      py_test_path.clear();
-    }
-    if (!py_test_path.empty()) {
-      for (int i = 0; i < groups.size(); i++) {
-        WriteToFile(py_test_path + "test_group_" + std::to_string(i) + ".py",
-                    GenerateGroupPythonCode(groups[i], fetch_var_ids));
-      }
-    }
-  }
-  Summary(groups, viz_path_);
-  WriteToFile(viz_path_ + "grouped_graph.dot",
-              VisualizeGraph(groups, fetch_var_ids));
-  {
-    // save each group's graphviz dot file
-    std::string group_path = viz_path_ + "/groups/";
    if (!MakeDirectory(group_path,
                       S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
-      LOG_IF(WARNING, viz_id == 0)
+      LOG(WARNING) << "Failed to make directory: \"" << group_path
-          << "Failed to make directory: \"" << group_path
+                   << "\", skip dump info for this group.";
-          << "\", the CINN subgraph's group graphviz file will not save.";
+      continue;
-      group_path.clear();
-    }
-    if (!group_path.empty()) {
-      const auto& group_dots = VisualizeGroups(groups, fetch_var_ids);
-      for (int i = 0; i < group_dots.size(); ++i) {
-        WriteToFile(GetFilePathForGroup(groups, i, group_path), group_dots[i]);
-      }
    }
+    // Create test_group_x.py
+    auto python_test_file =
+        utils::StringFormat("%s/test_group_%d.py", group_path.c_str(), idx);
+    WriteToFile(python_test_file,
+                GenerateGroupPythonCode(groups[idx], fetch_var_ids));
+    // Create x_group_name.dot
+    auto graph_group_file =
+        utils::StringFormat("%s/graph_group_%d.dot", group_path.c_str(), idx);
+    WriteToFile(graph_group_file, group_dots[idx]);
  }
+  // Summary
+  Summary(groups, FLAGS_cinn_fusion_groups_graphviz_dir);
+  // Grouped graph
+  auto grouped_graph_file = utils::StringFormat(
+      "%s/grouped_graph.dot", FLAGS_cinn_fusion_groups_graphviz_dir.c_str());
+  WriteToFile(grouped_graph_file, VisualizeGraph(groups, fetch_var_ids));
 }
 std::string Graph::VisualizeGraph(
@@ -494,8 +468,6 @@ std::vector<std::string> Graph::VisualizeGroups(
  return dot_vec;
 }
-std::atomic_size_t Graph::viz_count_{0};
 std::unordered_set<NodeData*> Graph::Group::GetInputNodeDatas() {
  std::unordered_set<NodeData*> group_inputs;
@@ -543,25 +515,6 @@ std::unordered_set<NodeData*> Graph::Group::GetOutputNodeDatas() {
  return group_outputs;
 }
-void Graph::SaveSourceCode(const std::string& code) {
-  if (cinn::runtime::CheckStringFlagFalse(
-          FLAGS_cinn_fusion_groups_graphviz_dir) ||
-      viz_path_.empty()) {
-    return;
-  }
-  WriteToFile(viz_path_ + "source_code.cu", code);
-}
-void Graph::SavePTXCode(const std::string& ptx) {
-  if (cinn::runtime::CheckStringFlagFalse(
-          FLAGS_cinn_fusion_groups_graphviz_dir) ||
-      viz_path_.empty()) {
-    return;
-  }
-  WriteToFile(viz_path_ + "source_code.ptx", ptx);
-}
 }  // namespace framework
 }  // namespace hlir
 }  // namespace cinn
--- a/paddle/cinn/hlir/framework/graph.h
+++ b/paddle/cinn/hlir/framework/graph.h
@@ -283,9 +283,6 @@ class Graph : public cinn::common::Graph {
      const std::vector<std::vector<Node*>>& groups,
      const std::unordered_set<std::string>& fetch_var_ids = {});
-  void SaveSourceCode(const std::string& code);
-  void SavePTXCode(const std::string& ptx);
 private:
  std::string DebugGroupedGraph(
      const std::vector<std::vector<Node*>>& groups,
@@ -301,9 +298,6 @@ class Graph : public cinn::common::Graph {
  std::vector<std::vector<Node*>> FusionGroupsToGroups();
-  std::string viz_path_;
-  static std::atomic_size_t viz_count_;
  CINN_DISALLOW_COPY_AND_ASSIGN(Graph);
 };

--- a/paddle/cinn/hlir/framework/graph_compiler.cc
+++ b/paddle/cinn/hlir/framework/graph_compiler.cc
@@ -20,6 +20,7 @@
 #include <unordered_set>
 #include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/backends/compiler.h"
 #include "paddle/cinn/common/context.h"
 #include "paddle/cinn/hlir/framework/instruction.h"
 #include "paddle/cinn/hlir/framework/op_lowering_util.h"
@@ -77,21 +78,24 @@ GraphCompiler::CompilationResult GraphCompiler::Build(
  parallel_compiler_ =
      std::make_shared<ParallelCompiler>(scope_, graph_, option, target_);
-  auto instructions = (*parallel_compiler_.get())();
+  auto result = (*parallel_compiler_.get())();
+  // Dump compilation result
+  backends::CompilationInfoDumper dumper(result);
  if (options.remove_unused_variables) {
-    RemoveInvalidVariables(instructions);
+    RemoveInvalidVariables(result.instructions);
  }
  if (options.with_buffer_handle_instruction_inserted) {
    VLOG(3) << "option.with_buffer_handle_instruction_inserted enable";
-    InsertBufferHandlers(&instructions);
+    InsertBufferHandlers(&result.instructions);
  }
  VLOG(2) << "Compile With Parallel Compiler Done!";
  GraphCompiler::CompilationResult compilation_result;
  compilation_result.runtime_program.reset(
-      new Program(scope_, std::move(instructions)));
+      new Program(scope_, std::move(result.instructions)));
  return compilation_result;
 }

--- a/paddle/cinn/hlir/framework/instruction.cc
+++ b/paddle/cinn/hlir/framework/instruction.cc
@@ -365,6 +365,29 @@ void Instruction::Run(
  //   }
 }
+std::string Instruction::DumpInstruction() {
+  std::stringstream ss;
+  ss << "Instruction {" << std::endl;
+  for (size_t i = 0; i < fn_names_.size(); ++i) {
+    ss << "  Function " << fn_names_[i] << ":" << std::endl;
+    ss << "    function ptr: " << fn_ptrs_[i] << std::endl;
+    auto in_arg = in_args_[i];
+    std::sort(in_arg.begin(), in_arg.end());
+    for (auto& in_name : in_arg) {
+      ss << "    input: " << in_name << std::endl;
+    }
+    auto out_arg = out_args_[i];
+    std::sort(out_arg.begin(), out_arg.end());
+    for (auto& out_name : out_arg) {
+      ss << "    output: " << out_name << std::endl;
+    }
+  }
+  ss << "}" << std::endl;
+  return ss.str();
+}
 void Instruction::CheckResults(
    const std::map<std::string, cinn_pod_value_t>* name2podargs, void* stream) {
 #ifdef CINN_WITH_CUDA

--- a/paddle/cinn/hlir/framework/instruction.h
+++ b/paddle/cinn/hlir/framework/instruction.h
@@ -132,6 +132,8 @@ class Instruction {
  int size() { return fn_ptrs_.size(); }
+  std::string DumpInstruction();
  std::vector<std::vector<std::string>> GetInArgs() { return in_args_; }
  std::vector<std::vector<std::string>> GetOutArgs() { return out_args_; }
  void ClearInArgs() { in_args_.clear(); }

--- a/paddle/cinn/hlir/framework/parallel_compiler.cc
+++ b/paddle/cinn/hlir/framework/parallel_compiler.cc
@@ -30,15 +30,13 @@
 #include "paddle/cinn/ir/module.h"
 #include "paddle/cinn/runtime/flags.h"
-DECLARE_int32(cinn_parallel_compile_size);
 DECLARE_int32(cinn_parallel_compile_thread);
 namespace cinn {
 namespace hlir {
 namespace framework {
-static constexpr int DebugLogMaxLen = 30000;
-std::vector<std::unique_ptr<Instruction>> ParallelCompiler::operator()() {
+ParallelCompiler::CompilationResult ParallelCompiler::operator()() {
  if (graph_->fusion_groups.size() == 0) {
    hlir::framework::ApplyPasses(graph_.get(), {"BuildNonFusedGroupsPass"});
  }
@@ -50,48 +48,31 @@ std::vector<std::unique_ptr<Instruction>> ParallelCompiler::operator()() {
  return MergeResult();
 }
-OpPatternKind GetOpKind(const framework::Node* node) {
-  auto& op_pattern_dict =
-      framework::Operator::GetAttrs<OpPatternKind>("OpPattern");
-  CHECK(op_pattern_dict.Find(node->op()))
-      << "Don't find the pattern of op : " << node->id();
-  auto kind = op_pattern_dict[node->op()];
-  if (kind == framework::kBroadcast) {
-    // As binary op was defined as broadcast, actually it should be
-    // element-wise.
-    if (node->op()->name != "broadcast_to") {
-      return framework::kElementWise;
-    }
-  }
-  return kind;
-}
 void ParallelCompiler::SplitTask() {
  CHECK(graph_->fusion_groups.size());
  CHECK(graph_->fusion_groups.size() == option_.lowered_funcs.size() ||
        option_.lowered_funcs.size() == 0);
-  // split task
+  // Assign fusion_group to each task.
-  int max_task_num = FLAGS_cinn_parallel_compile_thread > 0
+  // The maximum number of tasks is determined by the number of threads.
-                         ? FLAGS_cinn_parallel_compile_thread
+  // Fusion_group is assigned to tasks in order and continuous.
-                         : graph_->fusion_groups.size();
+  int fusion_group_size = graph_->fusion_groups.size();
+  int thread_size = FLAGS_cinn_parallel_compile_thread > 0
-  int group_per_task = graph_->fusion_groups.size();
+                        ? FLAGS_cinn_parallel_compile_thread
-  if (max_task_num > 1) {
+                        : 1;
-    group_per_task = FLAGS_cinn_parallel_compile_size > 0
+  int group_per_task =
-                         ? FLAGS_cinn_parallel_compile_size
+      (graph_->fusion_groups.size() + thread_size - 1) / thread_size;
-                         : ((graph_->fusion_groups.size() + max_task_num - 1) /
-                            max_task_num);
-  }
  for (int idx = 0; idx < graph_->fusion_groups.size(); idx += group_per_task) {
-    tasks_.emplace_back(this, scope_, graph_, option_, target_);
+    Task task(this, scope_, graph_, option_, target_);
+    task.start_gidx = idx;
+    task.stop_gidx =
+        (idx + group_per_task > fusion_group_size ? fusion_group_size
+                                                  : idx + group_per_task);
+    tasks_.emplace_back(std::move(task));
  }
  VLOG(2) << "Split task to " << tasks_.size() << " sub-task!";
 }
-void RunTask(ParallelCompiler::Task* task) {
+void ParallelCompiler::RunTask(ParallelCompiler::Task* task) {
  VLOG(2) << "Stark run sub-task, Thread Id : " << std::this_thread::get_id();
  VLOG(4) << "Start Lowering";
  task->Lowering();
@@ -106,7 +87,7 @@ void ParallelCompiler::LaunchTask() {
  // start sub-task.
  std::vector<std::thread> threads;
  for (int idx = 1; idx < tasks_.size(); ++idx) {
-    threads.emplace_back(RunTask, &tasks_[idx]);
+    threads.emplace_back(&ParallelCompiler::RunTask, this, &tasks_[idx]);
  }
  RunTask(&tasks_[0]);
@@ -116,11 +97,20 @@ void ParallelCompiler::LaunchTask() {
  }
 }
-std::vector<std::unique_ptr<Instruction>> ParallelCompiler::MergeResult() {
+ParallelCompiler::CompilationResult ParallelCompiler::MergeResult() {
-  std::vector<std::unique_ptr<Instruction>> res(graph_->fusion_groups.size());
+  ParallelCompiler::CompilationResult res;
  for (auto& task : tasks_) {
-    for (int idx = 0; idx < task.gidx.size(); ++idx) {
+    for (auto& lowered_func : task.lowered_funcs) {
-      res[task.gidx[idx]] = std::move(task.instructions[idx]);
+      res.lowered_funcs.emplace_back(lowered_func);
+    }
+    for (auto& source_code : task.source_codes) {
+      res.source_codes.emplace_back(source_code);
+    }
+    for (auto& source_ptx : task.source_ptxs) {
+      res.source_ptxs.emplace_back(source_ptx);
+    }
+    for (auto& instruction : task.instructions) {
+      res.instructions.emplace_back(std::move(instruction));
    }
  }
  return std::move(res);
@@ -138,13 +128,7 @@ void ParallelCompiler::Task::Lowering() {
          "infershape");
  OpLowerer op_lowerer(dtype_dict, shape_dict, target);
-  while (true) {
+  for (int idx = start_gidx; idx < stop_gidx; ++idx) {
-    int idx = compiler->GetGroupIdx();
-    if (idx < 0) {
-      break;
-    }
-    gidx.push_back(idx);
    if (options.lowered_funcs.size()) {
      lowered_funcs.push_back(options.lowered_funcs[idx]);
      continue;
@@ -154,16 +138,15 @@ void ParallelCompiler::Task::Lowering() {
            << std::this_thread::get_id() << " :\n"
            << "Group " << idx << " {\n"
            << graph->DebugGroupedGraph(group->CollectNodes()) << "}\n";
-    lowered_funcs.emplace_back(std::move(op_lowerer.Lower(group)));
+    auto lowered_group = op_lowerer.Lower(group);
-    CHECK_EQ(lowered_funcs.back().size(), 1)
+    CHECK_EQ(lowered_group.size(), 1) << "Lowerd Function Is Not Equal 1!";
-        << "Lowerd Function Is Not Equal 1!";
+    lowered_funcs.emplace_back(std::move(lowered_group));
  }
 }
 void ParallelCompiler::Task::CodegenAndJit() {
-  VLOG(2) << "Start Codegen and JIT with Group ["
+  VLOG(2) << "Start Codegen and JIT with Group [" << start_gidx << "-"
-          << cinn::utils::Join(this->gidx, ", ") << "] at "
+          << stop_gidx << ") at thread" << std::this_thread::get_id();
-          << std::this_thread::get_id();
  // build module
  ir::Module::Builder builder(common::UniqName("module"), target);
  for (auto& func : lowered_funcs) {
@@ -172,7 +155,6 @@ void ParallelCompiler::Task::CodegenAndJit() {
  }
  auto ir_module = builder.Build();
-  // codegen compile
  if (target == common::DefaultNVGPUTarget()) {
 #ifdef CINN_WITH_CUDA
    auto splited_module = backends::SplitCudaAndHostModule(ir_module);
@@ -185,14 +167,15 @@ void ParallelCompiler::Task::CodegenAndJit() {
    auto cuda_c = codegen.Compile(dmodule);
    CHECK(!cuda_c.empty()) << "Compile CUDA C code failed from device module:\n"
                           << dmodule;
+    source_codes.emplace_back(cuda_c);
    cinn::backends::SourceCodePrint::GetInstance()->write(cuda_c);
-    graph->SaveSourceCode(cuda_c);
    using runtime::cuda::CUDAModule;
    backends::nvrtc::Compiler compiler;
    auto ptx = compiler(cuda_c);
    CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << cuda_c;
+    source_ptxs.emplace_back(ptx);
    // load cumodule
    cumodule.reset(new CUDAModule(ptx,
                                  compiler.compile_to_cubin()
@@ -218,7 +201,7 @@ void ParallelCompiler::Task::CodegenAndJit() {
 void ParallelCompiler::Task::BuildInstruction() {
  // create instruction.
-  for (int idx : gidx) {
+  for (int idx = start_gidx; idx < stop_gidx; ++idx) {
    VLOG(2) << "Start BuildInstruction of Group " << idx << " at "
            << std::this_thread::get_id();
    auto& group = graph->fusion_groups[idx];
@@ -240,15 +223,6 @@ void ParallelCompiler::Task::BuildInstruction() {
  }
 }
-int ParallelCompiler::GetGroupIdx() {
-  std::lock_guard<std::mutex> lock(mtx_);
-  if (index < graph_->fusion_groups.size()) {
-    return index++;
-  } else {
-    return -1;
-  }
-}
 }  // namespace framework
 }  // namespace hlir
 }  // namespace cinn
--- a/paddle/cinn/hlir/framework/parallel_compiler.h
+++ b/paddle/cinn/hlir/framework/parallel_compiler.h
@@ -35,23 +35,18 @@ class ParallelCompiler {
    std::vector<std::vector<ir::LoweredFunc>> lowered_funcs;
  };
- public:
+  struct CompilationResult {
-  explicit ParallelCompiler(std::shared_ptr<Scope>& scope,  // NOLINT
+    // Lower result
-                            std::shared_ptr<Graph>& graph,  // NOLINT
+    std::vector<std::vector<ir::LoweredFunc>> lowered_funcs;
-                            const CompileOptions& option,
+    // Host/CUDA codegen result
-                            const common::Target& target)
+    std::vector<std::string> source_codes;
-      : scope_(scope), graph_(graph), option_(option), target_(target) {}
+    // CUDA ptx result
-  ~ParallelCompiler() {}
+    std::vector<std::string> source_ptxs;
-  std::vector<std::unique_ptr<Instruction>> operator()();
+    // Instruction result
+    std::vector<std::unique_ptr<Instruction>> instructions;
- private:
+  };
-  void SplitTask();
-  void LaunchTask();
-  std::vector<std::unique_ptr<Instruction>> MergeResult();
- public:
  struct Task {
-   public:
    Task(ParallelCompiler* p,
         std::shared_ptr<Scope>& s,  // NOLINT
         std::shared_ptr<Graph>& g,  // NOLINT
@@ -62,30 +57,40 @@ class ParallelCompiler {
    void CodegenAndJit();
    void BuildInstruction();
-   public:
    const Target target;
    ParallelCompiler* compiler;
    std::shared_ptr<Scope> scope;
    std::shared_ptr<Graph> graph;
    const CompileOptions& options;
-    std::vector<int> gidx;
+    int start_gidx;
+    int stop_gidx;
    std::vector<std::unique_ptr<Instruction>> instructions;
    std::vector<std::vector<ir::LoweredFunc>> lowered_funcs;
+    std::vector<std::string> source_codes;
+    std::vector<std::string> source_ptxs;
-   public:
    std::unique_ptr<backends::ExecutionEngine> engine;
 #ifdef CINN_WITH_CUDA
    std::unique_ptr<runtime::cuda::CUDAModule> cumodule;
 #endif
  };
-  std::vector<Task> tasks_;
-  int GetGroupIdx();
+  explicit ParallelCompiler(std::shared_ptr<Scope>& scope,  // NOLINT
+                            std::shared_ptr<Graph>& graph,  // NOLINT
+                            const CompileOptions& option,
+                            const common::Target& target)
+      : scope_(scope), graph_(graph), option_(option), target_(target) {}
+  ~ParallelCompiler() {}
+  CompilationResult operator()();
 private:
-  int index{0};
+  void SplitTask();
-  std::mutex mtx_;
+  void LaunchTask();
+  void RunTask(Task* task);
+  CompilationResult MergeResult();
+  std::vector<Task> tasks_;
  const common::Target target_;
  const CompileOptions& option_;
  std::shared_ptr<Scope> scope_;

--- a/paddle/cinn/hlir/framework/visualize_helper.cc
+++ b/paddle/cinn/hlir/framework/visualize_helper.cc
@@ -148,66 +148,30 @@ bool PassPrinter::End() {
 }
 bool MakeDirectory(const std::string& dirname, mode_t mode) {
-  auto len = dirname.length();
+  struct stat st;
-  std::vector<char> dir_path(len + 1, '\0');
+  std::string path;
-  strncpy(dir_path.data(), dirname.c_str(), len);
+  for (int i = 0; i < dirname.size(); ++i) {
-  char* path = dir_path.data();
+    path.push_back(dirname[i]);
-  for (char* p = strchr(path + 1, '/'); p; p = strchr(p + 1, '/')) {
+    if (!(dirname[i] == '/' || i + 1 == dirname.size())) {
-    *p = '\0';
+      continue;
-    if (mkdir(path, mode) == -1) {
+    }
-      if (errno != EEXIST) {
+    if (stat(path.c_str(), &st) == 0) {
-        *p = '/';
+      if (S_ISDIR(st.st_mode)) {
+        continue;
+      } else {
+        LOG(WARNING) << path << " is not a directory, please check your path.";
        return false;
      }
-    }
+    } else {
-    *p = '/';
+      if (mkdir(path.c_str(), mode) == 0) {
-  }
+        continue;
-  return true;
+      } else {
-}
+        LOG(WARNING) << "Make directory fail: " << path;
+        return false;
-std::string GetFilePathForGroup(const std::vector<std::vector<Node*>>& groups,
-                                const int group_id,
-                                const std::string& viz_path) {
-  std::string filename = "";
-  for (auto* node : groups[group_id]) {
-    filename += "_" + node->id();
-  }
-  int max_len = 50;
-  std::string simplified_filename = filename;
-  if (filename.size() > max_len) {
-    static std::unordered_map<std::string, std::string> funcname_map = {
-        {"const_scalar", "scalar"},
-        {"fill_constant", "fill"},
-        {"identity", "copy"},
-        {"broadcast_to", "broadcast"},
-        {"elementwise_add", "add"},
-        {"subtract", "sub"},
-        {"elementwise_mul", "mul"},
-        {"divide", "div"},
-        {"reduce_sum", "reduce"},
-        {"reduce_prod", "reduce"},
-        {"reduce_max", "reduce"},
-        {"reduce_min", "reduce"}};
-    for (auto& item : funcname_map) {
-      size_t index = 0;
-      while (true) {
-        index = simplified_filename.find(item.first, index);
-        if (index == std::string::npos) {
-          break;
-        }
-        simplified_filename.replace(index, item.first.size(), item.second);
-        index += item.second.size();
      }
    }
  }
+  return true;
-  int width = std::to_string(groups.size()).size();
-  std::stringstream ss;
-  ss << viz_path;
-  ss << std::setw(width) << std::setfill('0') << group_id;
-  ss << simplified_filename.substr(0, 50) << ".dot";
-  return ss.str();
 }
 std::string GenNodeDataLabel(
@@ -313,7 +277,7 @@ void Summary(const std::vector<std::vector<Node*>>& groups,
     << "Numbers\n";
  print_table(fusion_group_detail);
-  std::string filepath = viz_path + "summary.txt";
+  std::string filepath = viz_path + "/summary.txt";
  WriteToFile(filepath, ss.str());
 }

--- a/paddle/cinn/hlir/framework/visualize_helper.h
+++ b/paddle/cinn/hlir/framework/visualize_helper.h
@@ -133,10 +133,6 @@ inline std::vector<utils::DotAttr> GetGroupAttrs(size_t group_size) {
 bool MakeDirectory(const std::string& dirname, mode_t mode);
-std::string GetFilePathForGroup(const std::vector<std::vector<Node*>>& groups,
-                                const int group_id,
-                                const std::string& viz_path);
 std::string GenNodeDataLabel(
    const NodeData* node,
    const absl::flat_hash_map<std::string, shape_t>& shape_dict,

--- a/paddle/cinn/runtime/flags.cc
+++ b/paddle/cinn/runtime/flags.cc
@@ -44,13 +44,8 @@ DEFINE_string(cinn_nvcc_cmd_path,
              StringFromEnv("FLAGS_cinn_nvcc_cmd_path", "/usr/local/cuda/bin"),
              "Setting nvcc default path!");
-DEFINE_int32(cinn_parallel_compile_size,
-             Int32FromEnv("FLAGS_cinn_parallel_compile_size", 16),
-             "When use parallel compile, set the number of group compiled by "
-             "each thread.");
 DEFINE_int32(cinn_parallel_compile_thread,
-             Int32FromEnv("FLAGS_cinn_parallel_compile_thread", -1),
+             Int32FromEnv("FLAGS_cinn_parallel_compile_thread", 16),
             "How much thread the parallel compile used.");
 DEFINE_bool(cinn_use_op_fusion,
@@ -131,6 +126,26 @@ DEFINE_string(cinn_source_code_save_path,
              "Specify the directory path of generated source code, which is "
              "used for debug.");
+DEFINE_string(cinn_dump_group_lowered_func,
+              StringFromEnv("FLAGS_cinn_dump_group_lowered_func", ""),
+              "Specify the path for dump lowered functions by group, which is "
+              "used for debug.");
+DEFINE_string(
+    cinn_dump_group_source_code,
+    StringFromEnv("FLAGS_cinn_dump_group_source_code", ""),
+    "Specify the path for dump source code by group, which is used for debug.");
+DEFINE_string(
+    cinn_dump_group_ptx,
+    StringFromEnv("FLAGS_cinn_dump_group_ptx", ""),
+    "Specify the path for dump ptx by group, which is used for debug.");
+DEFINE_string(
+    cinn_dump_group_instruction,
+    StringFromEnv("FLAGS_cinn_dump_group_instruction", ""),
+    "Specify the path for dump instruction by group, which is used for debug.");
 DEFINE_string(cinn_pass_visualize_dir,
              StringFromEnv("FLAGS_cinn_pass_visualize_dir", ""),
              "Specify the directory path of pass visualize file of graph, "