diff --git a/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc b/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
index 628d8909f270a38d468ea77f19f33a78ec74c0df..c47543564c4497f13ad5b2250f8e5c3940cf7559 100644
--- a/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
+++ b/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
@@ -55,7 +55,6 @@ DEFINE_string(resnet50_model_dir,
 DEFINE_int32(evaluate_knobs,
              -1,
              "the options to control which schedule tests will be run.");
-DECLARE_int32(cinn_parallel_compile_size);
 
 namespace cinn {
 namespace auto_schedule {
@@ -78,8 +77,6 @@ class PerformanceTester : public ::testing::Test {
     std::bitset<3> evaluate_knobs = 0UL;
   };
 
-  void SetUp() override { FLAGS_cinn_parallel_compile_size = 0; }
-
   void Evaluate(const frontend::Program& program) {
     if (FLAGS_evaluate_knobs >= 0) {
       options_.evaluate_knobs = FLAGS_evaluate_knobs;
diff --git a/paddle/cinn/backends/compiler.cc b/paddle/cinn/backends/compiler.cc
index 72d69ccc3fed3e3d5b5656486e04bc034f3cb734..57b4116370782941618cee7895086c042ff78a4f 100644
--- a/paddle/cinn/backends/compiler.cc
+++ b/paddle/cinn/backends/compiler.cc
@@ -18,6 +18,7 @@
 
 #include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
 #include "paddle/cinn/common/context.h"
+#include "paddle/cinn/hlir/framework/visualize_helper.h"
 #ifdef CINN_WITH_CUDA
 #include "paddle/cinn/backends/codegen_cuda_dev.h"
 #include "paddle/cinn/backends/codegen_cuda_host.h"
@@ -29,6 +30,10 @@
 #endif
 
 DECLARE_string(cinn_source_code_save_path);
+DECLARE_string(cinn_dump_group_lowered_func);
+DECLARE_string(cinn_dump_group_source_code);
+DECLARE_string(cinn_dump_group_ptx);
+DECLARE_string(cinn_dump_group_instruction);
 
 namespace cinn {
 namespace backends {
@@ -36,6 +41,81 @@ using ir::Module;
 
 static constexpr int DebugLogMaxLen = 30000;
 
+void CompilationInfoDumper::DumpLoweredFunc() {
+  if (FLAGS_cinn_dump_group_lowered_func.empty()) {
+    return;
+  }
+  for (int idx = 0; idx < info_.lowered_funcs.size(); ++idx) {
+    std::stringstream content;
+    content << info_.lowered_funcs[idx].front();
+    Dump(FLAGS_cinn_dump_group_lowered_func,
+         idx,
+         "lowered_function.txt",
+         content.str());
+  }
+}
+
+void CompilationInfoDumper::DumpSourceCode() {
+  if (FLAGS_cinn_dump_group_source_code.empty()) {
+    return;
+  }
+  for (int idx = 0; idx < info_.source_codes.size(); ++idx) {
+    Dump(FLAGS_cinn_dump_group_source_code,
+         idx,
+         "source_code.cu",
+         info_.source_codes[idx]);
+  }
+}
+
+void CompilationInfoDumper::DumpPtxCode() {
+  if (FLAGS_cinn_dump_group_ptx.empty()) {
+    return;
+  }
+  for (int idx = 0; idx < info_.source_ptxs.size(); ++idx) {
+    Dump(FLAGS_cinn_dump_group_ptx,
+         idx,
+         "source_ptx.ptx",
+         info_.source_ptxs[idx]);
+  }
+}
+
+void CompilationInfoDumper::DumpInstruction() {
+  if (FLAGS_cinn_dump_group_instruction.empty()) {
+    return;
+  }
+  for (int idx = 0; idx < info_.instructions.size(); ++idx) {
+    Dump(FLAGS_cinn_dump_group_instruction,
+         idx,
+         "instruction.txt",
+         info_.instructions[idx]->DumpInstruction());
+  }
+}
+
+void CompilationInfoDumper::Dump(const std::string& base_path,
+                                 const int idx,
+                                 const std::string& file_name,
+                                 const std::string& content) {
+  auto dump_path =
+      utils::StringFormat("%s/fusion_group_%d", base_path.c_str(), idx);
+  if (!hlir::framework::MakeDirectory(
+          dump_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
+    LOG(WARNING) << "Failed to make directory: \"" << dump_path
+                 << "\", the instruction for this group will not dump.";
+  } else {
+    auto dump_file =
+        utils::StringFormat("%s/%s", dump_path.c_str(), file_name.c_str());
+    VLOG(7) << "Dump instruction to: " << dump_file;
+    std::ofstream of(dump_file, std::ios_base::out);
+    if (of.is_open()) {
+      of << content;
+      of.close();
+    } else {
+      LOG(WARNING) << "Failed to open file: " << dump_file
+                   << ", please check your path.";
+    }
+  }
+}
+
 SourceCodePrint::SourceCodePrint() {
   if (!FLAGS_cinn_source_code_save_path.empty()) {
     LOG(INFO)
diff --git a/paddle/cinn/backends/compiler.h b/paddle/cinn/backends/compiler.h
index 1293125129a8117ccc8cfcfff614f613b8d3941e..c943a9890fd5d5415df91f4532a3c8964696597f 100644
--- a/paddle/cinn/backends/compiler.h
+++ b/paddle/cinn/backends/compiler.h
@@ -24,6 +24,7 @@
 #include "paddle/cinn/backends/llvm/codegen_llvm.h"
 #include "paddle/cinn/backends/llvm/execution_engine.h"
 #include "paddle/cinn/backends/llvm/simple_jit.h"
+#include "paddle/cinn/hlir/framework/parallel_compiler.h"
 #include "paddle/cinn/lang/packed_func.h"
 #ifdef CINN_WITH_CUDA
 #include "paddle/cinn/runtime/cuda/cuda_module.h"
@@ -32,6 +33,38 @@
 namespace cinn {
 namespace backends {
 
+/**
+ * A class for dumping the code after compilation.
+ * Use FLAGS_cinn_dump_group_lowered_func to specify the directory to dump
+ * lowered function. Use FLAGS_cinn_dump_group_source_code to specify the
+ * directory to dump the source code. Use FLAGS_cinn_dump_group_ptx to specify
+ * the directory to dump ptx. Use FLAGS_cinn_dump_group_instruction to specify
+ * the directory to dump instruction.
+ */
+class CompilationInfoDumper {
+ public:
+  explicit CompilationInfoDumper(
+      const hlir::framework::ParallelCompiler::CompilationResult& info)
+      : info_(info) {
+    DumpLoweredFunc();
+    DumpSourceCode();
+    DumpPtxCode();
+    DumpInstruction();
+  }
+
+ private:
+  void DumpLoweredFunc();
+  void DumpSourceCode();
+  void DumpPtxCode();
+  void DumpInstruction();
+  void Dump(const std::string& base_path,
+            const int idx,
+            const std::string& file_name,
+            const std::string& content);
+
+  const hlir::framework::ParallelCompiler::CompilationResult& info_;
+};
+
 class SourceCodePrint {
  public:
   static SourceCodePrint* GetInstance() {
diff --git a/paddle/cinn/hlir/framework/graph.cc b/paddle/cinn/hlir/framework/graph.cc
index 6ebd405aeed7f06f6f983f280fe043a862d96f78..0e783cdda9272e17535432681c7b1fd116f6ec14 100644
--- a/paddle/cinn/hlir/framework/graph.cc
+++ b/paddle/cinn/hlir/framework/graph.cc
@@ -308,66 +308,40 @@ void Graph::VisualizeGroupedGraph(
     return;
   }
 
-  int viz_id = viz_count_.fetch_add(1);
-  {
-    // create base Directory
-    viz_path_ =
-        utils::StringFormat("%s/fusion_groups_%d/",
-                            FLAGS_cinn_fusion_groups_graphviz_dir.c_str(),
-                            viz_id);
-    if (!MakeDirectory(viz_path_,
-                       S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
-      LOG_IF(WARNING, viz_id == 0)
-          << "Failed to make directory: \"" << viz_path_
-          << "\", the CINN subgraph's fusion group information will not print.";
-      viz_path_.clear();
-      return;
-    }
-    LOG_IF(INFO, viz_id == 0) << "The CINN subgraph's fusion group information "
-                                 "will writing into path: \""
-                              << FLAGS_cinn_fusion_groups_graphviz_dir << "\"";
-  }
-
+  // Dump debug info for each group
+  LOG(INFO) << "Dump graph debug info to: "
+            << FLAGS_cinn_fusion_groups_graphviz_dir;
   const auto& groups = RemoveAccCheckGroups(origin_groups);
-  {
-    // save python test file
-    std::string py_test_path = viz_path_ + "/tests/";
-    if (!MakeDirectory(py_test_path,
-                       S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
-      LOG_IF(WARNING, viz_id == 0)
-          << "Failed to make directory: \"" << py_test_path
-          << "\", the CINN subgraph's python test file will not generate.";
-      py_test_path.clear();
-    }
-    if (!py_test_path.empty()) {
-      for (int i = 0; i < groups.size(); i++) {
-        WriteToFile(py_test_path + "test_group_" + std::to_string(i) + ".py",
-                    GenerateGroupPythonCode(groups[i], fetch_var_ids));
-      }
-    }
-  }
-
-  Summary(groups, viz_path_);
-  WriteToFile(viz_path_ + "grouped_graph.dot",
-              VisualizeGraph(groups, fetch_var_ids));
-
-  {
-    // save each group's graphviz dot file
-    std::string group_path = viz_path_ + "/groups/";
+  const auto& group_dots = VisualizeGroups(groups, fetch_var_ids);
+  for (int idx = 0; idx < groups.size(); ++idx) {
+    // Create fusion_group_x folder
+    auto group_path =
+        utils::StringFormat("%s/fusion_group_%d",
+                            FLAGS_cinn_fusion_groups_graphviz_dir.c_str(),
+                            idx);
     if (!MakeDirectory(group_path,
                        S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
-      LOG_IF(WARNING, viz_id == 0)
-          << "Failed to make directory: \"" << group_path
-          << "\", the CINN subgraph's group graphviz file will not save.";
-      group_path.clear();
-    }
-    if (!group_path.empty()) {
-      const auto& group_dots = VisualizeGroups(groups, fetch_var_ids);
-      for (int i = 0; i < group_dots.size(); ++i) {
-        WriteToFile(GetFilePathForGroup(groups, i, group_path), group_dots[i]);
-      }
+      LOG(WARNING) << "Failed to make directory: \"" << group_path
+                   << "\", skip dump info for this group.";
+      continue;
     }
+    // Create test_group_x.py
+    auto python_test_file =
+        utils::StringFormat("%s/test_group_%d.py", group_path.c_str(), idx);
+    WriteToFile(python_test_file,
+                GenerateGroupPythonCode(groups[idx], fetch_var_ids));
+    // Create x_group_name.dot
+    auto graph_group_file =
+        utils::StringFormat("%s/graph_group_%d.dot", group_path.c_str(), idx);
+    WriteToFile(graph_group_file, group_dots[idx]);
   }
+
+  // Summary
+  Summary(groups, FLAGS_cinn_fusion_groups_graphviz_dir);
+  // Grouped graph
+  auto grouped_graph_file = utils::StringFormat(
+      "%s/grouped_graph.dot", FLAGS_cinn_fusion_groups_graphviz_dir.c_str());
+  WriteToFile(grouped_graph_file, VisualizeGraph(groups, fetch_var_ids));
 }
 
 std::string Graph::VisualizeGraph(
@@ -494,8 +468,6 @@ std::vector<std::string> Graph::VisualizeGroups(
   return dot_vec;
 }
 
-std::atomic_size_t Graph::viz_count_{0};
-
 std::unordered_set<NodeData*> Graph::Group::GetInputNodeDatas() {
   std::unordered_set<NodeData*> group_inputs;
 
@@ -543,25 +515,6 @@ std::unordered_set<NodeData*> Graph::Group::GetOutputNodeDatas() {
   return group_outputs;
 }
 
-void Graph::SaveSourceCode(const std::string& code) {
-  if (cinn::runtime::CheckStringFlagFalse(
-          FLAGS_cinn_fusion_groups_graphviz_dir) ||
-      viz_path_.empty()) {
-    return;
-  }
-  WriteToFile(viz_path_ + "source_code.cu", code);
-}
-
-void Graph::SavePTXCode(const std::string& ptx) {
-  if (cinn::runtime::CheckStringFlagFalse(
-          FLAGS_cinn_fusion_groups_graphviz_dir) ||
-      viz_path_.empty()) {
-    return;
-  }
-
-  WriteToFile(viz_path_ + "source_code.ptx", ptx);
-}
-
 }  // namespace framework
 }  // namespace hlir
 }  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/graph.h b/paddle/cinn/hlir/framework/graph.h
index 5f4d2e4d9791fbd48345800896d081aacfe598f2..7f99ed004da7ed534b3951c50559ccb01428e436 100644
--- a/paddle/cinn/hlir/framework/graph.h
+++ b/paddle/cinn/hlir/framework/graph.h
@@ -283,9 +283,6 @@ class Graph : public cinn::common::Graph {
       const std::vector<std::vector<Node*>>& groups,
       const std::unordered_set<std::string>& fetch_var_ids = {});
 
-  void SaveSourceCode(const std::string& code);
-  void SavePTXCode(const std::string& ptx);
-
  private:
   std::string DebugGroupedGraph(
       const std::vector<std::vector<Node*>>& groups,
@@ -301,9 +298,6 @@ class Graph : public cinn::common::Graph {
 
   std::vector<std::vector<Node*>> FusionGroupsToGroups();
 
-  std::string viz_path_;
-  static std::atomic_size_t viz_count_;
-
   CINN_DISALLOW_COPY_AND_ASSIGN(Graph);
 };
 
diff --git a/paddle/cinn/hlir/framework/graph_compiler.cc b/paddle/cinn/hlir/framework/graph_compiler.cc
index 16d6366f92745f1d2bb77b0d24516c5e5cd9b0f1..7fc6b57932c51660d7f5a2fbcb02154135e08866 100644
--- a/paddle/cinn/hlir/framework/graph_compiler.cc
+++ b/paddle/cinn/hlir/framework/graph_compiler.cc
@@ -20,6 +20,7 @@
 #include <unordered_set>
 
 #include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/backends/compiler.h"
 #include "paddle/cinn/common/context.h"
 #include "paddle/cinn/hlir/framework/instruction.h"
 #include "paddle/cinn/hlir/framework/op_lowering_util.h"
@@ -77,21 +78,24 @@ GraphCompiler::CompilationResult GraphCompiler::Build(
 
   parallel_compiler_ =
       std::make_shared<ParallelCompiler>(scope_, graph_, option, target_);
-  auto instructions = (*parallel_compiler_.get())();
+  auto result = (*parallel_compiler_.get())();
+
+  // Dump compilation result
+  backends::CompilationInfoDumper dumper(result);
 
   if (options.remove_unused_variables) {
-    RemoveInvalidVariables(instructions);
+    RemoveInvalidVariables(result.instructions);
   }
 
   if (options.with_buffer_handle_instruction_inserted) {
     VLOG(3) << "option.with_buffer_handle_instruction_inserted enable";
-    InsertBufferHandlers(&instructions);
+    InsertBufferHandlers(&result.instructions);
   }
   VLOG(2) << "Compile With Parallel Compiler Done!";
 
   GraphCompiler::CompilationResult compilation_result;
   compilation_result.runtime_program.reset(
-      new Program(scope_, std::move(instructions)));
+      new Program(scope_, std::move(result.instructions)));
   return compilation_result;
 }
 
diff --git a/paddle/cinn/hlir/framework/instruction.cc b/paddle/cinn/hlir/framework/instruction.cc
index abd86b8a6d4dedb87e8336f18f087ae683a5ea27..01fb89aa2ac0d8805e1d316d9711b4a8696ccc6c 100644
--- a/paddle/cinn/hlir/framework/instruction.cc
+++ b/paddle/cinn/hlir/framework/instruction.cc
@@ -365,6 +365,29 @@ void Instruction::Run(
   //   }
 }
 
+std::string Instruction::DumpInstruction() {
+  std::stringstream ss;
+  ss << "Instruction {" << std::endl;
+  for (size_t i = 0; i < fn_names_.size(); ++i) {
+    ss << "  Function " << fn_names_[i] << ":" << std::endl;
+    ss << "    function ptr: " << fn_ptrs_[i] << std::endl;
+
+    auto in_arg = in_args_[i];
+    std::sort(in_arg.begin(), in_arg.end());
+    for (auto& in_name : in_arg) {
+      ss << "    input: " << in_name << std::endl;
+    }
+
+    auto out_arg = out_args_[i];
+    std::sort(out_arg.begin(), out_arg.end());
+    for (auto& out_name : out_arg) {
+      ss << "    output: " << out_name << std::endl;
+    }
+  }
+  ss << "}" << std::endl;
+  return ss.str();
+}
+
 void Instruction::CheckResults(
     const std::map<std::string, cinn_pod_value_t>* name2podargs, void* stream) {
 #ifdef CINN_WITH_CUDA
diff --git a/paddle/cinn/hlir/framework/instruction.h b/paddle/cinn/hlir/framework/instruction.h
index 225cf6d08fd568944dc041cada8ac9a375b19a9e..23da5c532c4cd38bd50a8ac04b04f69fe7572e8c 100644
--- a/paddle/cinn/hlir/framework/instruction.h
+++ b/paddle/cinn/hlir/framework/instruction.h
@@ -132,6 +132,8 @@ class Instruction {
 
   int size() { return fn_ptrs_.size(); }
 
+  std::string DumpInstruction();
+
   std::vector<std::vector<std::string>> GetInArgs() { return in_args_; }
   std::vector<std::vector<std::string>> GetOutArgs() { return out_args_; }
   void ClearInArgs() { in_args_.clear(); }
diff --git a/paddle/cinn/hlir/framework/parallel_compiler.cc b/paddle/cinn/hlir/framework/parallel_compiler.cc
index 5004b6de8b5e3c30ec2bbb17ac8d2bf3a69cd56a..1b2cbca9e056d73ddaaaf9554408011ad5fa35e1 100644
--- a/paddle/cinn/hlir/framework/parallel_compiler.cc
+++ b/paddle/cinn/hlir/framework/parallel_compiler.cc
@@ -30,15 +30,13 @@
 #include "paddle/cinn/ir/module.h"
 #include "paddle/cinn/runtime/flags.h"
 
-DECLARE_int32(cinn_parallel_compile_size);
 DECLARE_int32(cinn_parallel_compile_thread);
 
 namespace cinn {
 namespace hlir {
 namespace framework {
-static constexpr int DebugLogMaxLen = 30000;
 
-std::vector<std::unique_ptr<Instruction>> ParallelCompiler::operator()() {
+ParallelCompiler::CompilationResult ParallelCompiler::operator()() {
   if (graph_->fusion_groups.size() == 0) {
     hlir::framework::ApplyPasses(graph_.get(), {"BuildNonFusedGroupsPass"});
   }
@@ -50,48 +48,31 @@ std::vector<std::unique_ptr<Instruction>> ParallelCompiler::operator()() {
   return MergeResult();
 }
 
-OpPatternKind GetOpKind(const framework::Node* node) {
-  auto& op_pattern_dict =
-      framework::Operator::GetAttrs<OpPatternKind>("OpPattern");
-  CHECK(op_pattern_dict.Find(node->op()))
-      << "Don't find the pattern of op : " << node->id();
-  auto kind = op_pattern_dict[node->op()];
-
-  if (kind == framework::kBroadcast) {
-    // As binary op was defined as broadcast, actually it should be
-    // element-wise.
-    if (node->op()->name != "broadcast_to") {
-      return framework::kElementWise;
-    }
-  }
-
-  return kind;
-}
-
 void ParallelCompiler::SplitTask() {
   CHECK(graph_->fusion_groups.size());
   CHECK(graph_->fusion_groups.size() == option_.lowered_funcs.size() ||
         option_.lowered_funcs.size() == 0);
-  // split task
-  int max_task_num = FLAGS_cinn_parallel_compile_thread > 0
-                         ? FLAGS_cinn_parallel_compile_thread
-                         : graph_->fusion_groups.size();
-
-  int group_per_task = graph_->fusion_groups.size();
-  if (max_task_num > 1) {
-    group_per_task = FLAGS_cinn_parallel_compile_size > 0
-                         ? FLAGS_cinn_parallel_compile_size
-                         : ((graph_->fusion_groups.size() + max_task_num - 1) /
-                            max_task_num);
-  }
-
+  // Assign fusion_group to each task.
+  // The maximum number of tasks is determined by the number of threads.
+  // Fusion_group is assigned to tasks in order and continuous.
+  int fusion_group_size = graph_->fusion_groups.size();
+  int thread_size = FLAGS_cinn_parallel_compile_thread > 0
+                        ? FLAGS_cinn_parallel_compile_thread
+                        : 1;
+  int group_per_task =
+      (graph_->fusion_groups.size() + thread_size - 1) / thread_size;
   for (int idx = 0; idx < graph_->fusion_groups.size(); idx += group_per_task) {
-    tasks_.emplace_back(this, scope_, graph_, option_, target_);
+    Task task(this, scope_, graph_, option_, target_);
+    task.start_gidx = idx;
+    task.stop_gidx =
+        (idx + group_per_task > fusion_group_size ? fusion_group_size
+                                                  : idx + group_per_task);
+    tasks_.emplace_back(std::move(task));
   }
   VLOG(2) << "Split task to " << tasks_.size() << " sub-task!";
 }
 
-void RunTask(ParallelCompiler::Task* task) {
+void ParallelCompiler::RunTask(ParallelCompiler::Task* task) {
   VLOG(2) << "Stark run sub-task, Thread Id : " << std::this_thread::get_id();
   VLOG(4) << "Start Lowering";
   task->Lowering();
@@ -106,7 +87,7 @@ void ParallelCompiler::LaunchTask() {
   // start sub-task.
   std::vector<std::thread> threads;
   for (int idx = 1; idx < tasks_.size(); ++idx) {
-    threads.emplace_back(RunTask, &tasks_[idx]);
+    threads.emplace_back(&ParallelCompiler::RunTask, this, &tasks_[idx]);
   }
 
   RunTask(&tasks_[0]);
@@ -116,11 +97,20 @@ void ParallelCompiler::LaunchTask() {
   }
 }
 
-std::vector<std::unique_ptr<Instruction>> ParallelCompiler::MergeResult() {
-  std::vector<std::unique_ptr<Instruction>> res(graph_->fusion_groups.size());
+ParallelCompiler::CompilationResult ParallelCompiler::MergeResult() {
+  ParallelCompiler::CompilationResult res;
   for (auto& task : tasks_) {
-    for (int idx = 0; idx < task.gidx.size(); ++idx) {
-      res[task.gidx[idx]] = std::move(task.instructions[idx]);
+    for (auto& lowered_func : task.lowered_funcs) {
+      res.lowered_funcs.emplace_back(lowered_func);
+    }
+    for (auto& source_code : task.source_codes) {
+      res.source_codes.emplace_back(source_code);
+    }
+    for (auto& source_ptx : task.source_ptxs) {
+      res.source_ptxs.emplace_back(source_ptx);
+    }
+    for (auto& instruction : task.instructions) {
+      res.instructions.emplace_back(std::move(instruction));
     }
   }
   return std::move(res);
@@ -138,13 +128,7 @@ void ParallelCompiler::Task::Lowering() {
           "infershape");
 
   OpLowerer op_lowerer(dtype_dict, shape_dict, target);
-  while (true) {
-    int idx = compiler->GetGroupIdx();
-    if (idx < 0) {
-      break;
-    }
-
-    gidx.push_back(idx);
+  for (int idx = start_gidx; idx < stop_gidx; ++idx) {
     if (options.lowered_funcs.size()) {
       lowered_funcs.push_back(options.lowered_funcs[idx]);
       continue;
@@ -154,16 +138,15 @@ void ParallelCompiler::Task::Lowering() {
             << std::this_thread::get_id() << " :\n"
             << "Group " << idx << " {\n"
             << graph->DebugGroupedGraph(group->CollectNodes()) << "}\n";
-    lowered_funcs.emplace_back(std::move(op_lowerer.Lower(group)));
-    CHECK_EQ(lowered_funcs.back().size(), 1)
-        << "Lowerd Function Is Not Equal 1!";
+    auto lowered_group = op_lowerer.Lower(group);
+    CHECK_EQ(lowered_group.size(), 1) << "Lowerd Function Is Not Equal 1!";
+    lowered_funcs.emplace_back(std::move(lowered_group));
   }
 }
 
 void ParallelCompiler::Task::CodegenAndJit() {
-  VLOG(2) << "Start Codegen and JIT with Group ["
-          << cinn::utils::Join(this->gidx, ", ") << "] at "
-          << std::this_thread::get_id();
+  VLOG(2) << "Start Codegen and JIT with Group [" << start_gidx << "-"
+          << stop_gidx << ") at thread" << std::this_thread::get_id();
   // build module
   ir::Module::Builder builder(common::UniqName("module"), target);
   for (auto& func : lowered_funcs) {
@@ -172,7 +155,6 @@ void ParallelCompiler::Task::CodegenAndJit() {
   }
 
   auto ir_module = builder.Build();
-  // codegen compile
   if (target == common::DefaultNVGPUTarget()) {
 #ifdef CINN_WITH_CUDA
     auto splited_module = backends::SplitCudaAndHostModule(ir_module);
@@ -185,14 +167,15 @@ void ParallelCompiler::Task::CodegenAndJit() {
     auto cuda_c = codegen.Compile(dmodule);
     CHECK(!cuda_c.empty()) << "Compile CUDA C code failed from device module:\n"
                            << dmodule;
+    source_codes.emplace_back(cuda_c);
 
     cinn::backends::SourceCodePrint::GetInstance()->write(cuda_c);
-    graph->SaveSourceCode(cuda_c);
 
     using runtime::cuda::CUDAModule;
     backends::nvrtc::Compiler compiler;
     auto ptx = compiler(cuda_c);
     CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << cuda_c;
+    source_ptxs.emplace_back(ptx);
     // load cumodule
     cumodule.reset(new CUDAModule(ptx,
                                   compiler.compile_to_cubin()
@@ -218,7 +201,7 @@ void ParallelCompiler::Task::CodegenAndJit() {
 
 void ParallelCompiler::Task::BuildInstruction() {
   // create instruction.
-  for (int idx : gidx) {
+  for (int idx = start_gidx; idx < stop_gidx; ++idx) {
     VLOG(2) << "Start BuildInstruction of Group " << idx << " at "
             << std::this_thread::get_id();
     auto& group = graph->fusion_groups[idx];
@@ -240,15 +223,6 @@ void ParallelCompiler::Task::BuildInstruction() {
   }
 }
 
-int ParallelCompiler::GetGroupIdx() {
-  std::lock_guard<std::mutex> lock(mtx_);
-  if (index < graph_->fusion_groups.size()) {
-    return index++;
-  } else {
-    return -1;
-  }
-}
-
 }  // namespace framework
 }  // namespace hlir
 }  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/parallel_compiler.h b/paddle/cinn/hlir/framework/parallel_compiler.h
index 45fc4fef77a8d2a363a0e6dd50b82bc5104986c6..fad32d2c8a7eeae6047f91fe1e82ded1a0cdf028 100644
--- a/paddle/cinn/hlir/framework/parallel_compiler.h
+++ b/paddle/cinn/hlir/framework/parallel_compiler.h
@@ -35,23 +35,18 @@ class ParallelCompiler {
     std::vector<std::vector<ir::LoweredFunc>> lowered_funcs;
   };
 
- public:
-  explicit ParallelCompiler(std::shared_ptr<Scope>& scope,  // NOLINT
-                            std::shared_ptr<Graph>& graph,  // NOLINT
-                            const CompileOptions& option,
-                            const common::Target& target)
-      : scope_(scope), graph_(graph), option_(option), target_(target) {}
-  ~ParallelCompiler() {}
-  std::vector<std::unique_ptr<Instruction>> operator()();
-
- private:
-  void SplitTask();
-  void LaunchTask();
-  std::vector<std::unique_ptr<Instruction>> MergeResult();
+  struct CompilationResult {
+    // Lower result
+    std::vector<std::vector<ir::LoweredFunc>> lowered_funcs;
+    // Host/CUDA codegen result
+    std::vector<std::string> source_codes;
+    // CUDA ptx result
+    std::vector<std::string> source_ptxs;
+    // Instruction result
+    std::vector<std::unique_ptr<Instruction>> instructions;
+  };
 
- public:
   struct Task {
-   public:
     Task(ParallelCompiler* p,
          std::shared_ptr<Scope>& s,  // NOLINT
          std::shared_ptr<Graph>& g,  // NOLINT
@@ -62,30 +57,40 @@ class ParallelCompiler {
     void CodegenAndJit();
     void BuildInstruction();
 
-   public:
     const Target target;
     ParallelCompiler* compiler;
     std::shared_ptr<Scope> scope;
     std::shared_ptr<Graph> graph;
     const CompileOptions& options;
 
-    std::vector<int> gidx;
+    int start_gidx;
+    int stop_gidx;
     std::vector<std::unique_ptr<Instruction>> instructions;
     std::vector<std::vector<ir::LoweredFunc>> lowered_funcs;
+    std::vector<std::string> source_codes;
+    std::vector<std::string> source_ptxs;
 
-   public:
     std::unique_ptr<backends::ExecutionEngine> engine;
 #ifdef CINN_WITH_CUDA
     std::unique_ptr<runtime::cuda::CUDAModule> cumodule;
 #endif
   };
-  std::vector<Task> tasks_;
-  int GetGroupIdx();
+
+  explicit ParallelCompiler(std::shared_ptr<Scope>& scope,  // NOLINT
+                            std::shared_ptr<Graph>& graph,  // NOLINT
+                            const CompileOptions& option,
+                            const common::Target& target)
+      : scope_(scope), graph_(graph), option_(option), target_(target) {}
+  ~ParallelCompiler() {}
+  CompilationResult operator()();
 
  private:
-  int index{0};
-  std::mutex mtx_;
+  void SplitTask();
+  void LaunchTask();
+  void RunTask(Task* task);
+  CompilationResult MergeResult();
 
+  std::vector<Task> tasks_;
   const common::Target target_;
   const CompileOptions& option_;
   std::shared_ptr<Scope> scope_;
diff --git a/paddle/cinn/hlir/framework/visualize_helper.cc b/paddle/cinn/hlir/framework/visualize_helper.cc
index a310ac2a0fb8a4914276c321f37a015cce37c048..e370737b67e3f36127807cca541d606282b6d128 100644
--- a/paddle/cinn/hlir/framework/visualize_helper.cc
+++ b/paddle/cinn/hlir/framework/visualize_helper.cc
@@ -148,66 +148,30 @@ bool PassPrinter::End() {
 }
 
 bool MakeDirectory(const std::string& dirname, mode_t mode) {
-  auto len = dirname.length();
-  std::vector<char> dir_path(len + 1, '\0');
-  strncpy(dir_path.data(), dirname.c_str(), len);
-  char* path = dir_path.data();
-  for (char* p = strchr(path + 1, '/'); p; p = strchr(p + 1, '/')) {
-    *p = '\0';
-    if (mkdir(path, mode) == -1) {
-      if (errno != EEXIST) {
-        *p = '/';
+  struct stat st;
+  std::string path;
+  for (int i = 0; i < dirname.size(); ++i) {
+    path.push_back(dirname[i]);
+    if (!(dirname[i] == '/' || i + 1 == dirname.size())) {
+      continue;
+    }
+    if (stat(path.c_str(), &st) == 0) {
+      if (S_ISDIR(st.st_mode)) {
+        continue;
+      } else {
+        LOG(WARNING) << path << " is not a directory, please check your path.";
         return false;
       }
-    }
-    *p = '/';
-  }
-  return true;
-}
-
-std::string GetFilePathForGroup(const std::vector<std::vector<Node*>>& groups,
-                                const int group_id,
-                                const std::string& viz_path) {
-  std::string filename = "";
-  for (auto* node : groups[group_id]) {
-    filename += "_" + node->id();
-  }
-
-  int max_len = 50;
-  std::string simplified_filename = filename;
-  if (filename.size() > max_len) {
-    static std::unordered_map<std::string, std::string> funcname_map = {
-        {"const_scalar", "scalar"},
-        {"fill_constant", "fill"},
-        {"identity", "copy"},
-        {"broadcast_to", "broadcast"},
-        {"elementwise_add", "add"},
-        {"subtract", "sub"},
-        {"elementwise_mul", "mul"},
-        {"divide", "div"},
-        {"reduce_sum", "reduce"},
-        {"reduce_prod", "reduce"},
-        {"reduce_max", "reduce"},
-        {"reduce_min", "reduce"}};
-    for (auto& item : funcname_map) {
-      size_t index = 0;
-      while (true) {
-        index = simplified_filename.find(item.first, index);
-        if (index == std::string::npos) {
-          break;
-        }
-        simplified_filename.replace(index, item.first.size(), item.second);
-        index += item.second.size();
+    } else {
+      if (mkdir(path.c_str(), mode) == 0) {
+        continue;
+      } else {
+        LOG(WARNING) << "Make directory fail: " << path;
+        return false;
       }
     }
   }
-
-  int width = std::to_string(groups.size()).size();
-  std::stringstream ss;
-  ss << viz_path;
-  ss << std::setw(width) << std::setfill('0') << group_id;
-  ss << simplified_filename.substr(0, 50) << ".dot";
-  return ss.str();
+  return true;
 }
 
 std::string GenNodeDataLabel(
@@ -313,7 +277,7 @@ void Summary(const std::vector<std::vector<Node*>>& groups,
      << "Numbers\n";
   print_table(fusion_group_detail);
 
-  std::string filepath = viz_path + "summary.txt";
+  std::string filepath = viz_path + "/summary.txt";
   WriteToFile(filepath, ss.str());
 }
 
diff --git a/paddle/cinn/hlir/framework/visualize_helper.h b/paddle/cinn/hlir/framework/visualize_helper.h
index 032f5f2ec4e0c8508a44a7b3b254fd3aedba52a6..3afd3a974db0c83dc6e83b85957fd97673e295eb 100644
--- a/paddle/cinn/hlir/framework/visualize_helper.h
+++ b/paddle/cinn/hlir/framework/visualize_helper.h
@@ -133,10 +133,6 @@ inline std::vector<utils::DotAttr> GetGroupAttrs(size_t group_size) {
 
 bool MakeDirectory(const std::string& dirname, mode_t mode);
 
-std::string GetFilePathForGroup(const std::vector<std::vector<Node*>>& groups,
-                                const int group_id,
-                                const std::string& viz_path);
-
 std::string GenNodeDataLabel(
     const NodeData* node,
     const absl::flat_hash_map<std::string, shape_t>& shape_dict,
diff --git a/paddle/cinn/runtime/flags.cc b/paddle/cinn/runtime/flags.cc
index 5e55dbd029cf317608dae32bfd6e00f86122c0f9..20181835b881b442daf4e25f5cfde01279f85c97 100644
--- a/paddle/cinn/runtime/flags.cc
+++ b/paddle/cinn/runtime/flags.cc
@@ -44,13 +44,8 @@ DEFINE_string(cinn_nvcc_cmd_path,
               StringFromEnv("FLAGS_cinn_nvcc_cmd_path", "/usr/local/cuda/bin"),
               "Setting nvcc default path!");
 
-DEFINE_int32(cinn_parallel_compile_size,
-             Int32FromEnv("FLAGS_cinn_parallel_compile_size", 16),
-             "When use parallel compile, set the number of group compiled by "
-             "each thread.");
-
 DEFINE_int32(cinn_parallel_compile_thread,
-             Int32FromEnv("FLAGS_cinn_parallel_compile_thread", -1),
+             Int32FromEnv("FLAGS_cinn_parallel_compile_thread", 16),
              "How much thread the parallel compile used.");
 
 DEFINE_bool(cinn_use_op_fusion,
@@ -131,6 +126,26 @@ DEFINE_string(cinn_source_code_save_path,
               "Specify the directory path of generated source code, which is "
               "used for debug.");
 
+DEFINE_string(cinn_dump_group_lowered_func,
+              StringFromEnv("FLAGS_cinn_dump_group_lowered_func", ""),
+              "Specify the path for dump lowered functions by group, which is "
+              "used for debug.");
+
+DEFINE_string(
+    cinn_dump_group_source_code,
+    StringFromEnv("FLAGS_cinn_dump_group_source_code", ""),
+    "Specify the path for dump source code by group, which is used for debug.");
+
+DEFINE_string(
+    cinn_dump_group_ptx,
+    StringFromEnv("FLAGS_cinn_dump_group_ptx", ""),
+    "Specify the path for dump ptx by group, which is used for debug.");
+
+DEFINE_string(
+    cinn_dump_group_instruction,
+    StringFromEnv("FLAGS_cinn_dump_group_instruction", ""),
+    "Specify the path for dump instruction by group, which is used for debug.");
+
 DEFINE_string(cinn_pass_visualize_dir,
               StringFromEnv("FLAGS_cinn_pass_visualize_dir", ""),
               "Specify the directory path of pass visualize file of graph, "