diff --git a/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc b/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc index 628d8909f270a38d468ea77f19f33a78ec74c0df..c47543564c4497f13ad5b2250f8e5c3940cf7559 100644 --- a/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc +++ b/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc @@ -55,7 +55,6 @@ DEFINE_string(resnet50_model_dir, DEFINE_int32(evaluate_knobs, -1, "the options to control which schedule tests will be run."); -DECLARE_int32(cinn_parallel_compile_size); namespace cinn { namespace auto_schedule { @@ -78,8 +77,6 @@ class PerformanceTester : public ::testing::Test { std::bitset<3> evaluate_knobs = 0UL; }; - void SetUp() override { FLAGS_cinn_parallel_compile_size = 0; } - void Evaluate(const frontend::Program& program) { if (FLAGS_evaluate_knobs >= 0) { options_.evaluate_knobs = FLAGS_evaluate_knobs; diff --git a/paddle/cinn/backends/compiler.cc b/paddle/cinn/backends/compiler.cc index 72d69ccc3fed3e3d5b5656486e04bc034f3cb734..57b4116370782941618cee7895086c042ff78a4f 100644 --- a/paddle/cinn/backends/compiler.cc +++ b/paddle/cinn/backends/compiler.cc @@ -18,6 +18,7 @@ #include "paddle/cinn/backends/llvm/runtime_symbol_registry.h" #include "paddle/cinn/common/context.h" +#include "paddle/cinn/hlir/framework/visualize_helper.h" #ifdef CINN_WITH_CUDA #include "paddle/cinn/backends/codegen_cuda_dev.h" #include "paddle/cinn/backends/codegen_cuda_host.h" @@ -29,6 +30,10 @@ #endif DECLARE_string(cinn_source_code_save_path); +DECLARE_string(cinn_dump_group_lowered_func); +DECLARE_string(cinn_dump_group_source_code); +DECLARE_string(cinn_dump_group_ptx); +DECLARE_string(cinn_dump_group_instruction); namespace cinn { namespace backends { @@ -36,6 +41,81 @@ using ir::Module; static constexpr int DebugLogMaxLen = 30000; +void CompilationInfoDumper::DumpLoweredFunc() { + if (FLAGS_cinn_dump_group_lowered_func.empty()) { + return; + } + for (int idx = 0; idx < info_.lowered_funcs.size(); ++idx) { + std::stringstream content; + content << info_.lowered_funcs[idx].front(); + Dump(FLAGS_cinn_dump_group_lowered_func, + idx, + "lowered_function.txt", + content.str()); + } +} + +void CompilationInfoDumper::DumpSourceCode() { + if (FLAGS_cinn_dump_group_source_code.empty()) { + return; + } + for (int idx = 0; idx < info_.source_codes.size(); ++idx) { + Dump(FLAGS_cinn_dump_group_source_code, + idx, + "source_code.cu", + info_.source_codes[idx]); + } +} + +void CompilationInfoDumper::DumpPtxCode() { + if (FLAGS_cinn_dump_group_ptx.empty()) { + return; + } + for (int idx = 0; idx < info_.source_ptxs.size(); ++idx) { + Dump(FLAGS_cinn_dump_group_ptx, + idx, + "source_ptx.ptx", + info_.source_ptxs[idx]); + } +} + +void CompilationInfoDumper::DumpInstruction() { + if (FLAGS_cinn_dump_group_instruction.empty()) { + return; + } + for (int idx = 0; idx < info_.instructions.size(); ++idx) { + Dump(FLAGS_cinn_dump_group_instruction, + idx, + "instruction.txt", + info_.instructions[idx]->DumpInstruction()); + } +} + +void CompilationInfoDumper::Dump(const std::string& base_path, + const int idx, + const std::string& file_name, + const std::string& content) { + auto dump_path = + utils::StringFormat("%s/fusion_group_%d", base_path.c_str(), idx); + if (!hlir::framework::MakeDirectory( + dump_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) { + LOG(WARNING) << "Failed to make directory: \"" << dump_path + << "\", the instruction for this group will not dump."; + } else { + auto dump_file = + utils::StringFormat("%s/%s", dump_path.c_str(), file_name.c_str()); + VLOG(7) << "Dump instruction to: " << dump_file; + std::ofstream of(dump_file, std::ios_base::out); + if (of.is_open()) { + of << content; + of.close(); + } else { + LOG(WARNING) << "Failed to open file: " << dump_file + << ", please check your path."; + } + } +} + SourceCodePrint::SourceCodePrint() { if (!FLAGS_cinn_source_code_save_path.empty()) { LOG(INFO) diff --git a/paddle/cinn/backends/compiler.h b/paddle/cinn/backends/compiler.h index 1293125129a8117ccc8cfcfff614f613b8d3941e..c943a9890fd5d5415df91f4532a3c8964696597f 100644 --- a/paddle/cinn/backends/compiler.h +++ b/paddle/cinn/backends/compiler.h @@ -24,6 +24,7 @@ #include "paddle/cinn/backends/llvm/codegen_llvm.h" #include "paddle/cinn/backends/llvm/execution_engine.h" #include "paddle/cinn/backends/llvm/simple_jit.h" +#include "paddle/cinn/hlir/framework/parallel_compiler.h" #include "paddle/cinn/lang/packed_func.h" #ifdef CINN_WITH_CUDA #include "paddle/cinn/runtime/cuda/cuda_module.h" @@ -32,6 +33,38 @@ namespace cinn { namespace backends { +/** + * A class for dumping the code after compilation. + * Use FLAGS_cinn_dump_group_lowered_func to specify the directory to dump + * lowered function. Use FLAGS_cinn_dump_group_source_code to specify the + * directory to dump the source code. Use FLAGS_cinn_dump_group_ptx to specify + * the directory to dump ptx. Use FLAGS_cinn_dump_group_instruction to specify + * the directory to dump instruction. + */ +class CompilationInfoDumper { + public: + explicit CompilationInfoDumper( + const hlir::framework::ParallelCompiler::CompilationResult& info) + : info_(info) { + DumpLoweredFunc(); + DumpSourceCode(); + DumpPtxCode(); + DumpInstruction(); + } + + private: + void DumpLoweredFunc(); + void DumpSourceCode(); + void DumpPtxCode(); + void DumpInstruction(); + void Dump(const std::string& base_path, + const int idx, + const std::string& file_name, + const std::string& content); + + const hlir::framework::ParallelCompiler::CompilationResult& info_; +}; + class SourceCodePrint { public: static SourceCodePrint* GetInstance() { diff --git a/paddle/cinn/hlir/framework/graph.cc b/paddle/cinn/hlir/framework/graph.cc index 6ebd405aeed7f06f6f983f280fe043a862d96f78..0e783cdda9272e17535432681c7b1fd116f6ec14 100644 --- a/paddle/cinn/hlir/framework/graph.cc +++ b/paddle/cinn/hlir/framework/graph.cc @@ -308,66 +308,40 @@ void Graph::VisualizeGroupedGraph( return; } - int viz_id = viz_count_.fetch_add(1); - { - // create base Directory - viz_path_ = - utils::StringFormat("%s/fusion_groups_%d/", - FLAGS_cinn_fusion_groups_graphviz_dir.c_str(), - viz_id); - if (!MakeDirectory(viz_path_, - S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) { - LOG_IF(WARNING, viz_id == 0) - << "Failed to make directory: \"" << viz_path_ - << "\", the CINN subgraph's fusion group information will not print."; - viz_path_.clear(); - return; - } - LOG_IF(INFO, viz_id == 0) << "The CINN subgraph's fusion group information " - "will writing into path: \"" - << FLAGS_cinn_fusion_groups_graphviz_dir << "\""; - } - + // Dump debug info for each group + LOG(INFO) << "Dump graph debug info to: " + << FLAGS_cinn_fusion_groups_graphviz_dir; const auto& groups = RemoveAccCheckGroups(origin_groups); - { - // save python test file - std::string py_test_path = viz_path_ + "/tests/"; - if (!MakeDirectory(py_test_path, - S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) { - LOG_IF(WARNING, viz_id == 0) - << "Failed to make directory: \"" << py_test_path - << "\", the CINN subgraph's python test file will not generate."; - py_test_path.clear(); - } - if (!py_test_path.empty()) { - for (int i = 0; i < groups.size(); i++) { - WriteToFile(py_test_path + "test_group_" + std::to_string(i) + ".py", - GenerateGroupPythonCode(groups[i], fetch_var_ids)); - } - } - } - - Summary(groups, viz_path_); - WriteToFile(viz_path_ + "grouped_graph.dot", - VisualizeGraph(groups, fetch_var_ids)); - - { - // save each group's graphviz dot file - std::string group_path = viz_path_ + "/groups/"; + const auto& group_dots = VisualizeGroups(groups, fetch_var_ids); + for (int idx = 0; idx < groups.size(); ++idx) { + // Create fusion_group_x folder + auto group_path = + utils::StringFormat("%s/fusion_group_%d", + FLAGS_cinn_fusion_groups_graphviz_dir.c_str(), + idx); if (!MakeDirectory(group_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) { - LOG_IF(WARNING, viz_id == 0) - << "Failed to make directory: \"" << group_path - << "\", the CINN subgraph's group graphviz file will not save."; - group_path.clear(); - } - if (!group_path.empty()) { - const auto& group_dots = VisualizeGroups(groups, fetch_var_ids); - for (int i = 0; i < group_dots.size(); ++i) { - WriteToFile(GetFilePathForGroup(groups, i, group_path), group_dots[i]); - } + LOG(WARNING) << "Failed to make directory: \"" << group_path + << "\", skip dump info for this group."; + continue; } + // Create test_group_x.py + auto python_test_file = + utils::StringFormat("%s/test_group_%d.py", group_path.c_str(), idx); + WriteToFile(python_test_file, + GenerateGroupPythonCode(groups[idx], fetch_var_ids)); + // Create x_group_name.dot + auto graph_group_file = + utils::StringFormat("%s/graph_group_%d.dot", group_path.c_str(), idx); + WriteToFile(graph_group_file, group_dots[idx]); } + + // Summary + Summary(groups, FLAGS_cinn_fusion_groups_graphviz_dir); + // Grouped graph + auto grouped_graph_file = utils::StringFormat( + "%s/grouped_graph.dot", FLAGS_cinn_fusion_groups_graphviz_dir.c_str()); + WriteToFile(grouped_graph_file, VisualizeGraph(groups, fetch_var_ids)); } std::string Graph::VisualizeGraph( @@ -494,8 +468,6 @@ std::vector Graph::VisualizeGroups( return dot_vec; } -std::atomic_size_t Graph::viz_count_{0}; - std::unordered_set Graph::Group::GetInputNodeDatas() { std::unordered_set group_inputs; @@ -543,25 +515,6 @@ std::unordered_set Graph::Group::GetOutputNodeDatas() { return group_outputs; } -void Graph::SaveSourceCode(const std::string& code) { - if (cinn::runtime::CheckStringFlagFalse( - FLAGS_cinn_fusion_groups_graphviz_dir) || - viz_path_.empty()) { - return; - } - WriteToFile(viz_path_ + "source_code.cu", code); -} - -void Graph::SavePTXCode(const std::string& ptx) { - if (cinn::runtime::CheckStringFlagFalse( - FLAGS_cinn_fusion_groups_graphviz_dir) || - viz_path_.empty()) { - return; - } - - WriteToFile(viz_path_ + "source_code.ptx", ptx); -} - } // namespace framework } // namespace hlir } // namespace cinn diff --git a/paddle/cinn/hlir/framework/graph.h b/paddle/cinn/hlir/framework/graph.h index 5f4d2e4d9791fbd48345800896d081aacfe598f2..7f99ed004da7ed534b3951c50559ccb01428e436 100644 --- a/paddle/cinn/hlir/framework/graph.h +++ b/paddle/cinn/hlir/framework/graph.h @@ -283,9 +283,6 @@ class Graph : public cinn::common::Graph { const std::vector>& groups, const std::unordered_set& fetch_var_ids = {}); - void SaveSourceCode(const std::string& code); - void SavePTXCode(const std::string& ptx); - private: std::string DebugGroupedGraph( const std::vector>& groups, @@ -301,9 +298,6 @@ class Graph : public cinn::common::Graph { std::vector> FusionGroupsToGroups(); - std::string viz_path_; - static std::atomic_size_t viz_count_; - CINN_DISALLOW_COPY_AND_ASSIGN(Graph); }; diff --git a/paddle/cinn/hlir/framework/graph_compiler.cc b/paddle/cinn/hlir/framework/graph_compiler.cc index 16d6366f92745f1d2bb77b0d24516c5e5cd9b0f1..7fc6b57932c51660d7f5a2fbcb02154135e08866 100644 --- a/paddle/cinn/hlir/framework/graph_compiler.cc +++ b/paddle/cinn/hlir/framework/graph_compiler.cc @@ -20,6 +20,7 @@ #include #include "paddle/cinn/backends/codegen_cuda_dev.h" +#include "paddle/cinn/backends/compiler.h" #include "paddle/cinn/common/context.h" #include "paddle/cinn/hlir/framework/instruction.h" #include "paddle/cinn/hlir/framework/op_lowering_util.h" @@ -77,21 +78,24 @@ GraphCompiler::CompilationResult GraphCompiler::Build( parallel_compiler_ = std::make_shared(scope_, graph_, option, target_); - auto instructions = (*parallel_compiler_.get())(); + auto result = (*parallel_compiler_.get())(); + + // Dump compilation result + backends::CompilationInfoDumper dumper(result); if (options.remove_unused_variables) { - RemoveInvalidVariables(instructions); + RemoveInvalidVariables(result.instructions); } if (options.with_buffer_handle_instruction_inserted) { VLOG(3) << "option.with_buffer_handle_instruction_inserted enable"; - InsertBufferHandlers(&instructions); + InsertBufferHandlers(&result.instructions); } VLOG(2) << "Compile With Parallel Compiler Done!"; GraphCompiler::CompilationResult compilation_result; compilation_result.runtime_program.reset( - new Program(scope_, std::move(instructions))); + new Program(scope_, std::move(result.instructions))); return compilation_result; } diff --git a/paddle/cinn/hlir/framework/instruction.cc b/paddle/cinn/hlir/framework/instruction.cc index abd86b8a6d4dedb87e8336f18f087ae683a5ea27..01fb89aa2ac0d8805e1d316d9711b4a8696ccc6c 100644 --- a/paddle/cinn/hlir/framework/instruction.cc +++ b/paddle/cinn/hlir/framework/instruction.cc @@ -365,6 +365,29 @@ void Instruction::Run( // } } +std::string Instruction::DumpInstruction() { + std::stringstream ss; + ss << "Instruction {" << std::endl; + for (size_t i = 0; i < fn_names_.size(); ++i) { + ss << " Function " << fn_names_[i] << ":" << std::endl; + ss << " function ptr: " << fn_ptrs_[i] << std::endl; + + auto in_arg = in_args_[i]; + std::sort(in_arg.begin(), in_arg.end()); + for (auto& in_name : in_arg) { + ss << " input: " << in_name << std::endl; + } + + auto out_arg = out_args_[i]; + std::sort(out_arg.begin(), out_arg.end()); + for (auto& out_name : out_arg) { + ss << " output: " << out_name << std::endl; + } + } + ss << "}" << std::endl; + return ss.str(); +} + void Instruction::CheckResults( const std::map* name2podargs, void* stream) { #ifdef CINN_WITH_CUDA diff --git a/paddle/cinn/hlir/framework/instruction.h b/paddle/cinn/hlir/framework/instruction.h index 225cf6d08fd568944dc041cada8ac9a375b19a9e..23da5c532c4cd38bd50a8ac04b04f69fe7572e8c 100644 --- a/paddle/cinn/hlir/framework/instruction.h +++ b/paddle/cinn/hlir/framework/instruction.h @@ -132,6 +132,8 @@ class Instruction { int size() { return fn_ptrs_.size(); } + std::string DumpInstruction(); + std::vector> GetInArgs() { return in_args_; } std::vector> GetOutArgs() { return out_args_; } void ClearInArgs() { in_args_.clear(); } diff --git a/paddle/cinn/hlir/framework/parallel_compiler.cc b/paddle/cinn/hlir/framework/parallel_compiler.cc index 5004b6de8b5e3c30ec2bbb17ac8d2bf3a69cd56a..1b2cbca9e056d73ddaaaf9554408011ad5fa35e1 100644 --- a/paddle/cinn/hlir/framework/parallel_compiler.cc +++ b/paddle/cinn/hlir/framework/parallel_compiler.cc @@ -30,15 +30,13 @@ #include "paddle/cinn/ir/module.h" #include "paddle/cinn/runtime/flags.h" -DECLARE_int32(cinn_parallel_compile_size); DECLARE_int32(cinn_parallel_compile_thread); namespace cinn { namespace hlir { namespace framework { -static constexpr int DebugLogMaxLen = 30000; -std::vector> ParallelCompiler::operator()() { +ParallelCompiler::CompilationResult ParallelCompiler::operator()() { if (graph_->fusion_groups.size() == 0) { hlir::framework::ApplyPasses(graph_.get(), {"BuildNonFusedGroupsPass"}); } @@ -50,48 +48,31 @@ std::vector> ParallelCompiler::operator()() { return MergeResult(); } -OpPatternKind GetOpKind(const framework::Node* node) { - auto& op_pattern_dict = - framework::Operator::GetAttrs("OpPattern"); - CHECK(op_pattern_dict.Find(node->op())) - << "Don't find the pattern of op : " << node->id(); - auto kind = op_pattern_dict[node->op()]; - - if (kind == framework::kBroadcast) { - // As binary op was defined as broadcast, actually it should be - // element-wise. - if (node->op()->name != "broadcast_to") { - return framework::kElementWise; - } - } - - return kind; -} - void ParallelCompiler::SplitTask() { CHECK(graph_->fusion_groups.size()); CHECK(graph_->fusion_groups.size() == option_.lowered_funcs.size() || option_.lowered_funcs.size() == 0); - // split task - int max_task_num = FLAGS_cinn_parallel_compile_thread > 0 - ? FLAGS_cinn_parallel_compile_thread - : graph_->fusion_groups.size(); - - int group_per_task = graph_->fusion_groups.size(); - if (max_task_num > 1) { - group_per_task = FLAGS_cinn_parallel_compile_size > 0 - ? FLAGS_cinn_parallel_compile_size - : ((graph_->fusion_groups.size() + max_task_num - 1) / - max_task_num); - } - + // Assign fusion_group to each task. + // The maximum number of tasks is determined by the number of threads. + // Fusion_group is assigned to tasks in order and continuous. + int fusion_group_size = graph_->fusion_groups.size(); + int thread_size = FLAGS_cinn_parallel_compile_thread > 0 + ? FLAGS_cinn_parallel_compile_thread + : 1; + int group_per_task = + (graph_->fusion_groups.size() + thread_size - 1) / thread_size; for (int idx = 0; idx < graph_->fusion_groups.size(); idx += group_per_task) { - tasks_.emplace_back(this, scope_, graph_, option_, target_); + Task task(this, scope_, graph_, option_, target_); + task.start_gidx = idx; + task.stop_gidx = + (idx + group_per_task > fusion_group_size ? fusion_group_size + : idx + group_per_task); + tasks_.emplace_back(std::move(task)); } VLOG(2) << "Split task to " << tasks_.size() << " sub-task!"; } -void RunTask(ParallelCompiler::Task* task) { +void ParallelCompiler::RunTask(ParallelCompiler::Task* task) { VLOG(2) << "Stark run sub-task, Thread Id : " << std::this_thread::get_id(); VLOG(4) << "Start Lowering"; task->Lowering(); @@ -106,7 +87,7 @@ void ParallelCompiler::LaunchTask() { // start sub-task. std::vector threads; for (int idx = 1; idx < tasks_.size(); ++idx) { - threads.emplace_back(RunTask, &tasks_[idx]); + threads.emplace_back(&ParallelCompiler::RunTask, this, &tasks_[idx]); } RunTask(&tasks_[0]); @@ -116,11 +97,20 @@ void ParallelCompiler::LaunchTask() { } } -std::vector> ParallelCompiler::MergeResult() { - std::vector> res(graph_->fusion_groups.size()); +ParallelCompiler::CompilationResult ParallelCompiler::MergeResult() { + ParallelCompiler::CompilationResult res; for (auto& task : tasks_) { - for (int idx = 0; idx < task.gidx.size(); ++idx) { - res[task.gidx[idx]] = std::move(task.instructions[idx]); + for (auto& lowered_func : task.lowered_funcs) { + res.lowered_funcs.emplace_back(lowered_func); + } + for (auto& source_code : task.source_codes) { + res.source_codes.emplace_back(source_code); + } + for (auto& source_ptx : task.source_ptxs) { + res.source_ptxs.emplace_back(source_ptx); + } + for (auto& instruction : task.instructions) { + res.instructions.emplace_back(std::move(instruction)); } } return std::move(res); @@ -138,13 +128,7 @@ void ParallelCompiler::Task::Lowering() { "infershape"); OpLowerer op_lowerer(dtype_dict, shape_dict, target); - while (true) { - int idx = compiler->GetGroupIdx(); - if (idx < 0) { - break; - } - - gidx.push_back(idx); + for (int idx = start_gidx; idx < stop_gidx; ++idx) { if (options.lowered_funcs.size()) { lowered_funcs.push_back(options.lowered_funcs[idx]); continue; @@ -154,16 +138,15 @@ void ParallelCompiler::Task::Lowering() { << std::this_thread::get_id() << " :\n" << "Group " << idx << " {\n" << graph->DebugGroupedGraph(group->CollectNodes()) << "}\n"; - lowered_funcs.emplace_back(std::move(op_lowerer.Lower(group))); - CHECK_EQ(lowered_funcs.back().size(), 1) - << "Lowerd Function Is Not Equal 1!"; + auto lowered_group = op_lowerer.Lower(group); + CHECK_EQ(lowered_group.size(), 1) << "Lowerd Function Is Not Equal 1!"; + lowered_funcs.emplace_back(std::move(lowered_group)); } } void ParallelCompiler::Task::CodegenAndJit() { - VLOG(2) << "Start Codegen and JIT with Group [" - << cinn::utils::Join(this->gidx, ", ") << "] at " - << std::this_thread::get_id(); + VLOG(2) << "Start Codegen and JIT with Group [" << start_gidx << "-" + << stop_gidx << ") at thread" << std::this_thread::get_id(); // build module ir::Module::Builder builder(common::UniqName("module"), target); for (auto& func : lowered_funcs) { @@ -172,7 +155,6 @@ void ParallelCompiler::Task::CodegenAndJit() { } auto ir_module = builder.Build(); - // codegen compile if (target == common::DefaultNVGPUTarget()) { #ifdef CINN_WITH_CUDA auto splited_module = backends::SplitCudaAndHostModule(ir_module); @@ -185,14 +167,15 @@ void ParallelCompiler::Task::CodegenAndJit() { auto cuda_c = codegen.Compile(dmodule); CHECK(!cuda_c.empty()) << "Compile CUDA C code failed from device module:\n" << dmodule; + source_codes.emplace_back(cuda_c); cinn::backends::SourceCodePrint::GetInstance()->write(cuda_c); - graph->SaveSourceCode(cuda_c); using runtime::cuda::CUDAModule; backends::nvrtc::Compiler compiler; auto ptx = compiler(cuda_c); CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << cuda_c; + source_ptxs.emplace_back(ptx); // load cumodule cumodule.reset(new CUDAModule(ptx, compiler.compile_to_cubin() @@ -218,7 +201,7 @@ void ParallelCompiler::Task::CodegenAndJit() { void ParallelCompiler::Task::BuildInstruction() { // create instruction. - for (int idx : gidx) { + for (int idx = start_gidx; idx < stop_gidx; ++idx) { VLOG(2) << "Start BuildInstruction of Group " << idx << " at " << std::this_thread::get_id(); auto& group = graph->fusion_groups[idx]; @@ -240,15 +223,6 @@ void ParallelCompiler::Task::BuildInstruction() { } } -int ParallelCompiler::GetGroupIdx() { - std::lock_guard lock(mtx_); - if (index < graph_->fusion_groups.size()) { - return index++; - } else { - return -1; - } -} - } // namespace framework } // namespace hlir } // namespace cinn diff --git a/paddle/cinn/hlir/framework/parallel_compiler.h b/paddle/cinn/hlir/framework/parallel_compiler.h index 45fc4fef77a8d2a363a0e6dd50b82bc5104986c6..fad32d2c8a7eeae6047f91fe1e82ded1a0cdf028 100644 --- a/paddle/cinn/hlir/framework/parallel_compiler.h +++ b/paddle/cinn/hlir/framework/parallel_compiler.h @@ -35,23 +35,18 @@ class ParallelCompiler { std::vector> lowered_funcs; }; - public: - explicit ParallelCompiler(std::shared_ptr& scope, // NOLINT - std::shared_ptr& graph, // NOLINT - const CompileOptions& option, - const common::Target& target) - : scope_(scope), graph_(graph), option_(option), target_(target) {} - ~ParallelCompiler() {} - std::vector> operator()(); - - private: - void SplitTask(); - void LaunchTask(); - std::vector> MergeResult(); + struct CompilationResult { + // Lower result + std::vector> lowered_funcs; + // Host/CUDA codegen result + std::vector source_codes; + // CUDA ptx result + std::vector source_ptxs; + // Instruction result + std::vector> instructions; + }; - public: struct Task { - public: Task(ParallelCompiler* p, std::shared_ptr& s, // NOLINT std::shared_ptr& g, // NOLINT @@ -62,30 +57,40 @@ class ParallelCompiler { void CodegenAndJit(); void BuildInstruction(); - public: const Target target; ParallelCompiler* compiler; std::shared_ptr scope; std::shared_ptr graph; const CompileOptions& options; - std::vector gidx; + int start_gidx; + int stop_gidx; std::vector> instructions; std::vector> lowered_funcs; + std::vector source_codes; + std::vector source_ptxs; - public: std::unique_ptr engine; #ifdef CINN_WITH_CUDA std::unique_ptr cumodule; #endif }; - std::vector tasks_; - int GetGroupIdx(); + + explicit ParallelCompiler(std::shared_ptr& scope, // NOLINT + std::shared_ptr& graph, // NOLINT + const CompileOptions& option, + const common::Target& target) + : scope_(scope), graph_(graph), option_(option), target_(target) {} + ~ParallelCompiler() {} + CompilationResult operator()(); private: - int index{0}; - std::mutex mtx_; + void SplitTask(); + void LaunchTask(); + void RunTask(Task* task); + CompilationResult MergeResult(); + std::vector tasks_; const common::Target target_; const CompileOptions& option_; std::shared_ptr scope_; diff --git a/paddle/cinn/hlir/framework/visualize_helper.cc b/paddle/cinn/hlir/framework/visualize_helper.cc index a310ac2a0fb8a4914276c321f37a015cce37c048..e370737b67e3f36127807cca541d606282b6d128 100644 --- a/paddle/cinn/hlir/framework/visualize_helper.cc +++ b/paddle/cinn/hlir/framework/visualize_helper.cc @@ -148,66 +148,30 @@ bool PassPrinter::End() { } bool MakeDirectory(const std::string& dirname, mode_t mode) { - auto len = dirname.length(); - std::vector dir_path(len + 1, '\0'); - strncpy(dir_path.data(), dirname.c_str(), len); - char* path = dir_path.data(); - for (char* p = strchr(path + 1, '/'); p; p = strchr(p + 1, '/')) { - *p = '\0'; - if (mkdir(path, mode) == -1) { - if (errno != EEXIST) { - *p = '/'; + struct stat st; + std::string path; + for (int i = 0; i < dirname.size(); ++i) { + path.push_back(dirname[i]); + if (!(dirname[i] == '/' || i + 1 == dirname.size())) { + continue; + } + if (stat(path.c_str(), &st) == 0) { + if (S_ISDIR(st.st_mode)) { + continue; + } else { + LOG(WARNING) << path << " is not a directory, please check your path."; return false; } - } - *p = '/'; - } - return true; -} - -std::string GetFilePathForGroup(const std::vector>& groups, - const int group_id, - const std::string& viz_path) { - std::string filename = ""; - for (auto* node : groups[group_id]) { - filename += "_" + node->id(); - } - - int max_len = 50; - std::string simplified_filename = filename; - if (filename.size() > max_len) { - static std::unordered_map funcname_map = { - {"const_scalar", "scalar"}, - {"fill_constant", "fill"}, - {"identity", "copy"}, - {"broadcast_to", "broadcast"}, - {"elementwise_add", "add"}, - {"subtract", "sub"}, - {"elementwise_mul", "mul"}, - {"divide", "div"}, - {"reduce_sum", "reduce"}, - {"reduce_prod", "reduce"}, - {"reduce_max", "reduce"}, - {"reduce_min", "reduce"}}; - for (auto& item : funcname_map) { - size_t index = 0; - while (true) { - index = simplified_filename.find(item.first, index); - if (index == std::string::npos) { - break; - } - simplified_filename.replace(index, item.first.size(), item.second); - index += item.second.size(); + } else { + if (mkdir(path.c_str(), mode) == 0) { + continue; + } else { + LOG(WARNING) << "Make directory fail: " << path; + return false; } } } - - int width = std::to_string(groups.size()).size(); - std::stringstream ss; - ss << viz_path; - ss << std::setw(width) << std::setfill('0') << group_id; - ss << simplified_filename.substr(0, 50) << ".dot"; - return ss.str(); + return true; } std::string GenNodeDataLabel( @@ -313,7 +277,7 @@ void Summary(const std::vector>& groups, << "Numbers\n"; print_table(fusion_group_detail); - std::string filepath = viz_path + "summary.txt"; + std::string filepath = viz_path + "/summary.txt"; WriteToFile(filepath, ss.str()); } diff --git a/paddle/cinn/hlir/framework/visualize_helper.h b/paddle/cinn/hlir/framework/visualize_helper.h index 032f5f2ec4e0c8508a44a7b3b254fd3aedba52a6..3afd3a974db0c83dc6e83b85957fd97673e295eb 100644 --- a/paddle/cinn/hlir/framework/visualize_helper.h +++ b/paddle/cinn/hlir/framework/visualize_helper.h @@ -133,10 +133,6 @@ inline std::vector GetGroupAttrs(size_t group_size) { bool MakeDirectory(const std::string& dirname, mode_t mode); -std::string GetFilePathForGroup(const std::vector>& groups, - const int group_id, - const std::string& viz_path); - std::string GenNodeDataLabel( const NodeData* node, const absl::flat_hash_map& shape_dict, diff --git a/paddle/cinn/runtime/flags.cc b/paddle/cinn/runtime/flags.cc index 5e55dbd029cf317608dae32bfd6e00f86122c0f9..20181835b881b442daf4e25f5cfde01279f85c97 100644 --- a/paddle/cinn/runtime/flags.cc +++ b/paddle/cinn/runtime/flags.cc @@ -44,13 +44,8 @@ DEFINE_string(cinn_nvcc_cmd_path, StringFromEnv("FLAGS_cinn_nvcc_cmd_path", "/usr/local/cuda/bin"), "Setting nvcc default path!"); -DEFINE_int32(cinn_parallel_compile_size, - Int32FromEnv("FLAGS_cinn_parallel_compile_size", 16), - "When use parallel compile, set the number of group compiled by " - "each thread."); - DEFINE_int32(cinn_parallel_compile_thread, - Int32FromEnv("FLAGS_cinn_parallel_compile_thread", -1), + Int32FromEnv("FLAGS_cinn_parallel_compile_thread", 16), "How much thread the parallel compile used."); DEFINE_bool(cinn_use_op_fusion, @@ -131,6 +126,26 @@ DEFINE_string(cinn_source_code_save_path, "Specify the directory path of generated source code, which is " "used for debug."); +DEFINE_string(cinn_dump_group_lowered_func, + StringFromEnv("FLAGS_cinn_dump_group_lowered_func", ""), + "Specify the path for dump lowered functions by group, which is " + "used for debug."); + +DEFINE_string( + cinn_dump_group_source_code, + StringFromEnv("FLAGS_cinn_dump_group_source_code", ""), + "Specify the path for dump source code by group, which is used for debug."); + +DEFINE_string( + cinn_dump_group_ptx, + StringFromEnv("FLAGS_cinn_dump_group_ptx", ""), + "Specify the path for dump ptx by group, which is used for debug."); + +DEFINE_string( + cinn_dump_group_instruction, + StringFromEnv("FLAGS_cinn_dump_group_instruction", ""), + "Specify the path for dump instruction by group, which is used for debug."); + DEFINE_string(cinn_pass_visualize_dir, StringFromEnv("FLAGS_cinn_pass_visualize_dir", ""), "Specify the directory path of pass visualize file of graph, "