未验证 提交 7adb4703 编写于 作者: A Aurelius84 提交者: GitHub

[NewIR]Part-2.1 Refactor NewIRCompiler to support Group Ops (#56762)

* [NewIR]Part-2.1 Refactor NewIRCompiler to support Group Ops

* fix gflags link error

* fix include ir_printer.h

* fix unittest

* fix conflict

* fix flags

* fix comment
上级 1dd6f6fa
......@@ -605,6 +605,7 @@ if(WITH_CINN)
add_definitions(-DPADDLE_WITH_CINN)
if(CINN_ONLY)
add_definitions(-DCINN_WITH_ONLY)
if(WITH_PYTHON)
add_subdirectory(python)
endif()
......
......@@ -168,8 +168,8 @@ cinn_cc_library(
add_dependencies(cinnapi GEN_LLVM_RUNTIME_IR_HEADER ZLIB::ZLIB)
add_dependencies(cinnapi GEN_LLVM_RUNTIME_IR_HEADER ${core_deps})
if(NOT CINN_ONLY)
target_link_libraries(cinnapi phi)
add_dependencies(cinnapi phi)
target_link_libraries(cinnapi pd_dialect phi)
add_dependencies(cinnapi pd_dialect phi)
endif()
target_link_libraries(cinnapi ${PYTHON_LIBRARIES})
......@@ -226,8 +226,8 @@ function(gen_cinncore LINKTYPE)
add_dependencies(${CINNCORE_TARGET} GEN_LLVM_RUNTIME_IR_HEADER ZLIB::ZLIB)
add_dependencies(${CINNCORE_TARGET} GEN_LLVM_RUNTIME_IR_HEADER ${core_deps})
if(NOT CINN_ONLY)
target_link_libraries(${CINNCORE_TARGET} phi)
add_dependencies(${CINNCORE_TARGET} phi)
target_link_libraries(${CINNCORE_TARGET} pd_dialect phi)
add_dependencies(${CINNCORE_TARGET} pd_dialect phi)
endif()
add_dependencies(${CINNCORE_TARGET} pybind)
......
......@@ -63,8 +63,8 @@ void AutoTuner::Initialize(const Config& config,
const auto& shape_dict = graph_->GetAttrs<
absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
op_lowerer_ = std::make_unique<hlir::framework::OpLowerer>(
dtype_dict, shape_dict, target_);
op_lowerer_ = std::make_unique<hlir::framework::OpLowerer<GroupPtr>>(
new hlir::framework::OpLowererImpl(dtype_dict, shape_dict, target_));
InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
for (auto i = 0; i < tasks_.size(); ++i) {
auto&& task = tasks_[i];
......
......@@ -30,11 +30,11 @@
namespace cinn {
namespace auto_schedule {
// This class is entrance of auto-tune, users can use it
// to tune graph (not supported yet) and search a series of schedules
// that maybe more likely to obtain better performance.
// Internally, it creates necessary components and use them to perform tuning.
using GroupPtr = hlir::framework::GroupPtr;
class AutoTuner {
public:
// configure how to perform auto-tune, such as
......@@ -58,7 +58,7 @@ class AutoTuner {
private:
const common::Target& target_;
hlir::framework::Graph* graph_;
std::unique_ptr<hlir::framework::OpLowerer> op_lowerer_;
std::unique_ptr<hlir::framework::OpLowerer<GroupPtr>> op_lowerer_;
// Tasks to tune
std::vector<TuneTask> tasks_;
......
......@@ -26,6 +26,7 @@
#include "paddle/cinn/frontend/syntax.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/graph_compiler_util.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/runtime/flags.h"
namespace cinn {
......@@ -75,12 +76,12 @@ class TestMeasurer : public ::testing::Test {
absl::flat_hash_map<std::string, hlir::framework::shape_t>>(
"infershape");
auto op_lowerer = std::make_unique<hlir::framework::OpLowerer>(
dtype_dict, shape_dict, target);
auto op_lowerer =
hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target);
inputs.reserve(tasks.size());
for (int i = 0; i < tasks.size(); ++i) {
auto* task = &tasks[i];
task->Initialize(shape_dict, dtype_dict, op_lowerer.get());
task->Initialize(shape_dict, dtype_dict, &op_lowerer);
MeasureInput input;
input.task = task;
input.lowered_funcs = task->lowered_funcs;
......
......@@ -161,12 +161,12 @@ TEST(AutoInline, AddReluInline) {
"inferdtype");
const auto& shape_dict = graph->GetAttrs<
absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
auto op_lowerer = std::make_unique<hlir::framework::OpLowerer>(
dtype_dict, shape_dict, target);
auto op_lowerer =
hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target);
EXPECT_EQ(graph->fusion_groups.size(), 1UL);
std::vector<ir::LoweredFunc> funcs =
op_lowerer->Lower(graph->fusion_groups[0],
op_lowerer.Lower(graph->fusion_groups[0],
/*apply_op_schedule = */ false,
/*apply_group_schedule=*/false);
......
......@@ -61,7 +61,8 @@ ir::IRSchedule TestAutoGenRuleBase::MakeIRSchedule(
"inferdtype");
auto& shape_dict = graph->GetMutableAttrs<
absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
hlir::framework::OpLowerer op_lowerer(dtype_dict, shape_dict, target_);
auto op_lowerer =
hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target_);
lowered_funcs_ =
op_lowerer.Lower(graph->fusion_groups.front(),
......
......@@ -27,6 +27,7 @@
#include "paddle/cinn/auto_schedule/task/task_registry.h"
#include "paddle/cinn/auto_schedule/task/tune_task.h"
#include "paddle/cinn/auto_schedule/tuning.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "test/cpp/cinn/program_builder.h"
......@@ -44,11 +45,11 @@ std::vector<TuneTask> CreateTasks(const frontend::Program& program,
"inferdtype");
const auto& shape_dict = graph->GetAttrs<
absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
auto op_lowerer = std::make_unique<hlir::framework::OpLowerer>(
dtype_dict, shape_dict, target);
auto op_lowerer =
hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target);
InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
for (auto i = 0; i < tasks.size(); ++i) {
tasks[i].Initialize(shape_dict, dtype_dict, op_lowerer.get());
tasks[i].Initialize(shape_dict, dtype_dict, &op_lowerer);
task_registry->Regist(tasks[i].serialized_key,
ir::ModuleExpr(tasks[i].GetLoweredFuncBodyExprs()));
}
......
......@@ -45,11 +45,10 @@ std::vector<TuneTask> CreateTasks(hlir::framework::Graph* graph,
const auto& shape_dict = graph->GetAttrs<
absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
std::unique_ptr<hlir::framework::OpLowerer> op_lowerer =
std::make_unique<hlir::framework::OpLowerer>(
dtype_dict, shape_dict, target);
auto op_lowerer =
hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target);
for (TuneTask& task : tasks) {
task.Initialize(shape_dict, dtype_dict, op_lowerer.get());
task.Initialize(shape_dict, dtype_dict, &op_lowerer);
VLOG(3) << "Add a task with serialized_key:\n" << task.serialized_key;
}
......
......@@ -34,7 +34,7 @@ void TuneTask::Initialize(
const absl::flat_hash_map<std::string, hlir::framework::shape_t>&
shape_dict,
const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict,
hlir::framework::OpLowerer* lower_handler) {
hlir::framework::OpLowerer<GroupPtr>* lower_handler) {
CHECK(lower_handler != nullptr) << "op_lowerer can't be nullptr";
op_lowerer = lower_handler;
......
......@@ -34,16 +34,17 @@ namespace cinn {
namespace auto_schedule {
class TuneTask {
using GroupPtr = hlir::framework::GroupPtr;
public:
TuneTask() = default;
explicit TuneTask(std::shared_ptr<hlir::framework::Graph::Group> group)
: subgraph(group) {}
explicit TuneTask(GroupPtr group) : subgraph(group) {}
// Initialize a task
void Initialize(
const absl::flat_hash_map<std::string, hlir::framework::shape_t>&
shape_dict,
const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict,
hlir::framework::OpLowerer* lower_handler);
hlir::framework::OpLowerer<GroupPtr>* lower_handler);
// Extract bodies in lowered_funcs() and return
std::vector<ir::Expr> GetLoweredFuncBodyExprs() const;
......@@ -51,7 +52,7 @@ class TuneTask {
// sub-graph (if an op won't be fused, it will be a Group with size=1).
std::shared_ptr<hlir::framework::Graph::Group> subgraph;
// Lower handler, Not owned
hlir::framework::OpLowerer* op_lowerer;
hlir::framework::OpLowerer<GroupPtr>* op_lowerer;
// target of this task
common::Target target;
// stores the initial (un-optimized) LoweredFuncs
......
......@@ -75,7 +75,8 @@ TEST(TuneTask, GraphToUnoptLoweredFunc_NoPass) {
const auto& dtype_dict =
graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
"inferdtype");
OpLowerer op_lowerer(dtype_dict, shape_dict, target);
auto op_lowerer =
hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target);
std::stringstream ss;
for (TuneTask& task : tasks) {
......@@ -187,7 +188,8 @@ TEST(TuneTask, GraphToUnoptLoweredFunc_ApplyPass) {
graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
"inferdtype");
OpLowerer op_lowerer(dtype_dict, shape_dict, target);
OpLowerer op_lowerer(
new hlir::framework::OpLowererImpl(dtype_dict, shape_dict, target));
std::stringstream ss;
for (TuneTask& task : tasks) {
......@@ -291,7 +293,8 @@ TEST(TuneTask, SerializeToString) {
const auto& dtype_dict =
graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
"inferdtype");
OpLowerer op_lowerer(dtype_dict, shape_dict, target);
OpLowerer op_lowerer(
new hlir::framework::OpLowererImpl(dtype_dict, shape_dict, target));
ASSERT_EQ(single_tasks.size(), 2UL);
for (auto&& task : single_tasks) {
task.Initialize(shape_dict, dtype_dict, &op_lowerer);
......
......@@ -27,6 +27,7 @@
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/graph_compiler_util.h"
#include "paddle/cinn/hlir/framework/node.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/hlir/framework/pass.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/runtime/flags.h"
......@@ -143,9 +144,8 @@ class PerformanceTester : public ::testing::Test {
absl::flat_hash_map<std::string, hlir::framework::shape_t>>(
"infershape");
std::shared_ptr<hlir::framework::OpLowerer> op_lowerer =
std::make_unique<hlir::framework::OpLowerer>(
dtype_dict, shape_dict, target_);
auto op_lowerer =
hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target_);
CompilationContext& context = graph_compiler->GetCompilationContext();
context.with_instantiate_variables = true;
......@@ -157,7 +157,7 @@ class PerformanceTester : public ::testing::Test {
for (auto group : graph->fusion_groups) {
context.lowered_funcs.push_back(
op_lowerer->Lower(group,
op_lowerer.Lower(group,
/*apply_op_schedule = */ false,
/*apply_group_schedule=*/false));
}
......
......@@ -19,6 +19,7 @@
#include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
#include "paddle/cinn/common/context.h"
#include "paddle/cinn/hlir/framework/visualize_helper.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
#ifdef CINN_WITH_CUDA
#include "paddle/cinn/backends/codegen_cuda_dev.h"
#include "paddle/cinn/backends/codegen_cuda_host.h"
......
add_subdirectory(new_ir)
core_gather_headers()
gather_srcs(
......@@ -17,8 +18,8 @@ gather_srcs(
node.cc
pass.cc
op_strategy.cc
op_lowering.cc
op_lowering_util.cc
op_lowering_impl.cc
accuracy_checker.cc
visualize_helper.cc)
......
if(NOT CINN_ONLY)
core_gather_headers()
gather_srcs(cinnapi_src SRCS utils.cc op_lowering_impl.cc)
endif()
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/cinn/hlir/framework/op.h"
#include "paddle/ir/core/operation.h"
namespace cinn {
namespace hlir {
namespace framework {
namespace newir {
using framework::OpPatternKind;
// TODO(Aurelius84): Need to be replaced with CinnGroupOp
struct Group {
public:
explicit Group(const std::vector<::ir::Operation*>& group_ops)
: ops(group_ops) {
op_pattern_kind = OpPatternKind::kElementWise;
fn_name = "fn_";
for (auto& op : group_ops) {
fn_name += "_" + op->name();
}
}
std::vector<::ir::Operation*> ops;
std::vector<std::string> input_names;
std::vector<std::string> output_names;
int group_id;
// FIXME(Aurelius84): This should be refactored with CinnGroupOp
OpPatternKind op_pattern_kind;
std::string fn_name;
};
} // namespace newir
} // namespace framework
} // namespace hlir
} // namespace cinn
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/hlir/framework/new_ir/op_lowering_impl.h"
#include <string>
#include "paddle/cinn/hlir/framework/op_lowering_util.h"
#include "paddle/cinn/hlir/op/external_api_registry.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/optim/transform_gpu_forloop.h"
#include "paddle/cinn/hlir/framework/new_ir/utils.h"
#include "paddle/cinn/lang/placeholder.h"
#include "paddle/cinn/utils/attribute_util.h"
#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
#include "paddle/phi/core/ddim.h"
DECLARE_bool(cinn_use_cuda_vectorize);
namespace cinn {
namespace hlir {
namespace framework {
namespace newir {
using cinn::hlir::op::ExternalApiRegistry;
using common::Type;
using framework::OpPatternKind;
using framework::StrategyFunction;
namespace details {
ir::Tensor GetTensor(const ::ir::Value& value) {
auto type_info = value.type().dyn_cast<paddle::dialect::DenseTensorType>();
auto in_shape = phi::vectorize<int>(type_info.dims());
auto dtype = type_info.dtype();
std::string input_id = CompatibleInfo::InputName(value);
return lang::CreatePlaceHolder(
in_shape, utils::ConvertIRType(dtype), input_id);
}
std::vector<ir::Tensor> CollectInputTensor(
const ::ir::Operation* op,
std::vector<ir::Tensor>* func_args,
std::unordered_map<::ir::Value, ir::Tensor>* tensor_map) {
std::vector<ir::Tensor> tensors;
for (auto& operand : op->operands()) {
CHECK(operand);
auto in_value = operand.source();
ir::Tensor tensor;
if (!tensor_map->count(in_value)) {
tensor = details::GetTensor(in_value);
// record tensor.
(*tensor_map)[in_value] = tensor;
// record func input args
if (func_args != nullptr) func_args->push_back(tensor);
} else {
tensor = tensor_map->at(in_value);
}
tensors.push_back(tensor);
}
return tensors;
}
void CollectOutputInfo(const ::ir::Operation* op,
std::vector<Type>* out_types,
std::vector<std::vector<int>>* out_shapes) {
auto op_results = op->results();
for (auto& out_value : op_results) {
std::string output_id = CompatibleInfo::OutputName(out_value);
// group->output_names.push_back(output_id);
auto type_info =
out_value.type().dyn_cast<paddle::dialect::DenseTensorType>();
out_types->push_back(utils::ConvertIRType(type_info.dtype()));
auto out_shape = phi::vectorize<int>(type_info.dims());
out_shapes->push_back(std::move(out_shape));
}
}
NodeAttr CollectAttrs(const ::ir::Operation& op) {
NodeAttr node_attrs;
VLOG(4) << "op.attributes():" << op.attributes().size();
auto attrs = utils::ConvertAttributes(op.attributes());
node_attrs.node_name = CompatibleInfo::OpName(op);
node_attrs.attr_store = std::move(attrs);
return node_attrs;
}
} // namespace details
OpLowererImpl::OpLowererImpl(const Target& target) : target_(target) {}
std::vector<ir::LoweredFunc> OpLowererImpl::Lower(const GroupPtr& group,
bool apply_op_schedule,
bool apply_group_schedule) {
VLOG(3) << "Lowering Group : " << group->group_id
<< " , Op Pattern : " << group->op_pattern_kind;
group->input_names.clear();
group->output_names.clear();
switch (group->op_pattern_kind) {
case framework::kElementWise:
case framework::kBroadcast:
case framework::kInjective:
return LowerGroup(group,
apply_op_schedule,
apply_group_schedule,
&OpLowererImpl::ElementwiseScheduleDetermineFunction);
case framework::kReduction:
return LowerGroup(group,
apply_op_schedule,
apply_group_schedule,
&OpLowererImpl::ReduceScheduleDetermineFunction);
case framework::kOutFusible:
LOG(FATAL) << "Group Pattern Kind kOutFusible Is Not Implemented!";
case framework::kNonFusible:
return LowerGroup(group,
apply_op_schedule,
apply_group_schedule,
&OpLowererImpl::NonFusibleScheduleDetermineFunction);
default:
LOG(FATAL) << "Group Pattern Kind Is Unknown!";
}
}
bool OpLowererImpl::ElementwiseScheduleDetermineFunction(::ir::Operation* op) {
return true;
}
bool OpLowererImpl::ReduceScheduleDetermineFunction(::ir::Operation* op) {
// TODO(Aurelius84): Support this.
// auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
// return op_pattern_dict[op] == framework::kReduction;
return true;
}
bool OpLowererImpl::NonFusibleScheduleDetermineFunction(::ir::Operation* op) {
return true;
}
std::vector<ir::LoweredFunc> OpLowererImpl::LowerGroup(
const GroupPtr& group,
bool apply_op_schedule,
bool apply_group_schedule,
ScheduleDetermineFunction schedule_determine_func) {
// 1.Do compute, lower and schedule for each op.
auto& ops = group->ops;
if (ops.size() == 1 && ops[0]->name() == "custom_call") {
return LowerCustomCall(group);
}
std::vector<ir::Tensor> group_func_arg_tensors;
std::unordered_map<::ir::Value, ir::Tensor> tensor_map;
bool do_op_schedule = apply_group_schedule || apply_op_schedule;
std::vector<ir::Expr> func_bodies = LowerOps(ops,
do_op_schedule,
schedule_determine_func,
&group_func_arg_tensors,
&tensor_map);
// 2.Do group schedule.
ir::ModuleExpr mod_expr(func_bodies);
ir::IRSchedule ir_sch(mod_expr);
ir_sch.MergeExprs();
VLOG(3) << "After lower, ir is: \n" << ir_sch.GetModule().GetExprs().at(0);
// TODO(Aurelius84): Support this.
// if (apply_group_schedule) {
// DoGroupSchedule(ir_sch, group, tensor_map);
// VLOG(3) << "After group schedule, ir is: \n"
// << ir_sch.GetModule().GetExprs().at(0);
// }
// 3.Do post-processing,
// including preparing function args and temporary variables,
// applying low-level optimization passes, etc.
return PostProcess(
group, tensor_map, do_op_schedule, &ir_sch, &group_func_arg_tensors);
}
std::vector<ir::LoweredFunc> OpLowererImpl::LowerCustomCall(
const GroupPtr& group) {
auto& ops = group->ops;
CHECK_EQ(ops.size(), 1);
::ir::Operation* op = ops[0];
std::unordered_map<::ir::Value, ir::Tensor> tensor_map;
std::vector<ir::Tensor> op_func_arg_tensors =
details::CollectInputTensor(op, nullptr, &tensor_map);
VLOG(4) << "inputs.size(): " << op_func_arg_tensors.size();
std::vector<Type> out_types;
std::vector<std::vector<int>> out_shapes;
details::CollectOutputInfo(op, &out_types, &out_shapes);
VLOG(4) << "out_types.size(): " << out_types.size();
NodeAttr node_attrs = details::CollectAttrs(*op);
auto& cinn_strategy = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
const hlir::framework::Operator* cinn_op =
Operator::Get(node_attrs.node_name);
auto impl = OpStrategy::SelectImpl(cinn_strategy[cinn_op](
node_attrs, op_func_arg_tensors, out_types, out_shapes, target_));
// TODO(Arelius84): Support extern API
std::string external_api;
// if (node_attrs.attr_store.count("custom_call")) {
// external_api =
// absl::get<std::string>(node_attrs.attr_store.at("custom_call"));
// } else {
// external_api = ExternalApiRegistry::Global()->GetExternalApi(node,
// target_);
// }
std::vector<common::CINNValue> compute_args = {
common::CINNValue(group->fn_name), common::CINNValue(external_api)};
common::CINNValuePack pack =
impl->fcompute(common::CINNValuePack{compute_args});
CHECK_EQ(pack.size(), 1UL);
// reset input names as extern api input args can't be remove duplicate.
// group->input_names.clear();
// for (auto& inode : node->inlinks_in_order()) {
// group->input_names.push_back(inode->source()->as<NodeData>()->id());
// }
return {pack[0].operator ir::Expr().as_lowered_func_ref()};
}
std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
const GroupPtr& group,
const std::unordered_map<::ir::Value, ir::Tensor>& tensor_map,
bool done_op_schedule,
ir::IRSchedule* ir_sch,
std::vector<ir::Tensor>* group_func_arg_tensors) {
// 1.Prepare function args
group->input_names.clear();
std::vector<ir::Argument> group_func_args;
std::unordered_set<std::string> arg_name_set;
for (auto& arg_tensor : *group_func_arg_tensors) {
// input data name.
group->input_names.push_back(arg_tensor->name);
// input args
group_func_args.emplace_back(arg_tensor->buffer, ir::Argument::IO::kInput);
arg_name_set.insert(arg_tensor->buffer->name);
}
group->output_names.clear();
// FIXME(Aurelius84): Do we need to use output_ops?
// Currently we regards all ops as output_ops.
for (auto& op : group->ops) {
// collect all output tensor.
for (auto opresult : op->results()) {
if (tensor_map.count(opresult) == 0) {
continue;
}
auto tensor = tensor_map.at(opresult);
if (arg_name_set.count(tensor->buffer->name) != 0) {
continue;
}
// output arg tensors
group_func_arg_tensors->push_back(tensor);
// output args
group_func_args.emplace_back(tensor->buffer, ir::Argument::IO::kOutput);
arg_name_set.insert(tensor->buffer->name);
}
}
if (!done_op_schedule) {
std::unordered_set<std::string> args_set;
for (auto arg : group_func_args) {
args_set.insert(arg.name());
}
for (auto& tensor_pair : tensor_map) {
if (args_set.count("_" + tensor_pair.second->name)) {
continue;
}
group_func_arg_tensors->push_back(tensor_pair.second);
// use the underlying tensor name to be consistent with the argument name
// in the lowered function
group->output_names.push_back(tensor_pair.second->name);
group_func_args.emplace_back(tensor_pair.second->buffer,
ir::Argument::IO::kOutput);
}
}
auto func_body = ir_sch->GetModule().GetExprs().at(0);
#ifdef CINN_WITH_CUDA
optim::OptimizeExprGPU(&(func_body));
#endif
// 2.Prepare temp buffers
poly::StageMap stages;
auto temp_buffers =
lang::GetTempBuffers(*group_func_arg_tensors, stages, func_body);
// 3.Building LoweredFunc
auto func = ir::_LoweredFunc_::Make(group->fn_name,
group_func_args,
ir_sch->GetModule().GetExprs().at(0),
temp_buffers);
if (!done_op_schedule) {
func->PrepareBufferCastExprs();
}
// 4.Apply low level pass
func = optim::Optimize(Expr(func), target_, false).as_lowered_func_ref();
return {func};
}
std::vector<ir::Expr> OpLowererImpl::LowerOps(
const std::vector<::ir::Operation*>& ops,
bool apply_op_schedule,
ScheduleDetermineFunction schedule_determine_func,
std::vector<ir::Tensor>* group_func_arg_tensors,
std::unordered_map<::ir::Value, ir::Tensor>* tensor_map) {
auto& strategy = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
std::vector<Expr> func_bodies;
for (auto* op : ops) {
// 1.Select Op impl
std::vector<Type> out_types;
std::vector<std::vector<int>> out_shapes;
details::CollectOutputInfo(op, &out_types, &out_shapes);
VLOG(4) << "out_types.size(): " << out_types.size();
NodeAttr node_attrs = details::CollectAttrs(*op);
std::vector<ir::Tensor> op_func_arg_tensors =
details::CollectInputTensor(op, group_func_arg_tensors, tensor_map);
std::string cinn_op_name = CompatibleInfo::OpName(*op);
const hlir::framework::Operator* cinn_op = Operator::Get(cinn_op_name);
auto op_impl = OpStrategy::SelectImpl(strategy[cinn_op](
node_attrs, op_func_arg_tensors, out_types, out_shapes, this->target_));
// 2.Perform the lower process of Op
std::vector<ir::LoweredFunc> funcs =
DoOpLower(op_impl, op, tensor_map, &op_func_arg_tensors);
if (apply_op_schedule && (this->*schedule_determine_func)(op)) {
// 3.Perform the schedule of Op
func_bodies.push_back(DoOpSchedule(op_impl, op_func_arg_tensors, funcs));
} else {
for (const ir::LoweredFunc& func : funcs) {
func_bodies.push_back(func->body);
}
}
}
return func_bodies;
}
std::vector<ir::LoweredFunc> OpLowererImpl::DoOpLower(
std::shared_ptr<hlir::framework::OpImpl> op_impl,
const ::ir::Operation* op,
std::unordered_map<::ir::Value, ir::Tensor>* tensor_map,
std::vector<ir::Tensor>* op_func_arg_tensors) {
VLOG(4) << "Do lower with Compute, op: " << op->name();
std::vector<common::CINNValue> cinn_inputs;
for (const ir::Tensor& tensor : *op_func_arg_tensors) {
cinn_inputs.push_back(common::CINNValue(ir::Expr(tensor)));
}
// set tensor name = operand hash name
auto op_results = op->results();
for (const auto& result : op_results) {
std::string output_id = CompatibleInfo::OutputName(result);
cinn_inputs.push_back(common::CINNValue(output_id));
}
// 1.Do compute
common::CINNValuePack pack =
op_impl->fcompute(common::CINNValuePack{cinn_inputs});
poly::StageMap tmp_stages = pack.back();
std::string post = "";
for (int idx = 0; idx < pack.size() - 1; ++idx) {
Expr expr = pack[idx];
// Insert the output tensor defined by Compute into the tensor_map
if (pack.size() - 1 > op_results.size()) {
// Some op may output multiple temp tensors in their Compute
// definition, but only one output in the graph, and we use id +
// "_0"/"_1" as key.
// FIXME(Aurelius84): It seems that the implementation is relate with
// string name.
// (*tensor_map)[op_results[0] + post] = expr.as_tensor_ref();
// post = "_" + std::to_string(idx);
} else {
// If the number of output tensors defined by Compute is less equal than
// the output node_data on the graph, then there is a one-to-one
// correspondence, and the redundant output node_data contact empty.
(*tensor_map)[op_results[idx]] = expr.as_tensor_ref();
}
// Insert output tensors into function arg
if (!expr.as_tensor_ref()->buffer.defined() ||
this->target_ != common::DefaultNVGPUTarget()) {
op_func_arg_tensors->push_back(expr.as_tensor_ref());
expr.as_tensor_ref()->WithBuffer();
}
}
// 2.Do lower
std::string lower_fn_name = CompatibleInfo::OpFuncName(*op);
std::vector<ir::LoweredFunc> funcs = lang::LowerVec(lower_fn_name,
tmp_stages,
*op_func_arg_tensors,
{},
{},
nullptr,
this->target_,
true);
VLOG(4) << "Lower op: " << lower_fn_name << ", get " << funcs.size()
<< " LoweredFunc:\n";
op_func_arg_tensors->clear();
for (int idx = 0; idx < pack.size() - 1; ++idx) {
CHECK(pack[idx].is_tensor());
op_func_arg_tensors->push_back(
pack[idx].operator ir::Expr().as_tensor_ref());
}
return funcs;
}
ir::Expr OpLowererImpl::DoOpSchedule(
std::shared_ptr<hlir::framework::OpImpl> op_impl,
const std::vector<ir::Tensor>& op_func_arg_tensors,
const std::vector<ir::LoweredFunc>& lowered_funcs) {
VLOG(4) << "Do op schedule";
std::vector<common::CINNValue> schedule_inputs;
// 1.Collect tensors
for (const ir::Tensor& op_func_arg_tensor : op_func_arg_tensors) {
schedule_inputs.push_back(common::CINNValue(op_func_arg_tensor));
}
// 2.Collect bodies to be scheduled
for (const ir::LoweredFunc& func : lowered_funcs) {
schedule_inputs.push_back(common::CINNValue(func->body));
}
// 3.Do schedule on AST
common::CINNValuePack expr_pack =
op_impl->fschedule(common::CINNValuePack{schedule_inputs});
VLOG(4) << "After op schedule: " << expr_pack[0].operator ir::Expr();
return expr_pack[0].operator ir::Expr();
}
} // namespace newir
} // namespace framework
} // namespace hlir
} // namespace cinn
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <vector>
#include "paddle/cinn/common/target.h"
#include "paddle/cinn/hlir/framework/instruction.h"
#include "paddle/cinn/hlir/framework/new_ir/group.h"
#include "paddle/cinn/hlir/framework/op_lowering_impl_base.h"
#include "paddle/cinn/hlir/framework/op_strategy.h"
#include "paddle/cinn/ir/lowered_func.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/schedule/ir_schedule_util.h"
#include "paddle/cinn/lang/packed_func.h"
#include "paddle/ir/core/operation.h"
// Fusion Op lowering, there are four kinds of lowering function:
// Elementwise/Broadcast/Injective,Reduce,OutEWiseFusable,NonFusible.
// Elementwise/Broadcast/Injective Ops is with same shcedule.
// Reduce,OutEWiseFusable,NonFusible are using different schedule.
namespace cinn {
namespace hlir {
namespace framework {
namespace newir {
using GroupPtr = std::shared_ptr<Group>;
using common::Target;
class OpLowererImpl;
typedef bool (OpLowererImpl::*ScheduleDetermineFunction)(::ir::Operation*);
class OpLowererImpl : public OpLowererImplBase<GroupPtr> {
public:
explicit OpLowererImpl(const Target&);
/**
* @brief Lower a group to CINN IR.
* @param group The group to be lowered.
* @param apply_op_schedule Whether to schedule at Op level.
* @param apply_group_schedule Whether to schedule at group level.
* @return The lowered funcs.
*/
std::vector<ir::LoweredFunc> Lower(const GroupPtr& group,
bool apply_op_schedule = true,
bool apply_group_schedule = true);
private:
/**
* @brief Lower a group to CINN IR.
* @param group The group to be lowered.
* @param apply_op_schedule Whether to schedule at Op level.
* @param apply_group_schedule Whether to schedule at group level.
* @param schedule_determine_func Function used to determine which Ops to
* schedule.
* @return The lowered funcs.
*/
std::vector<ir::LoweredFunc> LowerGroup(
const GroupPtr& group,
bool apply_op_schedule,
bool apply_group_schedule,
ScheduleDetermineFunction schedule_determine_func);
/**
* @brief Lower a group composed of CustomCall Op.
* @param group The group to be lowered.
* @return The lowered funcs.
*/
std::vector<ir::LoweredFunc> LowerCustomCall(const GroupPtr& group);
/**
* @brief Post processing, including preparing function args and temporary
* variables, applying low-level optimization passes, etc.
* @param group The group to be lowered.
* @param tensor_map All tensors used for calculating the group.
* @param done_op_schedule Mark whether the Op level schedule has been
* applied.
* @param ir_sch The IRSchedule object of group.
* @param group_func_arg_tensors Tensors used as the group function arguments.
* @return The lowered funcs after the post processing.
*/
std::vector<ir::LoweredFunc> PostProcess(
const GroupPtr& group,
const std::unordered_map<::ir::Value, ir::Tensor>& tensor_map,
bool done_op_schedule,
ir::IRSchedule* ir_sch,
std::vector<ir::Tensor>* group_func_arg_tensors);
/**
* @brief Lower an Op set to CINN IR.
* Compute, Lower and optional Schedule will be performed one by one
* for each Op.
* @param ops The Op to be lowered.
* @param apply_op_schedule Whether to schedule at Op level.
* @param schedule_determine_func Function used to determine which Ops to
* schedule.
* @param group_func_arg_tensors Tensors used as the group function arguments.
* @param tensor_map All tensors used for calculating the group.
* @return The lowered func bodies of Op set.
*/
std::vector<ir::Expr> LowerOps(
const std::vector<::ir::Operation*>& ops,
bool apply_op_schedule,
ScheduleDetermineFunction schedule_determine_func,
std::vector<ir::Tensor>* group_func_arg_tensors,
std::unordered_map<::ir::Value, ir::Tensor>* tensor_map);
/**
* @brief Lower an Op to CINN IR. The Compute and Lower processes will be
* called sequentially.
* @param op_impl The Op implementation defining Compute and Schedule.
* @param op The Op to be lowered.
* @param tensor_map All tensors used for calculating the group.
* @param op_func_arg_tensors Tensors used as the Op function arguments.
* @return The lowered func of the Op.
*/
std::vector<ir::LoweredFunc> DoOpLower(
std::shared_ptr<hlir::framework::OpImpl> op_impl,
const ::ir::Operation* op,
std::unordered_map<::ir::Value, ir::Tensor>* tensor_map,
std::vector<ir::Tensor>* op_func_arg_tensors);
/**
* @brief Apply schedule on an Op.
* @param op_impl The Op implementation defining Compute and Schedule.
* @param op_func_arg_tensors Tensors used as the Op function arguments.
* @param lowered_funcs The lowered funcs of an Op to be scheduled.
* @return The lowered func body after schedule of the Op.
*/
ir::Expr DoOpSchedule(std::shared_ptr<hlir::framework::OpImpl> op_impl,
const std::vector<ir::Tensor>& op_func_arg_tensors,
const std::vector<ir::LoweredFunc>& lowered_funcs);
// Functions used to determine which Ops to schedule at op level, define a
// policy for each type of group.
inline bool ReduceScheduleDetermineFunction(::ir::Operation* op);
inline bool ElementwiseScheduleDetermineFunction(::ir::Operation* op);
inline bool NonFusibleScheduleDetermineFunction(::ir::Operation* op);
private:
Target target_;
};
} // namespace newir
} // namespace framework
} // namespace hlir
} // namespace cinn
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/hlir/framework/new_ir/utils.h"
namespace cinn {
namespace hlir {
namespace framework {
namespace newir {
const std::unordered_map<std::string, std::string> CompatibleInfo::OP_NAMES = {
{"pd.full", "fill_constant"}};
std::string CompatibleInfo::OpName(const ::ir::Operation& op) {
std::string name = op.name();
if (OP_NAMES.count(name)) {
return OP_NAMES.at(name);
}
auto pos = name.find(".");
if (pos == std::string::npos) {
return name;
}
auto cinn_op_name = name.substr(pos + 1);
VLOG(4) << "GetOpName: " << name << " -> " << cinn_op_name;
return cinn_op_name;
}
std::string CompatibleInfo::InputName(const ::ir::Value& value) {
return CompatibleInfo::kInputPrefix +
std::to_string(std::hash<::ir::Value>()(value));
}
std::string CompatibleInfo::OutputName(const ::ir::Value& value) {
return CompatibleInfo::kOutputPrefix +
std::to_string(std::hash<::ir::Value>()(value));
}
std::string CompatibleInfo::OpFuncName(const ::ir::Operation& op) {
std::string op_name = OpName(op);
std::string func_name =
cinn::common::Context::Global().NewName("fn_" + op_name);
return func_name;
}
std::string CompatibleInfo::GroupOpsName(
const std::vector<::ir::Operation*>& ops) {
std::string name = "fn_";
for (auto* op : ops) {
std::string op_name = OpName(*op);
name += cinn::common::Context::Global().NewName(op_name);
}
return name;
}
std::vector<std::string> CompatibleInfo::InputNames(const ::ir::Operation& op,
bool allow_duplicate) {
std::vector<std::string> names;
std::unordered_set<std::string> repeat;
for (int i = 0; i < op.num_operands(); ++i) {
auto value = op.operand_source(i);
std::string name = CompatibleInfo::InputName(value);
if (!allow_duplicate && repeat.count(name)) {
continue;
}
repeat.insert(name);
names.push_back(name);
}
return names;
}
std::vector<std::string> CompatibleInfo::OutputNames(
const ::ir::Operation& op) {
std::vector<std::string> names;
for (int i = 0; i < op.num_results(); ++i) {
auto value = op.result(i);
std::string name = CompatibleInfo::OutputName(value);
names.push_back(std::move(name));
}
return names;
}
} // namespace newir
} // namespace framework
} // namespace hlir
} // namespace cinn
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <unordered_map>
#include "paddle/cinn/common/context.h"
#include "paddle/ir/core/operation.h"
namespace cinn {
namespace hlir {
namespace framework {
namespace newir {
struct CompatibleInfo {
static constexpr char* kInputPrefix = "input_";
static constexpr char* kOutputPrefix = "output_";
// TODO(Aurelius): Need add name mapping logic in REGISTER_CINN_OP
// macros or attempt to unify Op name with Paddle and CINN.
static const std::unordered_map<std::string, std::string> OP_NAMES;
static std::string OpName(const ::ir::Operation& op);
static std::string InputName(const ::ir::Value& value);
static std::string OutputName(const ::ir::Value& value);
static std::string OpFuncName(const ::ir::Operation& op);
static std::string GroupOpsName(const std::vector<::ir::Operation*>& ops);
static std::vector<std::string> InputNames(const ::ir::Operation& op,
bool allow_duplicate = false);
static std::vector<std::string> OutputNames(const ::ir::Operation& op);
};
} // namespace newir
} // namespace framework
} // namespace hlir
} // namespace cinn
......@@ -15,9 +15,7 @@
#include "paddle/cinn/hlir/framework/new_ir_compiler.h"
#include <absl/types/variant.h>
#include "paddle/cinn/hlir/framework/op_strategy.h"
#include "paddle/cinn/lang/lower.h"
#include "paddle/cinn/lang/placeholder.h"
#include "paddle/cinn/hlir/framework/new_ir/utils.h"
#include "paddle/cinn/utils/attribute_util.h"
#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
#include "paddle/ir/core/builtin_type.h"
......@@ -25,25 +23,31 @@
namespace cinn {
namespace hlir {
namespace framework {
const std::unordered_map<std::string, std::string> CompatibleInfo::OP_NAMES = {
{"pd.full", "fill_constant"}, {"pd.matmul", "matmul"}};
using newir::CompatibleInfo;
// TODO(Aurelius84): Need abstract this logic to implement Proxy for
// the co-existance with GraphCompiler.
std::unique_ptr<Program> NewIRCompiler::Build() {
m_builder_.Clear();
// NOTE(Aurelius84): Currently only support each op for one group
std::vector<std::vector<::ir::Operation*>> groups;
std::vector<newir::GroupPtr> groups;
for (auto it = program_.block()->begin(); it != program_.block()->end();
++it) {
groups.push_back({*it});
std::vector<::ir::Operation*> ops = {*it};
groups.push_back(std::make_shared<newir::Group>(ops));
groups.back()->fn_name = CompatibleInfo::GroupOpsName(groups.back()->ops);
}
VLOG(4) << "Groups size: " << groups.size();
return std::move(Build(groups));
}
std::unique_ptr<Program> NewIRCompiler::Build(
const std::vector<newir::GroupPtr>& groups) {
auto op_lowerer = CreateOpLowerer<newir::GroupPtr>(target_);
std::vector<std::vector<ir::LoweredFunc>> lowered_funcs;
for (int i = 0; i < groups.size(); ++i) {
lowered_funcs.emplace_back(GetOpFunc(*groups[i][0], i));
lowered_funcs.emplace_back(op_lowerer.Lower(groups[i]));
}
for (auto&& lowered_func : lowered_funcs) {
......@@ -70,83 +74,6 @@ std::unique_ptr<Program> NewIRCompiler::Build() {
return std::make_unique<Program>(scope_, std::move(instructions));
}
std::vector<ir::LoweredFunc> NewIRCompiler::GetOpFunc(const ::ir::Operation& op,
int idx) {
std::vector<ir::Tensor> inputs;
std::vector<common::CINNValue> cinn_inputs;
auto op_name = op.name();
VLOG(4) << "GetOpFunc for op: " << op_name;
// step 1: Deal with Oprands
for (int i = 0; i < op.num_operands(); ++i) {
auto in_value = op.operand_source(i);
// TODO(Aurelius84): For now, use addr as name but it's not wise.
std::string input_id = CompatibleInfo::kInputPrefix +
std::to_string(std::hash<::ir::Value>()(in_value));
auto type_info =
in_value.type().dyn_cast<paddle::dialect::DenseTensorType>();
auto in_shape = phi::vectorize<int>(type_info.dims());
auto dtype = type_info.dtype();
ir::Tensor temp = lang::CreatePlaceHolder(
in_shape, utils::ConvertIRType(dtype), input_id);
inputs.push_back(temp);
cinn_inputs.push_back(common::CINNValue(temp));
}
for (auto out_name : OpGetOutputNames(op)) {
cinn_inputs.push_back(common::CINNValue(out_name));
}
VLOG(4) << "inputs.size(): " << inputs.size();
// step 2: Deal with OpResult
std::vector<Type> out_types;
std::vector<std::vector<int>> out_shapes;
for (int i = 0; i < op.num_results(); ++i) {
auto out_value = op.result(i);
auto type_info =
out_value.type().dyn_cast<paddle::dialect::DenseTensorType>();
out_types.push_back(utils::ConvertIRType(type_info.dtype()));
auto out_shape = phi::vectorize<int>(type_info.dims());
out_shapes.push_back(std::move(out_shape));
}
VLOG(4) << "out_types.size(): " << out_types.size();
NodeAttr node_attrs;
{
VLOG(4) << "op.attributes():" << op.attributes().size();
auto attrs = utils::ConvertAttributes(op.attributes());
node_attrs.node_name = CompatibleInfo::OP_NAMES.at(op_name);
node_attrs.attr_store = std::move(attrs);
}
auto& strategy = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
// NOTE(Aurelius84): Do we need replace all hlir::framework Operator with
// ::ir::Program ?
const hlir::framework::Operator* cinn_op =
Operator::Get(CompatibleInfo::OP_NAMES.at(op_name));
auto impl = OpStrategy::SelectImpl(
strategy[cinn_op](node_attrs, inputs, out_types, out_shapes, target_));
common::CINNValuePack C = impl->fcompute(common::CINNValuePack{cinn_inputs});
poly::StageMap stages = C.back();
// make sure all the tensors in the stages before schedule launch.
for (int i = 0; i < C->size() - 1; i++) {
ir::Expr temp = C[i];
stages->InsertLazily(temp.as_tensor_ref());
}
C = impl->fschedule(C);
for (int i = 0; i < C->size() - 1; i++) {
ir::Expr temp = C[i];
// checkout whether the tensor is with buffer.
if ((!temp.as_tensor_ref()->buffer.defined() ||
this->target_ != common::DefaultNVGPUTarget()) &&
!stages[temp.as_tensor_ref()]->inlined()) {
inputs.push_back(temp.as_tensor_ref());
}
}
auto func = lang::LowerVec(
GenOpFuncName(op, idx), stages, inputs, {}, {}, nullptr, target_);
return func;
}
void NewIRCompiler::ProcessFunction(
const std::vector<ir::LoweredFunc>& lowered_funcs) {
for (auto&& func : lowered_funcs) {
......@@ -173,71 +100,32 @@ void NewIRCompiler::ProcessFunction(
}
std::vector<std::unique_ptr<Instruction>> NewIRCompiler::BuildInstructions(
const std::vector<std::vector<::ir::Operation*>>& groups) {
const std::vector<newir::GroupPtr>& groups) {
std::vector<std::unique_ptr<Instruction>> instructions;
for (int idx = 0; idx < groups.size(); ++idx) {
// TODO(Aurelius84): only support single op in groups
auto& op = *groups[idx][0];
auto instr_name = op.name();
auto instr =
std::unique_ptr<Instruction>(new Instruction(target_,
auto& op = *(groups[idx]->ops[0]);
auto& fn_name = groups[idx]->fn_name;
auto instr = std::unique_ptr<Instruction>(
new Instruction(target_,
scope_.get(),
OpGetInputNames(op),
OpGetOutputNames(op),
instr_name));
auto& op_func_name = GenOpFuncName(op, idx);
auto* fn_ptr = compiler_->Lookup(op_func_name);
CompatibleInfo::InputNames(op),
CompatibleInfo::OutputNames(op),
fn_name));
VLOG(1) << "Lookup kernel name: " << fn_name;
auto* fn_ptr = compiler_->Lookup(fn_name);
CHECK(fn_ptr);
instr->SetLoweredFunc(reinterpret_cast<void*>(fn_ptr), op_func_name);
instr->SetLoweredFunc(reinterpret_cast<void*>(fn_ptr), fn_name);
// As some instruction like reduce, will generate more than one kernel.
// So try to find the rest kernel, if it exists.
// SetSubKernels(instr.get(), op_func_name);
instr->Finalize();
instructions.push_back(std::move(instr));
}
return instructions;
}
const std::string& NewIRCompiler::GenOpFuncName(const ::ir::Operation& op,
int idx) {
// TODO(Aurelius84): . will raise compiler error in pd.xxx, need more
// elegant way to generate function name.
std::string op_name = op.name().substr(3) + "_" + std::to_string(idx);
std::string func_name = Context::Global().NewName("fn_" + op_name);
func_names_.try_emplace(op_name, func_name);
return func_names_.at(op_name);
}
std::vector<std::string> NewIRCompiler::OpGetInputNames(
const ::ir::Operation& op) {
std::vector<std::string> names;
std::unordered_set<std::string> repeat;
for (int i = 0; i < op.num_operands(); ++i) {
auto value = op.operand_source(i);
std::string name = CompatibleInfo::kInputPrefix +
std::to_string(std::hash<::ir::Value>()(value));
if (repeat.count(name)) {
continue;
}
repeat.insert(name);
names.push_back(name);
}
return names;
}
std::vector<std::string> NewIRCompiler::OpGetOutputNames(
const ::ir::Operation& op) {
std::vector<std::string> names;
for (int i = 0; i < op.num_results(); ++i) {
auto value = op.result(i);
std::string name = CompatibleInfo::kOutputPrefix +
std::to_string(std::hash<::ir::Value>()(value));
names.push_back(std::move(name));
}
return names;
}
std::shared_ptr<Scope> BuildScope(const Target& target,
const ::ir::Program& program) {
std::unordered_set<::ir::Value> visited;
......
......@@ -20,19 +20,12 @@
#include "paddle/ir/core/program.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
namespace cinn {
namespace hlir {
namespace framework {
struct CompatibleInfo {
static constexpr char* kInputPrefix = "input_";
static constexpr char* kOutputPrefix = "output_";
// TODO(Aurelius): Need add name mapping logic in REGISTER_CINN_OP
// macros or attempt to unify Op name with Paddle and CINN.
static const std::unordered_map<std::string, std::string> OP_NAMES;
};
// TODO(Aurelius84): Need abstract this logic to implement Proxy for
// the co-existance with GraphCompiler.
class NewIRCompiler final {
......@@ -46,21 +39,18 @@ class NewIRCompiler final {
scope_(scope) {}
std::unique_ptr<Program> Build();
std::vector<ir::LoweredFunc> GetOpFunc(const ::ir::Operation& op, int idx);
void ProcessFunction(const std::vector<ir::LoweredFunc>& lowered_funcs);
std::vector<std::unique_ptr<Instruction>> BuildInstructions(
const std::vector<std::vector<::ir::Operation*>>& groups);
private:
CINN_DISALLOW_COPY_AND_ASSIGN(NewIRCompiler);
protected:
const std::string& GenOpFuncName(const ::ir::Operation& op, int idx);
std::unique_ptr<Program> Build(const std::vector<newir::GroupPtr>& groups);
std::vector<std::string> OpGetInputNames(const ::ir::Operation& op);
std::vector<ir::LoweredFunc> GetOpFunc(const ::ir::Operation& op, int idx);
std::vector<std::string> OpGetOutputNames(const ::ir::Operation& op);
void ProcessFunction(const std::vector<ir::LoweredFunc>& lowered_funcs);
private:
CINN_DISALLOW_COPY_AND_ASSIGN(NewIRCompiler);
std::vector<std::unique_ptr<Instruction>> BuildInstructions(
const std::vector<newir::GroupPtr>& groups);
const ::ir::Program& program_;
ir::Module::Builder m_builder_;
......
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -13,165 +13,65 @@
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "paddle/cinn/common/target.h"
#include "paddle/cinn/hlir/framework/graph.h"
#include "paddle/cinn/hlir/framework/instruction.h"
#include "paddle/cinn/hlir/framework/op_strategy.h"
#include "paddle/cinn/ir/lowered_func.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/schedule/ir_schedule_util.h"
#include "paddle/cinn/hlir/framework/op_lowering_impl.h"
#include "paddle/cinn/hlir/framework/op_lowering_impl_base.h"
#include "paddle/cinn/lang/packed_func.h"
// Fusion Op lowering, there are four kinds of lowering function:
// Elementwise/Broadcast/Injective,Reduce,OutEWiseFusable,NonFusible.
// Elementwise/Broadcast/Injective Ops is with same shcedule.
// Reduce,OutEWiseFusable,NonFusible are using different schedule.
#ifndef CINN_WITH_ONLY
#include "paddle/cinn/hlir/framework/new_ir/op_lowering_impl.h"
#endif
namespace cinn {
namespace hlir {
namespace framework {
using GroupPtr = std::shared_ptr<Graph::Group>;
using common::Target;
using GroupPtr = std::shared_ptr<hlir::framework::Graph::Group>;
class OpLowerer;
typedef bool (OpLowerer::*ScheduleDetermineFunction)(Node*);
template <typename T>
class OpLowerer {
public:
OpLowerer(const absl::flat_hash_map<std::string, Type>&,
const absl::flat_hash_map<std::string, shape_t>&,
const Target&);
explicit OpLowerer(OpLowererImplBase<T>* impl) { impl_.reset(impl); }
~OpLowerer() {}
/**
* @brief Lower a group to CINN IR.
* @param group The group to be lowered.
* @param apply_op_schedule Whether to schedule at Op level.
* @param apply_group_schedule Whether to schedule at group level.
* @return The lowered funcs.
*/
std::vector<ir::LoweredFunc> Lower(const GroupPtr& group,
std::vector<ir::LoweredFunc> Lower(const T& group,
bool apply_op_schedule = true,
bool apply_group_schedule = true);
bool apply_group_schedule = true) {
return impl_->Lower(group, apply_op_schedule, apply_group_schedule);
}
private:
/**
* @brief Lower a group to CINN IR.
* @param group The group to be lowered.
* @param apply_op_schedule Whether to schedule at Op level.
* @param apply_group_schedule Whether to schedule at group level.
* @param schedule_determine_func Function used to determine which Ops to
* schedule.
* @return The lowered funcs.
*/
std::vector<ir::LoweredFunc> LowerGroup(
const GroupPtr& group,
bool apply_op_schedule,
bool apply_group_schedule,
ScheduleDetermineFunction schedule_determine_func);
/**
* @brief Lower a group composed of CustomCall Op.
* @param group The group to be lowered.
* @return The lowered funcs.
*/
std::vector<ir::LoweredFunc> LowerCustomCall(const GroupPtr& group);
/**
* @brief Post processing, including preparing function args and temporary
* variables, applying low-level optimization passes, etc.
* @param group The group to be lowered.
* @param tensor_map All tensors used for calculating the group.
* @param done_op_schedule Mark whether the Op level schedule has been
* applied.
* @param ir_sch The IRSchedule object of group.
* @param group_func_arg_tensors Tensors used as the group function arguments.
* @return The lowered funcs after the post processing.
*/
std::vector<ir::LoweredFunc> PostProcess(
const GroupPtr& group,
const std::unordered_map<std::string, ir::Tensor>& tensor_map,
bool done_op_schedule,
ir::IRSchedule* ir_sch,
std::vector<ir::Tensor>* group_func_arg_tensors);
/**
* @brief Lower an Op set to CINN IR.
* Compute, Lower and optional Schedule will be performed one by one
* for each Op.
* @param nodes The Op nodes to be lowered.
* @param apply_op_schedule Whether to schedule at Op level.
* @param schedule_determine_func Function used to determine which Ops to
* schedule.
* @param group_func_arg_tensors Tensors used as the group function arguments.
* @param tensor_map All tensors used for calculating the group.
* @return The lowered func bodies of Op set.
*/
std::vector<ir::Expr> LowerOps(
const std::vector<Node*>& nodes,
bool apply_op_schedule,
ScheduleDetermineFunction schedule_determine_func,
std::vector<ir::Tensor>* group_func_arg_tensors,
std::unordered_map<std::string, ir::Tensor>* tensor_map);
/**
* @brief Lower an Op to CINN IR. The Compute and Lower processes will be
* called sequentially.
* @param op_impl The Op implementation defining Compute and Schedule.
* @param node The Op node to be lowered.
* @param tensor_map All tensors used for calculating the group.
* @param op_func_arg_tensors Tensors used as the Op function arguments.
* @return The lowered func of the Op node.
*/
std::vector<ir::LoweredFunc> DoOpLower(
std::shared_ptr<hlir::framework::OpImpl> op_impl,
Node* node,
std::unordered_map<std::string, ir::Tensor>* tensor_map,
std::vector<ir::Tensor>* op_func_arg_tensors);
/**
* @brief Apply schedule on an Op.
* @param op_impl The Op implementation defining Compute and Schedule.
* @param op_func_arg_tensors Tensors used as the Op function arguments.
* @param lowered_funcs The lowered funcs of an Op to be scheduled.
* @return The lowered func body after schedule of the Op.
*/
ir::Expr DoOpSchedule(std::shared_ptr<hlir::framework::OpImpl> op_impl,
const std::vector<ir::Tensor>& op_func_arg_tensors,
const std::vector<ir::LoweredFunc>& lowered_funcs);
/**
* @brief Apply schedule on a group.
* @param ir_sch The IRSchedule containing the entire group's lowered func
* bodies.
* @param group The group to be scheduled.
* @param tensor_map All tensors used for calculating the group.
* @return The lowered func body after schedule of the group.
*/
ir::Expr DoGroupSchedule(
ir::IRSchedule& ir_sch, // NOLINT
const GroupPtr& group,
const std::unordered_map<std::string, ir::Tensor>& tensor_map);
// Functions used to determine which Ops to schedule at op level, define a
// policy for each type of group.
inline bool ReduceScheduleDetermineFunction(Node* node);
inline bool ElementwiseScheduleDetermineFunction(Node* node);
inline bool NonFusibleScheduleDetermineFunction(Node* node);
std::shared_ptr<OpLowererImplBase<T>> impl_;
};
private:
Target target_;
const absl::flat_hash_map<std::string, Type>& type_dict_;
const absl::flat_hash_map<std::string, shape_t>& shape_dict_;
template <typename T = GroupPtr>
OpLowerer<T> CreateOpLowerer(const absl::flat_hash_map<std::string, Type>&,
const absl::flat_hash_map<std::string, shape_t>&,
const Target&);
// fucntion name prefix
const std::string func_name_prefix = "fn_";
};
template <>
inline OpLowerer<GroupPtr> CreateOpLowerer(
const absl::flat_hash_map<std::string, Type>& type_dict,
const absl::flat_hash_map<std::string, shape_t>& shape_dict,
const Target& target) {
auto* impl_base = new OpLowererImpl(type_dict, shape_dict, target);
return OpLowerer<GroupPtr>(impl_base);
}
#ifndef CINN_WITH_ONLY
template <typename T = newir::GroupPtr>
OpLowerer<T> CreateOpLowerer(const Target&);
template <>
inline OpLowerer<newir::GroupPtr> CreateOpLowerer(const Target& target) {
auto* impl_base = new newir::OpLowererImpl(target);
return OpLowerer<newir::GroupPtr>(impl_base);
}
#endif
} // namespace framework
} // namespace hlir
......
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/hlir/framework/op_lowering_impl.h"
#include "paddle/cinn/hlir/framework/op_lowering_util.h"
#include "paddle/cinn/hlir/op/external_api_registry.h"
......@@ -38,13 +38,13 @@ using common::Type;
using cinn::hlir::op::ExternalApiRegistry;
OpLowerer::OpLowerer(
OpLowererImpl::OpLowererImpl(
const absl::flat_hash_map<std::string, Type>& type_dict,
const absl::flat_hash_map<std::string, shape_t>& shape_dict,
const Target& target)
: type_dict_(type_dict), shape_dict_(shape_dict), target_(target) {}
std::vector<ir::LoweredFunc> OpLowerer::Lower(const GroupPtr& group,
std::vector<ir::LoweredFunc> OpLowererImpl::Lower(const GroupPtr& group,
bool apply_op_schedule,
bool apply_group_schedule) {
VLOG(3) << "Lowering Group : " << group->group_id
......@@ -58,36 +58,38 @@ std::vector<ir::LoweredFunc> OpLowerer::Lower(const GroupPtr& group,
return LowerGroup(group,
apply_op_schedule,
apply_group_schedule,
&OpLowerer::ElementwiseScheduleDetermineFunction);
&OpLowererImpl::ElementwiseScheduleDetermineFunction);
case framework::kReduction:
return LowerGroup(group,
apply_op_schedule,
apply_group_schedule,
&OpLowerer::ReduceScheduleDetermineFunction);
&OpLowererImpl::ReduceScheduleDetermineFunction);
case framework::kOutFusible:
LOG(FATAL) << "Group Pattern Kind kOutFusible Is Not Implemented!";
case framework::kNonFusible:
return LowerGroup(group,
apply_op_schedule,
apply_group_schedule,
&OpLowerer::NonFusibleScheduleDetermineFunction);
&OpLowererImpl::NonFusibleScheduleDetermineFunction);
default:
LOG(FATAL) << "Group Pattern Kind Is Unknown!";
}
}
bool OpLowerer::ElementwiseScheduleDetermineFunction(Node* node) {
bool OpLowererImpl::ElementwiseScheduleDetermineFunction(Node* node) {
return true;
}
bool OpLowerer::ReduceScheduleDetermineFunction(Node* node) {
bool OpLowererImpl::ReduceScheduleDetermineFunction(Node* node) {
auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
return op_pattern_dict[node->op()] == framework::kReduction;
}
bool OpLowerer::NonFusibleScheduleDetermineFunction(Node* node) { return true; }
bool OpLowererImpl::NonFusibleScheduleDetermineFunction(Node* node) {
return true;
}
std::vector<ir::LoweredFunc> OpLowerer::LowerGroup(
std::vector<ir::LoweredFunc> OpLowererImpl::LowerGroup(
const GroupPtr& group,
bool apply_op_schedule,
bool apply_group_schedule,
......@@ -126,7 +128,8 @@ std::vector<ir::LoweredFunc> OpLowerer::LowerGroup(
group, tensor_map, do_op_schedule, &ir_sch, &group_func_arg_tensors);
}
std::vector<ir::LoweredFunc> OpLowerer::LowerCustomCall(const GroupPtr& group) {
std::vector<ir::LoweredFunc> OpLowererImpl::LowerCustomCall(
const GroupPtr& group) {
std::vector<Node*> nodes = group->CollectNodes();
CHECK_EQ(nodes.size(), 1);
Node* node = nodes[0];
......@@ -178,7 +181,7 @@ std::vector<ir::LoweredFunc> OpLowerer::LowerCustomCall(const GroupPtr& group) {
return {pack[0].operator ir::Expr().as_lowered_func_ref()};
}
std::vector<ir::LoweredFunc> OpLowerer::PostProcess(
std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
const GroupPtr& group,
const std::unordered_map<std::string, ir::Tensor>& tensor_map,
bool done_op_schedule,
......@@ -260,7 +263,7 @@ std::vector<ir::LoweredFunc> OpLowerer::PostProcess(
return {func};
}
std::vector<ir::Expr> OpLowerer::LowerOps(
std::vector<ir::Expr> OpLowererImpl::LowerOps(
const std::vector<Node*>& nodes,
bool apply_op_schedule,
ScheduleDetermineFunction schedule_determine_func,
......@@ -307,7 +310,7 @@ std::vector<ir::Expr> OpLowerer::LowerOps(
return func_bodies;
}
std::vector<ir::LoweredFunc> OpLowerer::DoOpLower(
std::vector<ir::LoweredFunc> OpLowererImpl::DoOpLower(
std::shared_ptr<hlir::framework::OpImpl> op_impl,
Node* node,
std::unordered_map<std::string, ir::Tensor>* tensor_map,
......@@ -375,7 +378,7 @@ std::vector<ir::LoweredFunc> OpLowerer::DoOpLower(
return funcs;
}
ir::Expr OpLowerer::DoOpSchedule(
ir::Expr OpLowererImpl::DoOpSchedule(
std::shared_ptr<hlir::framework::OpImpl> op_impl,
const std::vector<ir::Tensor>& op_func_arg_tensors,
const std::vector<ir::LoweredFunc>& lowered_funcs) {
......@@ -398,7 +401,7 @@ ir::Expr OpLowerer::DoOpSchedule(
}
// group schedule
ir::Expr OpLowerer::DoGroupSchedule(
ir::Expr OpLowererImpl::DoGroupSchedule(
ir::IRSchedule& ir_sch,
const GroupPtr& group,
const std::unordered_map<std::string, ir::Tensor>& tensor_map) {
......
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/cinn/common/target.h"
#include "paddle/cinn/hlir/framework/graph.h"
#include "paddle/cinn/hlir/framework/instruction.h"
#include "paddle/cinn/hlir/framework/op_lowering_impl_base.h"
#include "paddle/cinn/hlir/framework/op_strategy.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/schedule/ir_schedule_util.h"
#include "paddle/cinn/lang/packed_func.h"
// Fusion Op lowering, there are four kinds of lowering function:
// Elementwise/Broadcast/Injective,Reduce,OutEWiseFusable,NonFusible.
// Elementwise/Broadcast/Injective Ops is with same shcedule.
// Reduce,OutEWiseFusable,NonFusible are using different schedule.
namespace cinn {
namespace hlir {
namespace framework {
using GroupPtr = std::shared_ptr<Graph::Group>;
using common::Target;
class OpLowererImpl;
typedef bool (OpLowererImpl::*ScheduleDetermineFunction)(Node*);
class OpLowererImpl : public OpLowererImplBase<GroupPtr> {
public:
OpLowererImpl(const absl::flat_hash_map<std::string, Type>&,
const absl::flat_hash_map<std::string, shape_t>&,
const Target&);
/**
* @brief Lower a group to CINN IR.
* @param group The group to be lowered.
* @param apply_op_schedule Whether to schedule at Op level.
* @param apply_group_schedule Whether to schedule at group level.
* @return The lowered funcs.
*/
std::vector<ir::LoweredFunc> Lower(const GroupPtr& group,
bool apply_op_schedule = true,
bool apply_group_schedule = true);
private:
/**
* @brief Lower a group to CINN IR.
* @param group The group to be lowered.
* @param apply_op_schedule Whether to schedule at Op level.
* @param apply_group_schedule Whether to schedule at group level.
* @param schedule_determine_func Function used to determine which Ops to
* schedule.
* @return The lowered funcs.
*/
std::vector<ir::LoweredFunc> LowerGroup(
const GroupPtr& group,
bool apply_op_schedule,
bool apply_group_schedule,
ScheduleDetermineFunction schedule_determine_func);
/**
* @brief Lower a group composed of CustomCall Op.
* @param group The group to be lowered.
* @return The lowered funcs.
*/
std::vector<ir::LoweredFunc> LowerCustomCall(const GroupPtr& group);
/**
* @brief Post processing, including preparing function args and temporary
* variables, applying low-level optimization passes, etc.
* @param group The group to be lowered.
* @param tensor_map All tensors used for calculating the group.
* @param done_op_schedule Mark whether the Op level schedule has been
* applied.
* @param ir_sch The IRSchedule object of group.
* @param group_func_arg_tensors Tensors used as the group function arguments.
* @return The lowered funcs after the post processing.
*/
std::vector<ir::LoweredFunc> PostProcess(
const GroupPtr& group,
const std::unordered_map<std::string, ir::Tensor>& tensor_map,
bool done_op_schedule,
ir::IRSchedule* ir_sch,
std::vector<ir::Tensor>* group_func_arg_tensors);
/**
* @brief Lower an Op set to CINN IR.
* Compute, Lower and optional Schedule will be performed one by one
* for each Op.
* @param nodes The Op nodes to be lowered.
* @param apply_op_schedule Whether to schedule at Op level.
* @param schedule_determine_func Function used to determine which Ops to
* schedule.
* @param group_func_arg_tensors Tensors used as the group function arguments.
* @param tensor_map All tensors used for calculating the group.
* @return The lowered func bodies of Op set.
*/
std::vector<ir::Expr> LowerOps(
const std::vector<Node*>& nodes,
bool apply_op_schedule,
ScheduleDetermineFunction schedule_determine_func,
std::vector<ir::Tensor>* group_func_arg_tensors,
std::unordered_map<std::string, ir::Tensor>* tensor_map);
/**
* @brief Lower an Op to CINN IR. The Compute and Lower processes will be
* called sequentially.
* @param op_impl The Op implementation defining Compute and Schedule.
* @param node The Op node to be lowered.
* @param tensor_map All tensors used for calculating the group.
* @param op_func_arg_tensors Tensors used as the Op function arguments.
* @return The lowered func of the Op node.
*/
std::vector<ir::LoweredFunc> DoOpLower(
std::shared_ptr<hlir::framework::OpImpl> op_impl,
Node* node,
std::unordered_map<std::string, ir::Tensor>* tensor_map,
std::vector<ir::Tensor>* op_func_arg_tensors);
/**
* @brief Apply schedule on an Op.
* @param op_impl The Op implementation defining Compute and Schedule.
* @param op_func_arg_tensors Tensors used as the Op function arguments.
* @param lowered_funcs The lowered funcs of an Op to be scheduled.
* @return The lowered func body after schedule of the Op.
*/
ir::Expr DoOpSchedule(std::shared_ptr<hlir::framework::OpImpl> op_impl,
const std::vector<ir::Tensor>& op_func_arg_tensors,
const std::vector<ir::LoweredFunc>& lowered_funcs);
/**
* @brief Apply schedule on a group.
* @param ir_sch The IRSchedule containing the entire group's lowered func
* bodies.
* @param group The group to be scheduled.
* @param tensor_map All tensors used for calculating the group.
* @return The lowered func body after schedule of the group.
*/
ir::Expr DoGroupSchedule(
ir::IRSchedule& ir_sch, // NOLINT
const GroupPtr& group,
const std::unordered_map<std::string, ir::Tensor>& tensor_map);
// Functions used to determine which Ops to schedule at op level, define a
// policy for each type of group.
inline bool ReduceScheduleDetermineFunction(Node* node);
inline bool ElementwiseScheduleDetermineFunction(Node* node);
inline bool NonFusibleScheduleDetermineFunction(Node* node);
private:
Target target_;
const absl::flat_hash_map<std::string, Type>& type_dict_;
const absl::flat_hash_map<std::string, shape_t>& shape_dict_;
// fucntion name prefix
const std::string func_name_prefix = "fn_";
};
} // namespace framework
} // namespace hlir
} // namespace cinn
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/cinn/ir/lowered_func.h"
// Fusion Op lowering, there are four kinds of lowering function:
// Elementwise/Broadcast/Injective,Reduce,OutEWiseFusable,NonFusible.
// Elementwise/Broadcast/Injective Ops is with same shcedule.
// Reduce,OutEWiseFusable,NonFusible are using different schedule.
namespace cinn {
namespace hlir {
namespace framework {
template <typename T>
class OpLowererImplBase {
public:
OpLowererImplBase() = default;
~OpLowererImplBase() = default;
virtual std::vector<ir::LoweredFunc> Lower(
const T& group,
bool apply_op_schedule = true,
bool apply_group_schedule = true) = 0;
};
} // namespace framework
} // namespace hlir
} // namespace cinn
......@@ -72,7 +72,7 @@ void Compile(NetBuilder& net_builder) { // NOLINT
graph->GetMutableAttrs<absl::flat_hash_map<std::string, shape_t>>(
"infershape");
OpLowerer op_lowerer(dtype_dict, shape_dict, target);
auto op_lowerer = CreateOpLowerer(dtype_dict, shape_dict, target);
for (auto& fusion_op : graph->fusion_groups) {
auto lowered_func = op_lowerer.Lower(fusion_op);
CHECK_EQ(lowered_func.size(), 1);
......
......@@ -16,7 +16,7 @@
#include <queue>
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/hlir/framework/op_lowering_impl.h"
namespace cinn {
namespace hlir {
......
......@@ -27,6 +27,7 @@
#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
#include "paddle/cinn/common/context.h"
#include "paddle/cinn/hlir/framework/graph_compiler_util.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/hlir/framework/pass.h"
#include "paddle/cinn/ir/module.h"
#include "paddle/cinn/runtime/flags.h"
......@@ -124,7 +125,7 @@ void ParallelCompiler::Task::Lowering() {
context->graph
->GetMutableAttrs<absl::flat_hash_map<std::string, shape_t>>(
"infershape");
OpLowerer op_lowerer(dtype_dict, shape_dict, context->target);
auto op_lowerer = CreateOpLowerer(dtype_dict, shape_dict, context->target);
auto& group = context->graph->fusion_groups[group_id];
VLOG(4) << "Start Lowering Group " << group_id << " at "
<< std::this_thread::get_id() << " :\n"
......
......@@ -21,7 +21,6 @@
#include "paddle/cinn/hlir/framework/graph.h"
#include "paddle/cinn/hlir/framework/graph_compiler_util.h"
#include "paddle/cinn/hlir/framework/instruction.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/ir/lowered_func.h"
#ifdef CINN_WITH_CUDA
#include "paddle/cinn/runtime/cuda/cuda_module.h"
......
......@@ -34,11 +34,6 @@ CINNSchedule GetElementwiseScheduleFunc(
common::CINNValuePack arg_pack = args[0];
CHECK_GT(arg_pack.size(), 0U)
<< "arg_pack.size() must contains at least one element.";
// TODO(Aurelius84): For NewIrCompiler, the outputs of Compute are
// tensor_ref and not Expr.
bool is_tensor_stages = arg_pack.size() == 2U && arg_pack[0].is_tensor() &&
arg_pack[1].is_stagemap();
if (!is_tensor_stages) {
std::vector<Expr> vec_ast;
for (int i = 0; i < arg_pack.size(); i++) {
if (arg_pack[i].is_expr()) {
......@@ -54,25 +49,6 @@ CINNSchedule GetElementwiseScheduleFunc(
std::vector<common::CINNValue> res{
common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
*ret = common::CINNValuePack{res};
} else {
CHECK(!args.empty()) << "The input argument of ElementwiseSchedule is "
"empty! Please check.\n";
common::CINNValuePack arg_pack = args[0];
Expr out = arg_pack[0];
poly::StageMap stages = arg_pack[1];
CHECK(out.as_tensor());
CHECK_EQ(arg_pack.size(), 2UL);
if (target.arch == Target::Arch::NVGPU) {
pe::CudaScheduleInjective(
stages[out.as_tensor_ref()], output_shapes.front(), target);
} else if (target.arch == Target::Arch::X86) {
pe::ScheduleInjectiveCPU(stages[out.as_tensor_ref()],
output_shapes.front(),
target,
vectorizable);
}
*ret = arg_pack;
}
});
}
......
......@@ -38,7 +38,8 @@ IRSchedule MakeIRSchedule(frontend::Program* program) {
"inferdtype");
auto& shape_dict = graph->GetMutableAttrs<
absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
hlir::framework::OpLowerer op_lowerer(dtype_dict, shape_dict, target);
auto op_lowerer =
hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target);
std::vector<LoweredFunc> lowered_funcs =
op_lowerer.Lower(graph->fusion_groups.front(), false, false);
......
......@@ -16,14 +16,10 @@ limitations under the License. */
#include "glog/logging.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/platform/flags.h"
#include "paddle/phi/core/flags.h"
PD_DECLARE_bool(benchmark);
PADDLE_DEFINE_EXPORTED_bool(
eager_delete_scope,
true,
"Delete local scope eagerly. It will reduce GPU memory usage but "
"slow down the destruction of variables.(around 1% performance harm)");
PHI_DECLARE_bool(eager_delete_scope);
#define SCOPE_KIDS_READER_LOCK phi::AutoRDLock auto_lock(&kids_lock_);
#define SCOPE_KIDS_WRITER_LOCK phi::AutoWRLock auto_lock(&kids_lock_);
......
......@@ -32,15 +32,10 @@ limitations under the License. */
#endif
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/flags.h"
#include "paddle/fluid/platform/os_info.h"
PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler,
false,
"Enable rpc profiler or not.");
#include "paddle/phi/core/flags.h"
PD_DEFINE_bool(enable_record_memory,
false,
"enable memory recorder"); // NOLINT
PHI_DECLARE_bool(enable_record_memory);
#if defined(_WIN32) && defined(PHI_SHARED)
phi::ProfilerState phi::ProfilerHelper::g_state = phi::ProfilerState::kDisabled;
......@@ -610,12 +605,6 @@ MemEvenRecorder::RecordMemEvent::~RecordMemEvent() { // NOLINT
PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
}
/*RecordRPCEvent::RecordRPCEvent(const std::string &name) {
if (FLAGS_enable_rpc_profiler) {
event_.reset(new platform::RecordEvent(name));
}
}*/
RecordBlock::RecordBlock(int block_id)
: is_enabled_(false), start_ns_(PosixInNsec()) {
// lock is not needed, the code below is thread-safe
......
......@@ -37,13 +37,6 @@
#include "paddle/phi/backends/device_manager.h"
#endif
// Used to filter events, works like glog VLOG(level).
// RecordEvent will works if host_trace_level >= level.
PADDLE_DEFINE_EXPORTED_int64(host_trace_level,
1,
"RecordEvent will works "
"if host_trace_level >= level.");
namespace paddle {
namespace platform {
......
......@@ -27,7 +27,7 @@
#include "paddle/fluid/platform/profiler/event_python.h"
#include "paddle/fluid/platform/profiler/tracer_base.h"
PD_DECLARE_int64(host_trace_level);
PHI_DECLARE_int64(host_trace_level);
namespace paddle {
namespace platform {
......
......@@ -1300,3 +1300,18 @@ PHI_DEFINE_EXPORTED_bool(enable_new_ir_api,
PHI_DEFINE_EXPORTED_bool(enable_new_ir_in_executor_trace_run,
false,
"Enable new IR in executor");
PHI_DEFINE_EXPORTED_bool(enable_record_memory, false, "Enable memory recorder");
PHI_DEFINE_EXPORTED_bool(
eager_delete_scope,
true,
"Delete local scope eagerly. It will reduce GPU memory usage but "
"slow down the destruction of variables.(around 1% performance harm)");
// Used to filter events, works like glog VLOG(level).
// RecordEvent will works if host_trace_level >= level.
PHI_DEFINE_EXPORTED_int64(host_trace_level,
1,
"RecordEvent will works "
"if host_trace_level >= level.");
......@@ -200,6 +200,9 @@ if(${len} GREATER_EQUAL 1)
if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
target_link_libraries(${test_name} ${PYTHON_LIBRARIES})
endif()
if(WITH_CINN AND NOT CINN_ONLY)
target_link_libraries(${test_name} $<TARGET_LINKER_FILE:cinnapi>)
endif()
if(WITH_XPU)
target_link_libraries(${test_name} xpulib)
endif()
......
......@@ -37,28 +37,33 @@ std::unique_ptr<::ir::Program> BuildProgram() {
auto program = std::make_unique<::ir::Program>(ctx);
::ir::Builder builder = ::ir::Builder(ctx, program->block());
const float value = 2.0;
const float value_one = 1.0; // relu(tan(1.)) = 1.5;
const float value_two = 2.0; // relu(tan(2.)) = 0.
auto full_op_x =
builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{64, 128},
value,
value_one,
phi::DataType::FLOAT32,
phi::GPUPlace());
auto full_op_y =
builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{128, 64},
value,
builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{64, 128},
value_two,
phi::DataType::FLOAT32,
phi::GPUPlace());
// TODO(Aurelius84): test more op
// auto add_z = builder.Build<paddle::dialect::MatmulOp>(full_op_x->result(0),
// full_op_y->result(0));
auto tanh_op_x = builder.Build<paddle::dialect::TanOp>(full_op_x->result(0));
auto relu_op_x = builder.Build<paddle::dialect::ReluOp>(tanh_op_x->result(0));
auto tanh_op_y = builder.Build<paddle::dialect::TanOp>(full_op_y->result(0));
auto relu_op_y = builder.Build<paddle::dialect::ReluOp>(tanh_op_y->result(0));
return std::move(program);
}
TEST(NewIRCompier, CompilerAndRun) {
// Step 1: Construct ir::Program
std::unique_ptr<::ir::Program> program = BuildProgram();
EXPECT_EQ(program->block()->size(), 2u);
EXPECT_EQ(program->block()->size(), 6u);
LOG(INFO) << program->block()->size();
std::stringstream ss;
program->Print(ss);
......@@ -67,21 +72,19 @@ TEST(NewIRCompier, CompilerAndRun) {
// Step 2: Compiler New ir::Program into Runtime Program
auto target = cinn::common::DefaultNVGPUTarget();
auto scope = cinn::hlir::framework::BuildScope(target, *program);
ASSERT_EQ(scope->var_names().size(), 2);
ASSERT_EQ(scope->var_names().size(), 6);
cinn::hlir::framework::NewIRCompiler ir_compiler(*program, target, scope);
auto runtime_program = ir_compiler.Build();
// Step 3: Execute Runtime Instruction and check Scope.
ASSERT_NO_THROW(runtime_program->Execute());
const float value = 2.0;
for (auto& var_name : scope->var_names()) {
std::string name = {var_name.begin(), var_name.end()};
std::vector<float> data =
cinn::GetTensorData<float>(scope->GetTensor(name), target);
for (int i = 0; i < data.size(); ++i) {
LOG_FIRST_N(INFO, 3) << "data: " << data[i];
ASSERT_NEAR(data[i], value, 1e-5);
for (int i = 0; i < 1; ++i) {
LOG_FIRST_N(INFO, 10) << "data: " << data[i];
}
}
}
......@@ -89,12 +92,12 @@ TEST(NewIRCompier, CompilerAndRun) {
TEST(RuntimeDialect, CompilerAndRun) {
// Step 1: Construct ir::Program
std::unique_ptr<::ir::Program> program = BuildProgram();
EXPECT_EQ(program->block()->size(), 2u);
EXPECT_EQ(program->block()->size(), 6u);
// Step 2: Compiler New ir::Program into Runtime Program
auto target = cinn::common::DefaultNVGPUTarget();
auto scope = cinn::hlir::framework::BuildScope(target, *program);
ASSERT_EQ(scope->var_names().size(), 2);
ASSERT_EQ(scope->var_names().size(), 6u);
cinn::hlir::framework::NewIRCompiler ir_compiler(*program, target, scope);
auto runtime_program = ir_compiler.Build();
......@@ -119,14 +122,12 @@ TEST(RuntimeDialect, CompilerAndRun) {
#endif
// Step 5: Check Scope Tensor Value.
const float value = 2.0;
for (auto& var_name : scope->var_names()) {
std::string name = {var_name.begin(), var_name.end()};
std::vector<float> data =
cinn::GetTensorData<float>(scope->GetTensor(name), target);
for (int i = 0; i < data.size(); ++i) {
LOG_FIRST_N(INFO, 3) << "data: " << data[i];
ASSERT_NEAR(data[i], value, 1e-5);
for (int i = 0; i < 1; ++i) {
LOG_FIRST_N(INFO, 10) << "data: " << data[i];
}
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册