// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include "paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h" #include "paddle/fluid/ir/dialect/kernel_attribute.h" #include "paddle/fluid/ir/dialect/kernel_dialect.h" #include "paddle/fluid/ir/dialect/kernel_op.h" #include "paddle/fluid/ir/dialect/kernel_type.h" #include "paddle/fluid/ir/dialect/op_yaml_info_util.h" #include "paddle/fluid/ir/dialect/pd_attribute.h" #include "paddle/fluid/ir/dialect/pd_dialect.h" #include "paddle/fluid/ir/dialect/utils.h" #include "paddle/fluid/ir/interface/op_yaml_info.h" #include "paddle/fluid/ir/interface/op_yaml_info_parser.h" #include "paddle/fluid/ir/trait/inplace.h" #include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/kernel_factory.h" namespace paddle { namespace dialect { const int init_on_gpu_threashold = 1000; std::unordered_map Str2PhiDataType = { {"DataType::FLOAT16", phi::DataType::FLOAT16}, {"DataType::BFLOAT16", phi::DataType::BFLOAT16}, {"DataType::FLOAT32", phi::DataType::FLOAT32}, {"DataType::FLOAT64", phi::DataType::FLOAT64}, {"DataType::INT16", phi::DataType::INT16}, {"DataType::INT32", phi::DataType::INT32}, {"DataType::INT64", phi::DataType::INT64}, {"DataType::INT8", phi::DataType::INT8}, {"DataType::BOOL", phi::DataType::BOOL}, }; const std::unordered_set UnchangeOutputOps = { "pd.data", "builtin.combine", "builtin.slice", "pd.feed", "pd.fetch", "builtin.set_parameter", "builtin.get_parameter", "pd.shadow_output"}; bool NeedFallBackCpu(const ir::Operation* op, const std::string& kernel_fn_name, const phi::KernelKey& kernel_key) { if (UnchangeOutputOps.count(op->name())) { return false; } if (kernel_fn_name == "") { return false; } if (phi::KernelFactory::Instance().HasKernel(kernel_fn_name, kernel_key)) { return false; } phi::KernelKey copy_kernel_key = kernel_key; if (copy_kernel_key.backend() == phi::Backend::GPUDNN) { copy_kernel_key.set_backend(phi::Backend::GPU); if (phi::KernelFactory::Instance().HasKernel(kernel_fn_name, copy_kernel_key)) { return false; } } copy_kernel_key.set_backend(phi::Backend::CPU); if (phi::KernelFactory::Instance().HasKernel(kernel_fn_name, copy_kernel_key)) { return true; } return false; } bool NeedFallBackFromGPUDNN2GPU(ir::Operation* op, const phi::KernelKey kernel_key) { // NOTE(phlrain): keep the same kernel select strategy with // GetExepectKernelKey if (op->name() == "pd.pool2d" || op->name() == "pd.pool2d_grad") { if (kernel_key.backend() == phi::Backend::GPUDNN && (op->attributes().at("adaptive").dyn_cast().data() == true)) { return true; } } return false; } ir::OpResult AddPlaceTransferOp(ir::OpResult in, ir::Type out_type, const phi::Place& src_place, const phi::Place& dst_place, const phi::KernelKey& kernel_key, ir::Program* program) { ir::IrContext* ctx = ir::IrContext::Instance(); std::string op_name = paddle::dialect::PhiKernelOp::name(); ir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_name); if ((src_place.GetType() == phi::AllocationType::CPU) && (dst_place.GetType() == phi::AllocationType::GPU)) { auto copy_kernel_key = kernel_key; copy_kernel_key.set_backend(phi::Backend::GPU); std::unordered_map op_attribute{ {"op_name", ir::StrAttribute::get(ctx, "pd.memcpy_h2d")}, {"kernel_name", ir::StrAttribute::get(ctx, "memcpy_h2d")}, {"kernel_key", dialect::KernelAttribute::get(ctx, copy_kernel_key)}, {"dst_place_type", ir::Int32Attribute::get(ctx, 1)}}; ir::Operation* op = ir::Operation::Create({in}, op_attribute, {out_type}, op_info); program->block()->push_back(op); auto new_in = op->result(0); return new_in; } else if ((src_place.GetType() == phi::AllocationType::GPU) && (dst_place.GetType() == phi::AllocationType::CPU)) { auto copy_kernel_key = kernel_key; copy_kernel_key.set_backend(phi::Backend::GPU); std::unordered_map op_attribute{ {"op_name", ir::StrAttribute::get(ctx, "pd.memcpy_d2h")}, {"kernel_name", ir::StrAttribute::get(ctx, "memcpy_d2h")}, {"kernel_key", dialect::KernelAttribute::get(ctx, copy_kernel_key)}, {"dst_place_type", ir::Int32Attribute::get(ctx, 0)}}; ir::Operation* op = ir::Operation::Create({in}, op_attribute, {out_type}, op_info); program->block()->push_back(op); auto new_in = op->result(0); return new_in; } else { PADDLE_THROW( phi::errors::Unimplemented("Only support cpu to gpu and gpu to cpu")); } } phi::KernelKey GetKernelKey( ir::Operation* op, const phi::Place& place, const std::unordered_map& map_value_pair, dialect::OpYamlInfoParser* op_info_parser = nullptr) { if (op->name() == "pd.feed") { // NOTE, for now feed op don't need a kernel, so the data type from Op // Result the next op use base program datatype return {phi::Backend::CPU, phi::DataLayout::ANY, TransToPhiDataType( op->result(0).type().dyn_cast().dtype())}; } if (op->name() == "pd.data") { // NOTE, for now feed op don't need a kernel, so the data type from Op // Result the next op use base program datatype auto t = op->attributes().at("place").dyn_cast().data(); auto backend = paddle::experimental::ParseBackend(t); return {backend, phi::DataLayout::ANY, TransToPhiDataType( op->result(0).type().dyn_cast().dtype())}; } phi::Backend kernel_backend = phi::Backend::UNDEFINED; phi::DataLayout kernel_layout = phi::DataLayout::UNDEFINED; phi::DataType kernel_data_type = phi::DataType::UNDEFINED; if (op_info_parser != nullptr) { // only suppurt non vector input for now int tensor_input_number = op_info_parser->InputTensorNumber(); auto attr_map = op->attributes(); auto& data_type_info = op_info_parser->OpRuntimeInfo().kernel_key_dtype; if (!data_type_info.empty() && !data_type_info[0].empty()) { // only support single input and attribute auto slot_name = data_type_info[0]; auto& input_map = op_info_parser->InputName2Id(); auto find_it = Str2PhiDataType.find(slot_name); if (find_it != Str2PhiDataType.end()) { kernel_data_type = find_it->second; } else if (input_map.count(slot_name)) { // parse from input int in_index = input_map.at(slot_name); auto type = map_value_pair.at(op->operand_source(in_index)).type(); if (type.isa()) { kernel_data_type = TransToPhiDataType( type.dyn_cast() .dtype()); } else if (type.isa()) { auto vec_data = type.dyn_cast().data(); if (vec_data.empty()) { kernel_data_type = phi::DataType::UNDEFINED; } else { if (vec_data[0].isa()) { kernel_data_type = TransToPhiDataType( vec_data[0] .dyn_cast() .dtype()); } else { PADDLE_THROW(phi::errors::Unimplemented( "Only support DenseTensorType in vector")); } } } else if (type.isa()) { kernel_data_type = TransToPhiDataType( type.dyn_cast() .dtype()); } else { PADDLE_THROW(phi::errors::Unimplemented( "Only support DenseTensorType, SelectedRows, VectorType")); } } else { PADDLE_ENFORCE_EQ(attr_map.count(slot_name), true, phi::errors::PreconditionNotMet( "[%s] MUST in attribute map", slot_name)); auto attr_type = op_info_parser->AttrTypeName(slot_name); PADDLE_ENFORCE_EQ(attr_type, "paddle::dialect::DataTypeAttribute", phi::errors::PreconditionNotMet( "Type of [%s] should be DataType", slot_name)); kernel_data_type = attr_map.at(slot_name) .dyn_cast() .data(); } } // parse all the input tensor if (tensor_input_number == 0 || op->name() == "pd.full_") { // all the information have to get from attribute and context if (op->name() == "pd.uniform") { // try to process uniform, use shape to determin backend // TODO(phlrain): shuold support other initilize op auto define_op = op->operand_source(0).GetDefiningOp(); if (define_op->name() == "pd.full_int_array") { auto shape = define_op->attributes() .at("value") .dyn_cast() .data() .GetData(); size_t numel = 1; for (auto& s : shape) { numel *= s; } if (numel > init_on_gpu_threashold) { kernel_backend = phi::Backend::GPU; } } } if (kernel_backend == phi::Backend::UNDEFINED) { kernel_backend = paddle::experimental::ParseBackend(place); } } } if (op->num_operands() > 0) { paddle::experimental::detail::KernelKeyParser kernel_key_parser; for (size_t i = 0; i < op->num_operands(); ++i) { // NOTE, only op with OpYamlInfo can have TensorArr if (op_info_parser != nullptr && op_info_parser->IsTensorAttribute(i)) { continue; } auto input_tmp = op->operand_source(i); // NOTE: if not input_tmp, it's an optional input if (!input_tmp) { continue; } auto new_input_tmp = map_value_pair.at(input_tmp); auto input_type = new_input_tmp.type(); dialect::AllocatedDenseTensorType type; if (input_type.isa()) { type = input_type.dyn_cast(); } else if (input_type.isa()) { if (!input_type.dyn_cast().empty()) { type = input_type.dyn_cast()[0] .dyn_cast(); } else { continue; } } // fake tensor here auto ptr = new phi::Allocation(nullptr, 0, type.place()); std::shared_ptr holder(ptr); auto dtype = TransToPhiDataType(type.dtype()); phi::DenseTensorMeta meta( dtype, type.dims(), type.data_layout(), type.lod(), type.offset()); phi::DenseTensor fake_tensor(holder, meta); kernel_key_parser.AssignKernelKeySet(fake_tensor); } auto kernel_key_set = kernel_key_parser.key_set; auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); if (kernel_backend == phi::Backend::UNDEFINED) { kernel_backend = kernel_key.backend(); } if (kernel_layout == phi::DataLayout::UNDEFINED) { kernel_layout = kernel_key.layout(); } if (kernel_data_type == phi::DataType::UNDEFINED) { kernel_data_type = kernel_key.dtype(); } } if (kernel_backend == phi::Backend::UNDEFINED) { kernel_backend = paddle::experimental::ParseBackend(place); } phi::KernelKey res(kernel_backend, kernel_layout, kernel_data_type); return res; } std::unique_ptr PdOpLowerToKernelPass(ir::Program* prog, phi::Place place) { auto program = std::make_unique(ir::IrContext::Instance()); auto block = prog->block(); ir::IrContext* ctx = ir::IrContext::Instance(); ctx->GetOrRegisterDialect(); ctx->GetOrRegisterDialect(); std::unordered_map map_op_pair; std::unordered_map map_value_pair; std::string phi_kernel_op_name = paddle::dialect::PhiKernelOp::name(); ir::OpInfo phi_kernel_op_info = ctx->GetRegisteredOpInfo(phi_kernel_op_name); for (auto op_item : *block) { VLOG(6) << "op name " << op_item->name(); if (op_item->name() == "builtin.combine") { std::vector out_places; // Copy op inputs std::vector vec_inputs; if (op_item->num_operands() > 0) { for (size_t i = 0; i < op_item->num_operands(); ++i) { auto cur_in = op_item->operand_source(i); if (!cur_in) { vec_inputs.emplace_back(); continue; } PADDLE_ENFORCE_EQ(map_value_pair.count(cur_in), true, phi::errors::PreconditionNotMet( "[%d]'s input of [%s] op MUST in map pair", i, op_item->name())); auto new_in = map_value_pair.at(cur_in); vec_inputs.push_back(new_in); if (new_in.type().isa()) { out_places.push_back( new_in.type() .dyn_cast() .place()); } else { PADDLE_THROW(phi::errors::Unimplemented( "only support dense tensor type for now")); } } } // Copy op output type std::vector op_output_types; if (op_item->num_results() > 0) { for (size_t i = 0; i < op_item->num_results(); ++i) { auto result_type = op_item->result(i).type(); if (!result_type) { op_output_types.push_back(result_type); } else if (result_type.isa()) { std::vector vec_inner_types; auto base_types = result_type.dyn_cast().data(); for (size_t idx = 0; idx < base_types.size(); idx++) { auto& base_type = base_types[idx]; if (base_type) { if (base_type.isa()) { auto allocated_dense_tensor_dtype = paddle::dialect::AllocatedDenseTensorType::get( ctx, out_places[idx], base_type.dyn_cast()); vec_inner_types.push_back(allocated_dense_tensor_dtype); } else { PADDLE_THROW(phi::errors::Unimplemented( "only support dense tensor in vector type for now")); } } else { // NOTE(phlrain), kernel not support a nullptr in output ir::Type fp32_dtype = ir::Float32Type::get(ctx); phi::DDim dims = {}; phi::DataLayout data_layout = phi::DataLayout::NCHW; phi::LoD lod = {{}}; size_t offset = 0; auto dense_tensor_dtype = paddle::dialect::DenseTensorType::get( ctx, fp32_dtype, dims, data_layout, lod, offset); vec_inner_types.push_back(dense_tensor_dtype); } } ir::Type t1 = ir::VectorType::get(ctx, vec_inner_types); op_output_types.push_back(t1); } else { PADDLE_THROW(phi::errors::Unimplemented( "builtin.combine Result type only support " "VectorType")); } } } // Get op info ir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_item->name()); // Generate new op ir::Operation* op = ir::Operation::Create( vec_inputs, op_item->attributes(), op_output_types, op_info); program->block()->push_back(op); map_op_pair[op_item] = op; // only deal with single output if (op_item->num_results() > 0) { for (size_t i = 0; i < op_item->num_results(); ++i) { map_value_pair[op_item->result(i)] = op->result(i); } } VLOG(6) << "Deep copy a new builtin op: " << op_item->name(); continue; } if (op_item->name() == "builtin.slice") { phi::Place out_place = place; // Copy op inputs std::vector vec_inputs; if (op_item->num_operands() > 0) { for (size_t i = 0; i < op_item->num_operands(); ++i) { auto cur_in = op_item->operand_source(i); if (!cur_in) { vec_inputs.emplace_back(); continue; } PADDLE_ENFORCE_EQ(map_value_pair.count(cur_in), true, phi::errors::PreconditionNotMet( "[%d]'s input of [%s] op MUST in map pair", i, op_item->name())); auto new_in = map_value_pair.at(cur_in); vec_inputs.push_back(new_in); if (new_in.type().isa()) { auto vec_types = new_in.type().dyn_cast().data(); out_place = vec_types[op_item->attributes() .at("index") .dyn_cast() .data()] .dyn_cast() .place(); } else { PADDLE_THROW( phi::errors::Unimplemented("only support vector type for now")); } } } // Copy op output type std::vector op_output_types; if (op_item->num_results() > 0) { for (size_t i = 0; i < op_item->num_results(); ++i) { auto result_type = op_item->result(i).type(); if (!result_type) { op_output_types.push_back(result_type); } else if (result_type.isa()) { auto allocated_dense_tensor_dtype = paddle::dialect::AllocatedDenseTensorType::get( ctx, out_place, result_type.dyn_cast()); op_output_types.push_back(allocated_dense_tensor_dtype); } else { PADDLE_THROW(phi::errors::Unimplemented( "builtin.combine Result type only support DenseTensorType")); } } } // Get op info ir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_item->name()); // Generate new op ir::Operation* op = ir::Operation::Create( vec_inputs, op_item->attributes(), op_output_types, op_info); program->block()->push_back(op); map_op_pair[op_item] = op; // only deal with single output if (op_item->num_results() > 0) { for (size_t i = 0; i < op_item->num_results(); ++i) { map_value_pair[op_item->result(i)] = op->result(i); } } VLOG(6) << "Deep copy a new builtin op: " << op_item->name(); continue; } // Lower from PaddleDialect to KernelDialect paddle::dialect::OpYamlInfoInterface op_info_interface = op_item->dyn_cast(); std::unique_ptr op_info_parser(nullptr); if (op_info_interface) { op_info_parser = std::make_unique(op_info_interface.GetOpInfo()); } std::string kernel_fn_str; if (op_info_parser != nullptr) { kernel_fn_str = op_info_parser->OpRuntimeInfo().kernel_func[0]; } auto kernel_key = GetKernelKey(op_item, place, map_value_pair, op_info_parser.get()); VLOG(6) << "kernel type " << kernel_key; if (op_item->name() == "pd.load_combine") { kernel_key.set_dtype(phi::DataType::FLOAT32); } if (NeedFallBackCpu((op_item), kernel_fn_str, kernel_key)) { kernel_key.set_backend(phi::Backend::CPU); } if (NeedFallBackFromGPUDNN2GPU(op_item, kernel_key)) { kernel_key.set_backend(phi::Backend::GPU); } // only for single output // need update new kernel key layout and data tyep std::vector op_output_types; if (op_item->num_results() > 0) { auto phi_kernel = phi::KernelFactory::Instance().SelectKernelWithGPUDNN( kernel_fn_str, kernel_key); auto args_def = phi_kernel.args_def(); auto output_defs = args_def.output_defs(); if (!UnchangeOutputOps.count(op_item->name()) && !IsLegacyOp(op_item->name())) { PADDLE_ENFORCE_EQ( op_item->num_results(), output_defs.size(), phi::errors::PreconditionNotMet( "op [%s] kernel output args defs should equal op outputs", op_item->name())); } for (size_t i = 0; i < op_item->num_results(); ++i) { phi::Place out_place; if ((!UnchangeOutputOps.count(op_item->name())) && (!IsLegacyOp(op_item->name())) && phi_kernel.IsValid()) { out_place = phi::TransToPhiPlace(output_defs[i].backend); } else { out_place = phi::TransToPhiPlace(kernel_key.backend()); } auto result_type = op_item->result(i).type(); if (!result_type) { op_output_types.push_back(result_type); } else if (result_type.isa()) { auto allocated_dense_tensor_dtype = paddle::dialect::AllocatedDenseTensorType::get( ctx, out_place, result_type.dyn_cast()); op_output_types.push_back(allocated_dense_tensor_dtype); } else if (result_type.isa()) { std::vector vec_inner_types; auto base_types = result_type.dyn_cast().data(); for (auto& base_type : base_types) { if (base_type) { if (base_type.isa()) { auto allocated_dense_tensor_dtype = paddle::dialect::AllocatedDenseTensorType::get( ctx, out_place, base_type.dyn_cast()); vec_inner_types.push_back(allocated_dense_tensor_dtype); } else { PADDLE_THROW(phi::errors::Unimplemented( "only support dense tensor in vector type for now")); } } else { // NOTE(phlrain), kernel not support a nullptr in output ir::Type fp32_dtype = ir::Float32Type::get(ctx); phi::DDim dims = {}; phi::DataLayout data_layout = phi::DataLayout::NCHW; phi::LoD lod = {{}}; size_t offset = 0; auto dense_tensor_dtype = paddle::dialect::DenseTensorType::get( ctx, fp32_dtype, dims, data_layout, lod, offset); auto allocated_dense_tensor_dtype = paddle::dialect::AllocatedDenseTensorType::get( ctx, out_place, dense_tensor_dtype); vec_inner_types.push_back(allocated_dense_tensor_dtype); } } ir::Type t1 = ir::VectorType::get(ctx, vec_inner_types); op_output_types.push_back(t1); } else if (result_type.isa()) { auto allocated_selected_rows_dtype = paddle::dialect::AllocatedSelectedRowsType::get( ctx, out_place, result_type.dyn_cast()); op_output_types.emplace_back(allocated_selected_rows_dtype); } else { PADDLE_THROW(phi::errors::Unimplemented( "Result type only support DenseTensorType and VectorType")); } } } // constuct input std::vector vec_inputs; if (op_item->num_operands() > 0) { for (size_t i = 0; i < op_item->num_operands(); ++i) { auto cur_in = op_item->operand_source(i); if (!cur_in) { vec_inputs.emplace_back(); continue; } PADDLE_ENFORCE_EQ(map_value_pair.count(cur_in), true, phi::errors::PreconditionNotMet( "[%d]'s input of [%s] op MUST in map pair", i, op_item->name())); auto new_in = map_value_pair.at(cur_in); auto new_in_type = new_in.type(); auto& kernel = phi::KernelFactory::Instance().SelectKernelWithGPUDNN( kernel_fn_str, kernel_key); if (kernel.IsValid() && (!UnchangeOutputOps.count(op_item->name()))) { if (new_in_type.isa()) { // allocated type auto place = new_in_type.dyn_cast() .place(); // get input args def type auto args_def = kernel.args_def(); auto input_defs = args_def.input_defs(); bool need_trans = (place.GetType() != phi::AllocationType::UNDEFINED) && (op_info_parser != nullptr && !op_info_parser->IsTensorAttribute(i)) && (paddle::experimental::NeedTransformPlace( place, kernel.InputAt(i).backend, {})); if (need_trans) { VLOG(6) << "need trans from " << place << " to " << kernel_key.backend(); // build memcopy op new_in = AddPlaceTransferOp( new_in, new_in_type, place, phi::TransToPhiPlace(kernel.InputAt(i).backend), kernel_key, program.get()); } } else if (new_in_type.isa()) { // [ todo need update here, support combine data transfomer] } else if (new_in_type.isa()) { // do nothing here } else { PADDLE_THROW(phi::errors::Unimplemented( "only support allocated dense tensor type for now")); } } vec_inputs.push_back(new_in); } } std::unordered_map op_attribute{ {"op_name", ir::StrAttribute::get(ctx, op_item->name())}, {"kernel_name", ir::StrAttribute::get(ctx, kernel_fn_str)}, {"kernel_key", dialect::KernelAttribute::get(ctx, kernel_key)}}; auto op_attr_map = op_item->attributes(); for (auto& map_item : op_attr_map) { op_attribute.emplace(map_item.first, map_item.second); } if (op_item->HasTrait()) { op_attribute.emplace("is_inplace", ir::BoolAttribute::get(ctx, true)); } ir::Operation* op = ir::Operation::Create( vec_inputs, op_attribute, op_output_types, phi_kernel_op_info); map_op_pair[op_item] = op; // only deal with single output if (op_item->num_results() > 0) { for (size_t i = 0; i < op_item->num_results(); ++i) { map_value_pair[op_item->result(i)] = op->result(i); } } program->block()->push_back(op); if (op_item->name() == "pd.feed" && platform::is_gpu_place(place)) { // add shadow feed op phi::KernelKey shadow_key{ phi::Backend::GPU, phi::DataLayout::ANY, TransToPhiDataType( op_item->result(0).type().dyn_cast().dtype())}; std::unordered_map attr_map{ {"op_name", ir::StrAttribute::get(ctx, "pd.shadow_feed")}, {"kernel_name", ir::StrAttribute::get(ctx, "shadow_feed")}, {"kernel_key", dialect::KernelAttribute::get(ctx, shadow_key)}}; auto out_type = paddle::dialect::AllocatedDenseTensorType::get( ctx, phi::TransToPhiPlace(shadow_key.backend()), op_item->result(0).type().dyn_cast()); ir::Operation* shadow_op = ir::Operation::Create( {op->result(0)}, attr_map, {out_type}, phi_kernel_op_info); map_op_pair[op_item] = shadow_op; program->block()->push_back(shadow_op); if (op_item->num_results() > 0) { for (size_t i = 0; i < shadow_op->num_results(); ++i) { map_value_pair[op_item->result(i)] = shadow_op->result(i); } } } } return program; } } // namespace dialect } // namespace paddle