未验证 提交 dd1379e4 编写于 作者: H hong 提交者: GitHub

[NewIR]New ir aot placement refactor (#55810)

* refacot aot

* update

* fix bugs

* remove some test

* fix bug

* fix bug

* fix bug

* fix bug

* update
上级 39b59603
......@@ -387,7 +387,7 @@ std::unique_ptr<::ir::Program> ConstructFowardIrProgram(
}
for (auto &param : params) {
auto name = param.name();
auto &name = param.name();
auto place = param.place().GetType();
auto op_desc = local_program.MutableBlock(0)->PrependOp();
......
......@@ -424,6 +424,10 @@ void FakeInitializeOutputsForFunctionKernel(
if (beta1_pow->place() == beta2_pow->place()) {
backend = phi::TransToPhiBackend(beta1_pow->place());
}
} else if (op_type == "reshape2") {
phi::TensorBase* x =
GetTensorFormVar(runtime_ctx.inputs.find("X")->second.at(0));
backend = phi::TransToPhiBackend(x->place());
} else {
PADDLE_THROW(phi::errors::Unimplemented(
"Unsupported UNDEFINED backend for op: %s, parameter: %s",
......
......@@ -49,11 +49,120 @@ std::unordered_map<std::string, phi::DataType> Str2PhiDataType = {
{"DataType::BOOL", phi::DataType::BOOL},
};
const std::unordered_set<std::string> UnchangeOutputOps = {
"pd.feed_with_place",
"builtin.combine",
"builtin.slice",
"pd.feed",
"pd.fetch",
"builtin.set_parameter",
"builtin.get_parameter",
"pd.shadow_output"};
bool NeedFallBackCpu(const ir::Operation* op,
const std::string& kernel_fn_name,
const phi::KernelKey& kernel_key) {
if (UnchangeOutputOps.count(op->name())) {
return false;
}
if (kernel_fn_name == "") {
return false;
}
if (phi::KernelFactory::Instance().HasKernel(kernel_fn_name, kernel_key)) {
return false;
}
phi::KernelKey copy_kernel_key = kernel_key;
if (copy_kernel_key.backend() == phi::Backend::GPUDNN) {
copy_kernel_key.set_backend(phi::Backend::GPU);
if (phi::KernelFactory::Instance().HasKernel(kernel_fn_name,
copy_kernel_key)) {
return false;
}
}
copy_kernel_key.set_backend(phi::Backend::CPU);
if (phi::KernelFactory::Instance().HasKernel(kernel_fn_name,
copy_kernel_key)) {
return true;
}
return false;
}
bool NeedFallBackFromGPUDNN2GPU(ir::Operation* op,
const phi::KernelKey kernel_key) {
// NOTE(phlrain): keep the same kernel select strategy with
// GetExepectKernelKey
if (op->name() == "pd.pool2d" || op->name() == "pd.pool2d_grad") {
if (kernel_key.backend() == phi::Backend::GPUDNN &&
(op->attributes().at("adaptive").dyn_cast<ir::BoolAttribute>().data() ==
true)) {
return true;
}
}
return false;
}
ir::OpResult AddPlaceTransferOp(ir::OpResult in,
ir::Type out_type,
const phi::Place& src_place,
const phi::Place& dst_place,
const phi::KernelKey& kernel_key,
ir::Program* program) {
ir::IrContext* ctx = ir::IrContext::Instance();
std::string op_name = paddle::dialect::PhiKernelOp::name();
ir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_name);
if ((src_place.GetType() == phi::AllocationType::CPU) &&
(dst_place.GetType() == phi::AllocationType::GPU)) {
auto copy_kernel_key = kernel_key;
copy_kernel_key.set_backend(phi::Backend::GPU);
std::unordered_map<std::string, ir::Attribute> op_attribute{
{"op_name", ir::StrAttribute::get(ctx, "pd.memcpy_h2d")},
{"kernel_name", ir::StrAttribute::get(ctx, "memcpy_h2d")},
{"kernel_key", dialect::KernelAttribute::get(ctx, copy_kernel_key)},
{"dst_place_type", ir::Int32Attribute::get(ctx, 1)}};
ir::Operation* op =
ir::Operation::Create({in}, op_attribute, {out_type}, op_info);
program->block()->push_back(op);
auto new_in = op->result(0);
return new_in;
} else if ((src_place.GetType() == phi::AllocationType::GPU) &&
(dst_place.GetType() == phi::AllocationType::CPU)) {
auto copy_kernel_key = kernel_key;
copy_kernel_key.set_backend(phi::Backend::GPU);
std::unordered_map<std::string, ir::Attribute> op_attribute{
{"op_name", ir::StrAttribute::get(ctx, "pd.memcpy_d2h")},
{"kernel_name", ir::StrAttribute::get(ctx, "memcpy_d2h")},
{"kernel_key", dialect::KernelAttribute::get(ctx, copy_kernel_key)},
{"dst_place_type", ir::Int32Attribute::get(ctx, 0)}};
ir::Operation* op =
ir::Operation::Create({in}, op_attribute, {out_type}, op_info);
program->block()->push_back(op);
auto new_in = op->result(0);
return new_in;
} else {
PADDLE_THROW(
phi::errors::Unimplemented("Only support cpu to gpu and gpu to cpu"));
}
}
phi::KernelKey GetKernelKey(
ir::Operation* op,
const phi::Place& place,
const std::unordered_map<ir::Value, ir::OpResult>& map_value_pair,
std::unique_ptr<dialect::OpYamlInfoParser> op_info_parser = nullptr) {
dialect::OpYamlInfoParser* op_info_parser = nullptr) {
if (op->name() == "pd.feed") {
// NOTE, for now feed op don't need a kernel, so the data type from Op
// Result the next op use base program datatype
......@@ -70,6 +179,7 @@ phi::KernelKey GetKernelKey(
op->attributes().at("place").dyn_cast<dialect::PlaceAttribute>().data();
auto backend = paddle::experimental::ParseBackend(t);
return {backend,
phi::DataLayout::ANY,
TransToPhiDataType(
......@@ -258,7 +368,7 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
VLOG(6) << "op name " << op_item->name();
paddle::dialect::OpYamlInfoInterface op_info_interface =
op_item->dyn_cast<paddle::dialect::OpYamlInfoInterface>();
std::unique_ptr<OpYamlInfoParser> op_info_parser;
std::unique_ptr<OpYamlInfoParser> op_info_parser(nullptr);
if (op_info_interface) {
op_info_parser =
std::make_unique<OpYamlInfoParser>(op_info_interface.GetOpInfo());
......@@ -270,13 +380,45 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
}
auto kernel_key =
GetKernelKey(op_item, place, map_value_pair, std::move(op_info_parser));
GetKernelKey(op_item, place, map_value_pair, op_info_parser.get());
VLOG(6) << "kernel type " << kernel_key;
if (NeedFallBackCpu((op_item), kernel_fn_str, kernel_key)) {
kernel_key.set_backend(phi::Backend::CPU);
}
if (NeedFallBackFromGPUDNN2GPU(op_item, kernel_key)) {
kernel_key.set_backend(phi::Backend::GPU);
}
// only for single output
// need update new kernel key layout and data tyep
std::vector<ir::Type> op_output_types;
if (op_item->num_results() > 0) {
auto phi_kernel = phi::KernelFactory::Instance().SelectKernelWithGPUDNN(
kernel_fn_str, kernel_key);
auto args_def = phi_kernel.args_def();
auto output_defs = args_def.output_defs();
if (!UnchangeOutputOps.count(op_item->name())) {
PADDLE_ENFORCE_EQ(
op_item->num_results(),
output_defs.size(),
phi::errors::PreconditionNotMet(
"op [%s] kernel output args defs should equal op outputs",
op_item->name()));
}
for (size_t i = 0; i < op_item->num_results(); ++i) {
phi::Place out_place;
if ((!UnchangeOutputOps.count(op_item->name())) &&
phi_kernel.IsValid()) {
out_place = phi::TransToPhiPlace(output_defs[i].backend);
} else {
out_place = phi::TransToPhiPlace(kernel_key.backend());
}
auto result_type = op_item->result(i).type();
if (!result_type) {
op_output_types.push_back(result_type);
......@@ -284,7 +426,7 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
auto allocated_dense_tensor_dtype =
paddle::dialect::AllocatedDenseTensorType::get(
ctx,
phi::TransToPhiPlace(kernel_key.backend()),
out_place,
result_type.dyn_cast<dialect::DenseTensorType>());
op_output_types.push_back(allocated_dense_tensor_dtype);
} else if (result_type.isa<ir::VectorType>()) {
......@@ -296,7 +438,7 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
auto allocated_dense_tensor_dtype =
paddle::dialect::AllocatedDenseTensorType::get(
ctx,
phi::TransToPhiPlace(kernel_key.backend()),
out_place,
base_type.dyn_cast<dialect::DenseTensorType>());
vec_inner_types.push_back(allocated_dense_tensor_dtype);
} else {
......@@ -314,9 +456,7 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
ctx, fp32_dtype, dims, data_layout, lod, offset);
auto allocated_dense_tensor_dtype =
paddle::dialect::AllocatedDenseTensorType::get(
ctx,
phi::TransToPhiPlace(kernel_key.backend()),
dense_tensor_dtype);
ctx, out_place, dense_tensor_dtype);
vec_inner_types.push_back(allocated_dense_tensor_dtype);
}
}
......@@ -327,7 +467,7 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
auto allocated_selected_rows_dtype =
paddle::dialect::AllocatedSelectedRowsType::get(
ctx,
phi::TransToPhiPlace(kernel_key.backend()),
out_place,
result_type.dyn_cast<dialect::SelectedRowsType>());
op_output_types.emplace_back(allocated_selected_rows_dtype);
} else {
......@@ -360,39 +500,34 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
auto& kernel = phi::KernelFactory::Instance().SelectKernelWithGPUDNN(
kernel_fn_str, kernel_key);
if (kernel.IsValid()) {
if (kernel.IsValid() && (!UnchangeOutputOps.count(op_item->name()))) {
if (new_in_type.isa<dialect::AllocatedDenseTensorType>()) {
// allocated type
auto place =
new_in_type.dyn_cast<dialect::AllocatedDenseTensorType>()
.place();
// get input args def type
auto args_def = kernel.args_def();
auto input_defs = args_def.input_defs();
bool need_trans =
(place.GetType() != phi::AllocationType::UNDEFINED) &&
(op_info_parser != nullptr &&
!op_info_parser->IsTensorAttribute(i)) &&
(place != phi::TransToPhiPlace(kernel_key.backend()));
(paddle::experimental::NeedTransformPlace(
place, kernel.InputAt(i).backend, {}));
if (need_trans) {
if (paddle::experimental::NeedTransformPlace(
place, kernel.InputAt(i).backend, {})) {
VLOG(6) << "need trans from " << place << " to "
<< kernel_key.backend();
// build memcopy op
auto copy_kernel_key = kernel_key;
copy_kernel_key.set_backend(phi::Backend::GPU);
std::unordered_map<std::string, ir::Attribute> op_attribute{
{"op_name", ir::StrAttribute::get(ctx, "pd.memcpy_h2d")},
{"kernel_name", ir::StrAttribute::get(ctx, "memcpy_h2d")},
{"kernel_key",
dialect::KernelAttribute::get(ctx, copy_kernel_key)},
{"dst_place_type", ir::Int32Attribute::get(ctx, 1)}};
ir::Operation* op = ir::Operation::Create(
{new_in}, op_attribute, {new_in_type}, op_info);
program->block()->push_back(op);
new_in = op_item->result(0);
}
VLOG(6) << "need trans from " << place << " to "
<< kernel_key.backend();
// build memcopy op
new_in = AddPlaceTransferOp(
new_in,
new_in_type,
place,
phi::TransToPhiPlace(kernel.InputAt(i).backend),
kernel_key,
program.get());
}
} else if (new_in_type.isa<ir::VectorType>()) {
// [ todo need update here, support combine data transfomer]
......
......@@ -39,6 +39,8 @@ Backend TransToPhiBackend(const phi::Place& place) {
return Backend::XPU;
case AllocationType::IPU:
return Backend::IPU;
case AllocationType::UNDEFINED:
return Backend::UNDEFINED;
case AllocationType::CUSTOM:
return static_cast<Backend>(
static_cast<size_t>(Backend::NUM_BACKENDS) +
......@@ -57,6 +59,8 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
switch (backend) {
case phi::Backend::CPU:
return phi::CPUPlace();
case phi::Backend::UNDEFINED:
return phi::Place();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
case phi::Backend::GPU:
return phi::GPUPlace(
......
......@@ -95,4 +95,6 @@ PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(reshape_infer,
phi::ReshapeInferKernel) {}
PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(reshape,
ALL_LAYOUT,
phi::ReshapeKernel) {}
phi::ReshapeKernel) {
kernel->OutputAt(1).SetBackend(phi::Backend::UNDEFINED);
}
......@@ -175,13 +175,7 @@ class TestNewIrDygraph(unittest.TestCase):
z = func(x, y)
gold_res = np.ones([2, 2], dtype="float32") * 2
self.assertEqual(
np.array_equal(
z.numpy(),
gold_res,
),
True,
)
np.testing.assert_array_equal(z.numpy(), gold_res)
class TestNewIrBackwardDygraph(unittest.TestCase):
......@@ -202,13 +196,35 @@ class TestNewIrBackwardDygraph(unittest.TestCase):
loss = z.mean()
loss.backward()
gold_res = np.ones([2, 2], dtype="float32")
self.assertEqual(
np.array_equal(
z.numpy(),
gold_res,
),
True,
)
np.testing.assert_array_equal(z.numpy(), gold_res)
gold_res = np.ones([2, 2], dtype="float32") * 0.25
np.testing.assert_array_equal(x.gradient(), gold_res)
np.testing.assert_array_equal(y.gradient(), gold_res)
class TestNewIrReshapeBackwardDygraph(unittest.TestCase):
def test_with_new_ir(self):
paddle.disable_static()
build_strategy = paddle.static.BuildStrategy()
build_strategy.enable_inplace = False
@paddle.jit.to_static(build_strategy=build_strategy)
def func(x, y):
x = x.reshape([-1, 2, 2])
y = y.reshape([-1, 2, 2])
return x * y
x = paddle.ones([2, 2], dtype='float32')
y = paddle.ones([2, 2], dtype='float32')
x.stop_gradient = False
y.stop_gradient = False
z = func(x, y)
loss = z.mean()
loss.backward()
gold_res = np.ones([1, 2, 2], dtype="float32")
np.testing.assert_array_equal(z.numpy(), gold_res)
gold_res = np.ones([2, 2], dtype="float32") * 0.25
np.testing.assert_array_equal(x.gradient(), gold_res)
......
......@@ -836,4 +836,5 @@ class TestImperativeOptimizerList(unittest.TestCase):
if __name__ == '__main__':
paddle.enable_static()
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册