未验证 提交 d520029f 编写于 作者: Z zyfncg 提交者: GitHub

Merge prepare_data and prepare_phi_data in static graph (#43878)

* merge prepare_data and prepare_phi_data in static graph

* fix bug

* fix data transform bug

* fix bug

* fix unittest of cinn
上级 9c32099d
...@@ -1622,9 +1622,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -1622,9 +1622,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
platform::EventRole::kInnerOp); platform::EventRole::kInnerOp);
if (run_phi_kernel_) { if (run_phi_kernel_) {
phi::KernelContext pt_kernel_context; phi::KernelContext pt_kernel_context;
// Do data transform before building KernelContext
// TODO(zhiqiu): support TransferInplaceVarsBack
PreparePhiData(exec_scope, *pt_kernel_, *kernel_signature_, runtime_ctx);
if (enable_cache_runtime_context_ && !need_prepare_phi_data_ && if (enable_cache_runtime_context_ && !need_prepare_phi_data_ &&
!need_prepare_data_) { !need_prepare_data_) {
impl_ = impl_ =
...@@ -2007,15 +2004,15 @@ Scope* OperatorWithKernel::PrepareData( ...@@ -2007,15 +2004,15 @@ Scope* OperatorWithKernel::PrepareData(
} }
} }
for (auto& var_name_item : Inputs()) { const auto& name_map = Inputs();
bool should_skip_input = auto prepare_input_data = [&](const std::string& in_name,
no_buffer_ins && no_buffer_ins->count(var_name_item.first) > 0; std::vector<Variable*>* in_vars,
const phi::TensorArgDef* in_def,
std::vector<Variable*>& input_vars = ctx->inputs[var_name_item.first]; bool should_skip_input) -> void {
auto& name_vec = name_map.at(in_name);
for (size_t i = 0; i < var_name_item.second.size(); ++i) { for (size_t i = 0; i < in_vars->size(); ++i) {
auto& var_name = var_name_item.second[i]; const auto& var_name = name_vec[i];
auto* var = input_vars[i]; auto* var = in_vars->at(i);
// Only tensor can be tranfer to another device. // Only tensor can be tranfer to another device.
if (var == nullptr || !VarIsTensor(*var)) { if (var == nullptr || !VarIsTensor(*var)) {
...@@ -2046,17 +2043,17 @@ Scope* OperatorWithKernel::PrepareData( ...@@ -2046,17 +2043,17 @@ Scope* OperatorWithKernel::PrepareData(
new_scope = &scope.NewScope(); new_scope = &scope.NewScope();
} }
auto* trans_var = new_scope->Var(var_name); auto* trans_var = new_scope->Var(var_name);
input_vars[i] = trans_var; in_vars->at(i) = trans_var;
auto out = trans_var->GetMutable<LoDTensor>(); auto out = trans_var->GetMutable<LoDTensor>();
out->Resize(tensor_in->dims()); out->Resize(tensor_in->dims());
platform::MatchShapeToLayout( platform::MatchShapeToLayout(
out, tensor_in->layout(), DataLayout::kNHWC); out, tensor_in->layout(), DataLayout::kNHWC);
VLOG(7) << "Created reshaped dummy input based on MKL-DNN Tensor , " VLOG(7) << "Created reshaped dummy input based on MKL-DNN Tensor , "
"but kNHWC layout" "but kNHWC layout"
<< var_name_item.first << " in Operator " << type_; << in_name << " in Operator " << type_;
} else { } else {
VLOG(7) << "Skip scanning input " << var_name_item.first VLOG(7) << "Skip scanning input " << in_name << " in Operator "
<< " in Operator " << type_; << type_;
} }
#endif #endif
continue; continue;
...@@ -2066,15 +2063,46 @@ Scope* OperatorWithKernel::PrepareData( ...@@ -2066,15 +2063,46 @@ Scope* OperatorWithKernel::PrepareData(
continue; continue;
} }
auto kernel_type_for_var = GetKernelTypeForVar( auto kernel_type_for_var =
var_name_item.first, *tensor_in, expected_kernel_key); GetKernelTypeForVar(in_name, *tensor_in, expected_kernel_key);
bool need_trans_dtype =
kernel_type_for_var.data_type_ != expected_kernel_key.data_type_;
bool need_trans_layout = NeedTransformLayout(
kernel_type_for_var.data_layout_, expected_kernel_key.data_layout_);
if (!need_trans_dtype && !need_trans_layout) {
if (!run_phi_kernel_ &&
platform::places_are_same_class(kernel_type_for_var.place_,
expected_kernel_key.place_)) {
continue;
}
}
if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) { std::unique_ptr<OpKernelType> new_expected_kernel_key = nullptr;
continue; if (run_phi_kernel_ && in_def->backend != phi::Backend::ALL_BACKEND) {
auto tensor_backend = phi::TransToPhiBackend(tensor_in->place());
if ((in_def->backend != tensor_backend &&
(in_def->backend != phi::Backend::GPUDNN ||
tensor_backend != phi::Backend::GPU)) ||
tensor_in->place().GetType() == AllocationType::GPUPINNED) {
new_expected_kernel_key = std::make_unique<OpKernelType>(
expected_kernel_key.data_type_,
phi::TransToPhiPlace(in_def->backend),
expected_kernel_key.data_layout_,
expected_kernel_key.library_type_,
expected_kernel_key.customized_type_value_);
}
}
if (!need_trans_dtype && !need_trans_layout) {
if (run_phi_kernel_ && new_expected_kernel_key == nullptr) {
continue;
}
} }
VLOG(3) << "Transform Variable " << var_name << " from " VLOG(3) << "Transform Variable " << var_name << " from "
<< kernel_type_for_var << " to " << expected_kernel_key; << kernel_type_for_var << " to "
<< (new_expected_kernel_key ? *new_expected_kernel_key
: expected_kernel_key);
// In the inference scenerio, the scopes will be reused across the // In the inference scenerio, the scopes will be reused across the
// batches, so the `new_scope` here will result in GPU memroy explosion // batches, so the `new_scope` here will result in GPU memroy explosion
...@@ -2094,13 +2122,22 @@ Scope* OperatorWithKernel::PrepareData( ...@@ -2094,13 +2122,22 @@ Scope* OperatorWithKernel::PrepareData(
// not do transfer scope caching, and cpu inference performance is not // not do transfer scope caching, and cpu inference performance is not
// impacted by test. // impacted by test.
enable_cache_transfer_scope_ = false; enable_cache_transfer_scope_ = false;
if (!run_by_executor_ && if (!run_by_executor_) {
(platform::is_gpu_place(kernel_type_for_var.place_) || if (new_expected_kernel_key) {
platform::is_gpu_place(expected_kernel_key.place_))) { if ((platform::is_gpu_place(kernel_type_for_var.place_) ||
new_scope = TryCreateTransferScope( platform::is_gpu_place(new_expected_kernel_key->place_))) {
kernel_type_for_var, expected_kernel_key, &scope); new_scope = TryCreateTransferScope(
enable_cache_transfer_scope_ = true; kernel_type_for_var, *new_expected_kernel_key, &scope);
enable_cache_transfer_scope_ = true;
}
} else if ((platform::is_gpu_place(kernel_type_for_var.place_) ||
platform::is_gpu_place(expected_kernel_key.place_))) {
new_scope = TryCreateTransferScope(
kernel_type_for_var, expected_kernel_key, &scope);
enable_cache_transfer_scope_ = true;
}
} }
if (!new_scope) { if (!new_scope) {
new_scope = &scope.NewScope(); new_scope = &scope.NewScope();
} }
...@@ -2117,7 +2154,7 @@ Scope* OperatorWithKernel::PrepareData( ...@@ -2117,7 +2154,7 @@ Scope* OperatorWithKernel::PrepareData(
// Create new var with the same name in transfer scopes // Create new var with the same name in transfer scopes
auto* trans_var = new_scope->Var(var_name); auto* trans_var = new_scope->Var(var_name);
input_vars[i] = trans_var; in_vars->at(i) = trans_var;
// Find if inplace exists between input and output // Find if inplace exists between input and output
// If inplace exists, set the new created var to inplaced output, and // If inplace exists, set the new created var to inplaced output, and
...@@ -2125,7 +2162,7 @@ Scope* OperatorWithKernel::PrepareData( ...@@ -2125,7 +2162,7 @@ Scope* OperatorWithKernel::PrepareData(
for (auto& pair : Outputs()) { for (auto& pair : Outputs()) {
for (size_t j = 0; j < pair.second.size(); ++j) { for (size_t j = 0; j < pair.second.size(); ++j) {
if (pair.second[j] == var_name) { if (pair.second[j] == var_name) {
VLOG(4) << "Found inplace between input(" << var_name_item.first VLOG(4) << "Found inplace between input(" << in_name
<< ") and output(" << pair.first << ") and output(" << pair.first
<< "), the variable name is " << var_name; << "), the variable name is " << var_name;
ctx->outputs[pair.first][j] = trans_var; ctx->outputs[pair.first][j] = trans_var;
...@@ -2136,9 +2173,47 @@ Scope* OperatorWithKernel::PrepareData( ...@@ -2136,9 +2173,47 @@ Scope* OperatorWithKernel::PrepareData(
// Do transfer // Do transfer
Tensor out; Tensor out;
TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out); TransformData(new_expected_kernel_key ? *new_expected_kernel_key
: expected_kernel_key,
kernel_type_for_var,
*tensor_in,
&out);
SetTensorToVariable(*var, out, trans_var); SetTensorToVariable(*var, out, trans_var);
} }
};
if (run_phi_kernel_) {
const auto& input_names = kernel_signature_->input_names;
const auto& input_defs = pt_kernel_->args_def().input_defs();
PADDLE_ENFORCE_EQ(input_names.size(),
input_defs.size(),
platform::errors::InvalidArgument(
"The size of inputs_args names (%d) must be equal to "
"the size of kernel input_defs (%d).",
input_names.size(),
input_defs.size()));
for (size_t i = 0; i < input_defs.size(); ++i) {
const auto& input_defs = pt_kernel_->args_def().input_defs();
auto& in_def = input_defs.at(i);
std::string input_name = input_names[i];
auto iter = ctx->inputs.find(input_name);
if (iter == ctx->inputs.end()) {
continue;
}
auto& ins_vector = iter->second;
bool should_skip_input =
no_buffer_ins && no_buffer_ins->count(input_name) > 0;
prepare_input_data(input_name, &ins_vector, &in_def, should_skip_input);
}
} else {
for (auto& var_name_item : Inputs()) {
bool should_skip_input =
no_buffer_ins && no_buffer_ins->count(var_name_item.first) > 0;
std::vector<Variable*>& input_vars = ctx->inputs[var_name_item.first];
prepare_input_data(
var_name_item.first, &input_vars, nullptr, should_skip_input);
}
} }
// If pre_scope = &scope, it means that scope is cached and the op is not in // If pre_scope = &scope, it means that scope is cached and the op is not in
...@@ -2381,107 +2456,6 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs( ...@@ -2381,107 +2456,6 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs(
return (*arg_map_fn_)(arg_mapping_ctx); return (*arg_map_fn_)(arg_mapping_ctx);
} }
Scope* OperatorWithKernel::PreparePhiData(
const Scope& scope,
const phi::Kernel& pt_kernel,
const phi::KernelSignature& pt_kernel_signature,
RuntimeContext* ctx) const {
const auto& input_names = pt_kernel_signature.input_names;
auto input_defs = pt_kernel.args_def().input_defs();
PADDLE_ENFORCE_EQ(input_names.size(),
input_defs.size(),
platform::errors::InvalidArgument(
"The size of inputs_args names (%d) must be equal to "
"the size of kernel input_defs (%d).",
input_names.size(),
input_defs.size()));
Scope* new_scope = nullptr;
auto& name_map = Inputs();
const std::unordered_set<std::string>* no_buffer_ins = nullptr;
if (info_) {
auto& no_buffer_inferer = info_->NoNeedBufferVarsInferer();
// Some op may not register NoNeedBufferVarsInferer
if (no_buffer_inferer) {
no_buffer_ins = &(no_buffer_inferer(Inputs(), Outputs(), Attrs()));
if (no_buffer_ins->empty()) no_buffer_ins = nullptr;
}
}
for (size_t i = 0; i < input_defs.size(); ++i) {
auto& in_def = input_defs.at(i);
if (ctx->inputs.find(input_names[i]) == ctx->inputs.end()) {
continue;
}
auto& ins_vector = ctx->inputs.at(input_names[i]);
auto& name_vec = name_map.at(input_names[i]);
bool should_skip_input =
no_buffer_ins && no_buffer_ins->count(input_names[i]) > 0;
for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
// Only tensor can be tranfer to another device.
auto* var = ins_vector[offset];
if (var == nullptr || !VarIsTensor(*var)) {
continue;
}
auto* tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
// When no_buffer_ins then checking of Tensor::holder_ is
// not a thread safe. And for infershape scenario checks
// to be omitted are not really needed
if (should_skip_input == true) {
// TODO(YuanRisheng) : There need to supplement MKLDNN code later
continue;
}
if (!tensor_in->IsInitialized()) {
continue;
}
if (in_def.backend == phi::Backend::ALL_BACKEND) {
continue;
}
auto tensor_backend = phi::TransToPhiBackend(tensor_in->place());
if (in_def.backend == tensor_backend ||
(in_def.backend == phi::Backend::GPUDNN &&
tensor_backend == phi::Backend::GPU)) {
continue;
}
auto expected_place = phi::TransToPhiPlace(in_def.backend);
VLOG(3) << "phi Transform Variable " << input_names[i] << " from "
<< tensor_in->place() << " to " << expected_place;
if (!new_scope) {
new_scope = &scope.NewScope();
}
// For inference, if a gpu model has an op which could only run on CPU,
// each result of different input will be the same with the first one.
// The reason is that if a gpu tensor is the input of a cpu kernel,
// we will create a new cpu tensor in new scope.
// However, if enable_cache_runtime_context_, we get the cpu tensor each
// time, not the gpu tensor. Thus, we set pre_scope_ = nullptr
// to trigger `new RuntimeContext()` in RunImpl().
if (enable_cache_runtime_context_) {
pre_scope_ = nullptr;
}
// Create new var with the same name in transfer scopes
auto* trans_var = new_scope->Var(name_vec[offset]);
ins_vector[offset] = trans_var;
// Do transfer
Tensor out;
framework::TensorCopySync(*tensor_in, expected_place, &out);
SetTensorToVariable(*var, out, trans_var);
need_prepare_phi_data_ = true;
}
}
return new_scope;
}
void OperatorWithKernel::BuildPhiKernelContext( void OperatorWithKernel::BuildPhiKernelContext(
const RuntimeContext& ctx, const RuntimeContext& ctx,
platform::DeviceContext* dev_ctx, platform::DeviceContext* dev_ctx,
......
...@@ -646,14 +646,6 @@ class OperatorWithKernel : public OperatorBase { ...@@ -646,14 +646,6 @@ class OperatorWithKernel : public OperatorBase {
phi::KernelKey ChoosePhiKernel(const ExecutionContext& ctx) const; phi::KernelKey ChoosePhiKernel(const ExecutionContext& ctx) const;
void ChooseKernel(const ExecutionContext& ctx) const; void ChooseKernel(const ExecutionContext& ctx) const;
/**
* Transfer data place for phi kernel
* Is this really needed?
*/
Scope* PreparePhiData(const Scope& scope,
const phi::Kernel& pt_kernel,
const phi::KernelSignature& pt_kernel_signature,
RuntimeContext* ctx) const;
void BuildPhiKernelContext(const RuntimeContext& ctx, void BuildPhiKernelContext(const RuntimeContext& ctx,
platform::DeviceContext* dev_ctx, platform::DeviceContext* dev_ctx,
......
...@@ -89,31 +89,37 @@ class TestCinnLaunchOp : public ::testing::Test { ...@@ -89,31 +89,37 @@ class TestCinnLaunchOp : public ::testing::Test {
void TearDown() override { CinnCompiler::GetInstance()->Clear(); } void TearDown() override { CinnCompiler::GetInstance()->Clear(); }
}; };
TEST_F(TestCinnLaunchOp, TestRunInstructionByPE) { TEST_F(TestCinnLaunchOp, TestRunCPUInstructionByPE) {
// CPU
RunAndCheck(platform::CPUPlace()); RunAndCheck(platform::CPUPlace());
// the second run on the same place is to check the cache logic // the second run on the same place is to check the cache logic
RunAndCheck(platform::CPUPlace()); RunAndCheck(platform::CPUPlace());
}
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
// GPU TEST_F(TestCinnLaunchOp, TestRunGPUInstructionByPE) {
RunAndCheck(platform::CUDAPlace()); RunAndCheck(platform::CUDAPlace());
RunAndCheck(platform::CUDAPlace()); RunAndCheck(platform::CUDAPlace());
#endif
} }
#endif
TEST_F(TestCinnLaunchOp, TestRunInstructionByCinnProgram) { TEST_F(TestCinnLaunchOp, TestRunCPUInstructionByCinnProgram) {
// set FLAGS_enable_pe_launch_cinn=false to switch to use // set FLAGS_enable_pe_launch_cinn=false to switch to use
// default scheduler of CINN to execute the compiled program // default scheduler of CINN to execute the compiled program
FLAGS_enable_pe_launch_cinn = false; FLAGS_enable_pe_launch_cinn = false;
RunAndCheck(platform::CPUPlace()); RunAndCheck(platform::CPUPlace());
RunAndCheck(platform::CPUPlace()); RunAndCheck(platform::CPUPlace());
}
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
// GPU TEST_F(TestCinnLaunchOp, TestRunGPUInstructionByCinnProgram) {
// set FLAGS_enable_pe_launch_cinn=false to switch to use
// default scheduler of CINN to execute the compiled program
FLAGS_enable_pe_launch_cinn = false;
RunAndCheck(platform::CUDAPlace()); RunAndCheck(platform::CUDAPlace());
RunAndCheck(platform::CUDAPlace()); RunAndCheck(platform::CUDAPlace());
#endif
} }
#endif
TEST_F(TestCinnLaunchOp, TestRunWithAutoTuneEnabled) { TEST_F(TestCinnLaunchOp, TestRunWithAutoTuneEnabled) {
FLAGS_enable_cinn_auto_tune = true; FLAGS_enable_cinn_auto_tune = true;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册