diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 2be93f0dc91785fe49d09f3e9543fadcf690a5dd..0a5de2bd3f2622d1863db7b5b327a5c98c247fcc 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1622,9 +1622,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, platform::EventRole::kInnerOp); if (run_phi_kernel_) { phi::KernelContext pt_kernel_context; - // Do data transform before building KernelContext - // TODO(zhiqiu): support TransferInplaceVarsBack - PreparePhiData(exec_scope, *pt_kernel_, *kernel_signature_, runtime_ctx); if (enable_cache_runtime_context_ && !need_prepare_phi_data_ && !need_prepare_data_) { impl_ = @@ -2007,15 +2004,15 @@ Scope* OperatorWithKernel::PrepareData( } } - for (auto& var_name_item : Inputs()) { - bool should_skip_input = - no_buffer_ins && no_buffer_ins->count(var_name_item.first) > 0; - - std::vector& input_vars = ctx->inputs[var_name_item.first]; - - for (size_t i = 0; i < var_name_item.second.size(); ++i) { - auto& var_name = var_name_item.second[i]; - auto* var = input_vars[i]; + const auto& name_map = Inputs(); + auto prepare_input_data = [&](const std::string& in_name, + std::vector* in_vars, + const phi::TensorArgDef* in_def, + bool should_skip_input) -> void { + auto& name_vec = name_map.at(in_name); + for (size_t i = 0; i < in_vars->size(); ++i) { + const auto& var_name = name_vec[i]; + auto* var = in_vars->at(i); // Only tensor can be tranfer to another device. if (var == nullptr || !VarIsTensor(*var)) { @@ -2046,17 +2043,17 @@ Scope* OperatorWithKernel::PrepareData( new_scope = &scope.NewScope(); } auto* trans_var = new_scope->Var(var_name); - input_vars[i] = trans_var; + in_vars->at(i) = trans_var; auto out = trans_var->GetMutable(); out->Resize(tensor_in->dims()); platform::MatchShapeToLayout( out, tensor_in->layout(), DataLayout::kNHWC); VLOG(7) << "Created reshaped dummy input based on MKL-DNN Tensor , " "but kNHWC layout" - << var_name_item.first << " in Operator " << type_; + << in_name << " in Operator " << type_; } else { - VLOG(7) << "Skip scanning input " << var_name_item.first - << " in Operator " << type_; + VLOG(7) << "Skip scanning input " << in_name << " in Operator " + << type_; } #endif continue; @@ -2066,15 +2063,46 @@ Scope* OperatorWithKernel::PrepareData( continue; } - auto kernel_type_for_var = GetKernelTypeForVar( - var_name_item.first, *tensor_in, expected_kernel_key); + auto kernel_type_for_var = + GetKernelTypeForVar(in_name, *tensor_in, expected_kernel_key); + bool need_trans_dtype = + kernel_type_for_var.data_type_ != expected_kernel_key.data_type_; + bool need_trans_layout = NeedTransformLayout( + kernel_type_for_var.data_layout_, expected_kernel_key.data_layout_); + if (!need_trans_dtype && !need_trans_layout) { + if (!run_phi_kernel_ && + platform::places_are_same_class(kernel_type_for_var.place_, + expected_kernel_key.place_)) { + continue; + } + } - if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) { - continue; + std::unique_ptr new_expected_kernel_key = nullptr; + if (run_phi_kernel_ && in_def->backend != phi::Backend::ALL_BACKEND) { + auto tensor_backend = phi::TransToPhiBackend(tensor_in->place()); + if ((in_def->backend != tensor_backend && + (in_def->backend != phi::Backend::GPUDNN || + tensor_backend != phi::Backend::GPU)) || + tensor_in->place().GetType() == AllocationType::GPUPINNED) { + new_expected_kernel_key = std::make_unique( + expected_kernel_key.data_type_, + phi::TransToPhiPlace(in_def->backend), + expected_kernel_key.data_layout_, + expected_kernel_key.library_type_, + expected_kernel_key.customized_type_value_); + } + } + + if (!need_trans_dtype && !need_trans_layout) { + if (run_phi_kernel_ && new_expected_kernel_key == nullptr) { + continue; + } } VLOG(3) << "Transform Variable " << var_name << " from " - << kernel_type_for_var << " to " << expected_kernel_key; + << kernel_type_for_var << " to " + << (new_expected_kernel_key ? *new_expected_kernel_key + : expected_kernel_key); // In the inference scenerio, the scopes will be reused across the // batches, so the `new_scope` here will result in GPU memroy explosion @@ -2094,13 +2122,22 @@ Scope* OperatorWithKernel::PrepareData( // not do transfer scope caching, and cpu inference performance is not // impacted by test. enable_cache_transfer_scope_ = false; - if (!run_by_executor_ && - (platform::is_gpu_place(kernel_type_for_var.place_) || - platform::is_gpu_place(expected_kernel_key.place_))) { - new_scope = TryCreateTransferScope( - kernel_type_for_var, expected_kernel_key, &scope); - enable_cache_transfer_scope_ = true; + if (!run_by_executor_) { + if (new_expected_kernel_key) { + if ((platform::is_gpu_place(kernel_type_for_var.place_) || + platform::is_gpu_place(new_expected_kernel_key->place_))) { + new_scope = TryCreateTransferScope( + kernel_type_for_var, *new_expected_kernel_key, &scope); + enable_cache_transfer_scope_ = true; + } + } else if ((platform::is_gpu_place(kernel_type_for_var.place_) || + platform::is_gpu_place(expected_kernel_key.place_))) { + new_scope = TryCreateTransferScope( + kernel_type_for_var, expected_kernel_key, &scope); + enable_cache_transfer_scope_ = true; + } } + if (!new_scope) { new_scope = &scope.NewScope(); } @@ -2117,7 +2154,7 @@ Scope* OperatorWithKernel::PrepareData( // Create new var with the same name in transfer scopes auto* trans_var = new_scope->Var(var_name); - input_vars[i] = trans_var; + in_vars->at(i) = trans_var; // Find if inplace exists between input and output // If inplace exists, set the new created var to inplaced output, and @@ -2125,7 +2162,7 @@ Scope* OperatorWithKernel::PrepareData( for (auto& pair : Outputs()) { for (size_t j = 0; j < pair.second.size(); ++j) { if (pair.second[j] == var_name) { - VLOG(4) << "Found inplace between input(" << var_name_item.first + VLOG(4) << "Found inplace between input(" << in_name << ") and output(" << pair.first << "), the variable name is " << var_name; ctx->outputs[pair.first][j] = trans_var; @@ -2136,9 +2173,47 @@ Scope* OperatorWithKernel::PrepareData( // Do transfer Tensor out; - TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out); + TransformData(new_expected_kernel_key ? *new_expected_kernel_key + : expected_kernel_key, + kernel_type_for_var, + *tensor_in, + &out); SetTensorToVariable(*var, out, trans_var); } + }; + + if (run_phi_kernel_) { + const auto& input_names = kernel_signature_->input_names; + const auto& input_defs = pt_kernel_->args_def().input_defs(); + PADDLE_ENFORCE_EQ(input_names.size(), + input_defs.size(), + platform::errors::InvalidArgument( + "The size of inputs_args names (%d) must be equal to " + "the size of kernel input_defs (%d).", + input_names.size(), + input_defs.size())); + for (size_t i = 0; i < input_defs.size(); ++i) { + const auto& input_defs = pt_kernel_->args_def().input_defs(); + auto& in_def = input_defs.at(i); + std::string input_name = input_names[i]; + auto iter = ctx->inputs.find(input_name); + if (iter == ctx->inputs.end()) { + continue; + } + auto& ins_vector = iter->second; + bool should_skip_input = + no_buffer_ins && no_buffer_ins->count(input_name) > 0; + prepare_input_data(input_name, &ins_vector, &in_def, should_skip_input); + } + } else { + for (auto& var_name_item : Inputs()) { + bool should_skip_input = + no_buffer_ins && no_buffer_ins->count(var_name_item.first) > 0; + + std::vector& input_vars = ctx->inputs[var_name_item.first]; + prepare_input_data( + var_name_item.first, &input_vars, nullptr, should_skip_input); + } } // If pre_scope = &scope, it means that scope is cached and the op is not in @@ -2381,107 +2456,6 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs( return (*arg_map_fn_)(arg_mapping_ctx); } -Scope* OperatorWithKernel::PreparePhiData( - const Scope& scope, - const phi::Kernel& pt_kernel, - const phi::KernelSignature& pt_kernel_signature, - RuntimeContext* ctx) const { - const auto& input_names = pt_kernel_signature.input_names; - auto input_defs = pt_kernel.args_def().input_defs(); - PADDLE_ENFORCE_EQ(input_names.size(), - input_defs.size(), - platform::errors::InvalidArgument( - "The size of inputs_args names (%d) must be equal to " - "the size of kernel input_defs (%d).", - input_names.size(), - input_defs.size())); - Scope* new_scope = nullptr; - auto& name_map = Inputs(); - const std::unordered_set* no_buffer_ins = nullptr; - if (info_) { - auto& no_buffer_inferer = info_->NoNeedBufferVarsInferer(); - // Some op may not register NoNeedBufferVarsInferer - if (no_buffer_inferer) { - no_buffer_ins = &(no_buffer_inferer(Inputs(), Outputs(), Attrs())); - if (no_buffer_ins->empty()) no_buffer_ins = nullptr; - } - } - - for (size_t i = 0; i < input_defs.size(); ++i) { - auto& in_def = input_defs.at(i); - if (ctx->inputs.find(input_names[i]) == ctx->inputs.end()) { - continue; - } - auto& ins_vector = ctx->inputs.at(input_names[i]); - auto& name_vec = name_map.at(input_names[i]); - bool should_skip_input = - no_buffer_ins && no_buffer_ins->count(input_names[i]) > 0; - - for (size_t offset = 0; offset < ins_vector.size(); ++offset) { - // Only tensor can be tranfer to another device. - auto* var = ins_vector[offset]; - if (var == nullptr || !VarIsTensor(*var)) { - continue; - } - auto* tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var); - - // When no_buffer_ins then checking of Tensor::holder_ is - // not a thread safe. And for infershape scenario checks - // to be omitted are not really needed - if (should_skip_input == true) { - // TODO(YuanRisheng) : There need to supplement MKLDNN code later - continue; - } - - if (!tensor_in->IsInitialized()) { - continue; - } - - if (in_def.backend == phi::Backend::ALL_BACKEND) { - continue; - } - - auto tensor_backend = phi::TransToPhiBackend(tensor_in->place()); - if (in_def.backend == tensor_backend || - (in_def.backend == phi::Backend::GPUDNN && - tensor_backend == phi::Backend::GPU)) { - continue; - } - - auto expected_place = phi::TransToPhiPlace(in_def.backend); - VLOG(3) << "phi Transform Variable " << input_names[i] << " from " - << tensor_in->place() << " to " << expected_place; - - if (!new_scope) { - new_scope = &scope.NewScope(); - } - // For inference, if a gpu model has an op which could only run on CPU, - // each result of different input will be the same with the first one. - // The reason is that if a gpu tensor is the input of a cpu kernel, - // we will create a new cpu tensor in new scope. - // However, if enable_cache_runtime_context_, we get the cpu tensor each - // time, not the gpu tensor. Thus, we set pre_scope_ = nullptr - // to trigger `new RuntimeContext()` in RunImpl(). - if (enable_cache_runtime_context_) { - pre_scope_ = nullptr; - } - - // Create new var with the same name in transfer scopes - auto* trans_var = new_scope->Var(name_vec[offset]); - ins_vector[offset] = trans_var; - - // Do transfer - Tensor out; - framework::TensorCopySync(*tensor_in, expected_place, &out); - SetTensorToVariable(*var, out, trans_var); - - need_prepare_phi_data_ = true; - } - } - - return new_scope; -} - void OperatorWithKernel::BuildPhiKernelContext( const RuntimeContext& ctx, platform::DeviceContext* dev_ctx, diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 1b7bd433dd104dab11d41c664e1bcff6c75591fa..c3827f56c7197b684428ee337711595116aa8981 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -646,14 +646,6 @@ class OperatorWithKernel : public OperatorBase { phi::KernelKey ChoosePhiKernel(const ExecutionContext& ctx) const; void ChooseKernel(const ExecutionContext& ctx) const; - /** - * Transfer data place for phi kernel - * Is this really needed? - */ - Scope* PreparePhiData(const Scope& scope, - const phi::Kernel& pt_kernel, - const phi::KernelSignature& pt_kernel_signature, - RuntimeContext* ctx) const; void BuildPhiKernelContext(const RuntimeContext& ctx, platform::DeviceContext* dev_ctx, diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc index 674f55efb5feb8ee5977f32c1ba60697e22abb6f..5b965573deefa6546fdf5d2c5786aa5e76284397 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc @@ -89,31 +89,37 @@ class TestCinnLaunchOp : public ::testing::Test { void TearDown() override { CinnCompiler::GetInstance()->Clear(); } }; -TEST_F(TestCinnLaunchOp, TestRunInstructionByPE) { - // CPU +TEST_F(TestCinnLaunchOp, TestRunCPUInstructionByPE) { RunAndCheck(platform::CPUPlace()); // the second run on the same place is to check the cache logic RunAndCheck(platform::CPUPlace()); +} + #ifdef PADDLE_WITH_CUDA - // GPU +TEST_F(TestCinnLaunchOp, TestRunGPUInstructionByPE) { RunAndCheck(platform::CUDAPlace()); RunAndCheck(platform::CUDAPlace()); -#endif } +#endif -TEST_F(TestCinnLaunchOp, TestRunInstructionByCinnProgram) { +TEST_F(TestCinnLaunchOp, TestRunCPUInstructionByCinnProgram) { // set FLAGS_enable_pe_launch_cinn=false to switch to use // default scheduler of CINN to execute the compiled program FLAGS_enable_pe_launch_cinn = false; RunAndCheck(platform::CPUPlace()); RunAndCheck(platform::CPUPlace()); +} + #ifdef PADDLE_WITH_CUDA - // GPU +TEST_F(TestCinnLaunchOp, TestRunGPUInstructionByCinnProgram) { + // set FLAGS_enable_pe_launch_cinn=false to switch to use + // default scheduler of CINN to execute the compiled program + FLAGS_enable_pe_launch_cinn = false; RunAndCheck(platform::CUDAPlace()); RunAndCheck(platform::CUDAPlace()); -#endif } +#endif TEST_F(TestCinnLaunchOp, TestRunWithAutoTuneEnabled) { FLAGS_enable_cinn_auto_tune = true;