// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/imperative/prepared_operator.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/imperative/infer_shape_context.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" #include "paddle/utils/small_vector.h" #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/device/xpu/xpu_op_list.h" #endif #include "paddle/fluid/framework/library_type.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/profiler/event_tracing.h" DECLARE_bool(check_nan_inf); DECLARE_bool(benchmark); DECLARE_bool(run_kp_kernel); namespace paddle { namespace imperative { const std::shared_ptr& GetVariableWrapper( const std::shared_ptr& var) { return var->SharedVar(); } const std::shared_ptr& GetVariableWrapper( const std::shared_ptr& var) { return var; } const framework::Tensor* GetTensorFromVar(const framework::Variable& var) { if (var.IsType()) { return &(var.Get()); } else if (var.IsType()) { return &(var.Get().value()); } else { return nullptr; } } template void HandleComplexGradToRealGrad(const NameVarMap& outs) { for (auto& pair : outs) { for (auto& var : pair.second) { if (var == nullptr) { continue; } if (var->ForwardDataType() == static_cast(-1)) { VLOG(6) << "Var (" << var->Name() << ")'s forward data type is not set."; continue; } if (!framework::IsComplexType(var->DataType()) || framework::IsComplexType(var->ForwardDataType())) { continue; } const auto* tensor = GetTensorFromVar(var->Var()); if (tensor && tensor->IsInitialized()) { VLOG(6) << "Transform " << framework::DataTypeToString(var->DataType()) << " var `" << var->Name() << "` to " << framework::DataTypeToString(var->ForwardDataType()) << " real var in dynamic graph."; framework::Tensor out; framework::TransComplexToReal(var->ForwardDataType(), var->DataType(), *tensor, &out); SetTensorToVariable(var->Var(), out, var->MutableVar()); } } } } template <> void HandleComplexGradToRealGrad( const NameVarMap& outs) { // TODO(jiabin): Support Complex here. } void TestHandleComplexGradToRealGradEager( const NameVarMap& outs) { HandleComplexGradToRealGrad(outs); } PreparedOp::PreparedOp(const framework::OperatorBase& op, const framework::RuntimeContext& ctx, const framework::OpKernelType& kernel_type, const framework::OperatorWithKernel::OpKernelFunc& func, platform::DeviceContext* dev_ctx) : op_(op), ctx_(ctx), kernel_type_(kernel_type), func_(func), dev_ctx_(dev_ctx) {} PreparedOp::PreparedOp(const framework::OperatorBase& op, const framework::RuntimeContext& ctx, const framework::OpKernelType& kernel_type, const framework::KernelSignature& kernel_signature, const phi::Kernel& pt_kernel, platform::DeviceContext* dev_ctx) : op_(op), ctx_(ctx), kernel_type_(kernel_type), func_(nullptr), dev_ctx_(dev_ctx), run_phi_kernel_(true), pt_kernel_signature_(kernel_signature), pt_kernel_(pt_kernel) {} template PreparedOp PrepareImpl(const NameVarMap& ins, const NameVarMap& outs, const framework::OperatorWithKernel& op, const platform::Place& place, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); framework::RuntimeContext ctx({}, {}); #ifdef PADDLE_WITH_MKLDNN // MKLDNN variant of code reads attributes in some of GetKernelTypeForVar and // GetKernelType functions, so we need to copy the attributes there. // Const qualifier of Attrs had to be discarded to overwrite it. if (FLAGS_use_mkldnn) { auto& mutable_op_attrs = const_cast(op.Attrs()); mutable_op_attrs = default_attrs; for (auto& attr : attrs) { mutable_op_attrs[attr.first] = attr.second; } } #endif // NOTE(zhiqiu): for kernels on given device, for example NPU, the order to // choose is: // phi npu kernel > fluid npu kernel > phi cpu kernel > fluid cpu kernel // 1. get expected kernel key auto dygraph_exe_ctx = DygraphExecutionContext( op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs, default_attrs); auto expected_kernel_key = op.GetExpectedKernelType(dygraph_exe_ctx); framework::KernelSignature pt_kernel_signature; phi::KernelKey pt_kernel_key; std::string pt_kernel_name; #if defined(PADDLE_WITH_XPU) bool is_xpu_unsupport = paddle::platform::is_xpu_place(expected_kernel_key.place_) && !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) || paddle::platform::is_in_xpu_black_list(op.Type()); #endif if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op.Type())) { pt_kernel_signature = op.GetExpectedPhiKernelArgs(dygraph_exe_ctx); VLOG(6) << pt_kernel_signature; pt_kernel_name = pt_kernel_signature.name; // modify the expected_kernel_key for KP in phi #ifdef PADDLE_WITH_XPU_KP if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) { bool use_xpu_kp_kernel_rt = FLAGS_run_kp_kernel && paddle::platform::is_xpu_kp_support_op( op.Type(), expected_kernel_key); bool use_xpu_kp_kernel_debug = paddle::platform::is_in_xpu_kpwhite_list(op.Type()); if (use_xpu_kp_kernel_rt) { VLOG(3) << "phi xpu_kp using rt mode "; } if (use_xpu_kp_kernel_debug) { VLOG(3) << "phi xpu_kp using debug mode "; } bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug); if (is_xpu_kp_support) { auto expected_kernel_key_library_type = expected_kernel_key.library_type_; expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP; VLOG(3) << "modifing XPU KP kernel: " << op.Type() << ", using_kernel_key:" << expected_kernel_key; phi::KernelKey try_pt_kernel_key = TransOpKernelTypeToPhiKernelKey(expected_kernel_key); if (!phi::KernelFactory::Instance().IsSelectKernelValid( pt_kernel_name, try_pt_kernel_key)) { expected_kernel_key.library_type_ = expected_kernel_key_library_type; VLOG(3) << "modify XPU KP kernel: " << op.Type() << " is failed " << expected_kernel_key; } } } #endif pt_kernel_key = TransOpKernelTypeToPhiKernelKey(expected_kernel_key); auto pt_kernel = phi::KernelFactory::Instance().SelectKernel(pt_kernel_name, pt_kernel_key); if (pt_kernel.IsValid() #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) && !is_xpu_unsupport #endif ) { VLOG(6) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name << " | kernel key: " << pt_kernel_key << " | kernel: " << pt_kernel; if (expected_kernel_key.place_ != place) { dev_ctx = pool.Get(expected_kernel_key.place_); } // TODO(chenweihang): using CPUKernel when miss device kernel case return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature, pt_kernel, dev_ctx); } else { VLOG(6) << "Dynamic mode ChoosePhiKernel - kernel `" << pt_kernel_name << "` not found."; } } // 2. check if op[type] has kernel registered. auto& all_op_kernels = op.AllOpKernels(); auto kernels_iter = all_op_kernels.find(op.Type()); #ifdef PADDLE_WITH_XPU_KP bool use_xpu_kp_kernel_rt = paddle::platform::is_xpu_place(expected_kernel_key.place_) && FLAGS_run_kp_kernel && paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key); bool use_xpu_kp_kernel_debug = paddle::platform::is_xpu_place(expected_kernel_key.place_) && paddle::platform::is_in_xpu_kpwhite_list(op.Type()); bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug); if (is_xpu_kp_support) { expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP; } #endif if ((kernels_iter == all_op_kernels.end() || kernels_iter->second.find(expected_kernel_key) == kernels_iter->second.end()) #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) || is_xpu_unsupport #endif ) { if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op.Type())) { auto pt_cpu_kernel_key = FallBackToCpu(expected_kernel_key, pt_kernel_key, op); auto pt_cpu_kernel = phi::KernelFactory::Instance().SelectKernel( pt_kernel_name, pt_cpu_kernel_key); if (pt_cpu_kernel.IsValid()) { VLOG(6) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name << " | kernel key: " << pt_cpu_kernel_key << " | kernel: " << pt_cpu_kernel; auto* cpu_ctx = pool.Get(paddle::platform::CPUPlace()); return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature, pt_cpu_kernel, cpu_ctx); } } } PADDLE_ENFORCE_NE( kernels_iter, all_op_kernels.end(), platform::errors::NotFound( "There are no kernels which are registered in the %s operator.", op.Type())); auto& kernels = kernels_iter->second; auto kernel_iter = kernels.find(expected_kernel_key); #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) if (paddle::platform::is_xpu_place(expected_kernel_key.place_) && (kernel_iter == kernels.end() || is_xpu_unsupport)) { VLOG(3) << "missing XPU kernel: " << op.Type() << ", expected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; expected_kernel_key.place_ = platform::CPUPlace(); kernel_iter = kernels.find(expected_kernel_key); } #endif #ifdef PADDLE_WITH_XPU_KP if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) { if (use_xpu_kp_kernel_rt) { VLOG(3) << "xpu_kp using rt mode "; } if (use_xpu_kp_kernel_debug) { VLOG(3) << "xpu_kp using debug mode "; } if (is_xpu_kp_support) { expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP; kernel_iter = kernels.find(expected_kernel_key); VLOG(3) << "using XPU KP kernel: " << op.Type() << ", using_kernel_key:" << expected_kernel_key; } if (!is_xpu_kp_support && (kernel_iter == kernels.end() || is_xpu_unsupport)) { VLOG(3) << "missing XPU kernel: " << op.Type() << ", expected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; expected_kernel_key.place_ = platform::CPUPlace(); kernel_iter = kernels.find(expected_kernel_key); } } #endif #ifdef PADDLE_WITH_ASCEND_CL if (kernel_iter == kernels.end() && paddle::platform::is_npu_place(expected_kernel_key.place_)) { VLOG(3) << "missing NPU kernel: " << op.Type() << ", expected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; expected_kernel_key.place_ = platform::CPUPlace(); kernel_iter = kernels.find(expected_kernel_key); } #endif #ifdef PADDLE_WITH_MLU if (kernel_iter == kernels.end() && paddle::platform::is_mlu_place(expected_kernel_key.place_)) { VLOG(3) << "missing MLU kernel: " << op.Type() << ", expected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; expected_kernel_key.place_ = platform::CPUPlace(); kernel_iter = kernels.find(expected_kernel_key); } #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE if (kernel_iter == kernels.end() && paddle::platform::is_custom_place(expected_kernel_key.place_)) { VLOG(3) << "missing " << place.GetDeviceType() << " kernel: " << op.Type() << ", expected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; expected_kernel_key.place_ = platform::CPUPlace(); kernel_iter = kernels.find(expected_kernel_key); } #endif // TODO(jiabin): Add operator.cc's line 1000 part back when we need that // case PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), platform::errors::NotFound( "Operator %s does not have kernel for %s.", op.Type(), KernelTypeToString(expected_kernel_key))); if (!(expected_kernel_key.place_ == place)) { dev_ctx = pool.Get(expected_kernel_key.place_); } return PreparedOp(op, ctx, expected_kernel_key, kernel_iter->second, dev_ctx); } PreparedOp PreparedOp::Prepare(const NameVarMap& ins, const NameVarMap& outs, const framework::OperatorWithKernel& op, const platform::Place& place, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { return PrepareImpl(ins, outs, op, place, attrs, default_attrs); } PreparedOp PreparedOp::Prepare(const NameVarMap& ins, const NameVarMap& outs, const framework::OperatorWithKernel& op, const platform::Place& place, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { return PrepareImpl(ins, outs, op, place, attrs, default_attrs); } PreparedOp PreparedOp::Prepare(const NameVarMap& ins, const NameVarMap& outs, const framework::OperatorWithKernel& op, const platform::Place& place, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { return PrepareImpl(ins, outs, op, place, attrs, default_attrs); } template static void PreparedOpRunImpl( const framework::OperatorBase& op, const framework::RuntimeContext& ctx, const framework::OpKernelType& kernel_type, const framework::OperatorWithKernel::OpKernelFunc& func, platform::DeviceContext* dev_ctx, const NameVarMap& ins, const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { // TODO(zjl): remove scope in dygraph framework::Scope scope; { platform::RecordEvent record_event(op.Type() + "::infer_shape", platform::TracerEventType::OperatorInner, 1, platform::EventRole::kInnerOp); DygraphInferShapeContext infer_shape_ctx( &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type); op.Info().infer_shape_(&infer_shape_ctx); } { platform::RecordEvent record_event(op.Type() + "::compute", platform::TracerEventType::OperatorInner, 1, platform::EventRole::kInnerOp); func(DygraphExecutionContext(op, scope, *dev_ctx, ctx, ins, outs, attrs, default_attrs)); } if (FLAGS_check_nan_inf) { framework::details::CheckOpHasNanOrInfInDygraph( op.Type(), outs, dev_ctx->GetPlace()); } if (FLAGS_benchmark) { dev_ctx->Wait(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error"; #endif } /** * [ Why need handle complex gradient to real gradient? ] * * After the introduction of complex number calculations, Ops that support * complex number calculations generally support type promotion, such as * x(float32) + y(complex64) = out(complex64), then the type of the grad * tensor should be dout(complex64), dx(float32), dy (complex64). * * But because the dout is complex64, the dx is also complex64 after * grad op kernel executed, we need to recognize this situation and * convert dx to float32 type. HandleComplexGradToRealGrad does this thing. */ if (framework::IsComplexType(kernel_type.data_type_)) { HandleComplexGradToRealGrad(outs); } } template static void PreparedOpRunPtImpl( const framework::OperatorBase& op, const framework::OpKernelType& kernel_type, const framework::KernelSignature& pt_kernel_signature, const phi::Kernel& pt_kernel, platform::DeviceContext* dev_ctx, const NameVarMap& ins, const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { { platform::RecordEvent record_event(op.Type() + "::infer_shape", platform::TracerEventType::OperatorInner, 1, platform::EventRole::kInnerOp); DygraphInferShapeContext infer_shape_ctx( &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type); op.Info().infer_shape_(&infer_shape_ctx); } { platform::RecordEvent record_event(op.Type() + "::compute", platform::TracerEventType::OperatorInner, 1, platform::EventRole::kInnerOp); PreparePhiData(pt_kernel, pt_kernel_signature, ins); phi::KernelContext pt_kernel_context; BuildDygraphPhiKernelContext(pt_kernel_signature, pt_kernel, ins, outs, attrs, default_attrs, dev_ctx, &pt_kernel_context); pt_kernel(&pt_kernel_context); } if (FLAGS_benchmark) { dev_ctx->Wait(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error"; #endif } // TODO(chenweihang): add debug flags later if (framework::IsComplexType(kernel_type.data_type_)) { HandleComplexGradToRealGrad(outs); } } void PreparedOp::Run(const NameVarMap& ins, const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { if (run_phi_kernel_) { PreparedOpRunPtImpl(op_, kernel_type_, pt_kernel_signature_, pt_kernel_, dev_ctx_, ins, outs, attrs, default_attrs); } else { PreparedOpRunImpl(op_, ctx_, kernel_type_, func_, dev_ctx_, ins, outs, attrs, default_attrs); } } void PreparedOp::Run(const NameVarMap& ins, const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { if (run_phi_kernel_) { PreparedOpRunPtImpl( op_, kernel_type_, pt_kernel_signature_, pt_kernel_, dev_ctx_, ins, outs, attrs, default_attrs); } else { PreparedOpRunImpl(op_, ctx_, kernel_type_, func_, dev_ctx_, ins, outs, attrs, default_attrs); } } void PreparedOp::Run(const NameVarMap& ins, const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { if (run_phi_kernel_) { PreparedOpRunPtImpl( op_, kernel_type_, pt_kernel_signature_, pt_kernel_, dev_ctx_, ins, outs, attrs, default_attrs); } else { PreparedOpRunImpl(op_, ctx_, kernel_type_, func_, dev_ctx_, ins, outs, attrs, default_attrs); } } } // namespace imperative } // namespace paddle