// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/framework/details/nan_inf_utils_detail.h" #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/scope.h" #include "paddle/phi/common/amp_type_traits.h" #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #endif #include "paddle/fluid/framework/convert_utils.h" #include "paddle/phi/kernels/funcs/eigen/extensions.h" DECLARE_int32(check_nan_inf_level); namespace paddle { namespace framework { namespace details { static std::once_flag white_list_init_flag; static int op_role_nan_inf_white_list = 0; static constexpr int FORWARD = 0x10000; // lazy init static const std::unordered_map& role_str2int() { /* In op_proto_maker.h * framework::OpRole::kForward = 0x0000, * framework::OpRole::kBackward = 0x0001, * framework::OpRole::kOptimize = 0x0002, * framework::OpRole::kRPC = 0x0004, * framework::OpRole::kDist = 0x0008, * framework::OpRole::kLRSched = 0x0010, * framework::OpRole::kLoss = 0x0100, * framework::OpRole::kNotSpecified = 0x1000, */ static const std::unordered_map _role_str2int = { {"forward", FORWARD}, /* kForward=0, can't filter */ {"backward", static_cast(framework::OpRole::kBackward)}, {"optimize", static_cast(framework::OpRole::kOptimize)}, {"rpc", static_cast(framework::OpRole::kRPC)}, {"dist", static_cast(framework::OpRole::kDist)}, {"lrsched", static_cast(framework::OpRole::kLRSched)}, {"loss", static_cast(framework::OpRole::kLoss)}, {"default", static_cast(framework::OpRole::kNotSpecified)}, }; return _role_str2int; } static std::unordered_set& op_type_nan_inf_white_list() { static std::unordered_set _op_type_nan_inf_white_list = { "coalesce_tensor", /* This Op will alloc tensor, and may not init space */ }; return _op_type_nan_inf_white_list; } static std::unordered_map>& op_var_nan_inf_white_list() { static std::unordered_map> _op_var_nan_inf_white_list = { /* encoded & gather var consist of idx&val, can't judge directly */ {"dgc", {"__dgc_encoded__", "__dgc_gather__"}}, }; return _op_var_nan_inf_white_list; } static void InitWhiteListFormEnv() { // op_type_skip and op_var_skip may be NULL. // So need init static value in there, prevent thread competition. // NOTE. role_str2int needn't do this for it only used in this func. op_type_nan_inf_white_list(); op_var_nan_inf_white_list(); // export PADDLE_INF_NAN_SKIP_OP="op0,op1,op2" // export PADDLE_INF_NAN_SKIP_ROLE="role1,role2,role3" // export PADDLE_INF_NAN_SKIP_VAR="op0:var0,op0:var1,op1:var0" const char* op_type_skip = std::getenv("PADDLE_INF_NAN_SKIP_OP"); const char* op_role_skip = std::getenv("PADDLE_INF_NAN_SKIP_ROLE"); const char* op_var_skip = std::getenv("PADDLE_INF_NAN_SKIP_VAR"); if (op_type_skip) { std::stringstream ss(op_type_skip); std::string op_type; while (std::getline(ss, op_type, ',')) { op_type_nan_inf_white_list().emplace(op_type); } } if (op_role_skip) { std::stringstream ss(op_role_skip); std::string op_role; while (std::getline(ss, op_role, ',')) { PADDLE_ENFORCE_EQ(role_str2int().find(op_role) != role_str2int().end(), true, platform::errors::InvalidArgument( "Skip role must be one of " "{forward,backward,optimize,rpc,dist,lrsched,loss," "default}, instead of %s", op_role)); op_role_nan_inf_white_list |= role_str2int().at(op_role); } } if (op_var_skip) { std::stringstream ss(op_var_skip); std::string op_var; while (std::getline(ss, op_var, ',')) { auto pos = op_var.find(":"); PADDLE_ENFORCE_EQ( pos != std::string::npos, true, platform::errors::InvalidArgument( "Skip var format must be op:var, instead of %s", op_var)); std::string op = op_var.substr(0, pos); std::string var = op_var.substr(pos + 1); op_var_nan_inf_white_list()[op].emplace_back(var); } } } template < typename T, std::enable_if_t>::value && !std::is_same>::value, bool> = true> static void CheckNanInfCpuImpl(const T* value_ptr, const int64_t numel, const std::string& cpu_hint_str) { using MT = typename phi::dtype::template MPTypeTrait::Type; #ifdef _OPENMP // Use maximum 4 threads to collect the nan and inf information. int num_threads = std::max(omp_get_num_threads(), 1); num_threads = std::min(num_threads, 4); #else int num_threads = 1; #endif std::vector thread_num_nan(num_threads, 0); std::vector thread_num_inf(num_threads, 0); std::vector thread_min_value(num_threads, static_cast(value_ptr[0])); std::vector thread_max_value(num_threads, static_cast(value_ptr[0])); std::vector thread_mean_value(num_threads, static_cast(0)); #ifdef _OPENMP #pragma omp parallel num_threads(num_threads) #endif { #ifdef _OPENMP int64_t tid = omp_get_thread_num(); int64_t chunk_size = (numel + num_threads - 1) / num_threads; int64_t begin = tid * chunk_size; int64_t end = chunk_size + begin > numel ? numel : chunk_size + begin; #else int64_t tid = 0; int64_t begin = 0; int64_t end = numel; #endif for (int64_t i = begin; i < end; ++i) { MT value = static_cast(value_ptr[i]); thread_min_value[tid] = std::min(thread_min_value[tid], value); thread_max_value[tid] = std::max(thread_max_value[tid], value); thread_mean_value[tid] += value / static_cast(numel); if (std::isnan(value)) { thread_num_nan[tid] += 1; } else if (std::isinf(value)) { thread_num_inf[tid] += 1; } } } int64_t num_nan = 0; int64_t num_inf = 0; MT min_value = thread_min_value[0]; MT max_value = thread_max_value[0]; MT mean_value = static_cast(0); for (int i = 0; i < num_threads; ++i) { num_nan += thread_num_nan[i]; num_inf += thread_num_inf[i]; min_value = std::min(thread_min_value[i], min_value); max_value = std::max(thread_max_value[i], max_value); mean_value += thread_mean_value[i]; } PrintForDifferentLevel(cpu_hint_str.c_str(), numel, num_nan, num_inf, max_value, min_value, mean_value, FLAGS_check_nan_inf_level); } template < typename T, std::enable_if_t>::value || std::is_same>::value, bool> = true> void CheckNanInfCpuImpl(const T* value_ptr, const int64_t numel, const std::string& cpu_hint_str) { using RealType = typename T::value_type; RealType real_sum = 0.0f, imag_sum = 0.0f; #ifdef _OPENMP #pragma omp parallel for reduction(+ : real_sum) reduction(+ : imag_sum) #endif for (int64_t i = 0; i < numel; ++i) { T value = value_ptr[i]; real_sum += (value.real - value.real); imag_sum += (value.imag - value.imag); } if (std::isnan(real_sum) || std::isinf(real_sum) || std::isnan(imag_sum) || std::isinf(imag_sum)) { // hot fix for compile failed in gcc4.8 // here also need print detail info of nan or inf later PADDLE_THROW(platform::errors::PreconditionNotMet( "There are NAN or INF in %s.", cpu_hint_str)); } } template <> template void TensorCheckerVisitor::apply( typename std::enable_if< std::is_floating_point::value || std::is_same>::value || std::is_same>::value>::type*) const { std::string cpu_hint_str = GetCpuHintString(op_type, var_name, tensor.place()); CheckNanInfCpuImpl(tensor.data(), tensor.numel(), cpu_hint_str); } template <> void tensor_check(const std::string& op_type, const std::string& var_name, const phi::DenseTensor& tensor, const platform::Place& place) { TensorCheckerVisitor vistor( op_type, var_name, tensor, place); VisitDataType(framework::TransToProtoVarType(tensor.dtype()), vistor); } void CheckVarHasNanOrInf(const std::string& op_type, const std::string& var_name, const framework::Variable* var, const platform::Place& place) { PADDLE_ENFORCE_NOT_NULL( var, platform::errors::NotFound( "Cannot find var: `%s` in op `%s`.", var_name, op_type)); const phi::DenseTensor* tensor{nullptr}; if (var->IsType()) { tensor = &var->Get(); } else if (var->IsType()) { tensor = &var->Get().value(); } else { VLOG(10) << var_name << " var_name need not to check"; return; } if (tensor->memory_size() == 0) { VLOG(10) << var_name << " var_name need not to check, size == 0"; return; } VLOG(10) << "begin check " << op_type << " var_name:" << var_name << ", place:" << tensor->place() << ", numel:" << tensor->numel(); if (platform::is_gpu_place(tensor->place())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) tensor_check(op_type, var_name, *tensor, place); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "phi::DenseTensor[%s] use gpu place. PaddlePaddle must compile " "with GPU.", var_name)); #endif return; } else if (platform::is_xpu_place(tensor->place())) { #ifdef PADDLE_WITH_XPU if (framework::TransToProtoVarType(tensor->dtype()) != proto::VarType::FP32) { return; } float* cpu_data = new float[tensor->numel()]; memory::Copy(platform::CPUPlace(), static_cast(cpu_data), tensor->place(), static_cast(tensor->data()), tensor->numel() * sizeof(float)); bool flag = false; for (int i = 0; i < tensor->numel(); i++) { if (isnan(cpu_data[i]) || isinf(cpu_data[i])) { flag = true; break; } } delete[] cpu_data; PADDLE_ENFORCE_NE( flag, true, platform::errors::Fatal( "Operator %s output phi::DenseTensor %s contains Inf.", op_type, var_name)); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "phi::DenseTensor[%s] use xpu place. PaddlePaddle must compile " "with XPU.", var_name)); #endif return; } else if (platform::is_npu_place(tensor->place())) { #ifdef PADDLE_WITH_ASCEND_CL if (framework::TransToProtoVarType(tensor->dtype()) != proto::VarType::FP32) { return; } phi::DenseTensor cpu_tensor; cpu_tensor.Resize(tensor->dims()); float* cpu_data = static_cast( cpu_tensor.mutable_data(platform::CPUPlace(), tensor->dtype())); framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor); bool flag = false; for (int i = 0; i < cpu_tensor.numel(); i++) { if (isnan(cpu_data[i]) || isinf(cpu_data[i])) { flag = true; break; } } PADDLE_ENFORCE_NE( flag, true, platform::errors::Fatal( "Operator %s output phi::DenseTensor %s contains Inf.", op_type, var_name)); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "phi::DenseTensor[%s] use npu place. PaddlePaddle must compile " "with NPU.", var_name)); #endif return; } tensor_check(op_type, var_name, *tensor, place); } void CheckVarHasNanOrInf(const std::string& op_type, const framework::Scope& scope, const std::string& var_name, const platform::Place& place) { auto* var = scope.FindVar(var_name); CheckVarHasNanOrInf(op_type, var_name, var, place); } bool IsSkipOp(const framework::OperatorBase& op) { if (op_type_nan_inf_white_list().count(op.Type()) != 0) return true; int op_role = 0; if (op.HasAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName())) { op_role = op.template Attr( framework::OpProtoAndCheckerMaker::OpRoleAttrName()); } // kForward=0, can't filter if (op_role == static_cast(framework::OpRole::kForward)) { op_role = FORWARD; } if (op_role_nan_inf_white_list & op_role) return true; return false; } #ifdef PADDLE_WITH_ASCEND_CL using NpuOpRunner = paddle::operators::NpuOpRunner; constexpr int FLOAT_STATUS_SIZE = 8; static phi::DenseTensor& npu_float_status() { static phi::DenseTensor float_status; return float_status; } void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op, const framework::Scope& scope, const platform::Place& place) { if (!platform::is_npu_place(place)) return; std::call_once(white_list_init_flag, InitWhiteListFormEnv); if (IsSkipOp(op)) return; auto* dev_ctx = reinterpret_cast( platform::DeviceContextPool::Instance().Get(place)); auto stream = dev_ctx->stream(); auto& flag = npu_float_status(); flag.mutable_data({FLOAT_STATUS_SIZE}, place); NpuOpRunner("NPUAllocFloatStatus", {}, {flag}).Run(stream); phi::DenseTensor tmp; tmp.mutable_data({FLOAT_STATUS_SIZE}, place); NpuOpRunner("NPUClearFloatStatus", {tmp}, {flag}).Run(stream); } void PrintNpuVarInfo(const std::string& op_type, const std::string& var_name, const framework::Variable* var, const platform::Place& place) { const phi::DenseTensor* tensor{nullptr}; if (var->IsType()) { tensor = &var->Get(); } else if (var->IsType()) { tensor = &var->Get().value(); } else { VLOG(10) << var_name << " var_name need not to check"; return; } if ((framework::TransToProtoVarType(tensor->dtype()) != proto::VarType::FP32) && (framework::TransToProtoVarType(tensor->dtype()) != proto::VarType::FP16)) { return; } if (tensor->memory_size() == 0) { VLOG(10) << var_name << " var_name need not to check, size == 0"; return; } VLOG(10) << "begin check " << op_type << " var_name:" << var_name << ", place:" << tensor->place() << ", numel:" << tensor->numel(); phi::DenseTensor cpu_tensor; cpu_tensor.Resize(tensor->dims()); cpu_tensor.mutable_data(platform::CPUPlace(), tensor->dtype()); framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor); LOG(WARNING) << "print [" << var_name << "] tensor info:"; // use env strategy control in future, -1=print_all. int print_num = 3; if (framework::TransToProtoVarType(tensor->dtype()) == proto::VarType::FP32) { const float* value = cpu_tensor.data(); PrintNanInf(value, tensor->numel(), print_num, op_type, var_name, false); } else if (framework::TransToProtoVarType(tensor->dtype()) == proto::VarType::FP16) { const paddle::platform::float16* value = cpu_tensor.data(); PrintNanInf(value, tensor->numel(), print_num, op_type, var_name, false); } } void PrintNPUOpValueInfo(const framework::OperatorBase& op, const framework::Scope& scope, const platform::Place& place) { LOG(WARNING) << "There are `nan` or `inf` in operator (" << op.Type() << "), here we print some tensor value info of this op."; for (auto& vname : op.InputVars()) { auto* var = scope.FindVar(vname); if (var == nullptr) continue; PrintNpuVarInfo(op.Type(), vname, var, place); } for (auto& vname : op.OutputVars(true)) { auto* var = scope.FindVar(vname); if (var == nullptr) continue; PrintNpuVarInfo(op.Type(), vname, var, place); } } static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op, const framework::Scope& scope, const platform::Place& place) { if (!platform::is_npu_place(place)) return; auto* dev_ctx = reinterpret_cast( platform::DeviceContextPool::Instance().Get(place)); auto stream = dev_ctx->stream(); auto& flag = npu_float_status(); phi::DenseTensor tmp; tmp.mutable_data({FLOAT_STATUS_SIZE}, place); // NPUGetFloatStatus updates data on input in-place. // tmp is only placeholder. NpuOpRunner("NPUGetFloatStatus", {flag}, {tmp}).Run(stream); phi::DenseTensor cpu_tensor; auto cpu_place = platform::CPUPlace(); float* cpu_data = static_cast( cpu_tensor.mutable_data({FLOAT_STATUS_SIZE}, cpu_place)); framework::TensorCopySync(flag, cpu_place, &cpu_tensor); float sum = 0.0; for (int i = 0; i < FLOAT_STATUS_SIZE; ++i) { sum += cpu_data[i]; } if (sum >= 1.0) PrintNPUOpValueInfo(op, scope, place); PADDLE_ENFORCE_LT(sum, 1.0, platform::errors::PreconditionNotMet( "Operator %s contains Nan/Inf.", op.Type())); } #endif void CheckOpHasNanOrInf(const framework::OperatorBase& op, const framework::Scope& exec_scope, const platform::Place& place) { std::call_once(white_list_init_flag, InitWhiteListFormEnv); if (IsSkipOp(op)) return; #ifdef PADDLE_WITH_ASCEND_CL if (platform::is_npu_place(place)) { NPUCheckOpHasNanOrInf(op, exec_scope, place); return; } #endif if (op_var_nan_inf_white_list().count(op.Type()) == 0) { // NOTE. vname may destruct in the end of this func. for (auto& vname : op.OutputVars(true)) { auto* var = exec_scope.FindVar(vname); if (var == nullptr) continue; CheckVarHasNanOrInf(op.Type(), exec_scope, vname, place); } } else { for (auto& vname : op.OutputVars(true)) { bool need_check = true; for (auto& white_vname : op_var_nan_inf_white_list().at(op.Type())) { if (vname.find(white_vname) != std::string::npos) { need_check = false; break; } } if (!need_check) continue; auto* var = exec_scope.FindVar(vname); if (var == nullptr) continue; CheckVarHasNanOrInf(op.Type(), exec_scope, vname, place); } } } } // namespace details } // namespace framework } // namespace paddle