// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/details/nan_inf_utils_detail.h" #include #include #include #include #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/selected_rows.h" namespace paddle { namespace framework { namespace details { static std::once_flag white_list_init_flag; static int op_role_nan_inf_white_list = 0; static constexpr int FORWARD = 0x10000; // lazy init static const std::unordered_map& role_str2int() { /* In op_proto_maker.h * framework::OpRole::kForward = 0x0000, * framework::OpRole::kBackward = 0x0001, * framework::OpRole::kOptimize = 0x0002, * framework::OpRole::kRPC = 0x0004, * framework::OpRole::kDist = 0x0008, * framework::OpRole::kLRSched = 0x0010, * framework::OpRole::kLoss = 0x0100, * framework::OpRole::kNotSpecified = 0x1000, */ static const std::unordered_map _role_str2int = { {"forward", FORWARD}, /* kForward=0, can't filter */ {"backward", static_cast(framework::OpRole::kBackward)}, {"optimize", static_cast(framework::OpRole::kOptimize)}, {"rpc", static_cast(framework::OpRole::kRPC)}, {"dist", static_cast(framework::OpRole::kDist)}, {"lrsched", static_cast(framework::OpRole::kLRSched)}, {"loss", static_cast(framework::OpRole::kLoss)}, {"default", static_cast(framework::OpRole::kNotSpecified)}, }; return _role_str2int; } static std::unordered_set& op_type_nan_inf_white_list() { static std::unordered_set _op_type_nan_inf_white_list = { "coalesce_tensor", /* This Op will alloc tensor, and may not init space */ }; return _op_type_nan_inf_white_list; } static std::unordered_map>& op_var_nan_inf_white_list() { static std::unordered_map> _op_var_nan_inf_white_list = { /* encoded & gather var consist of idx&val, can't judge directly */ {"dgc", {"__dgc_encoded__", "__dgc_gather__"}}, }; return _op_var_nan_inf_white_list; } static void InitWhiteListFormEnv() { // op_type_skip and op_var_skip may be NULL. // So need init static value in there, prevent thread competition. // NOTE. role_str2int needn't do this for it only used in this func. op_type_nan_inf_white_list(); op_var_nan_inf_white_list(); // export PADDLE_INF_NAN_SKIP_OP="op0,op1,op2" // export PADDLE_INF_NAN_SKIP_ROLE="role1,role2,role3" // export PADDLE_INF_NAN_SKIP_VAR="op0:var0,op0:var1,op1:var0" const char* op_type_skip = std::getenv("PADDLE_INF_NAN_SKIP_OP"); const char* op_role_skip = std::getenv("PADDLE_INF_NAN_SKIP_ROLE"); const char* op_var_skip = std::getenv("PADDLE_INF_NAN_SKIP_VAR"); if (op_type_skip != NULL) { std::stringstream ss(op_type_skip); std::string op_type; while (std::getline(ss, op_type, ',')) { op_type_nan_inf_white_list().emplace(op_type); } } if (op_role_skip != NULL) { std::stringstream ss(op_role_skip); std::string op_role; while (std::getline(ss, op_role, ',')) { PADDLE_ENFORCE_EQ(role_str2int().find(op_role) != role_str2int().end(), true, platform::errors::InvalidArgument( "Skip role must be one of " "{forward,backward,optimize,rpc,dist,lrsched,loss," "default}, instead of %s", op_role)); op_role_nan_inf_white_list |= role_str2int().at(op_role); } } if (op_var_skip != NULL) { std::stringstream ss(op_var_skip); std::string op_var; while (std::getline(ss, op_var, ',')) { auto pos = op_var.find(":"); PADDLE_ENFORCE_EQ( pos != std::string::npos, true, platform::errors::InvalidArgument( "Skip var format must be op:var, instead of %s", op_var)); std::string op = op_var.substr(0, pos); std::string var = op_var.substr(pos + 1); op_var_nan_inf_white_list()[op].emplace_back(var); } } } template static void PrintNanInf(const T* value, const size_t numel, int print_num, const std::string& op_type, const std::string& var_name) { size_t nan_count, inf_count, num_count; nan_count = inf_count = num_count = 0; // CPU print num value for (size_t i = 0; i < numel; ++i) { size_t count = 0; if (std::isnan(value[i])) { count = nan_count++; } else if (std::isinf(value[i])) { count = inf_count++; } else { count = num_count++; } if (count < static_cast(print_num)) { printf("numel:%lu index:%lu value:%f\n", static_cast(numel), static_cast(i), static_cast(value[i])); } } bool has_nan_inf = true; printf("In cpu, there has %lu,%lu,%lu nan,inf,num\n", static_cast(nan_count), static_cast(inf_count), static_cast(num_count)); PADDLE_ENFORCE_EQ(has_nan_inf, false, platform::errors::PreconditionNotMet( "===ERROR: in [op=%s] [tensor=%s] find nan or inf===", op_type, var_name)); } // openmp 4.0, reduction with fp16 #if defined _OPENMP && _OPENMP >= 201307 // more detail see: 180 page of // https://www.openmp.org/wp-content/uploads/OpenMP4.0.0.pdf #pragma omp declare reduction(+ : paddle::platform::float16 : omp_out += omp_in) #pragma omp declare reduction(+ : paddle::platform::bfloat16 : omp_out += \ omp_in) #endif template static void CheckNanInf(const T* value, const size_t numel, int print_num, const std::string& op_type, const std::string& var_name) { T sum = static_cast(0.0); #if defined _OPENMP && _OPENMP >= 201307 #pragma omp parallel for simd reduction(+ : sum) #elif defined _OPENMP #pragma omp parallel for reduction(+ : sum) #endif for (size_t i = 0; i < numel; ++i) { sum += (value[i] - value[i]); } if (std::isnan(sum) || std::isinf(sum)) { PrintNanInf(value, numel, print_num, op_type, var_name); } } #if defined _OPENMP && _OPENMP >= 201307 // openmp4.0 not need to specialization fp16 #elif defined _OPENMP template <> void CheckNanInf( const paddle::platform::float16* value, const size_t numel, int print_num, const std::string& op_type, const std::string& var_name) { float sum = 0.0f; #pragma omp parallel for reduction(+ : sum) for (size_t i = 0; i < numel; ++i) { sum += static_cast(value[i] - value[i]); } if (std::isnan(sum) || std::isinf(sum)) { PrintNanInf(value, numel, print_num, op_type, var_name); } } template <> void CheckNanInf( const paddle::platform::bfloat16* value, const size_t numel, int print_num, const std::string& op_type, const std::string& var_name) { float sum = 0.0f; #pragma omp parallel for reduction(+ : sum) for (size_t i = 0; i < numel; ++i) { sum += static_cast(value[i] - value[i]); } if (std::isnan(sum) || std::isinf(sum)) { PrintNanInf(value, numel, print_num, op_type, var_name); } } #endif template <> template void TensorCheckerVisitor::apply( typename std::enable_if::value>::type*) const { // use env strategy control in future, -1=print_all. int print_num = 3; CheckNanInf(tensor_.data(), tensor_.numel(), print_num, op_type_, var_name_); } template <> void tensor_check(const std::string& op_type, const std::string& var_name, const framework::Tensor& tensor, const platform::Place& place) { TensorCheckerVisitor vistor(op_type, var_name, tensor, place); VisitDataType(tensor.type(), vistor); } void CheckVarHasNanOrInf(const std::string& op_type, const framework::Scope& scope, const std::string& var_name, const platform::Place& place) { auto* var = scope.FindVar(var_name); PADDLE_ENFORCE_NOT_NULL( var, platform::errors::NotFound("In op=%s, can't find var:%s", op_type, var_name)); const Tensor* tensor{nullptr}; if (var->IsType()) { tensor = &var->Get(); } else if (var->IsType()) { tensor = &var->Get().value(); } else { VLOG(10) << var_name << " var_name need not to check"; return; } if (tensor->memory_size() == 0) { VLOG(10) << var_name << " var_name need not to check, size == 0"; return; } VLOG(10) << "begin check " << op_type << " var_name:" << var_name << ", place:" << tensor->place() << ", numel:" << tensor->numel(); if (platform::is_gpu_place(tensor->place())) { #ifdef PADDLE_WITH_CUDA tensor_check(op_type, var_name, *tensor, place); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "Tensor[%s] use gpu place. PaddlePaddle must compile with GPU.", var_name)); #endif return; } tensor_check(op_type, var_name, *tensor, place); } bool IsSkipOp(const framework::OperatorBase& op) { if (op_type_nan_inf_white_list().count(op.Type()) != 0) return true; int op_role = op.template Attr( framework::OpProtoAndCheckerMaker::OpRoleAttrName()); // kForward=0, can't filter if (op_role == static_cast(framework::OpRole::kForward)) { op_role = FORWARD; } if (op_role_nan_inf_white_list & op_role) return true; return false; } void CheckOpHasNanOrInf(const framework::OperatorBase& op, const framework::Scope& exec_scope, const platform::Place& place) { std::call_once(white_list_init_flag, InitWhiteListFormEnv); if (IsSkipOp(op)) return; if (op_var_nan_inf_white_list().count(op.Type()) == 0) { // NOTE. vname may destruct in the end of this func. for (auto& vname : op.OutputVars(true)) { auto* var = exec_scope.FindVar(vname); if (var == nullptr) continue; CheckVarHasNanOrInf(op.Type(), exec_scope, vname, place); } } else { for (auto& vname : op.OutputVars(true)) { bool need_check = true; for (auto& white_vname : op_var_nan_inf_white_list().at(op.Type())) { if (vname.find(white_vname) != std::string::npos) { need_check = false; break; } } if (!need_check) continue; auto* var = exec_scope.FindVar(vname); if (var == nullptr) continue; CheckVarHasNanOrInf(op.Type(), exec_scope, vname, place); } } } } // namespace details } // namespace framework } // namespace paddle