nan_inf_utils_detail.cc 9.8 KB
Newer Older
W
WangXi 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/framework/details/nan_inf_utils_detail.h"
16 17

#include "paddle/fluid/framework/details/nan_inf_utils.h"
W
WangXi 已提交
18
#include "paddle/fluid/framework/op_proto_maker.h"
19
#include "paddle/fluid/framework/scope.h"
20
#include "paddle/phi/common/amp_type_traits.h"
21

22
#include "paddle/fluid/framework/convert_utils.h"
23
#include "paddle/phi/core/flags.h"
Z
zyfncg 已提交
24
#include "paddle/phi/kernels/funcs/eigen/extensions.h"
25

W
WangXi 已提交
26 27 28
namespace paddle {
namespace framework {
namespace details {
29
struct DebugTools {
30
  DebugTools() = default;
31
  std::string path = "";
32
  int stack_limit = 1;
33 34 35 36 37 38 39 40 41 42 43 44 45 46
};
static DebugTools debug_nan_inf;

void SetNanInfDebugPath(const std::string& nan_inf_path) {
  debug_nan_inf.path = nan_inf_path;
  VLOG(4) << "Set the log's path of debug tools : " << nan_inf_path;
}

std::string GetNanPath() {
  if (debug_nan_inf.path.empty()) {
    return "";
  }
  return debug_nan_inf.path + "/";
}
W
WangXi 已提交
47

48 49 50 51 52 53 54
void SetNanInfStackLimit(const int& stack_limit) {
  debug_nan_inf.stack_limit = stack_limit;
  VLOG(4) << "Set the stack limit of debug tools : " << stack_limit;
}

int GetNanInfStackLimit() { return debug_nan_inf.stack_limit; }

W
WangXi 已提交
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
static std::once_flag white_list_init_flag;

static int op_role_nan_inf_white_list = 0;

static constexpr int FORWARD = 0x10000;

// lazy init
static const std::unordered_map<std::string, int>& role_str2int() {
  /* In op_proto_maker.h
   * framework::OpRole::kForward      = 0x0000,
   * framework::OpRole::kBackward     = 0x0001,
   * framework::OpRole::kOptimize     = 0x0002,
   * framework::OpRole::kRPC          = 0x0004,
   * framework::OpRole::kDist         = 0x0008,
   * framework::OpRole::kLRSched      = 0x0010,
   * framework::OpRole::kLoss         = 0x0100,
   * framework::OpRole::kNotSpecified = 0x1000,
   */
  static const std::unordered_map<std::string, int> _role_str2int = {
      {"forward", FORWARD}, /* kForward=0, can't filter */
      {"backward", static_cast<int>(framework::OpRole::kBackward)},
      {"optimize", static_cast<int>(framework::OpRole::kOptimize)},
      {"rpc", static_cast<int>(framework::OpRole::kRPC)},
      {"dist", static_cast<int>(framework::OpRole::kDist)},
      {"lrsched", static_cast<int>(framework::OpRole::kLRSched)},
      {"loss", static_cast<int>(framework::OpRole::kLoss)},
      {"default", static_cast<int>(framework::OpRole::kNotSpecified)},
  };
  return _role_str2int;
}

static std::unordered_set<std::string>& op_type_nan_inf_white_list() {
  static std::unordered_set<std::string> _op_type_nan_inf_white_list = {
      "coalesce_tensor", /* This Op will alloc tensor, and may not init space */
  };
  return _op_type_nan_inf_white_list;
}

static std::unordered_map<std::string, std::vector<std::string>>&
op_var_nan_inf_white_list() {
  static std::unordered_map<std::string, std::vector<std::string>>
      _op_var_nan_inf_white_list = {
          /* encoded & gather var consist of idx&val, can't judge directly */
          {"dgc", {"__dgc_encoded__", "__dgc_gather__"}},
      };
  return _op_var_nan_inf_white_list;
}

static void InitWhiteListFormEnv() {
  // op_type_skip and op_var_skip may be NULL.
  // So need init static value in there, prevent thread competition.
  // NOTE. role_str2int needn't do this for it only used in this func.
  op_type_nan_inf_white_list();
  op_var_nan_inf_white_list();

  // export PADDLE_INF_NAN_SKIP_OP="op0,op1,op2"
  // export PADDLE_INF_NAN_SKIP_ROLE="role1,role2,role3"
  // export PADDLE_INF_NAN_SKIP_VAR="op0:var0,op0:var1,op1:var0"
  const char* op_type_skip = std::getenv("PADDLE_INF_NAN_SKIP_OP");
  const char* op_role_skip = std::getenv("PADDLE_INF_NAN_SKIP_ROLE");
  const char* op_var_skip = std::getenv("PADDLE_INF_NAN_SKIP_VAR");

117
  if (op_type_skip) {
W
WangXi 已提交
118 119 120 121 122 123 124
    std::stringstream ss(op_type_skip);
    std::string op_type;
    while (std::getline(ss, op_type, ',')) {
      op_type_nan_inf_white_list().emplace(op_type);
    }
  }

125
  if (op_role_skip) {
W
WangXi 已提交
126 127 128 129 130 131 132 133 134 135 136 137 138 139
    std::stringstream ss(op_role_skip);
    std::string op_role;
    while (std::getline(ss, op_role, ',')) {
      PADDLE_ENFORCE_EQ(role_str2int().find(op_role) != role_str2int().end(),
                        true,
                        platform::errors::InvalidArgument(
                            "Skip role must be one of "
                            "{forward,backward,optimize,rpc,dist,lrsched,loss,"
                            "default}, instead of %s",
                            op_role));
      op_role_nan_inf_white_list |= role_str2int().at(op_role);
    }
  }

140
  if (op_var_skip) {
W
WangXi 已提交
141 142 143
    std::stringstream ss(op_var_skip);
    std::string op_var;
    while (std::getline(ss, op_var, ',')) {
144
      auto pos = op_var.find(':');
W
WangXi 已提交
145
      PADDLE_ENFORCE_EQ(
146 147
          pos != std::string::npos,
          true,
W
WangXi 已提交
148 149 150 151 152 153 154 155 156 157 158 159
          platform::errors::InvalidArgument(
              "Skip var format must be op:var, instead of %s", op_var));
      std::string op = op_var.substr(0, pos);
      std::string var = op_var.substr(pos + 1);

      op_var_nan_inf_white_list()[op].emplace_back(var);
    }
  }
}

void CheckVarHasNanOrInf(const std::string& op_type,
                         const std::string& var_name,
160
                         const framework::Variable* var,
W
WangXi 已提交
161 162
                         const platform::Place& place) {
  PADDLE_ENFORCE_NOT_NULL(
163 164 165
      var,
      platform::errors::NotFound(
          "Cannot find var: `%s` in op `%s`.", var_name, op_type));
W
WangXi 已提交
166

167
  const phi::DenseTensor* tensor{nullptr};
168 169
  if (var->IsType<phi::DenseTensor>()) {
    tensor = &var->Get<phi::DenseTensor>();
170 171
  } else if (var->IsType<phi::SelectedRows>()) {
    tensor = &var->Get<phi::SelectedRows>().value();
W
WangXi 已提交
172 173 174 175 176 177 178 179 180 181 182 183 184 185
  } else {
    VLOG(10) << var_name << " var_name need not to check";
    return;
  }

  if (tensor->memory_size() == 0) {
    VLOG(10) << var_name << " var_name need not to check, size == 0";
    return;
  }

  VLOG(10) << "begin check " << op_type << " var_name:" << var_name
           << ", place:" << tensor->place() << ", numel:" << tensor->numel();

  if (platform::is_gpu_place(tensor->place())) {
186
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
L
Leo Chen 已提交
187
    tensor_check<phi::GPUContext>(op_type, var_name, *tensor, place);
W
WangXi 已提交
188 189
#else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
190 191
        "phi::DenseTensor[%s] use gpu place. PaddlePaddle must compile "
        "with GPU.",
W
WangXi 已提交
192
        var_name));
193 194 195 196
#endif
    return;
  } else if (platform::is_xpu_place(tensor->place())) {
#ifdef PADDLE_WITH_XPU
197 198
    if (framework::TransToProtoVarType(tensor->dtype()) !=
        proto::VarType::FP32) {
199 200 201 202
      return;
    }

    float* cpu_data = new float[tensor->numel()];
203 204
    memory::Copy(platform::CPUPlace(),
                 static_cast<void*>(cpu_data),
205
                 tensor->place(),
T
taixiurong 已提交
206 207
                 static_cast<const void*>(tensor->data<float>()),
                 tensor->numel() * sizeof(float));
208 209 210 211 212 213 214 215 216
    bool flag = false;
    for (int i = 0; i < tensor->numel(); i++) {
      if (isnan(cpu_data[i]) || isinf(cpu_data[i])) {
        flag = true;
        break;
      }
    }
    delete[] cpu_data;
    PADDLE_ENFORCE_NE(
217 218 219
        flag,
        true,
        platform::errors::Fatal(
220 221 222
            "Operator %s output phi::DenseTensor %s contains Inf.",
            op_type,
            var_name));
223 224
#else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
225 226
        "phi::DenseTensor[%s] use xpu place. PaddlePaddle must compile "
        "with XPU.",
227
        var_name));
228 229 230
#endif
    return;
  }
L
Leo Chen 已提交
231
  tensor_check<phi::CPUContext>(op_type, var_name, *tensor, place);
W
WangXi 已提交
232 233
}

234
void CheckVarHasNanOrInf(const std::string& op_type,
235
                         const framework::Scope& scope,
236 237 238 239 240 241
                         const std::string& var_name,
                         const platform::Place& place) {
  auto* var = scope.FindVar(var_name);
  CheckVarHasNanOrInf(op_type, var_name, var, place);
}

W
WangXi 已提交
242 243 244
bool IsSkipOp(const framework::OperatorBase& op) {
  if (op_type_nan_inf_white_list().count(op.Type()) != 0) return true;

245 246 247 248 249
  int op_role = 0;
  if (op.HasAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName())) {
    op_role = op.template Attr<int>(
        framework::OpProtoAndCheckerMaker::OpRoleAttrName());
  }
W
WangXi 已提交
250 251 252 253 254 255 256 257 258 259 260

  // kForward=0, can't filter
  if (op_role == static_cast<int>(framework::OpRole::kForward)) {
    op_role = FORWARD;
  }
  if (op_role_nan_inf_white_list & op_role) return true;

  return false;
}

void CheckOpHasNanOrInf(const framework::OperatorBase& op,
261
                        const framework::Scope& exec_scope,
W
WangXi 已提交
262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
                        const platform::Place& place) {
  std::call_once(white_list_init_flag, InitWhiteListFormEnv);

  if (IsSkipOp(op)) return;

  if (op_var_nan_inf_white_list().count(op.Type()) == 0) {
    // NOTE. vname may destruct in the end of this func.
    for (auto& vname : op.OutputVars(true)) {
      auto* var = exec_scope.FindVar(vname);
      if (var == nullptr) continue;
      CheckVarHasNanOrInf(op.Type(), exec_scope, vname, place);
    }
  } else {
    for (auto& vname : op.OutputVars(true)) {
      bool need_check = true;
      for (auto& white_vname : op_var_nan_inf_white_list().at(op.Type())) {
        if (vname.find(white_vname) != std::string::npos) {
          need_check = false;
          break;
        }
      }
      if (!need_check) continue;
      auto* var = exec_scope.FindVar(vname);
      if (var == nullptr) continue;
      CheckVarHasNanOrInf(op.Type(), exec_scope, vname, place);
    }
  }
}

}  // namespace details
}  // namespace framework
}  // namespace paddle