未验证 提交 df4a978c 编写于 作者: C Chen Weihang 提交者: GitHub

[Debug] Add nan& inf check FLAG for dygraph (#32635)

* add check nan of inf for dygraph

* add unittest for dygraph

* revert error change
上级 60c9f97c
......@@ -19,6 +19,7 @@
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/imperative/type_defs.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
......@@ -30,9 +31,28 @@ void CheckVarHasNanOrInf(const std::string& op_type,
const std::string& var_name,
const platform::Place& place);
void CheckVarHasNanOrInf(const std::string& op_type,
const std::string& var_name,
const framework::Variable* var,
const platform::Place& place);
void CheckOpHasNanOrInf(const framework::OperatorBase& op,
const framework::Scope& scope,
const platform::Place& place);
template <typename VarType>
void CheckOpHasNanOrInfInDygraph(const std::string& op_type,
const imperative::NameVarMap<VarType>& op_outs,
platform::Place place) {
for (const auto& pair : op_outs) {
for (const auto& ivar : pair.second) {
auto* var = ivar->MutableVar();
if (var == nullptr) continue;
CheckVarHasNanOrInf(op_type, ivar->Name(), var, place);
}
}
}
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -297,13 +297,12 @@ void tensor_check<platform::CPUDeviceContext>(const std::string& op_type,
}
void CheckVarHasNanOrInf(const std::string& op_type,
const framework::Scope& scope,
const std::string& var_name,
const framework::Variable* var,
const platform::Place& place) {
auto* var = scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("In op=%s, can't find var:%s", op_type,
var_name));
var, platform::errors::NotFound("Cannot find var: `%s` in op `%s`.",
var_name, op_type));
const Tensor* tensor{nullptr};
if (var->IsType<framework::LoDTensor>()) {
......@@ -393,6 +392,14 @@ void CheckVarHasNanOrInf(const std::string& op_type,
tensor_check<platform::CPUDeviceContext>(op_type, var_name, *tensor, place);
}
void CheckVarHasNanOrInf(const std::string& op_type,
const framework::Scope& scope,
const std::string& var_name,
const platform::Place& place) {
auto* var = scope.FindVar(var_name);
CheckVarHasNanOrInf(op_type, var_name, var, place);
}
bool IsSkipOp(const framework::OperatorBase& op) {
if (op_type_nan_inf_white_list().count(op.Type()) != 0) return true;
......
cc_library(imperative_flag SRCS flags.cc DEPS gflags)
cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform)
cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils)
cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
add_subdirectory(jit)
cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
......
......@@ -15,8 +15,11 @@
#include "paddle/fluid/imperative/prepared_operator.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/details/nan_inf_utils.h"
#include "paddle/fluid/imperative/infer_shape_context.h"
DECLARE_bool(check_nan_inf);
namespace paddle {
namespace imperative {
......@@ -175,6 +178,11 @@ static void PreparedOpRunImpl(
func(DygraphExecutionContext<VarType>(op, scope, *dev_ctx, ctx, ins, outs,
attrs));
if (FLAGS_check_nan_inf) {
framework::details::CheckOpHasNanOrInfInDygraph<VarType>(
op.Type(), outs, dev_ctx->GetPlace());
}
/**
* [ Why need handle complex gradient to real gradient? ]
*
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import unicode_literals
from __future__ import print_function
import os
import sys
import time
import numpy as np
os.environ[str("FLAGS_check_nan_inf")] = str("1")
os.environ[str("GLOG_vmodule")] = str("nan_inf_utils_detail=10")
import paddle
import paddle.nn as nn
np.random.seed(0)
def generator():
batch_size = 5
for i in range(5):
curr_train_x = np.random.randint(
batch_size, size=(batch_size, 3)).astype("float32")
if i >= 2:
curr_train_x[0, :] = np.nan
curr_train_x[-1, :] = np.inf
res = []
for i in range(batch_size):
y = i % 3
res.append([y])
y_label = np.array(res).astype('int64')
yield [curr_train_x, y_label]
class TestLayer(nn.Layer):
def __init__(self):
super(TestLayer, self).__init__()
self.linear1 = nn.Linear(3, 400)
self.linear2 = nn.Linear(400, 400)
self.linear3 = nn.Linear(400, 3)
def forward(self, x):
x = self.linear1(x)
x = nn.functional.sigmoid(x)
x = self.linear2(x)
x = nn.functional.sigmoid(x)
x = self.linear3(x)
x = nn.functional.softmax(x)
return x
def check(use_cuda):
paddle.set_device('gpu' if use_cuda else 'cpu')
net = TestLayer()
sgd = paddle.optimizer.SGD(learning_rate=0.05, parameters=net.parameters())
for step, (x, y) in enumerate(generator()):
x = paddle.to_tensor(x)
y = paddle.to_tensor(y)
zero = paddle.zeros(shape=[1], dtype='int64')
fp16_zero = paddle.cast(zero, dtype='float16')
y = y + zero
y_pred = net(x)
cost = nn.functional.cross_entropy(y_pred, y, use_softmax=False)
avg_cost = paddle.mean(cost)
acc_top1 = paddle.metric.accuracy(input=y_pred, label=y, k=1)
print('iter={:.0f}, cost={}, acc1={}'.format(
step, avg_cost.numpy(), acc_top1.numpy()))
sgd.step()
sgd.clear_grad()
if __name__ == '__main__':
if paddle.is_compiled_with_cuda():
try:
check(use_cuda=True)
assert False
except Exception as e:
print(e)
print(type(e))
# Note. Enforce in cuda kernel may not catch in paddle, and
# Exception type will be RuntimeError
assert type(e) == OSError or type(e) == RuntimeError
try:
check(use_cuda=False)
assert False
except Exception as e:
print(e)
print(type(e))
assert type(e) == RuntimeError
......@@ -29,11 +29,10 @@ class TestNanInf(unittest.TestCase):
self._python_interp = sys.executable
if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
self._python_interp += " -m coverage run --branch -p"
self._python_interp += " check_nan_inf_base.py"
self.env = os.environ.copy()
def test_nan_inf(self):
def check_nan_inf(self):
cmd = self._python_interp
proc = subprocess.Popen(
......@@ -53,6 +52,14 @@ class TestNanInf(unittest.TestCase):
assert (out + err
).find('There are `nan` or `inf` in tensor'.encode()) != -1
def test_nan_inf_in_static_mode(self):
self._python_interp += " check_nan_inf_base.py"
self.check_nan_inf()
def test_nan_inf_in_dynamic_mode(self):
self._python_interp += " check_nan_inf_base_dygraph.py"
self.check_nan_inf()
class TestNanInfEnv(TestNanInf):
def setUp(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册