From d2fa26f64a123ced42cc5a79f8b06d178ce637bf Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Mon, 22 May 2023 10:56:48 +0800 Subject: [PATCH] Print python trace back when debugmode = CHECK_NAN_INF_AND_ABORT and backward has nan/inf (#52808) --- .../interpreter/interpreter_util.cc | 33 ++++++++++-- .../framework/new_executor/interpretercore.cc | 32 ++++++++++-- paddle/fluid/framework/operator.cc | 26 +++++++++- .../check_nan_inf_backward_static_stack.py | 52 +++++++++++++++++++ .../fluid/tests/unittests/test_nan_inf.py | 10 +++- 5 files changed, 142 insertions(+), 11 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/check_nan_inf_backward_static_stack.py diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 5b3f0f3738b..0de9a11cbba 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -773,10 +773,6 @@ void BuildOpFuncList(const platform::Place& place, } // for debug nan/inf - if (FLAGS_check_nan_inf) { - VLOG(4) << "Check nan/inf"; - framework::details::CheckOpHasNanOrInf(*op, *runtime_scope, place); - } vec_func_list->emplace_back(op_func_node); @@ -848,6 +844,35 @@ void BuildOpFuncList(const platform::Place& place, std::rethrow_exception(std::current_exception()); } + if (FLAGS_check_nan_inf) { + VLOG(4) << "Check nan/inf"; + try { + framework::details::CheckOpHasNanOrInf(*op, *local_scope, place); + } catch (...) { + const std::vector* callstack = nullptr; + auto attrs = op->Attrs(); + auto iter = + attrs.find(OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); + if (iter != attrs.end()) { + callstack = &PADDLE_GET_CONST(std::vector, iter->second); + if (callstack->empty()) callstack = nullptr; + } + std::ostringstream sout; + if (callstack) { + if (FLAGS_call_stack_level > 1) { + sout << "\n\n Compile Traceback (most recent call last):"; + } else { + sout << "In user code:\n"; + } + for (auto& line : *callstack) { + sout << "\n " << line; + } + } + std::cout << sout.str() << std::endl; + std::rethrow_exception(std::current_exception()); + } + } + VLOG(4) << "End run " << place << " " << op_func_node.operator_base_->DebugStringEx(local_scope); diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 494a5afa7ed..abebc2f54f0 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -983,10 +983,34 @@ void InterpreterCore::RunOperator(const Instruction& instr_node) { // for debug nan/inf if (op_with_kernel != nullptr && FLAGS_check_nan_inf) { VLOG(4) << "Check nan/inf"; - framework::details::CheckOpHasNanOrInf( - *op, - *local_scope, - place); // TODO(xiongkun03) change it to inner scope. + try { + framework::details::CheckOpHasNanOrInf( + *op, + *local_scope, + place); // TODO(xiongkun03) change it to inner scope. + } catch (...) { + const std::vector* callstack = nullptr; + auto attrs = op->Attrs(); + auto iter = + attrs.find(OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); + if (iter != attrs.end()) { + callstack = &PADDLE_GET_CONST(std::vector, iter->second); + if (callstack->empty()) callstack = nullptr; + } + std::ostringstream sout; + if (callstack) { + if (FLAGS_call_stack_level > 1) { + sout << "\n\n Compile Traceback (most recent call last):"; + } else { + sout << "In user code:\n"; + } + for (auto& line : *callstack) { + sout << "\n " << line; + } + } + std::cout << sout.str() << std::endl; + std::rethrow_exception(std::current_exception()); + } } } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 992f475fd69..18d9b3e6ff4 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2044,7 +2044,31 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } if (FLAGS_check_nan_inf) { - framework::details::CheckOpHasNanOrInf(*this, exec_scope, place); + try { + framework::details::CheckOpHasNanOrInf(*this, exec_scope, place); + } catch (...) { + const std::vector* callstack = nullptr; + auto attrs = Attrs(); + auto iter = + attrs.find(OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); + if (iter != attrs.end()) { + callstack = &PADDLE_GET_CONST(std::vector, iter->second); + if (callstack->empty()) callstack = nullptr; + } + std::ostringstream sout; + if (callstack) { + if (FLAGS_call_stack_level > 1) { + sout << "\n\n Compile Traceback (most recent call last):"; + } else { + sout << "In user code:\n"; + } + for (auto& line : *callstack) { + sout << "\n " << line; + } + } + std::cout << sout.str() << std::endl; + std::rethrow_exception(std::current_exception()); + } } // To solve issue #15032, have a discussion with @Luotao for cpu inference, diff --git a/python/paddle/fluid/tests/unittests/check_nan_inf_backward_static_stack.py b/python/paddle/fluid/tests/unittests/check_nan_inf_backward_static_stack.py new file mode 100644 index 00000000000..4ec58623f88 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/check_nan_inf_backward_static_stack.py @@ -0,0 +1,52 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle + + +# 定义静态图模型 +def static_model(x, y): + z = paddle.pow(x, y) + return z + + +def main(): + # 开启静态图模式 + paddle.enable_static() + paddle.set_flags({"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 0}) + # 定义输入变量和模型输出变量 + x_static = paddle.static.data(name='x_static', shape=[3], dtype='float32') + y_static = paddle.static.data(name='y_static', shape=[3], dtype='float32') + x_static.stop_gradient = False + z_static = static_model(x_static, y_static) + + # 计算梯度 + grads_static = paddle.static.gradients(z_static, x_static, y_static) + + # 创建 Executor 对象 + exe_static = paddle.static.Executor(paddle.CPUPlace()) + + # 编译计算图 + exe_static.run(paddle.static.default_startup_program()) + + # 执行前向计算和反向传播 + grads_val_static = exe_static.run( + paddle.static.default_main_program(), + feed={'x_static': [1, 0, 3], 'y_static': [0, 0, 0]}, + fetch_list=[grads_static], + ) + + +if __name__ == "__main__": + main() diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf.py b/python/paddle/fluid/tests/unittests/test_nan_inf.py index 4a60902768f..851c46c3b89 100644 --- a/python/paddle/fluid/tests/unittests/test_nan_inf.py +++ b/python/paddle/fluid/tests/unittests/test_nan_inf.py @@ -148,8 +148,8 @@ class TestNanInfCheckResult(unittest.TestCase): if paddle.fluid.core.is_compiled_with_cuda(): _check_num_nan_inf(use_cuda=True) - def test_check_stack(self): - self._python_interp += " check_nan_inf_backward_stack.py" + def check_stack(self, file_name): + self._python_interp += file_name cmd = self._python_interp proc = subprocess.Popen( cmd.split(" "), @@ -167,6 +167,12 @@ class TestNanInfCheckResult(unittest.TestCase): # in python3, type(out+err) is 'bytes', need use encode assert (out + err).find(b' z = paddle.pow(x, y)') != -1 + def test_check_stack(self): + self.check_stack(" check_nan_inf_backward_stack.py") + + def test_statck_check_stack(self): + self.check_stack(" check_nan_inf_backward_static_stack.py") + def check_nan_inf_level(self, use_cuda, dtype): shape = [8, 8] x_np, y_np = self.generate_inputs(shape, dtype) -- GitLab