From 660f781b7793510354379178c2b4d1bf9e5f4df2 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Tue, 18 Apr 2023 19:49:00 +0800 Subject: [PATCH] Print the forward's stack when backward op has nan/inf and FLAGS_check_nan_inf_level = 0 (#52639) --- .../eager_manual/forwards/add_n_fwd_func.cc | 9 +++++ .../forwards/conv2d_fwd_function.cc | 8 +++++ .../forwards/sync_batch_norm_fwd_func.cc | 8 +++++ paddle/fluid/eager/api/utils/global_utils.h | 6 ++++ .../generator/eager_gen.py | 34 ++++++++++++++++--- .../generator/python_c_gen.py | 6 ++-- paddle/fluid/eager/grad_node_info.h | 6 ++++ paddle/fluid/imperative/tracer.cc | 1 + paddle/fluid/imperative/tracer.h | 4 ++- paddle/fluid/pybind/eager_utils.cc | 16 +++++++++ paddle/fluid/pybind/eager_utils.h | 1 + .../unittests/check_nan_inf_backward_stack.py | 29 ++++++++++++++++ .../fluid/tests/unittests/test_nan_inf.py | 26 ++++++++++++++ 13 files changed, 147 insertions(+), 7 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/check_nan_inf_backward_stack.py diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc index ea7dcd8a5f0..0f1714b74b6 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc @@ -55,9 +55,12 @@ paddle::Tensor add_n_ad_func(const std::vector& x) { VLOG(3) << "Final State Running: " << "add_n_ad_func"; auto api_result = paddle::experimental::add_n(x); + + std::string forward_trace = ""; // Check NaN and Inf if needed if (FLAGS_check_nan_inf) { egr::CheckTensorHasNanOrInf("add_n", api_result); + forward_trace = egr::Controller::Instance().GetPythonStack(); } // Get Outputs @@ -83,6 +86,12 @@ paddle::Tensor add_n_ad_func(const std::vector& x) { // Node Construction auto grad_node = std::shared_ptr(new AddNGradNodeFinal(1, 1)); + + // Set forward's stack + if (FLAGS_check_nan_inf) { + grad_node->SetForwardTrace(forward_trace); + } + // SetAttributes if needed // Set TensorWrappers for Forward Inputs if needed diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc index 84938933f9e..af878d42095 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc @@ -110,9 +110,11 @@ paddle::Tensor conv2d_ad_func(const paddle::Tensor& input, dilations, groups, data_format); + std::string forward_trace = ""; // Check NaN and Inf if needed if (FLAGS_check_nan_inf) { egr::CheckTensorHasNanOrInf("conv2d", api_result); + forward_trace = egr::Controller::Instance().GetPythonStack(); } // Get Outputs @@ -138,6 +140,12 @@ paddle::Tensor conv2d_ad_func(const paddle::Tensor& input, // Node Construction auto grad_node = std::shared_ptr(new Conv2dGradNodeFinal(1, 2)); + + // Set forward's stack + if (FLAGS_check_nan_inf) { + grad_node->SetForwardTrace(forward_trace); + } + // SetAttributes if needed grad_node->SetAttributestrides(strides); grad_node->SetAttributepaddings(paddings); diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc index 86922267bb7..2ff86a73f2b 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc @@ -172,9 +172,11 @@ sync_batch_norm__ad_func(const paddle::Tensor& x, data_layout, use_global_stats, trainable_statistics); + std::string forward_trace = ""; // Check NaN and Inf if needed if (FLAGS_check_nan_inf) { egr::CheckTensorHasNanOrInf("sync_batch_norm_", api_result); + forward_trace = egr::Controller::Instance().GetPythonStack(); } // Get Outputs @@ -226,6 +228,12 @@ sync_batch_norm__ad_func(const paddle::Tensor& x, // Node Construction auto grad_node = std::shared_ptr(new SyncBatchNormGradNode(6, 5)); + + // Set forward's stack + if (FLAGS_check_nan_inf) { + grad_node->SetForwardTrace(forward_trace); + } + egr::Controller::Instance().PushBackForceSequentialNodes(grad_node.get()); // SetAttributes if needed grad_node->SetAttributemomentum(momentum); diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h index bd5c9637264..ba79a63648d 100644 --- a/paddle/fluid/eager/api/utils/global_utils.h +++ b/paddle/fluid/eager/api/utils/global_utils.h @@ -74,6 +74,12 @@ class Controller { void EnableLayoutAutoTune() { tracer_->EnableLayoutAutoTune(); } + void SetPythonStack(std::string stack_str) { + tracer_->SetPythonStack(stack_str); + } + + std::string GetPythonStack() { return tracer_->GetPythonStack(); } + bool HasGrad() const { return tracer_->HasGrad(); } void SetHasGrad(bool has_grad) { tracer_->SetHasGrad(has_grad); } std::string GenerateUniqueName(std::string key = "eager_in_tmp") { diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py index 4e105d138b7..79bf46c01f3 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py @@ -297,6 +297,10 @@ FORWARD_BODY_TEMPLATE = """ if(require_any_grad) {{ // Node Construction {} + // Set for forward trace + if (FLAGS_check_nan_inf) {{ + {} + }} // SetAttributes if needed {} // Set TensorWrappers for Forward Inputs if needed @@ -485,7 +489,25 @@ CHECK_BACKWARD_INPLACE_TEMPLATE = """ }} }}""" -CHECK_NAN_AND_INF_TEMPLATE = """ if (FLAGS_check_nan_inf) {{ egr::CheckTensorHasNanOrInf("{}", {}); }} +CHECK_NAN_AND_INF_TEMPLATE_FORWARD = """ + std::string forward_trace =""; + if (FLAGS_check_nan_inf) {{ + egr::CheckTensorHasNanOrInf("{}", {}); + forward_trace = egr::Controller::Instance().GetPythonStack(); + }} +""" + +CHECK_NAN_AND_INF_TEMPLATE_BACKWARD = """ + if (FLAGS_check_nan_inf) {{ + try{{ + egr::CheckTensorHasNanOrInf("{}", {}); + }} catch(...) {{ + LOG(WARNING) << "There are nan/inf in ({})"; + auto forward_trace = GetForwardTrace(); + std::cout<SetForwardTrace(forward_trace);" + ) if not for_backward: self.node_creation_str = FORWARD_BODY_TEMPLATE.format( node_creation_event_str, pass_stop_gradient_args_str, node_construction_str, + set_forward_trace, set_attributes_str, set_input_tensor_wrappers_str, set_grad_out_meta_str, @@ -1427,7 +1453,7 @@ class DygraphForwardFunctionGenerator(DygraphFunctionGeneratorBase): ) # Check Nan and Inf - check_nan_inf_str = CHECK_NAN_AND_INF_TEMPLATE.format( + check_nan_inf_str = CHECK_NAN_AND_INF_TEMPLATE_FORWARD.format( function_name, "api_result" ) @@ -2322,8 +2348,8 @@ class DygraphNodeGenerator(DygraphFunctionGeneratorBase): {indent}{grad_api_namespace}{backward_api_name}({grad_api_args_str});""" # Check Nan and Inf - check_nan_inf_str = CHECK_NAN_AND_INF_TEMPLATE.format( - backward_api_name, "returns" + check_nan_inf_str = CHECK_NAN_AND_INF_TEMPLATE_BACKWARD.format( + backward_api_name, "returns", backward_api_name ) # Prepare for Node Creation if Necessary diff --git a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py index 16cd6f0ffd7..d02b6243130 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py @@ -121,7 +121,10 @@ static PyObject * eager_api_{}(PyObject *self, PyObject *args, PyObject *kwargs) NOAMP_DYGRAPH_FUNCTION_TEMPLATE = "decltype({}({})) out = {}({});" -FUNCTION_SET_DEVICE_TEMPLATE = """{} if (paddle::platform::is_gpu_place(place)) {{ +FUNCTION_SET_DEVICE_TEMPLATE = """{} + LOG(INFO)<<"this is SetPythonStack"; + SetPythonStack(); + if (paddle::platform::is_gpu_place(place)) {{ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) phi::backends::gpu::SetDeviceId(place.device); VLOG(4) <<"CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << (int)place.device; @@ -170,7 +173,6 @@ PYTHON_C_WRAPPER_TEMPLATE = """ #include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/eager/amp_utils.h" #include "paddle/fluid/eager/eager_amp_auto_cast.h" - namespace paddle {{ namespace pybind {{ diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h index 8d73092ddf9..19012ea6445 100644 --- a/paddle/fluid/eager/grad_node_info.h +++ b/paddle/fluid/eager/grad_node_info.h @@ -292,6 +292,10 @@ class GradNodeBase { is_tensor_wrappers_cleared_ = is_tensor_wrappers_cleared; } + void SetForwardTrace(std::string trace) { forward_trace_ = trace; } + + std::string GetForwardTrace() { return forward_trace_; } + private: // bwd_out_meta_ is used to record Grad output info for backward paddle::small_vector, kSlotSmallVectorSize> @@ -317,6 +321,8 @@ class GradNodeBase { bool need_complex_to_real_ = false; bool is_tensor_wrappers_cleared_ = false; + // The trace of forward function + std::string forward_trace_ = ""; }; } // namespace egr diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index fa0f09d7277..af39832b4f5 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -37,6 +37,7 @@ DECLARE_string(tracer_mkldnn_ops_off); namespace paddle { namespace imperative { +thread_local std::string Tracer::python_stack_ = ""; thread_local bool Tracer::enable_program_desc_tracing_ = false; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 943505955e8..7355cec776e 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -199,7 +199,8 @@ class Tracer { use_layout_autotune_ = false; return false; } - + void SetPythonStack(std::string stack_str) { python_stack_ = stack_str; } + std::string GetPythonStack() { return python_stack_; } phi::KernelSignature GetExpectedKernelSignature( const std::string& type, const NameTensorMap& ins, @@ -215,6 +216,7 @@ class Tracer { std::unique_ptr generator_; platform::Place expected_place_; GarbageCollectorMap gcs_; + static thread_local std::string python_stack_; static thread_local bool enable_program_desc_tracing_; static thread_local bool use_layout_autotune_; static thread_local bool has_grad_; diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index b7ecd196ca2..a35ef64e66c 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -38,6 +38,7 @@ limitations under the License. */ #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" +DECLARE_bool(check_nan_inf); namespace paddle { namespace pybind { @@ -215,6 +216,21 @@ std::shared_ptr CastPyArg2VarBase(PyObject* obj, return py::cast>(obj); } +void SetPythonStack() { + if (FLAGS_check_nan_inf) { + pybind11::gil_scoped_acquire gil; + PyObject* mod = PyImport_ImportModule("traceback"); + PyObject* traceback_list = PyObject_CallMethod(mod, "format_stack", ""); + std::string str = ""; + for (Py_ssize_t i = 0; i < PyList_Size(traceback_list); i++) { + PyObject* line = PyList_GetItem(traceback_list, i); + str += py::str(PyUnicode_AsUTF8(line)); + } + std::string last = str + egr::Controller::Instance().GetPythonStack(); + egr::Controller::Instance().SetPythonStack(last); + } +} + std::shared_ptr CastPyArg2JitFunction(PyObject* obj, ssize_t arg_pos) { if (PyObject_IsInstance(obj, diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index dcf71ec0819..8f83e8f880f 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -78,6 +78,7 @@ std::vector CastPyArg2VectorOfString(PyObject* obj, ssize_t arg_pos); std::shared_ptr CastPyArg2JitFunction(PyObject* obj, ssize_t arg_pos); +void SetPythonStack(); PyObject* ToPyObject(int value); PyObject* ToPyObject(uint32_t value); diff --git a/python/paddle/fluid/tests/unittests/check_nan_inf_backward_stack.py b/python/paddle/fluid/tests/unittests/check_nan_inf_backward_stack.py new file mode 100644 index 00000000000..26bd197fb81 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/check_nan_inf_backward_stack.py @@ -0,0 +1,29 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import paddle + + +def main(): + paddle.set_flags({"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 0}) + cpu_place = paddle.CPUPlace() + x = paddle.to_tensor([1, 0.0, 3], stop_gradient=False, place=cpu_place) + y = paddle.to_tensor([0.2, 0.0, 0.5], place=cpu_place) + z = paddle.pow(x, y) + paddle.autograd.backward([z]) + + +if __name__ == "__main__": + main() diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf.py b/python/paddle/fluid/tests/unittests/test_nan_inf.py index 0aebff97e25..08bea2afa65 100644 --- a/python/paddle/fluid/tests/unittests/test_nan_inf.py +++ b/python/paddle/fluid/tests/unittests/test_nan_inf.py @@ -78,6 +78,13 @@ class TestCheckSkipEnv(TestNanInf): class TestNanInfCheckResult(unittest.TestCase): + def setUp(self): + self._python_interp = sys.executable + if os.getenv('WITH_COVERAGE', 'OFF') == 'ON': + self._python_interp += " -m coverage run --branch -p" + + self.env = os.environ.copy() + def generate_inputs(self, shape, dtype="float32"): data = np.random.random(size=shape).astype(dtype) # [-10, 10) @@ -141,6 +148,25 @@ class TestNanInfCheckResult(unittest.TestCase): if paddle.fluid.core.is_compiled_with_cuda(): _check_num_nan_inf(use_cuda=True) + def test_check_stack(self): + self._python_interp += " check_nan_inf_backward_stack.py" + cmd = self._python_interp + proc = subprocess.Popen( + cmd.split(" "), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=self.env, + ) + + out, err = proc.communicate() + returncode = proc.returncode + + print(out) + print(err) + + # in python3, type(out+err) is 'bytes', need use encode + assert (out + err).find(b' z = paddle.pow(x, y)') != -1 + def check_nan_inf_level(self, use_cuda, dtype): shape = [8, 8] x_np, y_np = self.generate_inputs(shape, dtype) -- GitLab