From 1e7efd81bca172bf52de73c7d1083165c7a361f3 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Mon, 24 Apr 2023 15:03:23 +0800 Subject: [PATCH] [cherry-pick] Add debugging api and python stack (#53217) Print the forward's stack when backward op has nan/inf and FLAGS_check_nan_inf_level = 0 Delete temp param in eager_gen --- .../eager_manual/forwards/add_n_fwd_func.cc | 7 +++++ .../forwards/conv2d_fwd_function.cc | 6 ++++ .../forwards/sync_batch_norm_fwd_func.cc | 6 ++++ paddle/fluid/eager/api/utils/global_utils.h | 6 ++++ .../generator/eager_gen.py | 28 +++++++++++++++--- .../generator/python_c_gen.py | 6 ++-- paddle/fluid/eager/grad_node_info.h | 6 ++++ paddle/fluid/imperative/tracer.cc | 1 + paddle/fluid/imperative/tracer.h | 4 ++- paddle/fluid/pybind/eager_utils.cc | 17 +++++++++++ paddle/fluid/pybind/eager_utils.h | 1 + .../unittests/check_nan_inf_backward_stack.py | 29 +++++++++++++++++++ .../fluid/tests/unittests/test_nan_inf.py | 26 +++++++++++++++++ 13 files changed, 136 insertions(+), 7 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/check_nan_inf_backward_stack.py diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc index ea7dcd8a5f0..5804d375409 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc @@ -55,6 +55,7 @@ paddle::Tensor add_n_ad_func(const std::vector& x) { VLOG(3) << "Final State Running: " << "add_n_ad_func"; auto api_result = paddle::experimental::add_n(x); + // Check NaN and Inf if needed if (FLAGS_check_nan_inf) { egr::CheckTensorHasNanOrInf("add_n", api_result); @@ -83,6 +84,12 @@ paddle::Tensor add_n_ad_func(const std::vector& x) { // Node Construction auto grad_node = std::shared_ptr(new AddNGradNodeFinal(1, 1)); + + // Set forward's stack + if (FLAGS_check_nan_inf) { + grad_node->SetForwardTrace(egr::Controller::Instance().GetPythonStack()); + } + // SetAttributes if needed // Set TensorWrappers for Forward Inputs if needed diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc index 84938933f9e..39fa77d3ae9 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc @@ -138,6 +138,12 @@ paddle::Tensor conv2d_ad_func(const paddle::Tensor& input, // Node Construction auto grad_node = std::shared_ptr(new Conv2dGradNodeFinal(1, 2)); + + // Set forward's stack + if (FLAGS_check_nan_inf) { + grad_node->SetForwardTrace(egr::Controller::Instance().GetPythonStack()); + } + // SetAttributes if needed grad_node->SetAttributestrides(strides); grad_node->SetAttributepaddings(paddings); diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc index 86922267bb7..5bd9571dbf1 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc @@ -226,6 +226,12 @@ sync_batch_norm__ad_func(const paddle::Tensor& x, // Node Construction auto grad_node = std::shared_ptr(new SyncBatchNormGradNode(6, 5)); + + // Set forward's stack + if (FLAGS_check_nan_inf) { + grad_node->SetForwardTrace(egr::Controller::Instance().GetPythonStack()); + } + egr::Controller::Instance().PushBackForceSequentialNodes(grad_node.get()); // SetAttributes if needed grad_node->SetAttributemomentum(momentum); diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h index bd5c9637264..ba79a63648d 100644 --- a/paddle/fluid/eager/api/utils/global_utils.h +++ b/paddle/fluid/eager/api/utils/global_utils.h @@ -74,6 +74,12 @@ class Controller { void EnableLayoutAutoTune() { tracer_->EnableLayoutAutoTune(); } + void SetPythonStack(std::string stack_str) { + tracer_->SetPythonStack(stack_str); + } + + std::string GetPythonStack() { return tracer_->GetPythonStack(); } + bool HasGrad() const { return tracer_->HasGrad(); } void SetHasGrad(bool has_grad) { tracer_->SetHasGrad(has_grad); } std::string GenerateUniqueName(std::string key = "eager_in_tmp") { diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py index 4e105d138b7..aa4afa965b8 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py @@ -297,6 +297,10 @@ FORWARD_BODY_TEMPLATE = """ if(require_any_grad) {{ // Node Construction {} + // Set for forward trace + if (FLAGS_check_nan_inf) {{ + grad_node->SetForwardTrace(egr::Controller::Instance().GetPythonStack()); + }} // SetAttributes if needed {} // Set TensorWrappers for Forward Inputs if needed @@ -485,7 +489,23 @@ CHECK_BACKWARD_INPLACE_TEMPLATE = """ }} }}""" -CHECK_NAN_AND_INF_TEMPLATE = """ if (FLAGS_check_nan_inf) {{ egr::CheckTensorHasNanOrInf("{}", {}); }} +CHECK_NAN_AND_INF_TEMPLATE_FORWARD = """ + if (FLAGS_check_nan_inf) {{ + egr::CheckTensorHasNanOrInf("{}", {}); + }} +""" + +CHECK_NAN_AND_INF_TEMPLATE_BACKWARD = """ + if (FLAGS_check_nan_inf) {{ + try{{ + egr::CheckTensorHasNanOrInf("{}", {}); + }} catch(...) {{ + LOG(WARNING) << "There are nan/inf in ({})"; + auto forward_trace = GetForwardTrace(); + std::cout<, kSlotSmallVectorSize> @@ -317,6 +321,8 @@ class GradNodeBase { bool need_complex_to_real_ = false; bool is_tensor_wrappers_cleared_ = false; + // The trace of forward function + std::string forward_trace_ = ""; }; } // namespace egr diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index fa0f09d7277..af39832b4f5 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -37,6 +37,7 @@ DECLARE_string(tracer_mkldnn_ops_off); namespace paddle { namespace imperative { +thread_local std::string Tracer::python_stack_ = ""; thread_local bool Tracer::enable_program_desc_tracing_ = false; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 943505955e8..7355cec776e 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -199,7 +199,8 @@ class Tracer { use_layout_autotune_ = false; return false; } - + void SetPythonStack(std::string stack_str) { python_stack_ = stack_str; } + std::string GetPythonStack() { return python_stack_; } phi::KernelSignature GetExpectedKernelSignature( const std::string& type, const NameTensorMap& ins, @@ -215,6 +216,7 @@ class Tracer { std::unique_ptr generator_; platform::Place expected_place_; GarbageCollectorMap gcs_; + static thread_local std::string python_stack_; static thread_local bool enable_program_desc_tracing_; static thread_local bool use_layout_autotune_; static thread_local bool has_grad_; diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index b7ecd196ca2..0312ad8d960 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -38,6 +38,7 @@ limitations under the License. */ #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" +DECLARE_bool(check_nan_inf); namespace paddle { namespace pybind { @@ -215,6 +216,22 @@ std::shared_ptr CastPyArg2VarBase(PyObject* obj, return py::cast>(obj); } +void SetPythonStack() { + if (FLAGS_check_nan_inf) { + VLOG(4) << "this is SetPythonStack"; + pybind11::gil_scoped_acquire gil; + PyObject* mod = PyImport_ImportModule("traceback"); + PyObject* traceback_list = PyObject_CallMethod(mod, "format_stack", ""); + std::string str = ""; + for (Py_ssize_t i = 0; i < PyList_Size(traceback_list); i++) { + PyObject* line = PyList_GetItem(traceback_list, i); + str += py::str(PyUnicode_AsUTF8(line)); + } + std::string last = str + egr::Controller::Instance().GetPythonStack(); + egr::Controller::Instance().SetPythonStack(last); + } +} + std::shared_ptr CastPyArg2JitFunction(PyObject* obj, ssize_t arg_pos) { if (PyObject_IsInstance(obj, diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index dcf71ec0819..8f83e8f880f 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -78,6 +78,7 @@ std::vector CastPyArg2VectorOfString(PyObject* obj, ssize_t arg_pos); std::shared_ptr CastPyArg2JitFunction(PyObject* obj, ssize_t arg_pos); +void SetPythonStack(); PyObject* ToPyObject(int value); PyObject* ToPyObject(uint32_t value); diff --git a/python/paddle/fluid/tests/unittests/check_nan_inf_backward_stack.py b/python/paddle/fluid/tests/unittests/check_nan_inf_backward_stack.py new file mode 100644 index 00000000000..26bd197fb81 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/check_nan_inf_backward_stack.py @@ -0,0 +1,29 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import paddle + + +def main(): + paddle.set_flags({"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 0}) + cpu_place = paddle.CPUPlace() + x = paddle.to_tensor([1, 0.0, 3], stop_gradient=False, place=cpu_place) + y = paddle.to_tensor([0.2, 0.0, 0.5], place=cpu_place) + z = paddle.pow(x, y) + paddle.autograd.backward([z]) + + +if __name__ == "__main__": + main() diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf.py b/python/paddle/fluid/tests/unittests/test_nan_inf.py index 0aebff97e25..08bea2afa65 100644 --- a/python/paddle/fluid/tests/unittests/test_nan_inf.py +++ b/python/paddle/fluid/tests/unittests/test_nan_inf.py @@ -78,6 +78,13 @@ class TestCheckSkipEnv(TestNanInf): class TestNanInfCheckResult(unittest.TestCase): + def setUp(self): + self._python_interp = sys.executable + if os.getenv('WITH_COVERAGE', 'OFF') == 'ON': + self._python_interp += " -m coverage run --branch -p" + + self.env = os.environ.copy() + def generate_inputs(self, shape, dtype="float32"): data = np.random.random(size=shape).astype(dtype) # [-10, 10) @@ -141,6 +148,25 @@ class TestNanInfCheckResult(unittest.TestCase): if paddle.fluid.core.is_compiled_with_cuda(): _check_num_nan_inf(use_cuda=True) + def test_check_stack(self): + self._python_interp += " check_nan_inf_backward_stack.py" + cmd = self._python_interp + proc = subprocess.Popen( + cmd.split(" "), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=self.env, + ) + + out, err = proc.communicate() + returncode = proc.returncode + + print(out) + print(err) + + # in python3, type(out+err) is 'bytes', need use encode + assert (out + err).find(b' z = paddle.pow(x, y)') != -1 + def check_nan_inf_level(self, use_cuda, dtype): shape = [8, 8] x_np, y_np = self.generate_inputs(shape, dtype) -- GitLab