diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index 30046b2d1d44e8c89f033aebc7619ed6a5980c99..dd7791af854471fa29a69b0a488d19e58a472300 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -30,6 +30,23 @@ DECLARE_int32(check_nan_inf_level); namespace paddle { namespace framework { namespace details { +struct DebugTools { + DebugTools() {} + std::string path = ""; +}; +static DebugTools debug_nan_inf; + +void SetNanInfDebugPath(const std::string& nan_inf_path) { + debug_nan_inf.path = nan_inf_path; + VLOG(4) << "Set the log's path of debug tools : " << nan_inf_path; +} + +std::string GetNanPath() { + if (debug_nan_inf.path.empty()) { + return ""; + } + return debug_nan_inf.path + "/"; +} static std::once_flag white_list_init_flag; @@ -134,112 +151,6 @@ static void InitWhiteListFormEnv() { } } -template < - typename T, - std::enable_if_t>::value && - !std::is_same>::value, - bool> = true> -static void CheckNanInfCpuImpl(const T* value_ptr, - const int64_t numel, - const std::string& cpu_hint_str) { - using MT = typename phi::dtype::template MPTypeTrait::Type; - -#ifdef _OPENMP - // Use maximum 4 threads to collect the nan and inf information. - int num_threads = std::max(omp_get_num_threads(), 1); - num_threads = std::min(num_threads, 4); -#else - int num_threads = 1; -#endif - - std::vector thread_num_nan(num_threads, 0); - std::vector thread_num_inf(num_threads, 0); - std::vector thread_min_value(num_threads, static_cast(value_ptr[0])); - std::vector thread_max_value(num_threads, static_cast(value_ptr[0])); - std::vector thread_mean_value(num_threads, static_cast(0)); - -#ifdef _OPENMP -#pragma omp parallel num_threads(num_threads) -#endif - { -#ifdef _OPENMP - int64_t tid = omp_get_thread_num(); - int64_t chunk_size = (numel + num_threads - 1) / num_threads; - int64_t begin = tid * chunk_size; - int64_t end = chunk_size + begin > numel ? numel : chunk_size + begin; -#else - int64_t tid = 0; - int64_t begin = 0; - int64_t end = numel; -#endif - for (int64_t i = begin; i < end; ++i) { - MT value = static_cast(value_ptr[i]); - - thread_min_value[tid] = std::min(thread_min_value[tid], value); - thread_max_value[tid] = std::max(thread_max_value[tid], value); - thread_mean_value[tid] += value / static_cast(numel); - - if (std::isnan(value)) { - thread_num_nan[tid] += 1; - } else if (std::isinf(value)) { - thread_num_inf[tid] += 1; - } - } - } - - int64_t num_nan = 0; - int64_t num_inf = 0; - MT min_value = thread_min_value[0]; - MT max_value = thread_max_value[0]; - MT mean_value = static_cast(0); - for (int i = 0; i < num_threads; ++i) { - num_nan += thread_num_nan[i]; - num_inf += thread_num_inf[i]; - min_value = std::min(thread_min_value[i], min_value); - max_value = std::max(thread_max_value[i], max_value); - mean_value += thread_mean_value[i]; - } - - PrintForDifferentLevel(cpu_hint_str.c_str(), - numel, - num_nan, - num_inf, - max_value, - min_value, - mean_value, - FLAGS_check_nan_inf_level); -} - -template < - typename T, - std::enable_if_t>::value || - std::is_same>::value, - bool> = true> -void CheckNanInfCpuImpl(const T* value_ptr, - const int64_t numel, - const std::string& cpu_hint_str) { - using RealType = typename T::value_type; - - RealType real_sum = 0.0f, imag_sum = 0.0f; - -#ifdef _OPENMP -#pragma omp parallel for reduction(+ : real_sum) reduction(+ : imag_sum) -#endif - for (int64_t i = 0; i < numel; ++i) { - T value = value_ptr[i]; - real_sum += (value.real - value.real); - imag_sum += (value.imag - value.imag); - } - - if (std::isnan(real_sum) || std::isinf(real_sum) || std::isnan(imag_sum) || - std::isinf(imag_sum)) { - // hot fix for compile failed in gcc4.8 - // here also need print detail info of nan or inf later - PADDLE_THROW(platform::errors::PreconditionNotMet( - "There are NAN or INF in %s.", cpu_hint_str)); - } -} - template <> template void TensorCheckerVisitor::apply( diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu index 8754a33b663db0b1d3a3539e9d399bf31faf62c5..514e5aa3e5d5610d11b0604c569dff0d877bc8c6 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu @@ -322,18 +322,26 @@ __global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr, } template -static char* GetGpuHintStringPtr(const phi::GPUContext& ctx, - const std::string& op_type, +inline std::string GetHintString(const std::string& op_type, const std::string& var_name, - int dev_id) { + const phi::Place& place, + int dev_id = -1) { + std::string op_var = GetCpuHintString(op_type, var_name, place, dev_id); PADDLE_ENFORCE_EQ( (dev_id >= 0 && dev_id < multi_op_var2gpu_str_mutex().size()), true, platform::errors::OutOfRange("GPU dev_id must >=0 and < dev_count=%d", multi_op_var2gpu_str_mutex().size())); + return op_var; +} +template +static char* GetGpuHintStringPtr(const phi::GPUContext& ctx, + const std::string& op_type, + const std::string& var_name, + int dev_id) { std::string op_var = - GetCpuHintString(op_type, var_name, ctx.GetPlace(), dev_id); + GetHintString(op_type, var_name, ctx.GetPlace(), dev_id); char* gpu_str_ptr = nullptr; { @@ -396,6 +404,24 @@ void TensorCheckerVisitor::apply( auto* dev_ctx = reinterpret_cast( platform::DeviceContextPool::Instance().Get(tensor.place())); int dev_id = tensor.place().device; + // Write log to file + auto file_path = GetNanPath(); + if (file_path.size() > 0) { + phi::DenseTensor cpu_tensor; + platform::CPUPlace cpu_place; + cpu_tensor.Resize(tensor.dims()); + // 1. copy from gpu to cpu + paddle::framework::TensorCopySync(tensor, cpu_place, &cpu_tensor); + auto* dev_ctx = reinterpret_cast( + platform::DeviceContextPool::Instance().Get(tensor.place())); + const std::string debug_info = + GetHintString(op_type, var_name, place, dev_id); + // 2. write log to file + CheckNanInfCpuImpl(cpu_tensor.data(), tensor.numel(), debug_info, "gpu"); + return; + } + + // Write log to window char* gpu_str_ptr = GetGpuHintStringPtr(*dev_ctx, op_type, var_name, dev_id); diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.h b/paddle/fluid/framework/details/nan_inf_utils_detail.h index 0adf23fd029218446108f55c2e8e3c98b2204fd1..fee2a52b428b2be97db67d43b816efe068fae862 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.h +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.h @@ -13,17 +13,33 @@ // limitations under the License. #pragma once - +#include +#include #include - #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/kernels/funcs/eigen/extensions.h" + +#ifdef _WIN32 +#include +#include +#define MKDIR(path) _mkdir(path) +#else +#include +#define MKDIR(path) mkdir(path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH) +#endif +DECLARE_int32(check_nan_inf_level); namespace paddle { namespace framework { namespace details { +void SetNanInfDebugPath(const std::string& nan_inf_path); + +std::string GetNanPath(); + template ::value, bool> = true> @@ -93,6 +109,49 @@ HOSTDEVICE void PrintForDifferentLevel(const char* debug_info, } } +template +void PrintForDifferentLevelFile(const char* debug_info, + int64_t numel, + int64_t num_nan, + int64_t num_inf, + MT max_value, + MT min_value, + MT mean_value, + int check_nan_inf_level, + const std::string& log_name) { + int dev_id = 0; +#ifdef PADDLE_WITH_HIP + hipGetDevice(&dev_id); +#elif PADDLE_WITH_CUDA + cudaGetDevice(&dev_id); +#endif + auto file_path = GetNanPath(); + MKDIR(file_path.c_str()); + std::string file_name = "worker_" + log_name + "." + std::to_string(dev_id); + std::string path = file_path + file_name; + std::ofstream outfile(path, std::ios::app); + if (!outfile.is_open()) { + return; + } + + if (num_nan > 0 || num_inf > 0) { + outfile << "[PRECISION] [ERROR] in " << debug_info + << ", numel=" << static_cast(numel) // NOLINT + << ", num_nan=" << static_cast(num_nan) // NOLINT + << ", num_inf=" << static_cast(num_inf) // NOLINT + << ", max=" << static_cast(max_value) + << ", min=" << static_cast(min_value) + << ", mean=" << static_cast(mean_value) << std::endl; + } else if (NeedPrint(max_value, min_value, check_nan_inf_level)) { + outfile << "[PRECISION] in " << debug_info + << ", numel=" << static_cast(numel) // NOLINT + << ", max=" << static_cast(max_value) + << ", min=" << static_cast(min_value) + << ", mean=" << static_cast(mean_value) << std::endl; + } + outfile.close(); +} + template inline std::string GetCpuHintString(const std::string& op_type, const std::string& var_name, @@ -120,6 +179,130 @@ inline std::string GetCpuHintString(const std::string& op_type, return ss.str(); } +template < + typename T, + std::enable_if_t>::value && + !std::is_same>::value, + bool> = true> +static void CheckNanInfCpuImpl(const T* value_ptr, + const int64_t numel, + const std::string& cpu_hint_str, + const std::string log_name = "cpu") { + using MT = typename phi::dtype::template MPTypeTrait::Type; + +#ifdef _OPENMP + // Use maximum 4 threads to collect the nan and inf information. + int num_threads = std::max(omp_get_num_threads(), 1); + num_threads = std::min(num_threads, 4); +#else + int num_threads = 1; +#endif + + std::vector thread_num_nan(num_threads, 0); + std::vector thread_num_inf(num_threads, 0); + std::vector thread_min_value(num_threads, static_cast(value_ptr[0])); + std::vector thread_max_value(num_threads, static_cast(value_ptr[0])); + std::vector thread_mean_value(num_threads, static_cast(0)); + +#ifdef _OPENMP +#pragma omp parallel num_threads(num_threads) +#endif + { +#ifdef _OPENMP + int64_t tid = omp_get_thread_num(); + int64_t chunk_size = (numel + num_threads - 1) / num_threads; + int64_t begin = tid * chunk_size; + int64_t end = chunk_size + begin > numel ? numel : chunk_size + begin; +#else + int64_t tid = 0; + int64_t begin = 0; + int64_t end = numel; +#endif + for (int64_t i = begin; i < end; ++i) { + MT value = static_cast(value_ptr[i]); + + thread_min_value[tid] = std::min(thread_min_value[tid], value); + thread_max_value[tid] = std::max(thread_max_value[tid], value); + thread_mean_value[tid] += value / static_cast(numel); + + if (std::isnan(value)) { + thread_num_nan[tid] += 1; + } else if (std::isinf(value)) { + thread_num_inf[tid] += 1; + } + } + } + + int64_t num_nan = 0; + int64_t num_inf = 0; + MT min_value = thread_min_value[0]; + MT max_value = thread_max_value[0]; + MT mean_value = static_cast(0); + for (int i = 0; i < num_threads; ++i) { + num_nan += thread_num_nan[i]; + num_inf += thread_num_inf[i]; + min_value = std::min(thread_min_value[i], min_value); + max_value = std::max(thread_max_value[i], max_value); + mean_value += thread_mean_value[i]; + } + auto file_path = GetNanPath(); + // Write log to file + if (file_path.size() > 0) { + VLOG(4) << "[FLAGS_check_nan_inf_level=" << FLAGS_check_nan_inf_level + << "]. Write log to " << file_path; + PrintForDifferentLevelFile(cpu_hint_str.c_str(), + numel, + num_nan, + num_inf, + max_value, + min_value, + mean_value, + FLAGS_check_nan_inf_level, + log_name); + return; + } + + PrintForDifferentLevel(cpu_hint_str.c_str(), + numel, + num_nan, + num_inf, + max_value, + min_value, + mean_value, + FLAGS_check_nan_inf_level); +} + +template < + typename T, + std::enable_if_t>::value || + std::is_same>::value, + bool> = true> +void CheckNanInfCpuImpl(const T* value_ptr, + const int64_t numel, + const std::string& cpu_hint_str, + const std::string log_name = "cpu") { + using RealType = typename T::value_type; + + RealType real_sum = 0.0f, imag_sum = 0.0f; + +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : real_sum) reduction(+ : imag_sum) +#endif + for (int64_t i = 0; i < numel; ++i) { + T value = value_ptr[i]; + real_sum += (value.real - value.real); + imag_sum += (value.imag - value.imag); + } + + if (std::isnan(real_sum) || std::isinf(real_sum) || std::isnan(imag_sum) || + std::isinf(imag_sum)) { + // hot fix for compile failed in gcc4.8 + // here also need print detail info of nan or inf later + PADDLE_THROW(platform::errors::PreconditionNotMet( + "There are NAN or INF in %s.", cpu_hint_str)); + } +} + template struct TensorCheckerVisitor { TensorCheckerVisitor(const std::string& o, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 556965770738913780806f134a2b7a9e4289939d..020a926b4739e5e5a63b53056b6200c4de360b88 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -34,6 +34,7 @@ limitations under the License. */ #include "paddle/fluid/framework/custom_operator.h" #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/details/nan_inf_utils_detail.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor_cache.h" #include "paddle/fluid/framework/executor_gc_helper.h" @@ -2671,6 +2672,9 @@ All parameter, weight, gradient are variables in Paddle. m.def("use_layout_autotune", [] { return egr::Controller::Instance().UseLayoutAutoTune(); }); + // Add the api for nan op debug + m.def("set_nan_inf_debug_path", + &paddle::framework::details::SetNanInfDebugPath); BindFleetWrapper(&m); BindIO(&m); diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf_dir.py b/python/paddle/fluid/tests/unittests/test_nan_inf_dir.py new file mode 100644 index 0000000000000000000000000000000000000000..49882d192f9f646ad2550ab4c97acdd8c63c04e4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_nan_inf_dir.py @@ -0,0 +1,108 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import unittest + +import numpy as np + +import paddle + + +class TestNanInfDirCheckResult(unittest.TestCase): + def generate_inputs(self, shape, dtype="float32"): + data = np.random.random(size=shape).astype(dtype) + # [-10, 10) + x = (data * 20 - 10) * np.random.randint( + low=0, high=2, size=shape + ).astype(dtype) + y = np.random.randint(low=0, high=2, size=shape).astype(dtype) + return x, y + + def get_reference_num_nan_inf(self, x): + out = np.log(x) + num_nan = np.sum(np.isnan(out)) + num_inf = np.sum(np.isinf(out)) + print("[reference] num_nan={}, num_inf={}".format(num_nan, num_inf)) + return num_nan, num_inf + + def get_num_nan_inf( + self, x_np, use_cuda=True, add_assert=False, pt="nan_inf_log_dir" + ): + num_nan = 0 + num_inf = 0 + if add_assert: + if use_cuda: + paddle.device.set_device("gpu:0") + else: + paddle.device.set_device("cpu") + x = paddle.to_tensor(x_np) + out = paddle.log(x) + sys.stdout.flush() + if not use_cuda: + os.path.exists(pt) + num_nan = 0 + num_inf = 0 + for root, dirs, files in os.walk(pt): + for file_name in files: + if file_name.startswith('worker_cpu'): + file_path = os.path.join(root, file_name) + with open(file_path, "rb") as fp: + for e in fp: + err_str_list = ( + str(e) + .replace("(", " ") + .replace(")", " ") + .replace(",", " ") + .split(" ") + ) + for err_str in err_str_list: + if "num_nan" in err_str: + num_nan = int(err_str.split("=")[1]) + elif "num_inf" in err_str: + num_inf = int(err_str.split("=")[1]) + print( + "[paddle] num_nan={}, num_inf={}".format(num_nan, num_inf) + ) + return num_nan, num_inf + + def test_num_nan_inf(self): + path = "nan_inf_log_dir" + paddle.fluid.core.set_nan_inf_debug_path(path) + + def _check_num_nan_inf(use_cuda): + shape = [32, 32] + x_np, _ = self.generate_inputs(shape) + num_nan_np, num_inf_np = self.get_reference_num_nan_inf(x_np) + add_assert = (num_nan_np + num_inf_np) > 0 + num_nan, num_inf = self.get_num_nan_inf( + x_np, use_cuda, add_assert, path + ) + if not use_cuda: + assert num_nan == num_nan_np and num_inf == num_inf_np + + paddle.set_flags( + {"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 3} + ) + _check_num_nan_inf(use_cuda=False) + if paddle.fluid.core.is_compiled_with_cuda(): + _check_num_nan_inf(use_cuda=True) + x = paddle.to_tensor([2, 3, 4], 'float32') + y = paddle.to_tensor([1, 5, 2], 'float32') + z = paddle.add(x, y) + + +if __name__ == '__main__': + unittest.main()