未验证 提交 751305ec 编写于 作者: L Leo Chen 提交者: GitHub

Add flags to control call stack of error message (#25997)

* add flags_call_stack_level

* update

* refine code
上级 fd2947ba
...@@ -33,6 +33,7 @@ limitations under the License. */ ...@@ -33,6 +33,7 @@ limitations under the License. */
#include <curand.h> #include <curand.h>
#include <thrust/system/cuda/error.h> #include <thrust/system/cuda/error.h>
#include <thrust/system_error.h> #include <thrust/system_error.h>
#include "paddle/fluid/platform/cuda_error.pb.h" #include "paddle/fluid/platform/cuda_error.pb.h"
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
...@@ -69,6 +70,8 @@ limitations under the License. */ ...@@ -69,6 +70,8 @@ limitations under the License. */
#include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/imperative/type_defs.h"
DECLARE_int32(call_stack_level);
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -226,9 +229,7 @@ inline std::string SimplifyDemangleStr(std::string str) { ...@@ -226,9 +229,7 @@ inline std::string SimplifyDemangleStr(std::string str) {
return str; return str;
} }
template <typename StrType> inline std::string GetCurrentTraceBackString() {
inline std::string GetTraceBackString(StrType&& what, const char* file,
int line) {
static constexpr int TRACE_STACK_LIMIT = 100; static constexpr int TRACE_STACK_LIMIT = 100;
std::ostringstream sout; std::ostringstream sout;
...@@ -256,6 +257,13 @@ inline std::string GetTraceBackString(StrType&& what, const char* file, ...@@ -256,6 +257,13 @@ inline std::string GetTraceBackString(StrType&& what, const char* file,
#else #else
sout << "Windows not support stack backtrace yet.\n"; sout << "Windows not support stack backtrace yet.\n";
#endif #endif
return sout.str();
}
template <typename StrType>
inline std::string GetErrorSumaryString(StrType&& what, const char* file,
int line) {
std::ostringstream sout;
sout << "\n----------------------\nError Message " sout << "\n----------------------\nError Message "
"Summary:\n----------------------\n"; "Summary:\n----------------------\n";
sout << string::Sprintf("%s at (%s:%d)", std::forward<StrType>(what), file, sout << string::Sprintf("%s at (%s:%d)", std::forward<StrType>(what), file,
...@@ -264,6 +272,17 @@ inline std::string GetTraceBackString(StrType&& what, const char* file, ...@@ -264,6 +272,17 @@ inline std::string GetTraceBackString(StrType&& what, const char* file,
return sout.str(); return sout.str();
} }
template <typename StrType>
inline std::string GetTraceBackString(StrType&& what, const char* file,
int line) {
if (FLAGS_call_stack_level > 1) {
// FLAGS_call_stack_level>1 means showing c++ call stack
return GetCurrentTraceBackString() + GetErrorSumaryString(what, file, line);
} else {
return GetErrorSumaryString(what, file, line);
}
}
inline bool is_error(bool stat) { return !stat; } inline bool is_error(bool stat) { return !stat; }
inline void throw_on_error(bool stat, const std::string& msg) { inline void throw_on_error(bool stat, const std::string& msg) {
...@@ -427,7 +446,7 @@ struct EnforceNotMet : public std::exception { ...@@ -427,7 +446,7 @@ struct EnforceNotMet : public std::exception {
* *
* Examples: * Examples:
* GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X", "Mul"); * GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X", "Mul");
*/ */
#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE) \ #define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE) \
(([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type { \ (([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type { \
auto* __ptr = (__PTR); \ auto* __ptr = (__PTR); \
...@@ -463,7 +482,7 @@ struct EnforceNotMet : public std::exception { ...@@ -463,7 +482,7 @@ struct EnforceNotMet : public std::exception {
* *
* Examples: * Examples:
* OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Mul"); * OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Mul");
*/ */
#define OP_INOUT_CHECK(__EXPR, __ROLE, __NAME, __OP_TYPE) \ #define OP_INOUT_CHECK(__EXPR, __ROLE, __NAME, __OP_TYPE) \
do { \ do { \
PADDLE_ENFORCE_EQ(__EXPR, true, paddle::platform::errors::NotFound( \ PADDLE_ENFORCE_EQ(__EXPR, true, paddle::platform::errors::NotFound( \
...@@ -491,7 +510,7 @@ struct EnforceNotMet : public std::exception { ...@@ -491,7 +510,7 @@ struct EnforceNotMet : public std::exception {
* Note: GCC 4.8 cannot select right overloaded function here, so need * Note: GCC 4.8 cannot select right overloaded function here, so need
* to define different functions and macros here, after we upgreade * to define different functions and macros here, after we upgreade
* CI gcc version, we can only define one BOOST_GET macro. * CI gcc version, we can only define one BOOST_GET macro.
*/ */
namespace details { namespace details {
#define DEFINE_SAFE_BOOST_GET(__InputType, __OutputType, __OutputTypePtr, \ #define DEFINE_SAFE_BOOST_GET(__InputType, __OutputType, __OutputTypePtr, \
......
...@@ -483,3 +483,28 @@ DEFINE_double(local_exe_sub_scope_limit, 256.0, // MBytes ...@@ -483,3 +483,28 @@ DEFINE_double(local_exe_sub_scope_limit, 256.0, // MBytes
* Note: * Note:
*/ */
DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
/**
* Debug related FLAG
* Name: FLAGS_call_stack_level
* Since Version: 2.0.0
* Value Range: int, default=2
* Example:
* Note: Used to debug. Determine the call stack to print when error or
* exeception happens.
* If FLAGS_call_stack_level == 0, only the error message summary will be shown.
* If FLAGS_call_stack_level == 1, the python stack and error message summary
* will be shown.
* If FLAGS_call_stack_level == 2, the python stack, c++ stack, and error
* message summary will be shown.
*/
DEFINE_int32(
call_stack_level, 2,
"Determine the call stack to print when error or exeception happens."
// TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0
// "If FLAGS_call_stack_level == 0, only the error message summary will be "
// "shown. "
"If FLAGS_call_stack_level == 1, the python stack and error message "
"summary will be shown."
"If FLAGS_call_stack_level == 2, the python stack, c++ stack, and "
"error message summary will be shown.");
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/pybind/global_value_getter_setter.h" #include "paddle/fluid/pybind/global_value_getter_setter.h"
#include <cctype> #include <cctype>
#include <functional> #include <functional>
#include <string> #include <string>
...@@ -20,6 +21,7 @@ ...@@ -20,6 +21,7 @@
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/framework/python_headers.h" #include "paddle/fluid/framework/python_headers.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -35,6 +37,7 @@ DECLARE_bool(cpu_deterministic); ...@@ -35,6 +37,7 @@ DECLARE_bool(cpu_deterministic);
DECLARE_bool(enable_rpc_profiler); DECLARE_bool(enable_rpc_profiler);
DECLARE_int32(multiple_of_cupti_buffer_size); DECLARE_int32(multiple_of_cupti_buffer_size);
DECLARE_bool(reader_queue_speed_test_mode); DECLARE_bool(reader_queue_speed_test_mode);
DECLARE_int32(call_stack_level);
// device management // device management
DECLARE_int32(paddle_num_threads); DECLARE_int32(paddle_num_threads);
// executor // executor
...@@ -337,14 +340,15 @@ static void RegisterGlobalVarGetterSetter() { ...@@ -337,14 +340,15 @@ static void RegisterGlobalVarGetterSetter() {
REGISTER_PUBLIC_GLOBAL_VAR( REGISTER_PUBLIC_GLOBAL_VAR(
FLAGS_eager_delete_tensor_gb, FLAGS_enable_parallel_graph, FLAGS_eager_delete_tensor_gb, FLAGS_enable_parallel_graph,
FLAGS_allocator_strategy, FLAGS_use_system_allocator, FLAGS_check_nan_inf, FLAGS_allocator_strategy, FLAGS_use_system_allocator, FLAGS_check_nan_inf,
FLAGS_cpu_deterministic, FLAGS_enable_rpc_profiler, FLAGS_call_stack_level, FLAGS_cpu_deterministic,
FLAGS_multiple_of_cupti_buffer_size, FLAGS_reader_queue_speed_test_mode, FLAGS_enable_rpc_profiler, FLAGS_multiple_of_cupti_buffer_size,
FLAGS_pe_profile_fname, FLAGS_print_sub_graph_dir, FLAGS_reader_queue_speed_test_mode, FLAGS_pe_profile_fname,
FLAGS_fraction_of_cpu_memory_to_use, FLAGS_fuse_parameter_groups_size, FLAGS_print_sub_graph_dir, FLAGS_fraction_of_cpu_memory_to_use,
FLAGS_fuse_parameter_memory_size, FLAGS_init_allocated_mem, FLAGS_fuse_parameter_groups_size, FLAGS_fuse_parameter_memory_size,
FLAGS_initial_cpu_memory_in_mb, FLAGS_memory_fraction_of_eager_deletion, FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb,
FLAGS_use_pinned_memory, FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory,
FLAGS_tracer_profile_fname, FLAGS_paddle_num_threads); FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname,
FLAGS_paddle_num_threads);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
REGISTER_PUBLIC_GLOBAL_VAR( REGISTER_PUBLIC_GLOBAL_VAR(
......
...@@ -166,17 +166,34 @@ def __bootstrap__(): ...@@ -166,17 +166,34 @@ def __bootstrap__():
os.environ['OMP_NUM_THREADS'] = str(num_threads) os.environ['OMP_NUM_THREADS'] = str(num_threads)
sysstr = platform.system() sysstr = platform.system()
read_env_flags = [ read_env_flags = [
'check_nan_inf', 'fast_check_nan_inf', 'benchmark', 'check_nan_inf',
'eager_delete_scope', 'fraction_of_cpu_memory_to_use', 'fast_check_nan_inf',
'initial_cpu_memory_in_mb', 'init_allocated_mem', 'paddle_num_threads', 'benchmark',
'dist_threadpool_size', 'eager_delete_tensor_gb', 'eager_delete_scope',
'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion', 'fraction_of_cpu_memory_to_use',
'allocator_strategy', 'reader_queue_speed_test_mode', 'initial_cpu_memory_in_mb',
'print_sub_graph_dir', 'pe_profile_fname', 'inner_op_parallelism', 'init_allocated_mem',
'enable_parallel_graph', 'fuse_parameter_groups_size', 'paddle_num_threads',
'multiple_of_cupti_buffer_size', 'fuse_parameter_memory_size', 'dist_threadpool_size',
'tracer_profile_fname', 'dygraph_debug', 'use_system_allocator', 'eager_delete_tensor_gb',
'enable_unused_var_check', 'free_idle_chunk', 'free_when_no_cache_hit' 'fast_eager_deletion_mode',
'memory_fraction_of_eager_deletion',
'allocator_strategy',
'reader_queue_speed_test_mode',
'print_sub_graph_dir',
'pe_profile_fname',
'inner_op_parallelism',
'enable_parallel_graph',
'fuse_parameter_groups_size',
'multiple_of_cupti_buffer_size',
'fuse_parameter_memory_size',
'tracer_profile_fname',
'dygraph_debug',
'use_system_allocator',
'enable_unused_var_check',
'free_idle_chunk',
'free_when_no_cache_hit',
'call_stack_level',
] ]
if 'Darwin' not in sysstr: if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory') read_env_flags.append('use_pinned_memory')
...@@ -208,12 +225,19 @@ def __bootstrap__(): ...@@ -208,12 +225,19 @@ def __bootstrap__():
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
read_env_flags += [ read_env_flags += [
'fraction_of_gpu_memory_to_use', 'initial_gpu_memory_in_mb', 'fraction_of_gpu_memory_to_use',
'reallocate_gpu_memory_in_mb', 'cudnn_deterministic', 'initial_gpu_memory_in_mb',
'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', 'reallocate_gpu_memory_in_mb',
'cudnn_exhaustive_search', 'selected_gpus', 'sync_nccl_allreduce', 'cudnn_deterministic',
'cudnn_batchnorm_spatial_persistent', 'gpu_allocator_retry_time', 'enable_cublas_tensor_op_math',
'local_exe_sub_scope_limit', 'gpu_memory_limit_mb' 'conv_workspace_size_limit',
'cudnn_exhaustive_search',
'selected_gpus',
'sync_nccl_allreduce',
'cudnn_batchnorm_spatial_persistent',
'gpu_allocator_retry_time',
'local_exe_sub_scope_limit',
'gpu_memory_limit_mb',
] ]
core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)]) core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
core.init_glog(sys.argv[0]) core.init_glog(sys.argv[0])
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册