diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 8d4623468b98d40ce52f88218a7927803ef494ca..fcd98fb52d1f0236ec58c6ecd1a4269de7f804b5 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -35,6 +35,9 @@ DEFINE_bool(check_nan_inf, false, "Checking whether operator produce NAN/INF or not. It will be " "extremely slow so please use this flag wisely."); DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op"); +DEFINE_bool(fast_check_nan_inf, false, + "Fast checking NAN/INF after each operation. It will be a little" + "bit slow, much faster than check_nan_inf"); namespace paddle { namespace framework { @@ -947,6 +950,25 @@ void OperatorWithKernel::RunImpl(const Scope& scope, dev_ctx->Wait(); } + if (FLAGS_fast_check_nan_inf) { + for (auto& vname : OutputVars(true)) { + // only check inserted vars, + // please see executor.py for details of fast_check_nan_inf + if (vname.rfind("debug_var") == 0) { + VLOG(3) << "debugging nan/inf in var " << vname; + + auto* var = exec_scope.FindVar(vname); + if (var == nullptr) continue; + if (var->IsType()) { + CheckTensorNANOrInf(type_, vname, var->Get()); + } else if (var->IsType()) { + CheckTensorNANOrInf(type_, vname, + var->Get().value()); + } + } + } + } + if (FLAGS_check_nan_inf) { for (auto& vname : OutputVars(true)) { auto* var = exec_scope.FindVar(vname); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 304643ea9a10ab017dad14030e2f402aeeb4e8a9..35e8ffcda804b3e7c2a74e10440517a8bf6ba5fe 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -152,15 +152,15 @@ def __bootstrap__(): os.environ['OMP_NUM_THREADS'] = str(num_threads) sysstr = platform.system() read_env_flags = [ - 'check_nan_inf', 'benchmark', 'eager_delete_scope', - 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', - 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', - 'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion', - 'allocator_strategy', 'reader_queue_speed_test_mode', - 'print_sub_graph_dir', 'pe_profile_fname', 'inner_op_parallelism', - 'enable_parallel_graph', 'fuse_parameter_groups_size', - 'multiple_of_cupti_buffer_size', 'fuse_parameter_memory_size', - 'tracer_profile_fname', 'dygraph_debug' + 'check_nan_inf', 'fast_check_nan_inf', 'benchmark', + 'eager_delete_scope', 'initial_cpu_memory_in_mb', 'init_allocated_mem', + 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", + 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', + 'memory_fraction_of_eager_deletion', 'allocator_strategy', + 'reader_queue_speed_test_mode', 'print_sub_graph_dir', + 'pe_profile_fname', 'inner_op_parallelism', 'enable_parallel_graph', + 'fuse_parameter_groups_size', 'multiple_of_cupti_buffer_size', + 'fuse_parameter_memory_size', 'tracer_profile_fname', 'dygraph_debug' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') diff --git a/python/paddle/fluid/debugger.py b/python/paddle/fluid/debugger.py index 63060a77d1abdfd4060648bfabe25709afcfeb8d..ef07dcebcabfe8aa3c0e3366597e40583a57db7c 100644 --- a/python/paddle/fluid/debugger.py +++ b/python/paddle/fluid/debugger.py @@ -16,10 +16,17 @@ from __future__ import print_function import sys import six +import random +import os import re from .graphviz import GraphPreviewGenerator from .proto import framework_pb2 from google.protobuf import text_format +from . import unique_name +from .framework import Program, default_main_program, Variable +from . import core +from . import io +from .layer_helper import LayerHelper _vartype2str_ = [ "UNK", @@ -273,3 +280,88 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"): add_op_link_var(opn, var, True) graph(path, show=False) + + +def prepare_fast_nan_inf_debug(_program): + """ + Given a program to run, insert a (reduce) sum op for every var in that program. + Instead of checking all vars originally defined in the program, + only those inserted ops will be checked in the c++ end, to detect if it contains NAN or INF. + Thereforce, the speed of nan/inf checking could be improved. + Please set ``FLAGS_fast_check_nan_inf" to open the fast nan/inf feature. + """ + + helper = LayerHelper('reduce_sum', **locals()) + + if _program is None: + _program = default_main_program() + + for _block in _program.blocks: + # fetch vars in the current block + _vars_in_prog = [] + for _var_name in _block.vars: + _vars_in_prog.append((_var_name, _block.vars[_var_name])) + + # append sum_op in the current block + for _var_name, _var in _vars_in_prog: + + try: + + if _var.dtype == -1: + continue + + ## create a var for holding sum output + _output_var = _block.create_var( + name=unique_name.generate("debug_var_" + _var_name), + dtype=_var.dtype, + type=core.VarDesc.VarType.LOD_TENSOR, + persistable=False, + stop_gradient=True) + + ## create a sum op, input each existing var in the block + _block.append_op( + type='sum', + outputs={'Out': _output_var}, + inputs={'X': [_var]}) + except Exception as e: + pass + + +def run_fast_nan_inf_debug(executor, + program=None, + feed=None, + fetch_list=None, + feed_var_name='feed', + fetch_var_name='fetch', + scope=None, + return_numpy=True, + use_program_cache=False, + dump_core=True): + """ + Run a program by the given executor. Catch the exception of NAN and INF, and save persistbales into the dumped core. + """ + + assert (executor is not None) + + try: + output = executor.run(program=program, + feed=feed, + fetch_list=fetch_list, + feed_var_name=feed_var_name, + fetch_var_name=fetch_var_name, + scope=scope, + return_numpy=return_numpy, + use_program_cache=use_program_cache) + + return output + + except Exception as e: + + print("catch an exception:") + print(e) + + core_filename = "core" + str(int(random.random() * 10000)) + ".pdckpt" + io.save_persistables( + executor, "./", main_program=program, filename=core_filename) + + print("dumping a core into ./%s" % core_filename)