未验证 提交 e9c7e218 编写于 作者: P pkpk 提交者: GitHub

Nan debugger init (#18401)

test=develop
上级 f72ced88
...@@ -35,6 +35,9 @@ DEFINE_bool(check_nan_inf, false, ...@@ -35,6 +35,9 @@ DEFINE_bool(check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be " "Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely."); "extremely slow so please use this flag wisely.");
DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op"); DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op");
DEFINE_bool(fast_check_nan_inf, false,
"Fast checking NAN/INF after each operation. It will be a little"
"bit slow, much faster than check_nan_inf");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -947,6 +950,25 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -947,6 +950,25 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
dev_ctx->Wait(); dev_ctx->Wait();
} }
if (FLAGS_fast_check_nan_inf) {
for (auto& vname : OutputVars(true)) {
// only check inserted vars,
// please see executor.py for details of fast_check_nan_inf
if (vname.rfind("debug_var") == 0) {
VLOG(3) << "debugging nan/inf in var " << vname;
auto* var = exec_scope.FindVar(vname);
if (var == nullptr) continue;
if (var->IsType<framework::LoDTensor>()) {
CheckTensorNANOrInf(type_, vname, var->Get<framework::LoDTensor>());
} else if (var->IsType<framework::SelectedRows>()) {
CheckTensorNANOrInf(type_, vname,
var->Get<framework::SelectedRows>().value());
}
}
}
}
if (FLAGS_check_nan_inf) { if (FLAGS_check_nan_inf) {
for (auto& vname : OutputVars(true)) { for (auto& vname : OutputVars(true)) {
auto* var = exec_scope.FindVar(vname); auto* var = exec_scope.FindVar(vname);
......
...@@ -152,15 +152,15 @@ def __bootstrap__(): ...@@ -152,15 +152,15 @@ def __bootstrap__():
os.environ['OMP_NUM_THREADS'] = str(num_threads) os.environ['OMP_NUM_THREADS'] = str(num_threads)
sysstr = platform.system() sysstr = platform.system()
read_env_flags = [ read_env_flags = [
'check_nan_inf', 'benchmark', 'eager_delete_scope', 'check_nan_inf', 'fast_check_nan_inf', 'benchmark',
'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'eager_delete_scope', 'initial_cpu_memory_in_mb', 'init_allocated_mem',
'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion', 'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
'allocator_strategy', 'reader_queue_speed_test_mode', 'memory_fraction_of_eager_deletion', 'allocator_strategy',
'print_sub_graph_dir', 'pe_profile_fname', 'inner_op_parallelism', 'reader_queue_speed_test_mode', 'print_sub_graph_dir',
'enable_parallel_graph', 'fuse_parameter_groups_size', 'pe_profile_fname', 'inner_op_parallelism', 'enable_parallel_graph',
'multiple_of_cupti_buffer_size', 'fuse_parameter_memory_size', 'fuse_parameter_groups_size', 'multiple_of_cupti_buffer_size',
'tracer_profile_fname', 'dygraph_debug' 'fuse_parameter_memory_size', 'tracer_profile_fname', 'dygraph_debug'
] ]
if 'Darwin' not in sysstr: if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory') read_env_flags.append('use_pinned_memory')
......
...@@ -16,10 +16,17 @@ from __future__ import print_function ...@@ -16,10 +16,17 @@ from __future__ import print_function
import sys import sys
import six import six
import random
import os
import re import re
from .graphviz import GraphPreviewGenerator from .graphviz import GraphPreviewGenerator
from .proto import framework_pb2 from .proto import framework_pb2
from google.protobuf import text_format from google.protobuf import text_format
from . import unique_name
from .framework import Program, default_main_program, Variable
from . import core
from . import io
from .layer_helper import LayerHelper
_vartype2str_ = [ _vartype2str_ = [
"UNK", "UNK",
...@@ -273,3 +280,88 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"): ...@@ -273,3 +280,88 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
add_op_link_var(opn, var, True) add_op_link_var(opn, var, True)
graph(path, show=False) graph(path, show=False)
def prepare_fast_nan_inf_debug(_program):
"""
Given a program to run, insert a (reduce) sum op for every var in that program.
Instead of checking all vars originally defined in the program,
only those inserted ops will be checked in the c++ end, to detect if it contains NAN or INF.
Thereforce, the speed of nan/inf checking could be improved.
Please set ``FLAGS_fast_check_nan_inf" to open the fast nan/inf feature.
"""
helper = LayerHelper('reduce_sum', **locals())
if _program is None:
_program = default_main_program()
for _block in _program.blocks:
# fetch vars in the current block
_vars_in_prog = []
for _var_name in _block.vars:
_vars_in_prog.append((_var_name, _block.vars[_var_name]))
# append sum_op in the current block
for _var_name, _var in _vars_in_prog:
try:
if _var.dtype == -1:
continue
## create a var for holding sum output
_output_var = _block.create_var(
name=unique_name.generate("debug_var_" + _var_name),
dtype=_var.dtype,
type=core.VarDesc.VarType.LOD_TENSOR,
persistable=False,
stop_gradient=True)
## create a sum op, input each existing var in the block
_block.append_op(
type='sum',
outputs={'Out': _output_var},
inputs={'X': [_var]})
except Exception as e:
pass
def run_fast_nan_inf_debug(executor,
program=None,
feed=None,
fetch_list=None,
feed_var_name='feed',
fetch_var_name='fetch',
scope=None,
return_numpy=True,
use_program_cache=False,
dump_core=True):
"""
Run a program by the given executor. Catch the exception of NAN and INF, and save persistbales into the dumped core.
"""
assert (executor is not None)
try:
output = executor.run(program=program,
feed=feed,
fetch_list=fetch_list,
feed_var_name=feed_var_name,
fetch_var_name=fetch_var_name,
scope=scope,
return_numpy=return_numpy,
use_program_cache=use_program_cache)
return output
except Exception as e:
print("catch an exception:")
print(e)
core_filename = "core" + str(int(random.random() * 10000)) + ".pdckpt"
io.save_persistables(
executor, "./", main_program=program, filename=core_filename)
print("dumping a core into ./%s" % core_filename)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册