Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
e9c7e218
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
e9c7e218
编写于
7月 03, 2019
作者:
P
pkpk
提交者:
GitHub
7月 03, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Nan debugger init (#18401)
test=develop
上级
f72ced88
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
123 addition
and
9 deletion
+123
-9
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+22
-0
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+9
-9
python/paddle/fluid/debugger.py
python/paddle/fluid/debugger.py
+92
-0
未找到文件。
paddle/fluid/framework/operator.cc
浏览文件 @
e9c7e218
...
...
@@ -35,6 +35,9 @@ DEFINE_bool(check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely."
);
DEFINE_int32
(
inner_op_parallelism
,
0
,
"number of threads for inner op"
);
DEFINE_bool
(
fast_check_nan_inf
,
false
,
"Fast checking NAN/INF after each operation. It will be a little"
"bit slow, much faster than check_nan_inf"
);
namespace
paddle
{
namespace
framework
{
...
...
@@ -947,6 +950,25 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
dev_ctx
->
Wait
();
}
if
(
FLAGS_fast_check_nan_inf
)
{
for
(
auto
&
vname
:
OutputVars
(
true
))
{
// only check inserted vars,
// please see executor.py for details of fast_check_nan_inf
if
(
vname
.
rfind
(
"debug_var"
)
==
0
)
{
VLOG
(
3
)
<<
"debugging nan/inf in var "
<<
vname
;
auto
*
var
=
exec_scope
.
FindVar
(
vname
);
if
(
var
==
nullptr
)
continue
;
if
(
var
->
IsType
<
framework
::
LoDTensor
>
())
{
CheckTensorNANOrInf
(
type_
,
vname
,
var
->
Get
<
framework
::
LoDTensor
>
());
}
else
if
(
var
->
IsType
<
framework
::
SelectedRows
>
())
{
CheckTensorNANOrInf
(
type_
,
vname
,
var
->
Get
<
framework
::
SelectedRows
>
().
value
());
}
}
}
}
if
(
FLAGS_check_nan_inf
)
{
for
(
auto
&
vname
:
OutputVars
(
true
))
{
auto
*
var
=
exec_scope
.
FindVar
(
vname
);
...
...
python/paddle/fluid/__init__.py
浏览文件 @
e9c7e218
...
...
@@ -152,15 +152,15 @@ def __bootstrap__():
os
.
environ
[
'OMP_NUM_THREADS'
]
=
str
(
num_threads
)
sysstr
=
platform
.
system
()
read_env_flags
=
[
'check_nan_inf'
,
'
benchmark'
,
'eager_delete_scope
'
,
'
initial_cpu_memory_in_mb'
,
'init_allocated_mem'
,
'free_idle_memory
'
,
'
paddle_num_threads'
,
"dist_threadpool_size"
,
'eager_delete_tensor_gb'
,
'
fast_eager_deletion_mode'
,
'memory_fraction_of_eager_deletion
'
,
'
allocator_strategy'
,
'reader_queue_speed_test_mode
'
,
'
print_sub_graph_dir'
,
'pe_profile_fname'
,
'inner_op_parallelism
'
,
'
enable_parallel_graph'
,
'fuse_parameter_groups_size
'
,
'
multiple_of_cupti_buffer_size'
,
'fuse_parameter_memory
_size'
,
'tracer_profile_fname'
,
'dygraph_debug'
'check_nan_inf'
,
'
fast_check_nan_inf'
,
'benchmark
'
,
'
eager_delete_scope'
,
'initial_cpu_memory_in_mb'
,
'init_allocated_mem
'
,
'
free_idle_memory'
,
'paddle_num_threads'
,
"dist_threadpool_size"
,
'
eager_delete_tensor_gb'
,
'fast_eager_deletion_mode
'
,
'
memory_fraction_of_eager_deletion'
,
'allocator_strategy
'
,
'
reader_queue_speed_test_mode'
,
'print_sub_graph_dir
'
,
'
pe_profile_fname'
,
'inner_op_parallelism'
,
'enable_parallel_graph
'
,
'
fuse_parameter_groups_size'
,
'multiple_of_cupti_buffer
_size'
,
'
fuse_parameter_memory_size'
,
'
tracer_profile_fname'
,
'dygraph_debug'
]
if
'Darwin'
not
in
sysstr
:
read_env_flags
.
append
(
'use_pinned_memory'
)
...
...
python/paddle/fluid/debugger.py
浏览文件 @
e9c7e218
...
...
@@ -16,10 +16,17 @@ from __future__ import print_function
import
sys
import
six
import
random
import
os
import
re
from
.graphviz
import
GraphPreviewGenerator
from
.proto
import
framework_pb2
from
google.protobuf
import
text_format
from
.
import
unique_name
from
.framework
import
Program
,
default_main_program
,
Variable
from
.
import
core
from
.
import
io
from
.layer_helper
import
LayerHelper
_vartype2str_
=
[
"UNK"
,
...
...
@@ -273,3 +280,88 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
add_op_link_var
(
opn
,
var
,
True
)
graph
(
path
,
show
=
False
)
def
prepare_fast_nan_inf_debug
(
_program
):
"""
Given a program to run, insert a (reduce) sum op for every var in that program.
Instead of checking all vars originally defined in the program,
only those inserted ops will be checked in the c++ end, to detect if it contains NAN or INF.
Thereforce, the speed of nan/inf checking could be improved.
Please set ``FLAGS_fast_check_nan_inf" to open the fast nan/inf feature.
"""
helper
=
LayerHelper
(
'reduce_sum'
,
**
locals
())
if
_program
is
None
:
_program
=
default_main_program
()
for
_block
in
_program
.
blocks
:
# fetch vars in the current block
_vars_in_prog
=
[]
for
_var_name
in
_block
.
vars
:
_vars_in_prog
.
append
((
_var_name
,
_block
.
vars
[
_var_name
]))
# append sum_op in the current block
for
_var_name
,
_var
in
_vars_in_prog
:
try
:
if
_var
.
dtype
==
-
1
:
continue
## create a var for holding sum output
_output_var
=
_block
.
create_var
(
name
=
unique_name
.
generate
(
"debug_var_"
+
_var_name
),
dtype
=
_var
.
dtype
,
type
=
core
.
VarDesc
.
VarType
.
LOD_TENSOR
,
persistable
=
False
,
stop_gradient
=
True
)
## create a sum op, input each existing var in the block
_block
.
append_op
(
type
=
'sum'
,
outputs
=
{
'Out'
:
_output_var
},
inputs
=
{
'X'
:
[
_var
]})
except
Exception
as
e
:
pass
def
run_fast_nan_inf_debug
(
executor
,
program
=
None
,
feed
=
None
,
fetch_list
=
None
,
feed_var_name
=
'feed'
,
fetch_var_name
=
'fetch'
,
scope
=
None
,
return_numpy
=
True
,
use_program_cache
=
False
,
dump_core
=
True
):
"""
Run a program by the given executor. Catch the exception of NAN and INF, and save persistbales into the dumped core.
"""
assert
(
executor
is
not
None
)
try
:
output
=
executor
.
run
(
program
=
program
,
feed
=
feed
,
fetch_list
=
fetch_list
,
feed_var_name
=
feed_var_name
,
fetch_var_name
=
fetch_var_name
,
scope
=
scope
,
return_numpy
=
return_numpy
,
use_program_cache
=
use_program_cache
)
return
output
except
Exception
as
e
:
print
(
"catch an exception:"
)
print
(
e
)
core_filename
=
"core"
+
str
(
int
(
random
.
random
()
*
10000
))
+
".pdckpt"
io
.
save_persistables
(
executor
,
"./"
,
main_program
=
program
,
filename
=
core_filename
)
print
(
"dumping a core into ./%s"
%
core_filename
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录