Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
8f6ed032
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
8f6ed032
编写于
8月 18, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
8月 18, 2020
浏览文件
操作
浏览文件
下载
差异文件
!4428 Operation Overflow Watchpoint for D-Chip debugger
Merge pull request !4428 from AdelShafiei/opoverflow2
上级
bf2ef95f
4834a337
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
239 addition
and
40 deletion
+239
-40
mindspore/ccsrc/debug/data_dump_parser.cc
mindspore/ccsrc/debug/data_dump_parser.cc
+20
-0
mindspore/ccsrc/debug/data_dump_parser.h
mindspore/ccsrc/debug/data_dump_parser.h
+1
-0
mindspore/ccsrc/debug/debug_services.cc
mindspore/ccsrc/debug/debug_services.cc
+56
-30
mindspore/ccsrc/debug/debug_services.h
mindspore/ccsrc/debug/debug_services.h
+9
-3
mindspore/ccsrc/debug/debugger/debug_grpc.proto
mindspore/ccsrc/debug/debugger/debug_grpc.proto
+8
-0
mindspore/ccsrc/debug/debugger/debugger.cc
mindspore/ccsrc/debug/debugger/debugger.cc
+117
-5
mindspore/ccsrc/debug/debugger/debugger.h
mindspore/ccsrc/debug/debugger/debugger.h
+13
-2
mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
+15
-0
未找到文件。
mindspore/ccsrc/debug/data_dump_parser.cc
浏览文件 @
8f6ed032
...
...
@@ -209,4 +209,24 @@ void DataDumpParser::CheckOpDebugMode(uint32_t op_debug_mode) const {
MS_LOG
(
EXCEPTION
)
<<
"[DataDump] op_debug_mode in config json file should be [0-3]"
;
}
}
std
::
string
DataDumpParser
::
GetOpOverflowBinPath
(
uint32_t
graph_id
,
uint32_t
device_id
)
const
{
std
::
string
bin_path
=
"/var/log/npu/ide_daemon/dump"
;
const
char
*
dump_data_path
=
std
::
getenv
(
"DATA_DUMP_PATH"
);
bin_path
.
append
(
dump_data_path
);
bin_path
.
append
(
"_"
);
bin_path
.
append
(
std
::
to_string
(
device_id
));
bin_path
.
append
(
"/"
);
bin_path
.
append
(
net_name_
);
bin_path
.
append
(
"_"
);
bin_path
.
append
(
std
::
to_string
(
graph_id
));
bin_path
.
append
(
"/"
);
bin_path
.
append
(
std
::
to_string
(
dump_mode_
));
bin_path
.
append
(
"/"
);
bin_path
.
append
(
std
::
to_string
(
dump_step_
));
bin_path
.
append
(
"/"
);
return
bin_path
;
}
}
// namespace mindspore
mindspore/ccsrc/debug/data_dump_parser.h
浏览文件 @
8f6ed032
...
...
@@ -42,6 +42,7 @@ class DataDumpParser {
uint32_t
dump_step
()
const
{
return
dump_step_
;
}
void
MatchKernel
(
const
std
::
string
&
kernel_name
);
void
PrintUnusedKernel
();
std
::
string
GetOpOverflowBinPath
(
uint32_t
graph_id
,
uint32_t
device_id
)
const
;
private:
DataDumpParser
()
=
default
;
...
...
mindspore/ccsrc/debug/debug_services.cc
浏览文件 @
8f6ed032
...
...
@@ -50,6 +50,8 @@ void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition,
}
else
if
(
watch_condition
==
1
)
{
watchpoint_item
.
conditions
.
inf
.
enabled
=
true
;
watchpoint_item
.
conditions
.
neg_inf
.
enabled
=
true
;
}
else
if
(
watch_condition
==
2
)
{
watchpoint_item
.
conditions
.
overflow
.
enabled
=
true
;
}
watchpoint_item
.
check_node_list
=
check_node_list
;
...
...
@@ -63,8 +65,8 @@ void DebugServices::RemoveWatchpoint(unsigned int id) {
}
void
DebugServices
::
CheckWatchpoints
(
std
::
vector
<
std
::
string
>
*
name
,
std
::
vector
<
std
::
string
>
*
slot
,
std
::
vector
<
char
*>
*
data_ptr
,
std
::
vector
<
unsigned
int
>
*
data_size
,
std
::
vector
<
int
>
*
condition
,
std
::
vector
<
unsigned
int
>
*
wacthpoint_id
)
{
std
::
vector
<
int
>
*
condition
,
std
::
vector
<
unsigned
int
>
*
watchpoint_id
,
const
std
::
vector
<
std
::
string
>
&
op_overflows
)
{
std
::
lock_guard
<
std
::
mutex
>
lg
(
lock_
);
std
::
vector
<
std
::
shared_ptr
<
TensorData
>>
tensor_list
=
tensor_loader_
->
GetTensor
();
...
...
@@ -74,6 +76,7 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
for
(
std
::
size_t
i
=
0
;
i
<
tensor_list
.
size
();
i
++
)
{
current_tensor_name
=
tensor_list
[
i
]
->
GetName
();
std
::
string
tensor_slot
=
std
::
to_string
(
tensor_list
[
i
]
->
GetSlot
());
mindspore
::
tensor
::
TensorPtr
tensor_ptr
=
tensor_list
[
i
]
->
GetTensor
();
int
tensor_data_type
=
tensor_ptr
->
data_type_c
();
...
...
@@ -106,10 +109,23 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
}
}
}
std
::
vector
<
unsigned
int
>
hit_encountered
;
// check if no watchpoints are valid for the current tensor
if
(
watchpoints_to_check_table
.
empty
())
{
continue
;
// handle watchpoint conditions that do not require per element checks
for
(
auto
it_w_table_check
=
watchpoints_to_check_table
.
begin
();
it_w_table_check
!=
watchpoints_to_check_table
.
end
();
++
it_w_table_check
)
{
if
(
it_w_table_check
->
second
.
conditions
.
overflow
.
enabled
)
{
std
::
string
name_no_slot
=
current_tensor_name
.
substr
(
0
,
current_tensor_name
.
find_first_of
(
":"
));
if
(
std
::
find
(
op_overflows
.
begin
(),
op_overflows
.
end
(),
name_no_slot
)
!=
op_overflows
.
end
())
{
hit_encountered
.
push_back
(
it_w_table_check
->
second
.
id
);
}
}
}
if
(
hit_encountered
.
size
())
{
HandleWatchpointHits
(
hit_encountered
,
name
,
slot
,
condition
,
watchpoint_id
,
current_tensor_name
,
&
watchpoints_to_check_table
,
tensor_slot
);
hit_encountered
.
clear
();
}
// need to add support for float16 and float64, and other types when we support conditions beyond inf and nan
...
...
@@ -117,11 +133,14 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
continue
;
}
// check if no watchpoints are remaining
if
(
watchpoints_to_check_table
.
empty
())
{
continue
;
}
float
*
start_addr
=
reinterpret_cast
<
float
*>
(
tensor_ptr
->
data_c
());
unsigned
int
num_elements
=
(
tensor_ptr
->
data
().
nbytes
())
/
sizeof
(
float
);
std
::
unordered_map
<
unsigned
int
,
watchpoint_t
>::
iterator
it_w_table_check
;
std
::
vector
<
unsigned
int
>
hit_encountered
;
for
(
unsigned
int
index
=
0
;
index
<
num_elements
;
index
++
)
{
float
x
=
start_addr
[
index
];
...
...
@@ -134,33 +153,12 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
}
else
if
(
it_w_table_check
->
second
.
conditions
.
nan
.
enabled
&&
isnan
(
x
))
{
hit_encountered
.
push_back
(
it_w_table_check
->
second
.
id
);
}
++
it_w_table_check
;
}
if
(
hit_encountered
.
size
())
{
for
(
auto
it_hit_id
=
hit_encountered
.
begin
();
it_hit_id
!=
hit_encountered
.
end
();
++
it_hit_id
)
{
std
::
string
name_no_slot
=
current_tensor_name
.
substr
(
0
,
current_tensor_name
.
find_first_of
(
":"
));
name
->
push_back
(
name_no_slot
);
slot
->
push_back
(
std
::
to_string
(
tensor_list
[
i
]
->
GetSlot
()));
data_ptr
->
push_back
(
reinterpret_cast
<
char
*>
(
tensor_ptr
->
data_c
()));
data_size
->
push_back
(
tensor_ptr
->
data
().
nbytes
());
int
condition_item
=
-
1
;
if
(
watchpoint_table
[
*
it_hit_id
].
conditions
.
nan
.
enabled
)
{
condition_item
=
0
;
}
else
if
(
watchpoint_table
[
*
it_hit_id
].
conditions
.
inf
.
enabled
||
watchpoint_table
[
*
it_hit_id
].
conditions
.
neg_inf
.
enabled
)
{
condition_item
=
1
;
}
condition
->
push_back
(
condition_item
);
wacthpoint_id
->
push_back
(
*
it_hit_id
);
watchpoints_to_check_table
.
erase
(
*
it_hit_id
);
}
HandleWatchpointHits
(
hit_encountered
,
name
,
slot
,
condition
,
watchpoint_id
,
current_tensor_name
,
&
watchpoints_to_check_table
,
tensor_slot
);
hit_encountered
.
clear
();
}
...
...
@@ -171,6 +169,34 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
}
}
void
DebugServices
::
HandleWatchpointHits
(
const
std
::
vector
<
unsigned
int
>
&
hit_encountered
,
std
::
vector
<
std
::
string
>
*
name
,
std
::
vector
<
std
::
string
>
*
slot
,
std
::
vector
<
int
>
*
condition
,
std
::
vector
<
unsigned
int
>
*
watchpoint_id
,
std
::
string
current_tensor_name
,
std
::
unordered_map
<
unsigned
int
,
watchpoint_t
>
*
watchpoints_to_check_table
,
std
::
string
tensor_slot
)
{
for
(
auto
it_hit_id
=
hit_encountered
.
begin
();
it_hit_id
!=
hit_encountered
.
end
();
++
it_hit_id
)
{
if
(
watchpoint_table
.
find
(
*
it_hit_id
)
!=
watchpoint_table
.
end
())
{
std
::
string
name_no_slot
=
current_tensor_name
.
substr
(
0
,
current_tensor_name
.
find_first_of
(
":"
));
name
->
push_back
(
name_no_slot
);
slot
->
push_back
(
tensor_slot
);
int
condition_item
=
-
1
;
if
(
watchpoint_table
[
*
it_hit_id
].
conditions
.
nan
.
enabled
)
{
condition_item
=
0
;
}
else
if
(
watchpoint_table
[
*
it_hit_id
].
conditions
.
inf
.
enabled
||
watchpoint_table
[
*
it_hit_id
].
conditions
.
neg_inf
.
enabled
)
{
condition_item
=
1
;
}
else
if
(
watchpoint_table
[
*
it_hit_id
].
conditions
.
overflow
.
enabled
)
{
condition_item
=
2
;
}
condition
->
push_back
(
condition_item
);
watchpoint_id
->
push_back
(
*
it_hit_id
);
}
watchpoints_to_check_table
->
erase
(
*
it_hit_id
);
}
}
void
DebugServices
::
CheckSingleWatchpoint
(
std
::
shared_ptr
<
TensorData
>
watchtensor
,
std
::
string
*
name
,
std
::
string
*
slot
,
char
**
data_ptr
,
unsigned
int
*
data_size
,
int
*
condition
,
unsigned
int
*
wacthpoint_id
)
{
...
...
mindspore/ccsrc/debug/debug_services.h
浏览文件 @
8f6ed032
...
...
@@ -51,6 +51,7 @@ class DebugServices {
condition_no_param_t
inf
;
condition_no_param_t
neg_inf
;
condition_no_param_t
nan
;
condition_no_param_t
overflow
;
condition_with_param_t
max_below
;
condition_with_param_t
max_above
;
condition_with_param_t
min_below
;
...
...
@@ -74,9 +75,8 @@ class DebugServices {
void
RemoveWatchpoint
(
unsigned
int
id
);
void
CheckWatchpoints
(
std
::
vector
<
std
::
string
>
*
name
,
std
::
vector
<
std
::
string
>
*
slot
,
std
::
vector
<
char
*>
*
data_ptr
,
std
::
vector
<
unsigned
int
>
*
data_size
,
std
::
vector
<
int
>
*
condition
,
std
::
vector
<
unsigned
int
>
*
wacthpoint_id
);
void
CheckWatchpoints
(
std
::
vector
<
std
::
string
>
*
name
,
std
::
vector
<
std
::
string
>
*
slot
,
std
::
vector
<
int
>
*
condition
,
std
::
vector
<
unsigned
int
>
*
watchpoint_id
,
const
std
::
vector
<
std
::
string
>
&
op_overflows
);
void
CheckSingleWatchpoint
(
std
::
shared_ptr
<
TensorData
>
watchnode
,
std
::
string
*
name
,
std
::
string
*
slot
,
char
**
data_ptr
,
unsigned
int
*
data_size
,
int
*
condition
,
unsigned
int
*
wacthpoint_id
);
...
...
@@ -97,6 +97,12 @@ class DebugServices {
std
::
unordered_map
<
unsigned
int
,
watchpoint_t
>
watchpoint_table
;
TensorLoader
*
tensor_loader_
;
void
HandleWatchpointHits
(
const
std
::
vector
<
unsigned
int
>
&
hit_encountered
,
std
::
vector
<
std
::
string
>
*
name
,
std
::
vector
<
std
::
string
>
*
slot
,
std
::
vector
<
int
>
*
condition
,
std
::
vector
<
unsigned
int
>
*
watchpoint_id
,
std
::
string
current_tensor_name
,
std
::
unordered_map
<
unsigned
int
,
watchpoint_t
>
*
watchpoints_to_check_table
,
std
::
string
tensor_slot
);
};
}
// namespace mindspore
...
...
mindspore/ccsrc/debug/debugger/debug_grpc.proto
浏览文件 @
8f6ed032
...
...
@@ -79,8 +79,16 @@ message WatchCondition {
enum
Condition
{
nan
=
0
;
inf
=
1
;
overflow
=
2
;
ge
=
3
;
// greater than and equal to
gt
=
4
;
// greater than
le
=
5
;
// less than and equal to
lt
=
6
;
// less than
between
=
7
;
// between
}
Condition
condition
=
1
;
repeated
float
value
=
2
;
// for between condition, there will be two values
repeated
bool
include
=
3
;
// for between condition, define the value is included or not
}
message
WatchNode
{
...
...
mindspore/ccsrc/debug/debugger/debugger.cc
浏览文件 @
8f6ed032
...
...
@@ -14,11 +14,18 @@
* limitations under the License.
*/
#include <dirent.h>
#include <stdio.h>
#include <fstream>
#include <tuple>
#include <vector>
#include <algorithm>
#include <iostream>
#include <cstring>
#include <utility>
#include <map>
#include "debug/debugger/debugger.h"
#include "debug/data_dump_parser.h"
#include "pipeline/jit/pipeline.h"
#include "backend/session/anf_runtime_algorithm.h"
#include "runtime/device/kernel_runtime_manager.h"
...
...
@@ -49,7 +56,9 @@ Debugger::Debugger()
node_name_
(
""
),
cur_name_
(
""
),
is_dataset_graph_
(
false
),
partial_memory_
(
false
)
{}
partial_memory_
(
false
),
last_overflow_bin_
(
0
),
overflow_bin_path_
(
""
)
{}
void
Debugger
::
Init
(
const
uint32_t
device_id
,
const
std
::
string
device_target
)
{
// access lock for public method
...
...
@@ -133,6 +142,35 @@ void Debugger::EnableDebugger() {
"usage for large models."
;
}
if
(
device_target_
==
kAscendDevice
)
{
// set operation overflow info
overflow_bin_path_
=
DataDumpParser
::
GetInstance
().
GetOpOverflowBinPath
(
graph_ptr_
->
graph_id
(),
device_id_
);
// new overflow dump files will have a timestamp greater than last_overflow_bin_
last_overflow_bin_
=
0
;
DIR
*
d
;
d
=
opendir
(
overflow_bin_path_
.
c_str
());
if
(
d
)
{
struct
dirent
*
dir
;
while
((
dir
=
readdir
(
d
))
!=
NULL
)
{
if
(
dir
->
d_type
==
DT_REG
)
{
std
::
string
file_path
=
overflow_bin_path_
;
file_path
.
append
(
dir
->
d_name
);
std
::
size_t
found
=
file_path
.
find_last_of
(
"."
);
if
(
found
==
std
::
string
::
npos
)
{
continue
;
}
std
::
string
overflow_time
=
file_path
.
substr
(
found
+
1
);
if
(
stod
(
overflow_time
)
<=
last_overflow_bin_
)
{
MS_LOG
(
INFO
)
<<
"Old op overflow bin folder"
<<
file_path
;
continue
;
}
last_overflow_bin_
=
stod
(
overflow_time
);
}
}
MS_LOG
(
INFO
)
<<
"last op overflow bin folder"
<<
last_overflow_bin_
;
}
}
// initialize grpc client
if
(
debugger_enabled_
)
{
grpc_client_
=
std
::
make_unique
<
GrpcClient
>
(
host
,
port
);
...
...
@@ -154,6 +192,9 @@ void Debugger::Reset() {
graph_ptr_
=
nullptr
;
grpc_client_
=
nullptr
;
debug_services_
=
nullptr
;
last_overflow_bin_
=
0
;
overflow_bin_path_
=
""
;
stream_task_to_opname_
.
clear
();
}
void
Debugger
::
PreExecute
(
const
KernelGraphPtr
&
graph_ptr
)
{
...
...
@@ -200,6 +241,7 @@ void Debugger::PostExecuteNode() {
if
(
debugger_enabled_
&&
!
is_dataset_graph_
)
{
auto
watchpoint_table
=
debug_services_
->
GetWatchpointTable
();
auto
is_watchpoint
=
debug_services_
->
IsWatchPoint
(
cur_name_
,
watchpoint_table
);
// if kernel is watchpoint,and get hit. suspend.
if
(
is_watchpoint
)
{
auto
hits
=
CheckSingleWatchpoint
(
cur_name_
);
...
...
@@ -225,6 +267,10 @@ void Debugger::PostDebugOp() {
}
}
std
::
map
<
std
::
pair
<
uint32_t
,
uint32_t
>
,
std
::
string
>
&
Debugger
::
GetStreamTaskToOpnameMap
()
{
return
stream_task_to_opname_
;
}
void
Debugger
::
CheckGraphPtr
(
const
KernelGraphPtr
&
graph_ptr
)
{
if
(
graph_ptr_
!=
graph_ptr
)
{
MS_LOG
(
INFO
)
<<
"Debugger got new graph: "
<<
graph_ptr
->
graph_id
();
...
...
@@ -476,15 +522,15 @@ void Debugger::Exit() {
std
::
exit
(
EXIT_FAILURE
);
}
std
::
list
<
WatchpointHit
>
Debugger
::
CheckWatchpoints
()
const
{
std
::
list
<
WatchpointHit
>
Debugger
::
CheckWatchpoints
()
{
std
::
vector
<
std
::
string
>
name
;
std
::
vector
<
std
::
string
>
slot
;
std
::
vector
<
char
*>
data_ptr
;
std
::
vector
<
unsigned
int
>
data_size
;
std
::
vector
<
int
>
condition
;
std
::
vector
<
unsigned
int
>
watchpoint_id
;
std
::
vector
<
std
::
string
>
overflow_ops
;
debug_services_
->
CheckWatchpoints
(
&
name
,
&
slot
,
&
data_ptr
,
&
data_size
,
&
condition
,
&
watchpoint_id
);
overflow_ops
=
CheckOpOverflow
();
debug_services_
->
CheckWatchpoints
(
&
name
,
&
slot
,
&
condition
,
&
watchpoint_id
,
overflow_ops
);
std
::
list
<
WatchpointHit
>
hits
;
for
(
unsigned
int
i
=
0
;
i
<
name
.
size
();
i
++
)
{
WatchpointHit
hit
;
...
...
@@ -658,4 +704,70 @@ void Debugger::SetStepNum(int32_t cur_num_step) {
int32_t
Debugger
::
step_num
()
const
{
return
num_step_
;
}
uint64_t
BytestoInt64
(
const
std
::
vector
<
char
>
&
buffer
)
{
uint64_t
ret
;
ret
=
((
uint64_t
)
buffer
[
7
]
<<
56
)
|
((
uint64_t
)
buffer
[
6
]
<<
48
)
|
((
uint64_t
)
buffer
[
5
]
<<
40
)
|
((
uint64_t
)
buffer
[
4
]
<<
32
)
|
(
buffer
[
3
]
<<
24
)
|
(
buffer
[
2
]
<<
16
)
|
(
buffer
[
1
]
<<
8
)
|
buffer
[
0
];
return
ret
;
}
#define BUF_SIZ 256
std
::
vector
<
std
::
string
>
Debugger
::
CheckOpOverflow
()
{
std
::
vector
<
double
>
bin_list
;
std
::
vector
<
std
::
string
>
op_names
;
DIR
*
d
;
struct
dirent
*
dir
;
d
=
opendir
(
overflow_bin_path_
.
c_str
());
if
(
d
)
{
while
((
dir
=
readdir
(
d
))
!=
NULL
)
{
if
(
dir
->
d_type
==
DT_REG
)
{
std
::
string
file_path
=
overflow_bin_path_
;
file_path
.
append
(
dir
->
d_name
);
std
::
string
file_name
=
dir
->
d_name
;
std
::
size_t
found
=
file_name
.
find_last_of
(
"."
);
if
(
found
==
std
::
string
::
npos
)
{
continue
;
}
std
::
string
overflow_time
=
file_name
.
substr
(
found
+
1
);
if
(
stod
(
overflow_time
)
<=
last_overflow_bin_
)
{
MS_LOG
(
INFO
)
<<
"File already processed "
<<
file_name
;
continue
;
}
bin_list
.
push_back
(
stod
(
overflow_time
));
std
::
fstream
infile
;
infile
.
open
(
file_path
.
c_str
(),
std
::
ios
::
binary
|
std
::
ios
::
in
);
infile
.
seekg
(
313
,
std
::
ios
::
beg
);
std
::
vector
<
char
>
buffer
;
buffer
.
resize
(
BUF_SIZ
);
infile
.
read
(
buffer
.
data
(),
BUF_SIZ
);
uint64_t
stream_id
=
BytestoInt64
(
std
::
vector
<
char
>
(
buffer
.
begin
()
+
8
,
buffer
.
end
()));
uint64_t
task_id
=
BytestoInt64
(
std
::
vector
<
char
>
(
buffer
.
begin
()
+
16
,
buffer
.
end
()));
MS_LOG
(
INFO
)
<<
"Overflow stream_id "
<<
stream_id
<<
", task_id "
<<
task_id
<<
"."
;
auto
op
=
debugger_
->
stream_task_to_opname_
.
find
(
std
::
make_pair
(
stream_id
,
task_id
));
if
(
op
!=
debugger_
->
stream_task_to_opname_
.
end
())
{
MS_LOG
(
ERROR
)
<<
"Overflow detected on node "
<<
op
->
second
<<
std
::
endl
;
op_names
.
push_back
(
op
->
second
);
}
else
{
MS_LOG
(
INFO
)
<<
"No overflow is detected "
<<
std
::
endl
;
}
infile
.
close
();
}
}
}
else
{
MS_LOG
(
INFO
)
<<
"OverFlow bin directory does not exist!"
;
}
closedir
(
d
);
MS_LOG
(
ERROR
)
<<
"These operation overflows are detected "
<<
op_names
;
for
(
auto
&
i
:
bin_list
)
{
if
(
i
>
last_overflow_bin_
)
{
last_overflow_bin_
=
i
;
}
}
return
op_names
;
}
}
// namespace mindspore
mindspore/ccsrc/debug/debugger/debugger.h
浏览文件 @
8f6ed032
...
...
@@ -19,6 +19,9 @@
#include <list>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include <map>
#include "backend/session/kernel_graph.h"
#include "debug/debugger/grpc_client.h"
#include "debug/debug_services.h"
...
...
@@ -90,6 +93,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
int32_t
step_num
()
const
;
std
::
map
<
std
::
pair
<
uint32_t
,
uint32_t
>
,
std
::
string
>
&
GetStreamTaskToOpnameMap
();
private:
// private constructor for singleton
Debugger
();
...
...
@@ -130,12 +135,15 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
// analyze tensors and check watchpoint conditions
// return names of tensors and what condition they hit
std
::
list
<
WatchpointHit
>
CheckWatchpoints
()
const
;
std
::
list
<
WatchpointHit
>
CheckWatchpoints
();
std
::
list
<
WatchpointHit
>
CheckSingleWatchpoint
(
std
::
string
watchnode
)
const
;
// send watchpoints that hit and enter command wait loop
void
SendWatchpointsAndSuspend
(
const
std
::
list
<
WatchpointHit
>
&
points
);
// Find if any operation overflow happened and return their names
std
::
vector
<
std
::
string
>
CheckOpOverflow
();
// class members
std
::
unique_ptr
<
GrpcClient
>
grpc_client_
;
std
::
unique_ptr
<
DebugServices
>
debug_services_
;
...
...
@@ -150,7 +158,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
bool
is_dataset_graph_
;
bool
partial_memory_
;
std
::
mutex
access_lock_
;
std
::
map
<
std
::
pair
<
uint32_t
,
uint32_t
>
,
std
::
string
>
stream_task_to_opname_
;
double
last_overflow_bin_
;
std
::
string
overflow_bin_path_
;
// singleton
static
std
::
mutex
instance_lock_
;
static
std
::
shared_ptr
<
Debugger
>
debugger_
;
...
...
@@ -180,5 +190,6 @@ ProtoVector<TensorProto> GetTensors(const EventReply &reply);
// get the full name of a tensor, which is the name used in TensorLoader
std
::
string
GetTensorFullName
(
const
TensorProto
&
tensor
);
uint64_t
BytestoInt64
(
const
std
::
vector
<
char
>
&
buffer
);
}
// namespace mindspore
#endif // MINDSPORE_CCSRC_DEBUG_DEBUGGER_DEBUGGER_H_
mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
浏览文件 @
8f6ed032
...
...
@@ -27,6 +27,9 @@
#include "proto/op_mapping_info.pb.h"
#include "utils/ms_context.h"
#include "debug/data_dump_parser.h"
#ifdef ENABLE_DEBUGGER
#include "debug/debugger/debugger.h"
#endif
static
constexpr
uint32_t
kAicpuLoadFlag
=
1
;
static
constexpr
uint32_t
kAicpuUnloadFlag
=
0
;
...
...
@@ -90,6 +93,18 @@ void DataDumper::LoadDumpInfo() {
load_flag_
=
true
;
// graph id may changed in Unload
graph_id_
=
kernel_graph_
->
graph_id
();
#ifdef ENABLE_DEBUGGER
auto
debugger
=
mindspore
::
Debugger
::
GetInstance
();
MS_EXCEPTION_IF_NULL
(
debugger
);
std
::
map
<
std
::
pair
<
uint32_t
,
uint32_t
>
,
std
::
string
>
&
stream_task_to_opname
=
debugger
->
GetStreamTaskToOpnameMap
();
// extract stream id, task id and opname from runtime_info_map for overflow detection
std
::
transform
(
runtime_info_map_
.
begin
(),
runtime_info_map_
.
end
(),
std
::
inserter
(
stream_task_to_opname
,
stream_task_to_opname
.
end
()),
[](
const
std
::
pair
<
std
::
string
,
std
::
shared_ptr
<
RuntimeInfo
>>
&
p
)
->
std
::
pair
<
std
::
pair
<
uint32_t
,
uint32_t
>
,
std
::
string
>
{
return
{{
std
::
get
<
1
>
(
*
p
.
second
),
std
::
get
<
0
>
(
*
p
.
second
)},
p
.
first
};
});
#endif
MS_LOG
(
INFO
)
<<
"[DataDump] LoadDumpInfo end"
;
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录