提交 38b4413f 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!5386 Fix GPU watchpoints check when multiple conditions set on same node

Merge pull request !5386 from HarshvardhanGupta/fix-gpu-wp-skip
......@@ -66,11 +66,9 @@ void DebugServices::RemoveWatchpoint(unsigned int id) {
void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot,
std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id,
const std::vector<std::string> &op_overflows) {
const std::vector<std::string> &op_overflows,
const std::vector<std::shared_ptr<TensorData>> &tensor_list) {
std::lock_guard<std::mutex> lg(lock_);
std::vector<std::shared_ptr<TensorData>> tensor_list = tensor_loader_->GetTensor();
std::string current_tensor_name;
std::unordered_map<unsigned int, watchpoint_t> watchpoints_to_check_table;
const size_t location = 0;
......@@ -198,61 +196,6 @@ void DebugServices::HandleWatchpointHits(const std::vector<unsigned int> &hit_en
}
}
void DebugServices::CheckSingleWatchpoint(std::shared_ptr<TensorData> watchtensor, std::string *name, std::string *slot,
char **data_ptr, unsigned int *data_size, int *condition,
unsigned int *wacthpoint_id) {
std::lock_guard<std::mutex> lg(lock_);
std::string current_watchtensor_name;
current_watchtensor_name = watchtensor->GetName();
mindspore::tensor::TensorPtr tensor_ptr = watchtensor->GetTensor();
int tensor_data_type = tensor_ptr->data_type_c();
watchpoint_t watchpoint_to_check;
for (auto w_table_item : watchpoint_table) {
auto check_node_list = std::get<1>(w_table_item).check_node_list;
for (auto check_node : check_node_list) {
std::string w_name = std::get<0>(check_node);
bool w_type = std::get<1>(check_node);
// get current the full info including condition, id..., for current watchtensor
std::string current_node_name = current_watchtensor_name.substr(0, current_watchtensor_name.find_first_of(":"));
if ((w_type == true && (current_watchtensor_name.find(w_name) != string::npos || w_name == "*")) ||
(w_type == false && current_node_name == w_name)) {
watchpoint_to_check = w_table_item.second;
// need to add support for float16 and float64, and other types when we support conditions beyond inf and nan
if (tensor_data_type != kNumberTypeFloat && tensor_data_type != kNumberTypeFloat32) {
return;
}
break;
}
}
}
float *start_addr = reinterpret_cast<float *>(tensor_ptr->data_c());
unsigned int num_elements = (tensor_ptr->data().nbytes()) / sizeof(float);
for (unsigned int index = 0; index < num_elements; index++) {
float x = start_addr[index];
if (((watchpoint_to_check.conditions.inf.enabled || watchpoint_to_check.conditions.neg_inf.enabled) && isinf(x)) ||
(watchpoint_to_check.conditions.nan.enabled && isnan(x))) {
std::string name_no_slot = current_watchtensor_name.substr(0, current_watchtensor_name.find_first_of(":"));
*name = name_no_slot;
*slot = std::to_string(watchtensor->GetSlot());
*data_ptr = reinterpret_cast<char *>(tensor_ptr->data_c());
*data_size = tensor_ptr->data().nbytes();
int condition_item = -1;
if (watchpoint_to_check.conditions.nan.enabled) {
condition_item = 0;
} else if (watchpoint_to_check.conditions.inf.enabled || watchpoint_to_check.conditions.neg_inf.enabled) {
condition_item = 1;
}
*condition = condition_item;
*wacthpoint_id = watchpoint_to_check.id;
}
}
}
void DebugServices::ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size,
std::vector<TypePtr> *dtype, std::vector<std::vector<int>> *shape) {
......
......@@ -76,10 +76,8 @@ class DebugServices {
void RemoveWatchpoint(unsigned int id);
void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition,
std::vector<unsigned int> *watchpoint_id, const std::vector<std::string> &op_overflows);
void CheckSingleWatchpoint(std::shared_ptr<TensorData> watchnode, std::string *name, std::string *slot,
char **data_ptr, unsigned int *data_size, int *condition, unsigned int *wacthpoint_id);
std::vector<unsigned int> *watchpoint_id, const std::vector<std::string> &op_overflows,
const std::vector<std::shared_ptr<TensorData>> &tensor_list);
void ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size,
......
......@@ -254,7 +254,7 @@ void Debugger::PostExecuteNode() {
// if kernel is watchpoint,and get hit. suspend.
if (is_watchpoint) {
auto hits = CheckSingleWatchpoint(cur_name_);
auto hits = CheckWatchpoints(cur_name_);
if (!hits.empty()) {
SendWatchpointsAndSuspend(hits);
}
......@@ -547,7 +547,7 @@ void Debugger::Exit() {
std::exit(EXIT_FAILURE);
}
std::list<WatchpointHit> Debugger::CheckWatchpoints() {
std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode) {
std::vector<std::string> name;
std::vector<std::string> slot;
std::vector<int> condition;
......@@ -556,7 +556,15 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints() {
#ifdef ENABLE_D
overflow_ops = CheckOpOverflow();
#endif
debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, overflow_ops);
auto tensor_loader = debug_services_->tensor_loader();
std::vector<std::shared_ptr<TensorData>> tensor_list;
if (watchnode.empty()) {
tensor_list = tensor_loader->GetTensor();
} else {
tensor_list = tensor_loader->GetNodeTensorMap(watchnode);
}
debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, overflow_ops, tensor_list);
std::list<WatchpointHit> hits;
for (unsigned int i = 0; i < name.size(); i++) {
WatchpointHit hit;
......@@ -576,35 +584,6 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints() {
return hits;
}
std::list<WatchpointHit> Debugger::CheckSingleWatchpoint(std::string watchnode) const {
auto tensor_loader = debug_services_->tensor_loader();
auto tensors = tensor_loader->GetNodeTensorMap(watchnode);
std::list<WatchpointHit> hits;
for (std::vector<std::shared_ptr<TensorData>>::iterator it = tensors.begin(); it != tensors.end(); ++it) {
auto cur_tensor = *it;
std::string name = "";
std::string slot = "";
char *data_ptr = nullptr;
unsigned int data_size = 0;
int condition = -1;
unsigned int watchpoint_id = -1;
WatchpointHit hit;
debug_services_->CheckSingleWatchpoint(cur_tensor, &name, &slot, &data_ptr, &data_size, &condition, &watchpoint_id);
if (name != "") {
hit.set_id(watchpoint_id);
// here TensorProto act as a tensor indicator, not sending tensor content
TensorProto *tensor_item = hit.mutable_tensor();
tensor_item->set_node_name(name);
tensor_item->set_slot(slot);
tensor_item->set_finished(true);
WatchCondition *condition_item = hit.mutable_watch_condition();
condition_item->set_condition(debugger::WatchCondition_Condition(condition));
hits.push_back(hit);
}
}
return hits;
}
void Debugger::SendWatchpointsAndSuspend(const std::list<WatchpointHit> &points) {
// send info about watchpoint
if (!points.empty()) {
......
......@@ -137,8 +137,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
// analyze tensors and check watchpoint conditions
// return names of tensors and what condition they hit
std::list<WatchpointHit> CheckWatchpoints();
std::list<WatchpointHit> CheckSingleWatchpoint(std::string watchnode) const;
std::list<WatchpointHit> CheckWatchpoints(const std::string &watchnode = std::string());
// send watchpoints that hit and enter command wait loop
void SendWatchpointsAndSuspend(const std::list<WatchpointHit> &points);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册