!5386 Fix GPU watchpoints check when multiple conditions set on same node

Merge pull request !5386 from HarshvardhanGupta/fix-gpu-wp-skip

!5386 Fix GPU watchpoints check when multiple conditions set on same node
Merge pull request !5386 from HarshvardhanGupta/fix-gpu-wp-skip
38b4413f · mindspore-ci-bot · Gitee · 767c4c7f · 3226e840 · 38b4413f
4 changed file
--- a/mindspore/ccsrc/debug/debug_services.cc
+++ b/mindspore/ccsrc/debug/debug_services.cc
@@ -66,11 +66,9 @@ void DebugServices::RemoveWatchpoint(unsigned int id) {

 void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot,
                                     std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id,
-                                     const std::vector<std::string> &op_overflows) {
+                                     const std::vector<std::string> &op_overflows,
+                                     const std::vector<std::shared_ptr<TensorData>> &tensor_list) {
  std::lock_guard<std::mutex> lg(lock_);
-
-  std::vector<std::shared_ptr<TensorData>> tensor_list = tensor_loader_->GetTensor();
-
  std::string current_tensor_name;
  std::unordered_map<unsigned int, watchpoint_t> watchpoints_to_check_table;
  const size_t location = 0;
@@ -198,61 +196,6 @@ void DebugServices::HandleWatchpointHits(const std::vector<unsigned int> &hit_en
  }
 }

-void DebugServices::CheckSingleWatchpoint(std::shared_ptr<TensorData> watchtensor, std::string *name, std::string *slot,
-                                          char **data_ptr, unsigned int *data_size, int *condition,
-                                          unsigned int *wacthpoint_id) {
-  std::lock_guard<std::mutex> lg(lock_);
-
-  std::string current_watchtensor_name;
-  current_watchtensor_name = watchtensor->GetName();
-  mindspore::tensor::TensorPtr tensor_ptr = watchtensor->GetTensor();
-  int tensor_data_type = tensor_ptr->data_type_c();
-  watchpoint_t watchpoint_to_check;
-
-  for (auto w_table_item : watchpoint_table) {
-    auto check_node_list = std::get<1>(w_table_item).check_node_list;
-    for (auto check_node : check_node_list) {
-      std::string w_name = std::get<0>(check_node);
-      bool w_type = std::get<1>(check_node);
-      // get current the full info including condition, id..., for current watchtensor
-      std::string current_node_name = current_watchtensor_name.substr(0, current_watchtensor_name.find_first_of(":"));
-      if ((w_type == true && (current_watchtensor_name.find(w_name) != string::npos || w_name == "*")) ||
-          (w_type == false && current_node_name == w_name)) {
-        watchpoint_to_check = w_table_item.second;
-        // need to add support for float16 and float64, and other types when we support conditions beyond inf and nan
-        if (tensor_data_type != kNumberTypeFloat && tensor_data_type != kNumberTypeFloat32) {
-          return;
-        }
-        break;
-      }
-    }
-  }
-
-  float *start_addr = reinterpret_cast<float *>(tensor_ptr->data_c());
-  unsigned int num_elements = (tensor_ptr->data().nbytes()) / sizeof(float);
-
-  for (unsigned int index = 0; index < num_elements; index++) {
-    float x = start_addr[index];
-    if (((watchpoint_to_check.conditions.inf.enabled || watchpoint_to_check.conditions.neg_inf.enabled) && isinf(x)) ||
-        (watchpoint_to_check.conditions.nan.enabled && isnan(x))) {
-      std::string name_no_slot = current_watchtensor_name.substr(0, current_watchtensor_name.find_first_of(":"));
-      *name = name_no_slot;
-      *slot = std::to_string(watchtensor->GetSlot());
-      *data_ptr = reinterpret_cast<char *>(tensor_ptr->data_c());
-      *data_size = tensor_ptr->data().nbytes();
-      int condition_item = -1;
-      if (watchpoint_to_check.conditions.nan.enabled) {
-        condition_item = 0;
-      } else if (watchpoint_to_check.conditions.inf.enabled || watchpoint_to_check.conditions.neg_inf.enabled) {
-        condition_item = 1;
-      }
-      *condition = condition_item;
-
-      *wacthpoint_id = watchpoint_to_check.id;
-    }
-  }
-}
-
 void DebugServices::ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
                                     std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size,
                                     std::vector<TypePtr> *dtype, std::vector<std::vector<int>> *shape) {

--- a/mindspore/ccsrc/debug/debug_services.h
+++ b/mindspore/ccsrc/debug/debug_services.h
@@ -76,10 +76,8 @@ class DebugServices {
  void RemoveWatchpoint(unsigned int id);

  void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition,
-                        std::vector<unsigned int> *watchpoint_id, const std::vector<std::string> &op_overflows);
-
-  void CheckSingleWatchpoint(std::shared_ptr<TensorData> watchnode, std::string *name, std::string *slot,
-                             char **data_ptr, unsigned int *data_size, int *condition, unsigned int *wacthpoint_id);
+                        std::vector<unsigned int> *watchpoint_id, const std::vector<std::string> &op_overflows,
+                        const std::vector<std::shared_ptr<TensorData>> &tensor_list);

  void ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
                        std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size,

--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -254,7 +254,7 @@ void Debugger::PostExecuteNode() {

    // if kernel is watchpoint,and get hit. suspend.
    if (is_watchpoint) {
-      auto hits = CheckSingleWatchpoint(cur_name_);
+      auto hits = CheckWatchpoints(cur_name_);
      if (!hits.empty()) {
        SendWatchpointsAndSuspend(hits);
      }
@@ -547,7 +547,7 @@ void Debugger::Exit() {
  std::exit(EXIT_FAILURE);
 }

-std::list<WatchpointHit> Debugger::CheckWatchpoints() {
+std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode) {
  std::vector<std::string> name;
  std::vector<std::string> slot;
  std::vector<int> condition;
@@ -556,7 +556,15 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints() {
 #ifdef ENABLE_D
  overflow_ops = CheckOpOverflow();
 #endif
-  debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, overflow_ops);
+  auto tensor_loader = debug_services_->tensor_loader();
+  std::vector<std::shared_ptr<TensorData>> tensor_list;
+  if (watchnode.empty()) {
+    tensor_list = tensor_loader->GetTensor();
+  } else {
+    tensor_list = tensor_loader->GetNodeTensorMap(watchnode);
+  }
+
+  debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, overflow_ops, tensor_list);
  std::list<WatchpointHit> hits;
  for (unsigned int i = 0; i < name.size(); i++) {
    WatchpointHit hit;
@@ -576,35 +584,6 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints() {
  return hits;
 }

-std::list<WatchpointHit> Debugger::CheckSingleWatchpoint(std::string watchnode) const {
-  auto tensor_loader = debug_services_->tensor_loader();
-  auto tensors = tensor_loader->GetNodeTensorMap(watchnode);
-  std::list<WatchpointHit> hits;
-  for (std::vector<std::shared_ptr<TensorData>>::iterator it = tensors.begin(); it != tensors.end(); ++it) {
-    auto cur_tensor = *it;
-    std::string name = "";
-    std::string slot = "";
-    char *data_ptr = nullptr;
-    unsigned int data_size = 0;
-    int condition = -1;
-    unsigned int watchpoint_id = -1;
-    WatchpointHit hit;
-    debug_services_->CheckSingleWatchpoint(cur_tensor, &name, &slot, &data_ptr, &data_size, &condition, &watchpoint_id);
-    if (name != "") {
-      hit.set_id(watchpoint_id);
-      // here TensorProto act as a tensor indicator, not sending tensor content
-      TensorProto *tensor_item = hit.mutable_tensor();
-      tensor_item->set_node_name(name);
-      tensor_item->set_slot(slot);
-      tensor_item->set_finished(true);
-      WatchCondition *condition_item = hit.mutable_watch_condition();
-      condition_item->set_condition(debugger::WatchCondition_Condition(condition));
-      hits.push_back(hit);
-    }
-  }
-  return hits;
-}
-
 void Debugger::SendWatchpointsAndSuspend(const std::list<WatchpointHit> &points) {
  // send info about watchpoint
  if (!points.empty()) {

--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@@ -137,8 +137,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

  // analyze tensors and check watchpoint conditions
  // return names of tensors and what condition they hit
-  std::list<WatchpointHit> CheckWatchpoints();
-  std::list<WatchpointHit> CheckSingleWatchpoint(std::string watchnode) const;
+  std::list<WatchpointHit> CheckWatchpoints(const std::string &watchnode = std::string());

  // send watchpoints that hit and enter command wait loop
  void SendWatchpointsAndSuspend(const std::list<WatchpointHit> &points);