feat(graph): support measure hardware time

bb2507b2 · dingminghui · jackzhang235 · ada6790e · bb2507b2
显示空白变更内容
内联并排

Showing with 43 addition and 1 deletion

lite/kernels/mlu/bridges/graph.h lite/kernels/mlu/bridges/graph.h +43 -1

未找到文件。
--- a/lite/kernels/mlu/bridges/graph.h
+++ b/lite/kernels/mlu/bridges/graph.h
@@ -23,6 +23,12 @@
 #include "lite/core/tensor.h"
 #include "lite/kernels/mlu/bridges/tensor.h"
+#define PRINT_HW_TIME false
+#if PRINT_HW_TIME
+#include <mutex>
+#endif
 namespace paddle {
 namespace lite {
 namespace subgraph {
@@ -32,7 +38,13 @@ namespace mlu {
 // to the MLU IR graph
 class Graph {
 public:
-  Graph() { CNML_CALL(cnmlCreateFusionOp(&fusion_op_)); }
+  Graph() {
+    CNML_CALL(cnmlCreateFusionOp(&fusion_op_));
+#if PRINT_HW_TIME
+    CNRT_CALL(cnrtCreateNotifier(&notifier_start_));
+    CNRT_CALL(cnrtCreateNotifier(&notifier_end_));
+#endif
+  }
  ~Graph() {
    FreeConstData();
@@ -40,6 +52,16 @@ class Graph {
    for (auto op : ops_) {
      CNML_CALL(cnmlDestroyBaseOp(&op));
    }
+#if PRINT_HW_TIME
+    CNRT_CALL(cnrtDestroyNotifier(&notifier_start_));
+    CNRT_CALL(cnrtDestroyNotifier(&notifier_end_));
+    double total_time = 0;
+    for (auto& f : time_log_) {
+      total_time += f;
+    }
+    std::cout << "cnml hardware time for " << time_log_.size()
+              << " process:" << total_time / time_log_.size() << std::endl;
+#endif
  }
  // Data node
@@ -90,6 +112,10 @@ class Graph {
  }
  void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) {
+#if PRINT_HW_TIME
+    thread_local float hw_time;
+    CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que));
+#endif
    CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_,
                                            input_addrs_.data(),
                                            input_addrs_.size(),
@@ -97,7 +123,18 @@ class Graph {
                                            output_addrs_.size(),
                                            &forward_param,
                                            que));
+#if PRINT_HW_TIME
+    CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que));
+#endif
    CNRT_CALL(cnrtSyncQueue(que));
+#if PRINT_HW_TIME
+    CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time));
+    hw_time /= 1000.0f;
+    DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl;
+    std::lock_guard<std::mutex> lk(time_mut_);
+    time_log_.push_back(hw_time);
+#endif
  }
  template <typename T>
@@ -203,6 +240,11 @@ class Graph {
  std::vector<cnmlBaseOp_t> ops_;
  cnmlFusionOp_t fusion_op_;
  std::vector<void*> const_data_storage_;
+#if PRINT_HW_TIME
+  cnrtNotifier_t notifier_start_{}, notifier_end_{};
+  std::mutex time_mut_;
+  std::vector<float> time_log_;
+#endif
 };
 }  // namespace mlu