feat(subgraph): execute cnml compute forward asynchronously

d6ad8937 · dingminghui · 26c8b551 · d6ad8937 · d6ad8937
隐藏空白更改
内联并排

Showing with 5 addition and 3 deletion

lite/kernels/mlu/bridges/graph.h lite/kernels/mlu/bridges/graph.h +0 -3

lite/kernels/mlu/io_copy_compute.cc lite/kernels/mlu/io_copy_compute.cc +5 -0

未找到文件。
--- a/lite/kernels/mlu/bridges/graph.h
+++ b/lite/kernels/mlu/bridges/graph.h
@@ -125,10 +125,7 @@ class Graph {
                                            que));
 #if PRINT_HW_TIME
    CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que));
-#endif
    CNRT_CALL(cnrtSyncQueue(que));
-#if PRINT_HW_TIME
    CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time));
    hw_time /= 1000.0f;
    DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl;

--- a/lite/kernels/mlu/io_copy_compute.cc
+++ b/lite/kernels/mlu/io_copy_compute.cc
@@ -79,6 +79,11 @@ class IoCopyMluToHostCompute
    CHECK(param.x->target() == TARGET(kMLU));
    auto mem_size = param.x->memory_size();
    auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
+    // sync queue to ensure process done
+    auto& mlu_context = this->ctx_->template As<MLUContext>();
+    CNRT_CALL(cnrtSyncQueue(mlu_context.exec_queue()));
    CopyToHostSync(data, param.x->raw_data(), mem_size);
  }