diff --git a/lite/kernels/mlu/bridges/graph.h b/lite/kernels/mlu/bridges/graph.h index b846d15af06c683ad685b04da5588f7ecedd0d38..c5bc236dbfdab4db89aa0fba68fb6c9702fcfbcd 100644 --- a/lite/kernels/mlu/bridges/graph.h +++ b/lite/kernels/mlu/bridges/graph.h @@ -125,10 +125,7 @@ class Graph { que)); #if PRINT_HW_TIME CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que)); -#endif - CNRT_CALL(cnrtSyncQueue(que)); -#if PRINT_HW_TIME CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time)); hw_time /= 1000.0f; DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl; diff --git a/lite/kernels/mlu/io_copy_compute.cc b/lite/kernels/mlu/io_copy_compute.cc index 02e4d8b28e81e88201b895a4b8fbe9e93d3f17f9..7520d50034f43440ee2700317de08cb0bdf2901d 100644 --- a/lite/kernels/mlu/io_copy_compute.cc +++ b/lite/kernels/mlu/io_copy_compute.cc @@ -79,6 +79,11 @@ class IoCopyMluToHostCompute CHECK(param.x->target() == TARGET(kMLU)); auto mem_size = param.x->memory_size(); auto* data = param.y->mutable_data(TARGET(kHost), mem_size); + + // sync queue to ensure process done + auto& mlu_context = this->ctx_->template As(); + CNRT_CALL(cnrtSyncQueue(mlu_context.exec_queue())); + CopyToHostSync(data, param.x->raw_data(), mem_size); }