diff --git a/lite/kernels/mlu/bridges/graph.h b/lite/kernels/mlu/bridges/graph.h
index b846d15af06c683ad685b04da5588f7ecedd0d38..c5bc236dbfdab4db89aa0fba68fb6c9702fcfbcd 100644
--- a/lite/kernels/mlu/bridges/graph.h
+++ b/lite/kernels/mlu/bridges/graph.h
@@ -125,10 +125,7 @@ class Graph {
                                             que));
 #if PRINT_HW_TIME
     CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que));
-#endif
-
     CNRT_CALL(cnrtSyncQueue(que));
-#if PRINT_HW_TIME
     CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time));
     hw_time /= 1000.0f;
     DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl;
diff --git a/lite/kernels/mlu/io_copy_compute.cc b/lite/kernels/mlu/io_copy_compute.cc
index 02e4d8b28e81e88201b895a4b8fbe9e93d3f17f9..7520d50034f43440ee2700317de08cb0bdf2901d 100644
--- a/lite/kernels/mlu/io_copy_compute.cc
+++ b/lite/kernels/mlu/io_copy_compute.cc
@@ -79,6 +79,11 @@ class IoCopyMluToHostCompute
     CHECK(param.x->target() == TARGET(kMLU));
     auto mem_size = param.x->memory_size();
     auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
+
+    // sync queue to ensure process done
+    auto& mlu_context = this->ctx_->template As<MLUContext>();
+    CNRT_CALL(cnrtSyncQueue(mlu_context.exec_queue()));
+
     CopyToHostSync(data, param.x->raw_data(), mem_size);
   }