diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 03323e3da7bd2634be1d9f07d9e8fcf21cfdf437..26c09eb8eb9db6f30e5d2e3ce332a8a62975fa40 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -66,6 +66,7 @@ void FetchOpHandle::RunImpl() {
     if (platform::is_gpu_place(var->place_)) {
 #ifdef PADDLE_WITH_CUDA
       TensorCopy(t, cpu, *dev_ctx_[t.place()], &tensors_[i]);
+      dev_ctx_[t.place()]->Wait();
 #endif
     } else {
       tensors_[i].ShareDataWith(t);
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 07a4b8921753a0b3cb8e62397e0a0ab2385a094a..63affb705424f847d531fc8c2f20d132c92784d8 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -33,9 +33,6 @@ std::string OpHandleBase::DebugString() const {
 
 OpHandleBase::~OpHandleBase() {
 #ifdef PADDLE_WITH_CUDA
-  for (auto &ctx : dev_ctx_) {
-    ctx.second->Wait();
-  }
   for (auto &ev : events_) {
     PADDLE_ENFORCE(cudaEventDestroy(ev.second));
   }