diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 35ebe48ba682f135b7f85edb3b2999db7c29e51a..c1a6d0221baa1238adb4efa46b1f071cf2efbaf7 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -543,8 +543,14 @@ void OperatorWithKernel::Run(const Scope& scope,
 
   auto kernel_iter = kernels.find(expected_kernel_key);
 
-  kernel_iter->second->Compute(ExecutionContext(
-      *this, new_scope, *pool.Get(expected_kernel_key.place_)));
+  auto* new_dev_ctx = pool.Get(expected_kernel_key.place_);
+  kernel_iter->second->Compute(
+      ExecutionContext(*this, new_scope, *new_dev_ctx));
+
+  /*For profiling/benchmark only*/
+  if (FLAGS_op_sync) {
+    new_dev_ctx->Wait();
+  }
 }
 
 proto::DataType OperatorWithKernel::IndicateDataType(
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index 7037551d7544d6fea54e2f4bf887309b7dc5a52e..9d3147362ab18cc37dbf75b85c71cf9a307a16cf 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -22,6 +22,10 @@ DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
               "Default use 92% of GPU memory for PaddlePaddle,"
               "reserve the rest for page tables, etc");
 
+DEFINE_bool(op_sync, false,
+            "Default cuda is asynchronous device, set to True will"
+            "force op run in synchronous mode.");
+
 namespace paddle {
 namespace platform {