diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 8f45cd0fa6ea148633b3de4dcaed8d01849beb91..c31464bf20acc9042d7085bc963f785dd7c95448 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/platform/xpu/xpu_op_list.h" #endif DECLARE_bool(check_nan_inf); +DECLARE_bool(benchmark); namespace paddle { namespace imperative { @@ -208,6 +209,19 @@ static void PreparedOpRunImpl( op.Type(), outs, dev_ctx->GetPlace()); } + /*For profiling/benchmark only*/ + if (FLAGS_benchmark) { + dev_ctx->Wait(); +#if defined(PADDLE_WITH_CUDA) + PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError()); + VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error"; +#endif +#if defined(PADDLE_WITH_HIP) + PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError()); + VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error"; +#endif + } + /** * [ Why need handle complex gradient to real gradient? ] *