diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 21d9fd259c829593a3bd5b119d9d4397cee67f48..1a2e6a5f8676d493c5476fd1d7b6b4bcf1c33aed 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -132,9 +132,13 @@ struct ScaleLossGradOpHandle : public OpHandle { scope_(scope), place_(place) { PADDLE_ENFORCE(cudaEventCreateWithFlags(&ev_, cudaEventDisableTiming)); + VLOG(3) << "Create " << ev_; } - ~ScaleLossGradOpHandle() { PADDLE_ENFORCE(cudaEventDestroy(ev_)); } + ~ScaleLossGradOpHandle() { + VLOG(3) << "Destroy " << ev_; + PADDLE_ENFORCE(cudaEventDestroy(ev_)); + } void Run() override { std::string var_name = static_cast(this->outputs_[0])->name_; @@ -146,20 +150,13 @@ struct ScaleLossGradOpHandle : public OpHandle { if (platform::is_cpu_place(place_)) { *tmp = coeff_; } else { - VLOG(3) << "Scale loss on place" << place_; auto stream = static_cast(this->dev_ctx_[place_]) ->stream(); cudaSetDevice(boost::get(place_).device); - VLOG(3) << "1"; - PADDLE_ENFORCE(cudaGetLastError()); - VLOG(3) << "2"; memory::Copy(boost::get(place_), tmp, platform::CPUPlace(), &coeff_, sizeof(float), stream); - PADDLE_ENFORCE(cudaDeviceSynchronize()); - VLOG(3) << "3"; PADDLE_ENFORCE(cudaEventRecord(ev_, stream)); - VLOG(3) << "4"; } }