diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu index b575c682f0d30678a72a33040cce6cc799da26cb..d2dcab4e548b99c6beecfaa570ac31804fd07d82 100644 --- a/paddle/operators/accuracy_op.cu +++ b/paddle/operators/accuracy_op.cu @@ -16,6 +16,7 @@ limitations under the License. */ #include #include "paddle/operators/accuracy_op.h" #include "paddle/platform/cuda_helper.h" +#include "paddle/platform/gpu_info.h" namespace paddle { namespace operators { @@ -73,26 +74,28 @@ class AccuracyOpCUDAKernel : public framework::OpKernel { int num_samples = static_cast(inference->dims()[0]); size_t infer_width = inference->dims()[1]; - PADDLE_ENFORCE(cudaMemset(accuracy_data, 0, sizeof(float))); - // cudaMemset((void**)&correct_data, 0, sizeof(float)); + auto stream = ctx.cuda_device_context().stream(); + platform::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream); if (num_samples == 0) { return; } - cudaMemcpy(total_data, &num_samples, sizeof(int), cudaMemcpyHostToDevice); + platform::GpuMemcpyAsync(total_data, &num_samples, sizeof(int), + cudaMemcpyHostToDevice, stream); - AccuracyCudaKernel<<< - 1, PADDLE_CUDA_NUM_THREADS, 0, ctx.cuda_device_context().stream()>>>( + AccuracyCudaKernel< + PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>( num_samples, infer_width, indices_data, label_data, correct_data, accuracy_data); int d_num_samples, d_num_correct; float d_accuracy; - cudaMemcpy(&d_num_correct, correct_data, sizeof(int), - cudaMemcpyDeviceToHost); - cudaMemcpy(&d_num_samples, total_data, sizeof(int), cudaMemcpyDeviceToHost); - cudaMemcpy(&d_accuracy, accuracy_data, sizeof(float), - cudaMemcpyDeviceToHost); + platform::GpuMemcpyAsync(&d_num_correct, correct_data, sizeof(int), + cudaMemcpyDeviceToHost, stream); + platform::GpuMemcpyAsync(&d_num_samples, total_data, sizeof(int), + cudaMemcpyDeviceToHost, stream); + platform::GpuMemcpyAsync(&d_accuracy, accuracy_data, sizeof(float), + cudaMemcpyDeviceToHost, stream); } }; diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc index f3455a8733862c91eaece629b6684d446672336c..36b216d872138d49bfd5ab6e3499d15d49ebd0ca 100644 --- a/paddle/platform/gpu_info.cc +++ b/paddle/platform/gpu_info.cc @@ -109,5 +109,10 @@ void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream), "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer"); } + +void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) { + PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream), + "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync"); +} } // namespace platform } // namespace paddle diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h index 37665b97d764fbcfe0964127d230b1d28d90b687..db961f3838af73855312d4cf6a80e2355306e08f 100644 --- a/paddle/platform/gpu_info.h +++ b/paddle/platform/gpu_info.h @@ -60,6 +60,9 @@ void GpuMemcpySync(void *dst, const void *src, size_t count, void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, size_t count, cudaStream_t stream); +//! Set memory dst with value count size asynchronously +void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream); + } // namespace platform } // namespace paddle