âfix bug in cuda_aggregate

ISSUE=4608831 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1498 1ad973e4-5ce8-4261-8a94-b56d1f490c56

âfix bug in cuda_aggregate
ISSUE=4608831 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1498 1ad973e4-5ce8-4261-8a94-b56d1f490c56
8ae1b325 · liaogang · 24acf8d5 · 8ae1b325 · 8ae1b325 · 8ae1b325
4 changed file
--- a/paddle/cuda/include/hl_cuda.h
+++ b/paddle/cuda/include/hl_cuda.h
@@ -321,13 +321,14 @@ extern const char* hl_get_device_error_string(size_t err);
 extern int hl_get_device_last_error();

 /**
- * @brief   hppl query event.
+ * @brief   check cuda event is ready
 *
 * @param[in]  event        cuda event to query.
- * @param[out]  isNotReady  this work under device has not yet been
- *                          completed, vice versa.
+ *
+ * @return     true    cuda event is ready.
+ *             false   cuda event is not ready.
 */
-extern void hl_cuda_event_query(hl_event_t event, bool& isNotReady);
+extern bool hl_cuda_event_is_ready(hl_event_t event);

 /**
 * @brief   hppl device synchronization.

--- a/paddle/cuda/include/stub/hl_cuda_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_stub.h
@@ -89,7 +89,7 @@ inline const char* hl_get_device_error_string() { return NULL; }

 inline const char* hl_get_device_error_string(size_t err) { return NULL; }

-inline void hl_cuda_event_query(hl_event_t event, bool& isNotReady) {}
+inline bool hl_cuda_event_is_ready(hl_event_t event) { return true; }

 inline void hl_device_synchronize() {}


--- a/paddle/cuda/src/hl_cuda_aggregate.cu
+++ b/paddle/cuda/src/hl_cuda_aggregate.cu
@@ -261,11 +261,7 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {

  struct _hl_event_st hl_event_st  = {.cu_event = t_resource.event};
  hl_event_t hl_event = &hl_event_st;
-
-  bool isNotReady = false;
-  do {
-    hl_cuda_event_query(hl_event, isNotReady);
-  } while (isNotReady == cudaErrorNotReady);
+  while (!hl_cuda_event_is_ready(hl_event)) {}

  KeVectorSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
           (A_d, t_resource.gpu_mem, dimM);
@@ -275,7 +271,10 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);

-  CHECK_SYNC("hl_vector_sum failed");
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  cudaError_t err = (cudaError_t)hl_get_device_last_error();
+  CHECK_EQ(cudaSuccess, err)
+    << "CUDA error: " << hl_get_device_error_string((size_t)err);
 }

 template <int blockSize>
@@ -317,11 +316,7 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {

  struct _hl_event_st hl_event_st  = {.cu_event = t_resource.event};
  hl_event_t hl_event = &hl_event_st;
-
-  bool isNotReady = false;
-  do {
-    hl_cuda_event_query(hl_event, isNotReady);
-  } while (isNotReady == cudaErrorNotReady);
+  while (!hl_cuda_event_is_ready(hl_event)) {}

  KeVectorAbsSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
           (A_d, t_resource.gpu_mem, dimM);
@@ -331,5 +326,8 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);

-  CHECK_SYNC("hl_vector_abs_sum failed");
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  cudaError_t err = (cudaError_t)hl_get_device_last_error();
+  CHECK_EQ(cudaSuccess, err)
+    << "CUDA error: " << hl_get_device_error_string((size_t)err);
 }
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -751,11 +751,12 @@ void hl_set_device_flags_block() {
             cudaDeviceScheduleBlockingSync));
 }

-void hl_cuda_event_query(hl_event_t event, bool& isNotReady) {
+bool hl_cuda_event_is_ready(hl_event_t event) {
  cudaError_t err = dynload::cudaEventQuery(event->cu_event);
  CHECK(cudaSuccess == err || cudaErrorNotReady == err);

  if (cudaErrorNotReady == err) {
-    isNotReady = true;
+    return false;
  }
+  return true;
 }