diff --git a/dnn/src/cuda/convolution3d/backward_filter/inplace_matmul_impl.cu b/dnn/src/cuda/convolution3d/backward_filter/inplace_matmul_impl.cu index 9188ff7bb6c71e813d83fb547753703e3feaad0c..0f06a3fe5fa3886c61a7f7ae7befd324aae31dff 100644 --- a/dnn/src/cuda/convolution3d/backward_filter/inplace_matmul_impl.cu +++ b/dnn/src/cuda/convolution3d/backward_filter/inplace_matmul_impl.cu @@ -355,7 +355,8 @@ void convolution3d::exec_inplace_matmul_bwd_filter( } else { BX = BY = 16; } - cudaMemset(grad, 0, OC * IC * FD * FH * FW * sizeof(float)); + cuda_check( + cudaMemsetAsync(grad, 0, OC * IC * FD * FH * FW * sizeof(float), stream)); dim3 blocks(DIVUP(n, 4 * BX), DIVUP(m, 4 * BY), N); dim3 threads(BX, BY); #define DISPATCH_BX_BY(BX, BY) \ diff --git a/dnn/src/cuda/padding/padding.cu b/dnn/src/cuda/padding/padding.cu index 5b4678cdfb296fcda0cc2f57bcfa1ce98470f24e..3e1147ede50e33f582d522df96143367bef97ce0 100644 --- a/dnn/src/cuda/padding/padding.cu +++ b/dnn/src/cuda/padding/padding.cu @@ -224,7 +224,7 @@ void padding_backward_proxy( params.offsets[i * 2 + 1] = offsets[i * 2 + 1]; } - cudaMemset(dst.raw_ptr(), 0, dst.layout.access_bytes()); + cuda_check(cudaMemsetAsync(dst.raw_ptr(), 0, dst.layout.access_bytes(), stream)); void (*bwd_kern)(const size_t, const size_t, const T* const, T* const, ShapeParams);