diff --git a/paddle/cuda/include/hl_device_functions.cuh b/paddle/cuda/include/hl_device_functions.cuh index 27e3f450c5c1c987f4b3fbc2eb1e52a1696c7320..88d950d6c17132d1d9969d0f3766395377e2de96 100755 --- a/paddle/cuda/include/hl_device_functions.cuh +++ b/paddle/cuda/include/hl_device_functions.cuh @@ -16,28 +16,37 @@ limitations under the License. */ #ifndef HL_DEVICE_FUNCTIONS_CUH_ #define HL_DEVICE_FUNCTIONS_CUH_ -namespace hppl { - -static __inline__ __device__ double atomicAdd(double* address, double val) { - // NOLINTNEXTLINE - unsigned long long int* address_as_ull = (unsigned long long int*)address; - unsigned long long int old = *address_as_ull, assumed; // NOLINT - - do { - assumed = old; - old = atomicCAS(address_as_ull, - assumed, - __double_as_longlong(val + - __longlong_as_double(assumed))); - } while (assumed != old); - - return __longlong_as_double(old); -} +namespace paddle { + +template +inline __device__ T paddleAtomicAdd(T* address, T val); -} // namespace hppl +template <> +inline __device__ float paddleAtomicAdd(float* address, float val) { + return atomicAdd(address, val); +} -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 -using hppl::atomicAdd; +template <> +inline __device__ double paddleAtomicAdd(double* address, double val) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 + return atomicAdd(address, val); +#else + // NOLINTNEXTLINE + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; // NOLINT + + do { + assumed = old; + old = atomicCAS(address_as_ull, + assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + } while (assumed != old); + + return __longlong_as_double(old); #endif +} +} // namespace paddle + #endif /* HL_DEVICE_FUNCTIONS_CUH_ */ diff --git a/paddle/cuda/include/hl_gpu_lstm.cuh b/paddle/cuda/include/hl_gpu_lstm.cuh index 2ca33f2b13a1f7c0863275be2993583996f70e50..07806e11c18a2b47d79237587a0e882d7bf2a1d2 100644 --- a/paddle/cuda/include/hl_gpu_lstm.cuh +++ b/paddle/cuda/include/hl_gpu_lstm.cuh @@ -192,10 +192,10 @@ __global__ void KeLstmBackward(Op op, if (isBatch) { if (value.prevStateValue) { - if (grad.checkIgGrad) atomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad); - if (grad.checkFgGrad) atomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad); + if (grad.checkIgGrad) paddle::paddleAtomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad); + if (grad.checkFgGrad) paddle::paddleAtomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad); } - if (grad.checkOgGrad) atomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad); + if (grad.checkOgGrad) paddle::paddleAtomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad); } else { if (value.prevStateValue) { if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad; diff --git a/paddle/cuda/src/hl_cuda_lstm.cu b/paddle/cuda/src/hl_cuda_lstm.cu index 64699c9f6d45040a05c9d8a1371f1afa4cfb43e3..cf009620bf69d05397c5e03de3f7f2856bf4ff6b 100644 --- a/paddle/cuda/src/hl_cuda_lstm.cu +++ b/paddle/cuda/src/hl_cuda_lstm.cu @@ -564,11 +564,11 @@ __global__ void KeLstmBackward(real *gateValue, /* TODO: Temporary save & merger in another kernel */ if (frameIdy == 1) { - if (checkIgGrad) atomicAdd(checkIgGrad+frameIdx, rCheckGrad); + if (checkIgGrad) paddle::paddleAtomicAdd(checkIgGrad+frameIdx, rCheckGrad); } else if (frameIdy == 2) { - if (checkFgGrad) atomicAdd(checkFgGrad+frameIdx, rCheckGrad); + if (checkFgGrad) paddle::paddleAtomicAdd(checkFgGrad+frameIdx, rCheckGrad); } else if (frameIdy == 3) { - if (checkOgGrad) atomicAdd(checkOgGrad+frameIdx, rCheckGrad); + if (checkOgGrad) paddle::paddleAtomicAdd(checkOgGrad+frameIdx, rCheckGrad); } } diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu index ecc44944e4fa19b064fb0aa09d81e2143e5bc85d..38e4f16217c2a4779ca9b1fd76df6276819645ff 100644 --- a/paddle/cuda/src/hl_cuda_matrix.cu +++ b/paddle/cuda/src/hl_cuda_matrix.cu @@ -623,7 +623,7 @@ __global__ void KeCosSimDerivative(real* grad, prevGradY[index] += scale * grad[ty] * prevOutX[index] * reciprocal; } else { - atomicAdd(prevGradY + index, + paddle::paddleAtomicAdd(prevGradY + index, scale * grad[ty] * prevOutX[index] * reciprocal); } } @@ -640,7 +640,7 @@ __global__ void KeCosSimDerivative(real* grad, (prevOutX[index] * reciprocalXY - prevOutY[index] * reciprocalSquareSumY); } else { - atomicAdd(prevGradY + index, output[ty] * grad[ty] * + paddle::paddleAtomicAdd(prevGradY + index, output[ty] * grad[ty] * (prevOutX[index] * reciprocalXY - prevOutY[index] * reciprocalSquareSumY)); } diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu index f88a2682fd0604c4c33cb6dd7d49d63e058bebc1..e028880156e5b191d032253bc62739c9e5ab34fc 100644 --- a/paddle/cuda/src/hl_cuda_sequence.cu +++ b/paddle/cuda/src/hl_cuda_sequence.cu @@ -362,7 +362,7 @@ __global__ void KeMatrixAddRows(real* output, if (AddRow == 0) { outputData[i] += tableData[i]; } else { - atomicAdd(&tableData[i], outputData[i]); + paddle::paddleAtomicAdd(&tableData[i], outputData[i]); } } } diff --git a/paddle/cuda/src/hl_cuda_sparse.cuh b/paddle/cuda/src/hl_cuda_sparse.cuh index becb6c66492c1b208e303af959352f1a246349c2..db5c9ce979885a173c8caadc8f6b47836f1771b5 100644 --- a/paddle/cuda/src/hl_cuda_sparse.cuh +++ b/paddle/cuda/src/hl_cuda_sparse.cuh @@ -280,7 +280,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d, if (index_n_t < dimN) { real tmp; tmp = alpha*a_r*b_r[n]; - atomicAdd(C_d_r, tmp); + paddle::paddleAtomicAdd(C_d_r, tmp); C_d_r += CU_CSC_MUL_DENSE_THREAD_X; index_n_t += CU_CSC_MUL_DENSE_THREAD_X; } @@ -328,7 +328,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d, if (index_n_t < dimN) { real tmp; tmp = alpha*a_r*b_r[n]; - atomicAdd(C_d_r, tmp); + paddle::paddleAtomicAdd(C_d_r, tmp); C_d_r += CU_CSC_MUL_DENSE_THREAD_X; index_n_t += CU_CSC_MUL_DENSE_THREAD_X; } @@ -629,7 +629,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d, for (int n=0; n < CU_DM_CSR_N; n++) { if (index_m_t++ < dimM) { tmp = alpha * b_r * a_r[n]; - atomicAdd(C_d_r, tmp); + paddle::paddleAtomicAdd(C_d_r, tmp); C_d_r += dimN; } } @@ -660,7 +660,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d, for (int n=0; n < CU_DM_CSR_N; n++) { if (index_m_t++ < dimM) { tmp = alpha * b_r * a_r[n]; - atomicAdd(C_d_r, tmp); + paddle::paddleAtomicAdd(C_d_r, tmp); C_d_r += dimN; } } @@ -912,7 +912,7 @@ __global__ void KeSMatrixCsrColumnSum(real* a_val, real* csr_val, for (int idx = gid; idx < dimNNZ; idx += gridDim.x * blockDim.x) { int colIdx = csr_col[idx]; real val = csr_val[idx]; - atomicAdd(a_val + colIdx, val); + paddle::paddleAtomicAdd(a_val + colIdx, val); } } diff --git a/paddle/cuda/src/hl_table_apply.cu b/paddle/cuda/src/hl_table_apply.cu index 05335c5f835fc52d48753d34cca1037e371cddf2..52ee4610edf670dc339e0ece66d58153c0164499 100644 --- a/paddle/cuda/src/hl_table_apply.cu +++ b/paddle/cuda/src/hl_table_apply.cu @@ -35,7 +35,7 @@ __global__ void KeMatrixAddRows(real* output, int ldo, real *tab = table + tableId * ldt; for (int i = idx; i < dim; i += blockDimX) { if (AddRow) { - atomicAdd(&tab[i], out[i]); + paddle::paddleAtomicAdd(&tab[i], out[i]); } else { out[i] += tab[i]; }