提交 2d13462a 编写于 作者: L liaogang

Fix incompatible on CUDA atomicAdd operation

上级 4e37b226
...@@ -16,28 +16,37 @@ limitations under the License. */ ...@@ -16,28 +16,37 @@ limitations under the License. */
#ifndef HL_DEVICE_FUNCTIONS_CUH_ #ifndef HL_DEVICE_FUNCTIONS_CUH_
#define HL_DEVICE_FUNCTIONS_CUH_ #define HL_DEVICE_FUNCTIONS_CUH_
namespace hppl { namespace paddle {
static __inline__ __device__ double atomicAdd(double* address, double val) { template <class T>
// NOLINTNEXTLINE inline __device__ T paddleAtomicAdd(T* address, T val);
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed; // NOLINT
do {
assumed = old;
old = atomicCAS(address_as_ull,
assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
} // namespace hppl template <>
inline __device__ float paddleAtomicAdd(float* address, float val) {
return atomicAdd(address, val);
}
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 template <>
using hppl::atomicAdd; inline __device__ double paddleAtomicAdd(double* address, double val) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
return atomicAdd(address, val);
#else
// NOLINTNEXTLINE
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed; // NOLINT
do {
assumed = old;
old = atomicCAS(address_as_ull,
assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
#endif #endif
}
} // namespace paddle
#endif /* HL_DEVICE_FUNCTIONS_CUH_ */ #endif /* HL_DEVICE_FUNCTIONS_CUH_ */
...@@ -192,10 +192,10 @@ __global__ void KeLstmBackward(Op op, ...@@ -192,10 +192,10 @@ __global__ void KeLstmBackward(Op op,
if (isBatch) { if (isBatch) {
if (value.prevStateValue) { if (value.prevStateValue) {
if (grad.checkIgGrad) atomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad); if (grad.checkIgGrad) paddle::paddleAtomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
if (grad.checkFgGrad) atomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad); if (grad.checkFgGrad) paddle::paddleAtomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
} }
if (grad.checkOgGrad) atomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad); if (grad.checkOgGrad) paddle::paddleAtomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
} else { } else {
if (value.prevStateValue) { if (value.prevStateValue) {
if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad; if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
......
...@@ -564,11 +564,11 @@ __global__ void KeLstmBackward(real *gateValue, ...@@ -564,11 +564,11 @@ __global__ void KeLstmBackward(real *gateValue,
/* TODO: Temporary save & merger in another kernel */ /* TODO: Temporary save & merger in another kernel */
if (frameIdy == 1) { if (frameIdy == 1) {
if (checkIgGrad) atomicAdd(checkIgGrad+frameIdx, rCheckGrad); if (checkIgGrad) paddle::paddleAtomicAdd(checkIgGrad+frameIdx, rCheckGrad);
} else if (frameIdy == 2) { } else if (frameIdy == 2) {
if (checkFgGrad) atomicAdd(checkFgGrad+frameIdx, rCheckGrad); if (checkFgGrad) paddle::paddleAtomicAdd(checkFgGrad+frameIdx, rCheckGrad);
} else if (frameIdy == 3) { } else if (frameIdy == 3) {
if (checkOgGrad) atomicAdd(checkOgGrad+frameIdx, rCheckGrad); if (checkOgGrad) paddle::paddleAtomicAdd(checkOgGrad+frameIdx, rCheckGrad);
} }
} }
......
...@@ -623,7 +623,7 @@ __global__ void KeCosSimDerivative(real* grad, ...@@ -623,7 +623,7 @@ __global__ void KeCosSimDerivative(real* grad,
prevGradY[index] += prevGradY[index] +=
scale * grad[ty] * prevOutX[index] * reciprocal; scale * grad[ty] * prevOutX[index] * reciprocal;
} else { } else {
atomicAdd(prevGradY + index, paddle::paddleAtomicAdd(prevGradY + index,
scale * grad[ty] * prevOutX[index] * reciprocal); scale * grad[ty] * prevOutX[index] * reciprocal);
} }
} }
...@@ -640,7 +640,7 @@ __global__ void KeCosSimDerivative(real* grad, ...@@ -640,7 +640,7 @@ __global__ void KeCosSimDerivative(real* grad,
(prevOutX[index] * reciprocalXY - (prevOutX[index] * reciprocalXY -
prevOutY[index] * reciprocalSquareSumY); prevOutY[index] * reciprocalSquareSumY);
} else { } else {
atomicAdd(prevGradY + index, output[ty] * grad[ty] * paddle::paddleAtomicAdd(prevGradY + index, output[ty] * grad[ty] *
(prevOutX[index] * reciprocalXY - (prevOutX[index] * reciprocalXY -
prevOutY[index] * reciprocalSquareSumY)); prevOutY[index] * reciprocalSquareSumY));
} }
......
...@@ -362,7 +362,7 @@ __global__ void KeMatrixAddRows(real* output, ...@@ -362,7 +362,7 @@ __global__ void KeMatrixAddRows(real* output,
if (AddRow == 0) { if (AddRow == 0) {
outputData[i] += tableData[i]; outputData[i] += tableData[i];
} else { } else {
atomicAdd(&tableData[i], outputData[i]); paddle::paddleAtomicAdd(&tableData[i], outputData[i]);
} }
} }
} }
......
...@@ -280,7 +280,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d, ...@@ -280,7 +280,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
if (index_n_t < dimN) { if (index_n_t < dimN) {
real tmp; real tmp;
tmp = alpha*a_r*b_r[n]; tmp = alpha*a_r*b_r[n];
atomicAdd(C_d_r, tmp); paddle::paddleAtomicAdd(C_d_r, tmp);
C_d_r += CU_CSC_MUL_DENSE_THREAD_X; C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
index_n_t += CU_CSC_MUL_DENSE_THREAD_X; index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
} }
...@@ -328,7 +328,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d, ...@@ -328,7 +328,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
if (index_n_t < dimN) { if (index_n_t < dimN) {
real tmp; real tmp;
tmp = alpha*a_r*b_r[n]; tmp = alpha*a_r*b_r[n];
atomicAdd(C_d_r, tmp); paddle::paddleAtomicAdd(C_d_r, tmp);
C_d_r += CU_CSC_MUL_DENSE_THREAD_X; C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
index_n_t += CU_CSC_MUL_DENSE_THREAD_X; index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
} }
...@@ -629,7 +629,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d, ...@@ -629,7 +629,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d,
for (int n=0; n < CU_DM_CSR_N; n++) { for (int n=0; n < CU_DM_CSR_N; n++) {
if (index_m_t++ < dimM) { if (index_m_t++ < dimM) {
tmp = alpha * b_r * a_r[n]; tmp = alpha * b_r * a_r[n];
atomicAdd(C_d_r, tmp); paddle::paddleAtomicAdd(C_d_r, tmp);
C_d_r += dimN; C_d_r += dimN;
} }
} }
...@@ -660,7 +660,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d, ...@@ -660,7 +660,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d,
for (int n=0; n < CU_DM_CSR_N; n++) { for (int n=0; n < CU_DM_CSR_N; n++) {
if (index_m_t++ < dimM) { if (index_m_t++ < dimM) {
tmp = alpha * b_r * a_r[n]; tmp = alpha * b_r * a_r[n];
atomicAdd(C_d_r, tmp); paddle::paddleAtomicAdd(C_d_r, tmp);
C_d_r += dimN; C_d_r += dimN;
} }
} }
...@@ -912,7 +912,7 @@ __global__ void KeSMatrixCsrColumnSum(real* a_val, real* csr_val, ...@@ -912,7 +912,7 @@ __global__ void KeSMatrixCsrColumnSum(real* a_val, real* csr_val,
for (int idx = gid; idx < dimNNZ; idx += gridDim.x * blockDim.x) { for (int idx = gid; idx < dimNNZ; idx += gridDim.x * blockDim.x) {
int colIdx = csr_col[idx]; int colIdx = csr_col[idx];
real val = csr_val[idx]; real val = csr_val[idx];
atomicAdd(a_val + colIdx, val); paddle::paddleAtomicAdd(a_val + colIdx, val);
} }
} }
......
...@@ -35,7 +35,7 @@ __global__ void KeMatrixAddRows(real* output, int ldo, ...@@ -35,7 +35,7 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
real *tab = table + tableId * ldt; real *tab = table + tableId * ldt;
for (int i = idx; i < dim; i += blockDimX) { for (int i = idx; i < dim; i += blockDimX) {
if (AddRow) { if (AddRow) {
atomicAdd(&tab[i], out[i]); paddle::paddleAtomicAdd(&tab[i], out[i]);
} else { } else {
out[i] += tab[i]; out[i] += tab[i];
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册