提交 2d13462a 编写于 作者: L liaogang

Fix incompatible on CUDA atomicAdd operation

上级 4e37b226
......@@ -16,28 +16,37 @@ limitations under the License. */
#ifndef HL_DEVICE_FUNCTIONS_CUH_
#define HL_DEVICE_FUNCTIONS_CUH_
namespace hppl {
static __inline__ __device__ double atomicAdd(double* address, double val) {
// NOLINTNEXTLINE
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed; // NOLINT
do {
assumed = old;
old = atomicCAS(address_as_ull,
assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
namespace paddle {
template <class T>
inline __device__ T paddleAtomicAdd(T* address, T val);
} // namespace hppl
template <>
inline __device__ float paddleAtomicAdd(float* address, float val) {
return atomicAdd(address, val);
}
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
using hppl::atomicAdd;
template <>
inline __device__ double paddleAtomicAdd(double* address, double val) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
return atomicAdd(address, val);
#else
// NOLINTNEXTLINE
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed; // NOLINT
do {
assumed = old;
old = atomicCAS(address_as_ull,
assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
#endif
}
} // namespace paddle
#endif /* HL_DEVICE_FUNCTIONS_CUH_ */
......@@ -192,10 +192,10 @@ __global__ void KeLstmBackward(Op op,
if (isBatch) {
if (value.prevStateValue) {
if (grad.checkIgGrad) atomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
if (grad.checkFgGrad) atomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
if (grad.checkIgGrad) paddle::paddleAtomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
if (grad.checkFgGrad) paddle::paddleAtomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
}
if (grad.checkOgGrad) atomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
if (grad.checkOgGrad) paddle::paddleAtomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
} else {
if (value.prevStateValue) {
if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
......
......@@ -564,11 +564,11 @@ __global__ void KeLstmBackward(real *gateValue,
/* TODO: Temporary save & merger in another kernel */
if (frameIdy == 1) {
if (checkIgGrad) atomicAdd(checkIgGrad+frameIdx, rCheckGrad);
if (checkIgGrad) paddle::paddleAtomicAdd(checkIgGrad+frameIdx, rCheckGrad);
} else if (frameIdy == 2) {
if (checkFgGrad) atomicAdd(checkFgGrad+frameIdx, rCheckGrad);
if (checkFgGrad) paddle::paddleAtomicAdd(checkFgGrad+frameIdx, rCheckGrad);
} else if (frameIdy == 3) {
if (checkOgGrad) atomicAdd(checkOgGrad+frameIdx, rCheckGrad);
if (checkOgGrad) paddle::paddleAtomicAdd(checkOgGrad+frameIdx, rCheckGrad);
}
}
......
......@@ -623,7 +623,7 @@ __global__ void KeCosSimDerivative(real* grad,
prevGradY[index] +=
scale * grad[ty] * prevOutX[index] * reciprocal;
} else {
atomicAdd(prevGradY + index,
paddle::paddleAtomicAdd(prevGradY + index,
scale * grad[ty] * prevOutX[index] * reciprocal);
}
}
......@@ -640,7 +640,7 @@ __global__ void KeCosSimDerivative(real* grad,
(prevOutX[index] * reciprocalXY -
prevOutY[index] * reciprocalSquareSumY);
} else {
atomicAdd(prevGradY + index, output[ty] * grad[ty] *
paddle::paddleAtomicAdd(prevGradY + index, output[ty] * grad[ty] *
(prevOutX[index] * reciprocalXY -
prevOutY[index] * reciprocalSquareSumY));
}
......
......@@ -362,7 +362,7 @@ __global__ void KeMatrixAddRows(real* output,
if (AddRow == 0) {
outputData[i] += tableData[i];
} else {
atomicAdd(&tableData[i], outputData[i]);
paddle::paddleAtomicAdd(&tableData[i], outputData[i]);
}
}
}
......
......@@ -280,7 +280,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
if (index_n_t < dimN) {
real tmp;
tmp = alpha*a_r*b_r[n];
atomicAdd(C_d_r, tmp);
paddle::paddleAtomicAdd(C_d_r, tmp);
C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
}
......@@ -328,7 +328,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
if (index_n_t < dimN) {
real tmp;
tmp = alpha*a_r*b_r[n];
atomicAdd(C_d_r, tmp);
paddle::paddleAtomicAdd(C_d_r, tmp);
C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
}
......@@ -629,7 +629,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d,
for (int n=0; n < CU_DM_CSR_N; n++) {
if (index_m_t++ < dimM) {
tmp = alpha * b_r * a_r[n];
atomicAdd(C_d_r, tmp);
paddle::paddleAtomicAdd(C_d_r, tmp);
C_d_r += dimN;
}
}
......@@ -660,7 +660,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d,
for (int n=0; n < CU_DM_CSR_N; n++) {
if (index_m_t++ < dimM) {
tmp = alpha * b_r * a_r[n];
atomicAdd(C_d_r, tmp);
paddle::paddleAtomicAdd(C_d_r, tmp);
C_d_r += dimN;
}
}
......@@ -912,7 +912,7 @@ __global__ void KeSMatrixCsrColumnSum(real* a_val, real* csr_val,
for (int idx = gid; idx < dimNNZ; idx += gridDim.x * blockDim.x) {
int colIdx = csr_col[idx];
real val = csr_val[idx];
atomicAdd(a_val + colIdx, val);
paddle::paddleAtomicAdd(a_val + colIdx, val);
}
}
......
......@@ -35,7 +35,7 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
real *tab = table + tableId * ldt;
for (int i = idx; i < dim; i += blockDimX) {
if (AddRow) {
atomicAdd(&tab[i], out[i]);
paddle::paddleAtomicAdd(&tab[i], out[i]);
} else {
out[i] += tab[i];
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册