From d6038c22696e23dfc181643694e84f888e8001ae Mon Sep 17 00:00:00 2001 From: Li Min <11663212+limin2021@users.noreply.github.com> Date: Thu, 24 Feb 2022 10:21:33 +0800 Subject: [PATCH] optimize performance of lookup_table_v2_op (#39856) * optimize block config and fp16 atomicAdd perf for lookup_table_v2_grad. --- paddle/fluid/operators/lookup_table_v2_op.cu | 45 ++++++---- .../platform/device/gpu/gpu_primitives.h | 88 +++++++++++++++++++ 2 files changed, 115 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu index 4539f7091b..d40b264378 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.cu +++ b/paddle/fluid/operators/lookup_table_v2_op.cu @@ -21,19 +21,18 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template __global__ void LookupTableV2(T *output, const T *table, const IdT *ids, const int64_t N, const int64_t K, const int64_t D, const int64_t padding_idx) { int idx = threadIdx.x; - int idy = blockIdx.x + threadIdx.y * GridDimX; + int idy = blockIdx.x + threadIdx.y * gridDim.x; while (idy < K) { auto id = static_cast(ids[idy]); T *out = output + idy * D; const T *tab = table + id * D; - for (int i = idx; i < D; i += BlockDimX) { + for (int i = idx; i < D; i += blockDim.x) { if (PaddingFlag) { if (id == padding_idx) out[i] = static_cast(0); @@ -43,25 +42,29 @@ __global__ void LookupTableV2(T *output, const T *table, const IdT *ids, out[i] = tab[i]; } } - idy += BlockDimY * GridDimX; + idy += blockDim.y * gridDim.x; } } -template +template __global__ void LookupTableV2Grad(T *table, const T *output, const IdT *ids, const int64_t N, const int64_t K, const int64_t D) { int idx = threadIdx.x; - int idy = blockIdx.x + threadIdx.y * GridDimX; + int idy = blockIdx.x + threadIdx.y * gridDim.x; while (idy < K) { auto id = static_cast(ids[idy]); const T *out = output + idy * D; T *tab = table + id * D; - for (int i = idx; i < D; i += BlockDimX) { +#ifdef PADDLE_WITH_CUDA + paddle::platform::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab); +#else + for (int i = idx; i < D; i += blockDim.x) { paddle::platform::CudaAtomicAdd(&tab[i], out[i]); } - idy += BlockDimY * GridDimX; +#endif + idy += blockDim.y * gridDim.x; } } @@ -81,8 +84,9 @@ struct LookupTableV2CUDAFunctor { size_t D = table_t->dims()[1]; size_t K = ids_t_->numel(); + const int gridx = 2 * context_.cuda_device_context().GetSMCount(); dim3 threads(256, 4); - dim3 grids(80, 1); + dim3 grids(gridx, 1); const auto *table = table_t->template data(); const auto *ids = ids_t_->template data(); @@ -90,10 +94,10 @@ struct LookupTableV2CUDAFunctor { auto stream = context_.cuda_device_context().stream(); if (padding_idx == -1) { - LookupTableV2<<>>( + LookupTableV2<<>>( output, table, ids, N, K, D, padding_idx); } else { - LookupTableV2<<>>( + LookupTableV2<<>>( output, table, ids, N, K, D, padding_idx); } } @@ -193,17 +197,22 @@ struct LookupTableV2GradCUDAFunctor { int D = d_table_t->dims()[1]; int K = ids_t_->numel(); - dim3 threads(128, 8); - dim3 grids(8, 1); const T *d_output = d_output_t->template data(); const auto *ids = ids_t_->template data(); T *d_table = d_table_t->mutable_data(context_.GetPlace()); - auto t = framework::EigenVector::Flatten(*d_table_t); - t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(0)); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream())); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream())); +#endif - LookupTableV2Grad<<>>( + const int gridx = 2 * dev_ctx.GetSMCount(); + dim3 threads(128, 8); + dim3 grids(gridx, 1); + LookupTableV2Grad<<>>( d_table, d_output, ids, N, K, D); } } diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h index 3e070da546..8616e969f6 100644 --- a/paddle/fluid/platform/device/gpu/gpu_primitives.h +++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h @@ -147,6 +147,94 @@ CUDA_ATOMIC_WRAPPER(Add, float16) { } } #endif + +// The performance of "atomicAdd(half* )" is bad, but for "atomicAdd(half2* )" +// is good. So for fp16 type, we can use "atomicAdd(half2* )" to speed up. +template ::value>::type * = nullptr> +__device__ __forceinline__ void fastAtomicAdd(T *tensor, size_t index, + const size_t numel, T value) { +#if ((CUDA_VERSION < 10000) || \ + (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) + CudaAtomicAdd(reinterpret_cast(tensor) + index, + static_cast(value)); +#else + // whether the address is 32-byte aligned. + __half *target_addr = reinterpret_cast<__half *>(tensor + index); + bool aligned_half2 = + (reinterpret_cast(target_addr) % sizeof(__half2) == 0); + + if (aligned_half2 && index < (numel - 1)) { + __half2 value2; + value2.x = *reinterpret_cast<__half *>(&value); + value2.y = __int2half_rz(0); + atomicAdd(reinterpret_cast<__half2 *>(target_addr), value2); + + } else if (!aligned_half2 && index > 0) { + __half2 value2; + value2.x = __int2half_rz(0); + value2.y = *reinterpret_cast<__half *>(&value); + atomicAdd(reinterpret_cast<__half2 *>(target_addr - 1), value2); + + } else { + atomicAdd(reinterpret_cast<__half *>(tensor) + index, + *reinterpret_cast<__half *>(&value)); + } +#endif +} + +template ::value>::type * = nullptr> +__device__ __forceinline__ void fastAtomicAdd(T *arr, size_t index, + const size_t numel, T value) { + CudaAtomicAdd(arr + index, value); +} + +#ifdef PADDLE_WITH_CUDA +/* + * One thead block deals with elementwise atomicAdd for vector of len. + * @in: [x1, x2, x3, ...] + * @out:[y1+x1, y2+x2, y3+x3, ...] + * */ +template ::value>::type * = nullptr> +__device__ __forceinline__ void VectorizedAtomicAddPerBlock( + const int64_t len, int tid, int threads_per_block, const T *in, T *out) { + for (int i = tid; i < len; i += threads_per_block) { + CudaAtomicAdd(&out[i], in[i]); + } +} + +// Note: assume that len is even. If len is odd, call fastAtomicAdd directly. +template ::value>::type * = nullptr> +__device__ __forceinline__ void VectorizedAtomicAddPerBlock( + const int64_t len, int tid, int threads_per_block, const T *in, T *out) { + int i = 0; + int loops = len / 2 * 2; + + bool aligned_half2 = + (reinterpret_cast(out) % sizeof(__half2) == 0); + + if (aligned_half2) { + for (i = tid * 2; i < loops; i += threads_per_block * 2) { + __half2 value2; + T value_1 = in[i]; + T value_2 = in[i + 1]; + value2.x = *reinterpret_cast<__half *>(&value_1); + value2.y = *reinterpret_cast<__half *>(&value_2); + atomicAdd(reinterpret_cast<__half2 *>(&out[i]), value2); + } + for (; i < len; i += threads_per_block) { + fastAtomicAdd(out, i, len, in[i]); + } + } else { + for (int i = tid; i < len; i += threads_per_block) { + fastAtomicAdd(out, i, len, in[i]); + } + } +} +#endif #endif CUDA_ATOMIC_WRAPPER(Add, complex) { -- GitLab