optimize performance of lookup_table_v2_op (#39856)

* optimize block config and fp16 atomicAdd perf for lookup_table_v2_grad.

optimize performance of lookup_table_v2_op (#39856)
* optimize block config and fp16 atomicAdd perf for lookup_table_v2_grad.
d6038c22 · Li Min · GitHub · 76a6b88d · d6038c22 · d6038c22
Showing with 115 addition and 18 deletion

paddle/fluid/operators/lookup_table_v2_op.cu paddle/fluid/operators/lookup_table_v2_op.cu +27 -18

paddle/fluid/platform/device/gpu/gpu_primitives.h paddle/fluid/platform/device/gpu/gpu_primitives.h +88 -0

未找到文件。
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -21,19 +21,18 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename T, typename IdT, int BlockDimX, int BlockDimY, int GridDimX,
-          bool PaddingFlag>
+template <typename T, typename IdT, bool PaddingFlag>
 __global__ void LookupTableV2(T *output, const T *table, const IdT *ids,
                              const int64_t N, const int64_t K, const int64_t D,
                              const int64_t padding_idx) {
  int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * GridDimX;
+  int idy = blockIdx.x + threadIdx.y * gridDim.x;

  while (idy < K) {
    auto id = static_cast<int64_t>(ids[idy]);
    T *out = output + idy * D;
    const T *tab = table + id * D;
-    for (int i = idx; i < D; i += BlockDimX) {
+    for (int i = idx; i < D; i += blockDim.x) {
      if (PaddingFlag) {
        if (id == padding_idx)
          out[i] = static_cast<T>(0);
@@ -43,25 +42,29 @@ __global__ void LookupTableV2(T *output, const T *table, const IdT *ids,
        out[i] = tab[i];
      }
    }
-    idy += BlockDimY * GridDimX;
+    idy += blockDim.y * gridDim.x;
  }
 }

-template <typename T, typename IdT, int BlockDimX, int BlockDimY, int GridDimX>
+template <typename T, typename IdT>
 __global__ void LookupTableV2Grad(T *table, const T *output, const IdT *ids,
                                  const int64_t N, const int64_t K,
                                  const int64_t D) {
  int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * GridDimX;
+  int idy = blockIdx.x + threadIdx.y * gridDim.x;

  while (idy < K) {
    auto id = static_cast<int64_t>(ids[idy]);
    const T *out = output + idy * D;
    T *tab = table + id * D;
-    for (int i = idx; i < D; i += BlockDimX) {
+#ifdef PADDLE_WITH_CUDA
+    paddle::platform::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab);
+#else
+    for (int i = idx; i < D; i += blockDim.x) {
      paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
    }
-    idy += BlockDimY * GridDimX;
+#endif
+    idy += blockDim.y * gridDim.x;
  }
 }

@@ -81,8 +84,9 @@ struct LookupTableV2CUDAFunctor {
    size_t D = table_t->dims()[1];
    size_t K = ids_t_->numel();

+    const int gridx = 2 * context_.cuda_device_context().GetSMCount();
    dim3 threads(256, 4);
-    dim3 grids(80, 1);
+    dim3 grids(gridx, 1);

    const auto *table = table_t->template data<T>();
    const auto *ids = ids_t_->template data<IdT>();
@@ -90,10 +94,10 @@ struct LookupTableV2CUDAFunctor {
    auto stream = context_.cuda_device_context().stream();

    if (padding_idx == -1) {
-      LookupTableV2<T, IdT, 256, 4, 80, false><<<grids, threads, 0, stream>>>(
+      LookupTableV2<T, IdT, false><<<grids, threads, 0, stream>>>(
          output, table, ids, N, K, D, padding_idx);
    } else {
-      LookupTableV2<T, IdT, 256, 4, 80, true><<<grids, threads, 0, stream>>>(
+      LookupTableV2<T, IdT, true><<<grids, threads, 0, stream>>>(
          output, table, ids, N, K, D, padding_idx);
    }
  }
@@ -193,17 +197,22 @@ struct LookupTableV2GradCUDAFunctor {
      int D = d_table_t->dims()[1];
      int K = ids_t_->numel();

-      dim3 threads(128, 8);
-      dim3 grids(8, 1);
      const T *d_output = d_output_t->template data<T>();
      const auto *ids = ids_t_->template data<IdT>();
      T *d_table = d_table_t->mutable_data<T>(context_.GetPlace());

-      auto t = framework::EigenVector<T>::Flatten(*d_table_t);
-      t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
+#endif

-      LookupTableV2Grad<T, IdT, 128, 8,
-                        8><<<grids, threads, 0, dev_ctx.stream()>>>(
+      const int gridx = 2 * dev_ctx.GetSMCount();
+      dim3 threads(128, 8);
+      dim3 grids(gridx, 1);
+      LookupTableV2Grad<T, IdT><<<grids, threads, 0, dev_ctx.stream()>>>(
          d_table, d_output, ids, N, K, D);
    }
  }

--- a/paddle/fluid/platform/device/gpu/gpu_primitives.h
+++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h
@@ -147,6 +147,94 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
  }
 }
 #endif
+
+// The performance of "atomicAdd(half* )" is bad, but for "atomicAdd(half2* )"
+// is good. So for fp16 type, we can use "atomicAdd(half2* )" to speed up.
+template <typename T, typename std::enable_if<std::is_same<
+                          platform::float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void fastAtomicAdd(T *tensor, size_t index,
+                                              const size_t numel, T value) {
+#if ((CUDA_VERSION < 10000) || \
+     (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
+  CudaAtomicAdd(reinterpret_cast<platform::float16 *>(tensor) + index,
+                static_cast<platform::float16>(value));
+#else
+  // whether the address is 32-byte aligned.
+  __half *target_addr = reinterpret_cast<__half *>(tensor + index);
+  bool aligned_half2 =
+      (reinterpret_cast<std::uintptr_t>(target_addr) % sizeof(__half2) == 0);
+
+  if (aligned_half2 && index < (numel - 1)) {
+    __half2 value2;
+    value2.x = *reinterpret_cast<__half *>(&value);
+    value2.y = __int2half_rz(0);
+    atomicAdd(reinterpret_cast<__half2 *>(target_addr), value2);
+
+  } else if (!aligned_half2 && index > 0) {
+    __half2 value2;
+    value2.x = __int2half_rz(0);
+    value2.y = *reinterpret_cast<__half *>(&value);
+    atomicAdd(reinterpret_cast<__half2 *>(target_addr - 1), value2);
+
+  } else {
+    atomicAdd(reinterpret_cast<__half *>(tensor) + index,
+              *reinterpret_cast<__half *>(&value));
+  }
+#endif
+}
+
+template <typename T, typename std::enable_if<!std::is_same<
+                          platform::float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void fastAtomicAdd(T *arr, size_t index,
+                                              const size_t numel, T value) {
+  CudaAtomicAdd(arr + index, value);
+}
+
+#ifdef PADDLE_WITH_CUDA
+/*
+ * One thead block deals with elementwise atomicAdd for vector of len.
+ * @in: [x1, x2, x3, ...]
+ * @out:[y1+x1, y2+x2, y3+x3, ...]
+ * */
+template <typename T, typename std::enable_if<!std::is_same<
+                          platform::float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
+    const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
+  for (int i = tid; i < len; i += threads_per_block) {
+    CudaAtomicAdd(&out[i], in[i]);
+  }
+}
+
+// Note: assume that len is even. If len is odd, call fastAtomicAdd directly.
+template <typename T, typename std::enable_if<std::is_same<
+                          platform::float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
+    const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
+  int i = 0;
+  int loops = len / 2 * 2;
+
+  bool aligned_half2 =
+      (reinterpret_cast<std::uintptr_t>(out) % sizeof(__half2) == 0);
+
+  if (aligned_half2) {
+    for (i = tid * 2; i < loops; i += threads_per_block * 2) {
+      __half2 value2;
+      T value_1 = in[i];
+      T value_2 = in[i + 1];
+      value2.x = *reinterpret_cast<__half *>(&value_1);
+      value2.y = *reinterpret_cast<__half *>(&value_2);
+      atomicAdd(reinterpret_cast<__half2 *>(&out[i]), value2);
+    }
+    for (; i < len; i += threads_per_block) {
+      fastAtomicAdd(out, i, len, in[i]);
+    }
+  } else {
+    for (int i = tid; i < len; i += threads_per_block) {
+      fastAtomicAdd(out, i, len, in[i]);
+    }
+  }
+}
+#endif
 #endif

 CUDA_ATOMIC_WRAPPER(Add, complex<float>) {