diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index 4539f7091b5780a9876adb64c7d8253d51723bdf..d40b2643785706e843dbd9812e74ca0aa134f7b5 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -21,19 +21,18 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T, typename IdT, int BlockDimX, int BlockDimY, int GridDimX,
-          bool PaddingFlag>
+template <typename T, typename IdT, bool PaddingFlag>
 __global__ void LookupTableV2(T *output, const T *table, const IdT *ids,
                               const int64_t N, const int64_t K, const int64_t D,
                               const int64_t padding_idx) {
   int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * GridDimX;
+  int idy = blockIdx.x + threadIdx.y * gridDim.x;
 
   while (idy < K) {
     auto id = static_cast<int64_t>(ids[idy]);
     T *out = output + idy * D;
     const T *tab = table + id * D;
-    for (int i = idx; i < D; i += BlockDimX) {
+    for (int i = idx; i < D; i += blockDim.x) {
       if (PaddingFlag) {
         if (id == padding_idx)
           out[i] = static_cast<T>(0);
@@ -43,25 +42,29 @@ __global__ void LookupTableV2(T *output, const T *table, const IdT *ids,
         out[i] = tab[i];
       }
     }
-    idy += BlockDimY * GridDimX;
+    idy += blockDim.y * gridDim.x;
   }
 }
 
-template <typename T, typename IdT, int BlockDimX, int BlockDimY, int GridDimX>
+template <typename T, typename IdT>
 __global__ void LookupTableV2Grad(T *table, const T *output, const IdT *ids,
                                   const int64_t N, const int64_t K,
                                   const int64_t D) {
   int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * GridDimX;
+  int idy = blockIdx.x + threadIdx.y * gridDim.x;
 
   while (idy < K) {
     auto id = static_cast<int64_t>(ids[idy]);
     const T *out = output + idy * D;
     T *tab = table + id * D;
-    for (int i = idx; i < D; i += BlockDimX) {
+#ifdef PADDLE_WITH_CUDA
+    paddle::platform::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab);
+#else
+    for (int i = idx; i < D; i += blockDim.x) {
       paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
     }
-    idy += BlockDimY * GridDimX;
+#endif
+    idy += blockDim.y * gridDim.x;
   }
 }
 
@@ -81,8 +84,9 @@ struct LookupTableV2CUDAFunctor {
     size_t D = table_t->dims()[1];
     size_t K = ids_t_->numel();
 
+    const int gridx = 2 * context_.cuda_device_context().GetSMCount();
     dim3 threads(256, 4);
-    dim3 grids(80, 1);
+    dim3 grids(gridx, 1);
 
     const auto *table = table_t->template data<T>();
     const auto *ids = ids_t_->template data<IdT>();
@@ -90,10 +94,10 @@ struct LookupTableV2CUDAFunctor {
     auto stream = context_.cuda_device_context().stream();
 
     if (padding_idx == -1) {
-      LookupTableV2<T, IdT, 256, 4, 80, false><<<grids, threads, 0, stream>>>(
+      LookupTableV2<T, IdT, false><<<grids, threads, 0, stream>>>(
           output, table, ids, N, K, D, padding_idx);
     } else {
-      LookupTableV2<T, IdT, 256, 4, 80, true><<<grids, threads, 0, stream>>>(
+      LookupTableV2<T, IdT, true><<<grids, threads, 0, stream>>>(
           output, table, ids, N, K, D, padding_idx);
     }
   }
@@ -193,17 +197,22 @@ struct LookupTableV2GradCUDAFunctor {
       int D = d_table_t->dims()[1];
       int K = ids_t_->numel();
 
-      dim3 threads(128, 8);
-      dim3 grids(8, 1);
       const T *d_output = d_output_t->template data<T>();
       const auto *ids = ids_t_->template data<IdT>();
       T *d_table = d_table_t->mutable_data<T>(context_.GetPlace());
 
-      auto t = framework::EigenVector<T>::Flatten(*d_table_t);
-      t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
+#endif
 
-      LookupTableV2Grad<T, IdT, 128, 8,
-                        8><<<grids, threads, 0, dev_ctx.stream()>>>(
+      const int gridx = 2 * dev_ctx.GetSMCount();
+      dim3 threads(128, 8);
+      dim3 grids(gridx, 1);
+      LookupTableV2Grad<T, IdT><<<grids, threads, 0, dev_ctx.stream()>>>(
           d_table, d_output, ids, N, K, D);
     }
   }
diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h
index 3e070da546b2ae85c40bb0e9cae05cc30d6d22c1..8616e969f69dfd469fec0372d40f6365e5038425 100644
--- a/paddle/fluid/platform/device/gpu/gpu_primitives.h
+++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h
@@ -147,6 +147,94 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
   }
 }
 #endif
+
+// The performance of "atomicAdd(half* )" is bad, but for "atomicAdd(half2* )"
+// is good. So for fp16 type, we can use "atomicAdd(half2* )" to speed up.
+template <typename T, typename std::enable_if<std::is_same<
+                          platform::float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void fastAtomicAdd(T *tensor, size_t index,
+                                              const size_t numel, T value) {
+#if ((CUDA_VERSION < 10000) || \
+     (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
+  CudaAtomicAdd(reinterpret_cast<platform::float16 *>(tensor) + index,
+                static_cast<platform::float16>(value));
+#else
+  // whether the address is 32-byte aligned.
+  __half *target_addr = reinterpret_cast<__half *>(tensor + index);
+  bool aligned_half2 =
+      (reinterpret_cast<std::uintptr_t>(target_addr) % sizeof(__half2) == 0);
+
+  if (aligned_half2 && index < (numel - 1)) {
+    __half2 value2;
+    value2.x = *reinterpret_cast<__half *>(&value);
+    value2.y = __int2half_rz(0);
+    atomicAdd(reinterpret_cast<__half2 *>(target_addr), value2);
+
+  } else if (!aligned_half2 && index > 0) {
+    __half2 value2;
+    value2.x = __int2half_rz(0);
+    value2.y = *reinterpret_cast<__half *>(&value);
+    atomicAdd(reinterpret_cast<__half2 *>(target_addr - 1), value2);
+
+  } else {
+    atomicAdd(reinterpret_cast<__half *>(tensor) + index,
+              *reinterpret_cast<__half *>(&value));
+  }
+#endif
+}
+
+template <typename T, typename std::enable_if<!std::is_same<
+                          platform::float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void fastAtomicAdd(T *arr, size_t index,
+                                              const size_t numel, T value) {
+  CudaAtomicAdd(arr + index, value);
+}
+
+#ifdef PADDLE_WITH_CUDA
+/*
+ * One thead block deals with elementwise atomicAdd for vector of len.
+ * @in: [x1, x2, x3, ...]
+ * @out:[y1+x1, y2+x2, y3+x3, ...]
+ * */
+template <typename T, typename std::enable_if<!std::is_same<
+                          platform::float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
+    const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
+  for (int i = tid; i < len; i += threads_per_block) {
+    CudaAtomicAdd(&out[i], in[i]);
+  }
+}
+
+// Note: assume that len is even. If len is odd, call fastAtomicAdd directly.
+template <typename T, typename std::enable_if<std::is_same<
+                          platform::float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
+    const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
+  int i = 0;
+  int loops = len / 2 * 2;
+
+  bool aligned_half2 =
+      (reinterpret_cast<std::uintptr_t>(out) % sizeof(__half2) == 0);
+
+  if (aligned_half2) {
+    for (i = tid * 2; i < loops; i += threads_per_block * 2) {
+      __half2 value2;
+      T value_1 = in[i];
+      T value_2 = in[i + 1];
+      value2.x = *reinterpret_cast<__half *>(&value_1);
+      value2.y = *reinterpret_cast<__half *>(&value_2);
+      atomicAdd(reinterpret_cast<__half2 *>(&out[i]), value2);
+    }
+    for (; i < len; i += threads_per_block) {
+      fastAtomicAdd(out, i, len, in[i]);
+    }
+  } else {
+    for (int i = tid; i < len; i += threads_per_block) {
+      fastAtomicAdd(out, i, len, in[i]);
+    }
+  }
+}
+#endif
 #endif
 
 CUDA_ATOMIC_WRAPPER(Add, complex<float>) {