diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
index f11fd0191b93514ffd0773d4c5c060e0797ed1be..6694216214c315c5449200821667c92e1e35697b 100644
--- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
@@ -23,6 +23,8 @@
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/embedding_util.h"
 
+DECLARE_bool(cudnn_deterministic);
+
 namespace phi {
 
 template <typename InT, typename OutT>
@@ -101,6 +103,11 @@ struct EmbeddingGradCUDAFunctor {
       const int gridx = 2 * dev_ctx_.GetSMCount();
       dim3 threads(128, 8);
       dim3 grids(gridx, 1);
+
+      if (FLAGS_cudnn_deterministic) {
+        VLOG(2) << "Run grad kernel of embedding with single thread.";
+        grids.x = 1;
+      }
       EmbeddingGrad<T, IdT><<<grids, threads, 0, dev_ctx_.stream()>>>(
           d_table, d_output, ids, N, K, D);
     }
diff --git a/paddle/phi/kernels/gpu/index_add_kernel.cu b/paddle/phi/kernels/gpu/index_add_kernel.cu
index 9783687ba5fb7cd8cb2592501f39c1731e6230fe..ff8fb1702075cbad562130b93091c03b886ce411 100644
--- a/paddle/phi/kernels/gpu/index_add_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_add_kernel.cu
@@ -20,6 +20,8 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 
+DECLARE_bool(cudnn_deterministic);
+
 namespace phi {
 
 using paddle::platform::PADDLE_CUDA_NUM_THREADS;
@@ -79,6 +81,12 @@ void IndexAddKernel(const Context& ctx,
   // todo(@limin29): inplace do not need copy.
   phi::Copy(ctx, x, ctx.GetPlace(), false, output);
 
+  if (FLAGS_cudnn_deterministic) {
+    VLOG(2) << "Run grad kernel of index_add with single thread.";
+    block_dim = 1;
+    grid_dim.x = 1;
+  }
+
   if (index_type == phi::DataType::INT64) {
     const int64_t* index_data = index.data<int64_t>();
     index_add_cuda_kernel<T, int64_t>