diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu index f11fd0191b93514ffd0773d4c5c060e0797ed1be..6694216214c315c5449200821667c92e1e35697b 100644 --- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu @@ -23,6 +23,8 @@ #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/embedding_util.h" +DECLARE_bool(cudnn_deterministic); + namespace phi { template @@ -101,6 +103,11 @@ struct EmbeddingGradCUDAFunctor { const int gridx = 2 * dev_ctx_.GetSMCount(); dim3 threads(128, 8); dim3 grids(gridx, 1); + + if (FLAGS_cudnn_deterministic) { + VLOG(2) << "Run grad kernel of embedding with single thread."; + grids.x = 1; + } EmbeddingGrad<<>>( d_table, d_output, ids, N, K, D); } diff --git a/paddle/phi/kernels/gpu/index_add_kernel.cu b/paddle/phi/kernels/gpu/index_add_kernel.cu index 9783687ba5fb7cd8cb2592501f39c1731e6230fe..ff8fb1702075cbad562130b93091c03b886ce411 100644 --- a/paddle/phi/kernels/gpu/index_add_kernel.cu +++ b/paddle/phi/kernels/gpu/index_add_kernel.cu @@ -20,6 +20,8 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/utils/data_type.h" +DECLARE_bool(cudnn_deterministic); + namespace phi { using paddle::platform::PADDLE_CUDA_NUM_THREADS; @@ -79,6 +81,12 @@ void IndexAddKernel(const Context& ctx, // todo(@limin29): inplace do not need copy. phi::Copy(ctx, x, ctx.GetPlace(), false, output); + if (FLAGS_cudnn_deterministic) { + VLOG(2) << "Run grad kernel of index_add with single thread."; + block_dim = 1; + grid_dim.x = 1; + } + if (index_type == phi::DataType::INT64) { const int64_t* index_data = index.data(); index_add_cuda_kernel