diff --git a/paddle/phi/kernels/cpu/embedding_kernel.cc b/paddle/phi/kernels/cpu/embedding_kernel.cc index cfba1787a1576e361809ab0d49273f5f70766947..0d937e6364eacc22863ac34ec7a2afe65fee317a 100644 --- a/paddle/phi/kernels/cpu/embedding_kernel.cc +++ b/paddle/phi/kernels/cpu/embedding_kernel.cc @@ -48,14 +48,8 @@ struct EmbeddingCPUFunctor { dev_ctx_.template Alloc(out_); auto* output = out_->data(); -#if defined(_OPENMP) && !defined(PADDLE_WITH_CUDA) -#pragma omp parallel for -#endif - for (int64_t i = 0; i < ids_numel; ++i) { - if (padding_idx_ != kNoPadding && ids[i] == padding_idx_) { - memset(output + i * row_width, 0, row_width * sizeof(T)); - } else { + if (padding_idx_ == kNoPadding && ids[i] != padding_idx_) { PADDLE_ENFORCE_LT( ids[i], row_number, @@ -74,6 +68,17 @@ struct EmbeddingCPUFunctor { "value.", row_number, ids[i])); + } + } + +#if defined(_OPENMP) && !defined(PADDLE_WITH_CUDA) +#pragma omp parallel for +#endif + + for (int64_t i = 0; i < ids_numel; ++i) { + if (padding_idx_ != kNoPadding && ids[i] == padding_idx_) { + memset(output + i * row_width, 0, row_width * sizeof(T)); + } else { memcpy(output + i * row_width, table + ids[i] * row_width, row_width * sizeof(T));