From f53e920d8592b79a659a796b3838c5421cf7a00c Mon Sep 17 00:00:00 2001 From: ykkk2333 <77383312+ykkk2333@users.noreply.github.com> Date: Fri, 4 Nov 2022 11:10:49 +0800 Subject: [PATCH] fix deepfm and deep_wide bug, add embedding_sparse_grad kernel, test=kunlun (#47365) --- .../fluid/imperative/gradient_accumulator.cc | 13 +++- .../fluid/platform/device/xpu/xpu2_op_list.h | 2 + .../kernels/selected_rows/xpu/adam_kernel.cc | 60 ++++++++++++---- .../phi/kernels/xpu/embedding_grad_kernel.cc | 72 +++++++++++++++++++ 4 files changed, 129 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 851574e1db7..f60d8ef5134 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -483,8 +483,16 @@ std::shared_ptr SelectedRowsMerge(const VarType& src1, PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double); } else { #endif - PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, float); - PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, double); +#if defined(PADDLE_WITH_XPU) + if (paddle::platform::is_xpu_place(place)) { + PADDLE_SELECTED_ROWS_ADD(phi::XPUContext, float); + } else { +#endif + PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, float); + PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, double); +#if defined(PADDLE_WITH_XPU) + } +#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } #endif @@ -858,6 +866,5 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr var, dst_var->SetType(framework::proto::VarType::SELECTED_ROWS); } } - } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 73898354dc1..07337dc747e 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -195,6 +195,8 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::FP16, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP64, XPUPlace())})}, + {"embedding_sparse_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), diff --git a/paddle/phi/kernels/selected_rows/xpu/adam_kernel.cc b/paddle/phi/kernels/selected_rows/xpu/adam_kernel.cc index d94751f749d..1f8aa700541 100644 --- a/paddle/phi/kernels/selected_rows/xpu/adam_kernel.cc +++ b/paddle/phi/kernels/selected_rows/xpu/adam_kernel.cc @@ -50,6 +50,7 @@ void AdamDenseParamSparseGradKernel( DenseTensor* beta1_pow_out, DenseTensor* beta2_pow_out, DenseTensor* master_param_outs) { + using XPUType = typename XPUTypeTrait::Type; float* param_ptr = nullptr; funcs::GetDataPointer(param, ¶m_ptr, dev_ctx); @@ -62,16 +63,32 @@ void AdamDenseParamSparseGradKernel( float* lr_ptr = nullptr; funcs::GetDataPointer(learning_rate, &lr_ptr, dev_ctx); + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); float* beta1_pow_ptr = nullptr; const float* beta1_const_pow_ptr = nullptr; + if (beta1_pow.place() == CPUPlace()) { - DenseTensor xpu_beta1_pow; - phi::Copy(dev_ctx, beta1_pow, beta1_pow.place(), false, &xpu_beta1_pow); - if (xpu_beta1_pow.dtype() == DataType::FLOAT16) - funcs::GetDataPointer( - xpu_beta1_pow, &beta1_pow_ptr, dev_ctx); - else - beta1_const_pow_ptr = xpu_beta1_pow.template data(); + if (beta1_pow.dtype() == DataType::FLOAT16) { + XPUType* beta1_pow_t = + RAII_GUARD.alloc_l3_or_gm(beta1_pow.numel()); + paddle::memory::Copy(param.place(), + beta1_pow_t, + beta1_pow.place(), + beta1_pow.data(), + sizeof(T) * beta1_pow.numel()); + + int r = xpu::cast( + dev_ctx.x_context(), beta1_pow_t, beta1_pow_ptr, beta1_pow.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + } else { + beta1_pow_ptr = RAII_GUARD.alloc_l3_or_gm(beta1_pow.numel()); + paddle::memory::Copy(param.place(), + beta1_pow_ptr, + beta1_pow.place(), + beta1_pow.data(), + sizeof(T) * beta1_pow.numel()); + } + } else { if (beta1_pow.dtype() == DataType::FLOAT16) funcs::GetDataPointer(beta1_pow, &beta1_pow_ptr, dev_ctx); @@ -81,14 +98,28 @@ void AdamDenseParamSparseGradKernel( float* beta2_pow_ptr = nullptr; const float* beta2_const_pow_ptr = nullptr; + if (beta2_pow.place() == CPUPlace()) { - DenseTensor xpu_beta2_pow; - phi::Copy(dev_ctx, beta2_pow, beta2_pow.place(), false, &xpu_beta2_pow); - if (xpu_beta2_pow.dtype() == DataType::FLOAT16) - funcs::GetDataPointer( - xpu_beta2_pow, &beta2_pow_ptr, dev_ctx); - else - beta2_const_pow_ptr = xpu_beta2_pow.template data(); + if (beta2_pow.dtype() == DataType::FLOAT16) { + XPUType* beta2_pow_t = + RAII_GUARD.alloc_l3_or_gm(beta2_pow.numel()); + paddle::memory::Copy(param.place(), + beta2_pow_t, + beta2_pow.place(), + beta2_pow.data(), + sizeof(T) * beta2_pow.numel()); + + int r = xpu::cast( + dev_ctx.x_context(), beta2_pow_t, beta2_pow_ptr, beta2_pow.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + } else { + beta2_pow_ptr = RAII_GUARD.alloc_l3_or_gm(beta2_pow.numel()); + paddle::memory::Copy(param.place(), + beta2_pow_ptr, + beta2_pow.place(), + beta2_pow.data(), + sizeof(T) * beta2_pow.numel()); + } } else { if (beta2_pow.dtype() == DataType::FLOAT16) funcs::GetDataPointer(beta2_pow, &beta2_pow_ptr, dev_ctx); @@ -195,7 +226,6 @@ void AdamDenseParamSparseGradKernel( int row_count = grad_merge.rows().size(); std::vector rows(row_count); - xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); int* xpu_rows = RAII_GUARD.alloc_l3_or_gm(row_count); std::vector merge_rows(grad_merge.rows().begin(), grad_merge.rows().end()); diff --git a/paddle/phi/kernels/xpu/embedding_grad_kernel.cc b/paddle/phi/kernels/xpu/embedding_grad_kernel.cc index f1b9abfe82f..53b5cdb9016 100644 --- a/paddle/phi/kernels/xpu/embedding_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/embedding_grad_kernel.cc @@ -14,6 +14,7 @@ #include "paddle/phi/kernels/embedding_grad_kernel.h" +#include "paddle/fluid/memory/memcpy.h" #include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/core/kernel_registry.h" @@ -60,7 +61,78 @@ void EmbeddingGradKernel(const Context& ctx, PADDLE_ENFORCE_XDNN_SUCCESS(r, "embedding_grad"); } +template +void EmbeddingSparseGradKernel(const Context& ctx, + const DenseTensor& input, + const DenseTensor& weight, + const DenseTensor& out_grad, + int64_t padding_idx, + SelectedRows* weight_grad) { + DDim table_dim = weight.dims(); + + xpu::ctx_guard RAII_GUARD(ctx.x_context()); + std::vector ids(input.numel()); + if (input.dtype() == phi::DataType::INT64) { + paddle::memory::Copy(CPUPlace(), + ids.data(), + input.place(), + input.data(), + sizeof(int64_t) * input.numel()); + + } else if (input.dtype() == phi::DataType::INT32) { + int64_t* id_t = RAII_GUARD.alloc_l3_or_gm(input.numel()); + int r = xpu::cast( + ctx.x_context(), input.data(), id_t, input.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + paddle::memory::Copy(CPUPlace(), + ids.data(), + input.place(), + id_t, + sizeof(int64_t) * input.numel()); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "emebdding input only support int32 and int64")); + } + + auto ids_num = static_cast(input.numel()); + // Since paddings are not trainable and fixed in forward, the gradient of + // paddings makes no sense and we don't deal with it in backward. + auto* d_table = weight_grad; + auto* d_output = &out_grad; + d_table->set_rows(ids); + + auto* d_table_value = d_table->mutable_value(); + d_table_value->Resize({ids_num, table_dim[1]}); + + ctx.template Alloc(d_table_value); + + d_table->set_height(table_dim[0]); + + auto* d_output_data = d_output->template data(); + auto* d_table_data = d_table_value->template data(); + + auto d_output_dims = d_output->dims(); + auto d_output_dims_2d = + flatten_to_2d(d_output_dims, d_output_dims.size() - 1); + PADDLE_ENFORCE_EQ(d_table_value->dims(), + d_output_dims_2d, + phi::errors::InvalidArgument( + "ShapeError: The shape of lookup_table@Grad and " + "output@Grad should be same. " + "But received lookup_table@Grad's shape = [%s], " + "output@Grad's shape = [%s].", + d_table_value->dims(), + d_output_dims_2d)); + int r = xpu::copy( + ctx.x_context(), d_output_data, d_table_data, d_output->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy"); +} } // namespace phi PD_REGISTER_KERNEL( embedding_grad, XPU, ALL_LAYOUT, phi::EmbeddingGradKernel, float) {} +PD_REGISTER_KERNEL(embedding_sparse_grad, + XPU, + ALL_LAYOUT, + phi::EmbeddingSparseGradKernel, + float) {} -- GitLab