未验证 提交 f53e920d 编写于 作者: Y ykkk2333 提交者: GitHub

fix deepfm and deep_wide bug, add embedding_sparse_grad kernel, test=kunlun (#47365)

上级 9e006987
...@@ -483,8 +483,16 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1, ...@@ -483,8 +483,16 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double); PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double);
} else { } else {
#endif #endif
PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, float); #if defined(PADDLE_WITH_XPU)
PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, double); if (paddle::platform::is_xpu_place(place)) {
PADDLE_SELECTED_ROWS_ADD(phi::XPUContext, float);
} else {
#endif
PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, float);
PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, double);
#if defined(PADDLE_WITH_XPU)
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
} }
#endif #endif
...@@ -858,6 +866,5 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var, ...@@ -858,6 +866,5 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
dst_var->SetType(framework::proto::VarType::SELECTED_ROWS); dst_var->SetType(framework::proto::VarType::SELECTED_ROWS);
} }
} }
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle
...@@ -195,6 +195,8 @@ XPUOpMap& get_kl2_ops() { ...@@ -195,6 +195,8 @@ XPUOpMap& get_kl2_ops() {
pOpKernelType(vartype::FP16, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace()),
pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP64, XPUPlace())})}, pOpKernelType(vartype::FP64, XPUPlace())})},
{"embedding_sparse_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"equal", {"equal",
XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()),
......
...@@ -50,6 +50,7 @@ void AdamDenseParamSparseGradKernel( ...@@ -50,6 +50,7 @@ void AdamDenseParamSparseGradKernel(
DenseTensor* beta1_pow_out, DenseTensor* beta1_pow_out,
DenseTensor* beta2_pow_out, DenseTensor* beta2_pow_out,
DenseTensor* master_param_outs) { DenseTensor* master_param_outs) {
using XPUType = typename XPUTypeTrait<T>::Type;
float* param_ptr = nullptr; float* param_ptr = nullptr;
funcs::GetDataPointer<Context, float>(param, &param_ptr, dev_ctx); funcs::GetDataPointer<Context, float>(param, &param_ptr, dev_ctx);
...@@ -62,16 +63,32 @@ void AdamDenseParamSparseGradKernel( ...@@ -62,16 +63,32 @@ void AdamDenseParamSparseGradKernel(
float* lr_ptr = nullptr; float* lr_ptr = nullptr;
funcs::GetDataPointer<Context, float>(learning_rate, &lr_ptr, dev_ctx); funcs::GetDataPointer<Context, float>(learning_rate, &lr_ptr, dev_ctx);
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
float* beta1_pow_ptr = nullptr; float* beta1_pow_ptr = nullptr;
const float* beta1_const_pow_ptr = nullptr; const float* beta1_const_pow_ptr = nullptr;
if (beta1_pow.place() == CPUPlace()) { if (beta1_pow.place() == CPUPlace()) {
DenseTensor xpu_beta1_pow; if (beta1_pow.dtype() == DataType::FLOAT16) {
phi::Copy(dev_ctx, beta1_pow, beta1_pow.place(), false, &xpu_beta1_pow); XPUType* beta1_pow_t =
if (xpu_beta1_pow.dtype() == DataType::FLOAT16) RAII_GUARD.alloc_l3_or_gm<XPUType>(beta1_pow.numel());
funcs::GetDataPointer<Context, float>( paddle::memory::Copy(param.place(),
xpu_beta1_pow, &beta1_pow_ptr, dev_ctx); beta1_pow_t,
else beta1_pow.place(),
beta1_const_pow_ptr = xpu_beta1_pow.template data<float>(); beta1_pow.data<T>(),
sizeof(T) * beta1_pow.numel());
int r = xpu::cast<XPUType, float>(
dev_ctx.x_context(), beta1_pow_t, beta1_pow_ptr, beta1_pow.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
} else {
beta1_pow_ptr = RAII_GUARD.alloc_l3_or_gm<float>(beta1_pow.numel());
paddle::memory::Copy(param.place(),
beta1_pow_ptr,
beta1_pow.place(),
beta1_pow.data<T>(),
sizeof(T) * beta1_pow.numel());
}
} else { } else {
if (beta1_pow.dtype() == DataType::FLOAT16) if (beta1_pow.dtype() == DataType::FLOAT16)
funcs::GetDataPointer<Context, float>(beta1_pow, &beta1_pow_ptr, dev_ctx); funcs::GetDataPointer<Context, float>(beta1_pow, &beta1_pow_ptr, dev_ctx);
...@@ -81,14 +98,28 @@ void AdamDenseParamSparseGradKernel( ...@@ -81,14 +98,28 @@ void AdamDenseParamSparseGradKernel(
float* beta2_pow_ptr = nullptr; float* beta2_pow_ptr = nullptr;
const float* beta2_const_pow_ptr = nullptr; const float* beta2_const_pow_ptr = nullptr;
if (beta2_pow.place() == CPUPlace()) { if (beta2_pow.place() == CPUPlace()) {
DenseTensor xpu_beta2_pow; if (beta2_pow.dtype() == DataType::FLOAT16) {
phi::Copy(dev_ctx, beta2_pow, beta2_pow.place(), false, &xpu_beta2_pow); XPUType* beta2_pow_t =
if (xpu_beta2_pow.dtype() == DataType::FLOAT16) RAII_GUARD.alloc_l3_or_gm<XPUType>(beta2_pow.numel());
funcs::GetDataPointer<Context, float>( paddle::memory::Copy(param.place(),
xpu_beta2_pow, &beta2_pow_ptr, dev_ctx); beta2_pow_t,
else beta2_pow.place(),
beta2_const_pow_ptr = xpu_beta2_pow.template data<float>(); beta2_pow.data<T>(),
sizeof(T) * beta2_pow.numel());
int r = xpu::cast<XPUType, float>(
dev_ctx.x_context(), beta2_pow_t, beta2_pow_ptr, beta2_pow.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
} else {
beta2_pow_ptr = RAII_GUARD.alloc_l3_or_gm<float>(beta2_pow.numel());
paddle::memory::Copy(param.place(),
beta2_pow_ptr,
beta2_pow.place(),
beta2_pow.data<T>(),
sizeof(T) * beta2_pow.numel());
}
} else { } else {
if (beta2_pow.dtype() == DataType::FLOAT16) if (beta2_pow.dtype() == DataType::FLOAT16)
funcs::GetDataPointer<Context, float>(beta2_pow, &beta2_pow_ptr, dev_ctx); funcs::GetDataPointer<Context, float>(beta2_pow, &beta2_pow_ptr, dev_ctx);
...@@ -195,7 +226,6 @@ void AdamDenseParamSparseGradKernel( ...@@ -195,7 +226,6 @@ void AdamDenseParamSparseGradKernel(
int row_count = grad_merge.rows().size(); int row_count = grad_merge.rows().size();
std::vector<int> rows(row_count); std::vector<int> rows(row_count);
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
int* xpu_rows = RAII_GUARD.alloc_l3_or_gm<int>(row_count); int* xpu_rows = RAII_GUARD.alloc_l3_or_gm<int>(row_count);
std::vector<int64_t> merge_rows(grad_merge.rows().begin(), std::vector<int64_t> merge_rows(grad_merge.rows().begin(),
grad_merge.rows().end()); grad_merge.rows().end());
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/phi/kernels/embedding_grad_kernel.h" #include "paddle/phi/kernels/embedding_grad_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
...@@ -60,7 +61,78 @@ void EmbeddingGradKernel(const Context& ctx, ...@@ -60,7 +61,78 @@ void EmbeddingGradKernel(const Context& ctx,
PADDLE_ENFORCE_XDNN_SUCCESS(r, "embedding_grad"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "embedding_grad");
} }
template <typename T, typename Context>
void EmbeddingSparseGradKernel(const Context& ctx,
const DenseTensor& input,
const DenseTensor& weight,
const DenseTensor& out_grad,
int64_t padding_idx,
SelectedRows* weight_grad) {
DDim table_dim = weight.dims();
xpu::ctx_guard RAII_GUARD(ctx.x_context());
std::vector<int64_t> ids(input.numel());
if (input.dtype() == phi::DataType::INT64) {
paddle::memory::Copy(CPUPlace(),
ids.data(),
input.place(),
input.data<int64_t>(),
sizeof(int64_t) * input.numel());
} else if (input.dtype() == phi::DataType::INT32) {
int64_t* id_t = RAII_GUARD.alloc_l3_or_gm<int64_t>(input.numel());
int r = xpu::cast<int32_t, int64_t>(
ctx.x_context(), input.data<int>(), id_t, input.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
paddle::memory::Copy(CPUPlace(),
ids.data(),
input.place(),
id_t,
sizeof(int64_t) * input.numel());
} else {
PADDLE_THROW(phi::errors::Unimplemented(
"emebdding input only support int32 and int64"));
}
auto ids_num = static_cast<int64_t>(input.numel());
// Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward.
auto* d_table = weight_grad;
auto* d_output = &out_grad;
d_table->set_rows(ids);
auto* d_table_value = d_table->mutable_value();
d_table_value->Resize({ids_num, table_dim[1]});
ctx.template Alloc<T>(d_table_value);
d_table->set_height(table_dim[0]);
auto* d_output_data = d_output->template data<T>();
auto* d_table_data = d_table_value->template data<T>();
auto d_output_dims = d_output->dims();
auto d_output_dims_2d =
flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
PADDLE_ENFORCE_EQ(d_table_value->dims(),
d_output_dims_2d,
phi::errors::InvalidArgument(
"ShapeError: The shape of lookup_table@Grad and "
"output@Grad should be same. "
"But received lookup_table@Grad's shape = [%s], "
"output@Grad's shape = [%s].",
d_table_value->dims(),
d_output_dims_2d));
int r = xpu::copy<T>(
ctx.x_context(), d_output_data, d_table_data, d_output->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
}
} // namespace phi } // namespace phi
PD_REGISTER_KERNEL( PD_REGISTER_KERNEL(
embedding_grad, XPU, ALL_LAYOUT, phi::EmbeddingGradKernel, float) {} embedding_grad, XPU, ALL_LAYOUT, phi::EmbeddingGradKernel, float) {}
PD_REGISTER_KERNEL(embedding_sparse_grad,
XPU,
ALL_LAYOUT,
phi::EmbeddingSparseGradKernel,
float) {}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册