未验证 提交 c8f76337 编写于 作者: Y ykkk2333 提交者: GitHub

fix dlrm qpsproblem (#49171)

* migrate shaple sgd, split,sign xpu kernels to phi, test=kunlun

* fix dlrm throughput problem, test=kunlun
上级 3f896dce
......@@ -29,6 +29,8 @@ XPUOpMap& get_kl2_ops() {
{"adadelta", XPUKernelSet({phi::DataType::FLOAT32})},
{"adamw", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
{"adam", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
{"adam_dense_param_sparse_grad",
XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
{"arg_max", XPUKernelSet({phi::DataType::FLOAT32})},
{"argsort_grad",
XPUKernelSet({phi::DataType::INT32,
......@@ -152,7 +154,10 @@ XPUOpMap& get_kl2_ops() {
{"elementwise_mul_grad",
XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
{"elementwise_mul",
XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
XPUKernelSet({phi::DataType::FLOAT32,
phi::DataType::FLOAT16,
phi::DataType::INT32,
phi::DataType::INT64})},
{"elementwise_pow",
XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
{"elementwise_sub_grad",
......@@ -176,6 +181,7 @@ XPUOpMap& get_kl2_ops() {
phi::DataType::FLOAT16,
phi::DataType::FLOAT32,
phi::DataType::FLOAT64})},
{"embedding_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"embedding_sparse_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"equal",
XPUKernelSet({phi::DataType::INT64,
......@@ -356,6 +362,11 @@ XPUOpMap& get_kl2_ops() {
{"mul", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
{"mul_grad",
XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
{"multiply",
XPUKernelSet({phi::DataType::FLOAT32,
phi::DataType::FLOAT16,
phi::DataType::INT32,
phi::DataType::INT64})},
{"nearest_interp_v2", XPUKernelSet({phi::DataType::FLOAT32})},
{"nearest_interp_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"not_equal",
......@@ -549,8 +560,14 @@ XPUOpMap& get_kl2_ops() {
{"temporal_shift_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"tril_triu",
XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32})},
{"tril", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32})},
{"triu", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32})},
{"tril_triu_grad",
XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32})},
{"tril_grad",
XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32})},
{"triu_grad",
XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32})},
{"tile",
XPUKernelSet({phi::DataType::INT32,
phi::DataType::INT64,
......
......@@ -17,6 +17,7 @@
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/embedding_util.h"
namespace phi {
......@@ -80,15 +81,18 @@ void EmbeddingSparseGradKernel(const Context& ctx,
int64_t padding_idx,
SelectedRows* weight_grad) {
DDim table_dim = weight.dims();
auto xpu_place = ctx.GetPlace();
xpu::ctx_guard RAII_GUARD(ctx.x_context());
std::vector<int64_t> ids(input.numel());
std::vector<int64_t> ids;
DenseTensor ids_cpu;
ids_cpu.Resize(input.dims());
ctx.template HostAlloc(
&ids_cpu, input.dtype(), input.numel() * sizeof(int64_t));
if (input.dtype() == phi::DataType::INT64) {
paddle::memory::Copy(CPUPlace(),
ids.data(),
input.place(),
input.data<int64_t>(),
sizeof(int64_t) * input.numel());
phi::Copy(ctx, input, CPUPlace(), false, &ids_cpu);
ids = CopyIdsToVector<int64_t, int64_t>(ids_cpu);
} else if (input.dtype() == phi::DataType::INT32) {
int64_t* id_t = RAII_GUARD.alloc_l3_or_gm<int64_t>(input.numel());
......@@ -96,10 +100,11 @@ void EmbeddingSparseGradKernel(const Context& ctx,
ctx.x_context(), input.data<int>(), id_t, input.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
paddle::memory::Copy(CPUPlace(),
ids.data(),
ids_cpu.data(),
input.place(),
id_t,
sizeof(int64_t) * input.numel());
ids = CopyIdsToVector<int, int64_t>(ids_cpu);
} else {
PADDLE_THROW(phi::errors::Unimplemented(
"emebdding input only support int32 and int64"));
......@@ -115,7 +120,7 @@ void EmbeddingSparseGradKernel(const Context& ctx,
auto* d_table_value = d_table->mutable_value();
d_table_value->Resize({ids_num, table_dim[1]});
ctx.template Alloc<T>(d_table_value);
ctx.template HostAlloc<T>(d_table_value);
d_table->set_height(table_dim[0]);
......@@ -134,9 +139,12 @@ void EmbeddingSparseGradKernel(const Context& ctx,
"output@Grad's shape = [%s].",
d_table_value->dims(),
d_output_dims_2d));
int r = xpu::copy<T>(
ctx.x_context(), d_output_data, d_table_data, d_output->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
paddle::memory::Copy(CPUPlace(),
d_table_data,
xpu_place,
d_output_data,
d_output->numel() * sizeof(T));
}
} // namespace phi
......
......@@ -37,42 +37,16 @@ void LogSoftmaxGradKernel(const Context& dev_ctx,
return;
}
if (out.numel() != 0) {
auto out_shape = phi::vectorize<int>(out.dims());
dev_ctx.template Alloc<T>(x_grad);
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
T* tmp_ptr = RAII_GUARD.alloc_l3_or_gm<T>(out_grad.numel());
T* tmp2_ptr = RAII_GUARD.alloc_l3_or_gm<T>(out_grad.numel());
PADDLE_ENFORCE_NE(
tmp_ptr, nullptr, phi::errors::External("no enough memory in xpu"));
PADDLE_ENFORCE_NE(
tmp2_ptr, nullptr, phi::errors::External("no enough memory in xpu"));
int r = xpu::exp<XPUType>(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(out.data<T>()),
reinterpret_cast<XPUType*>(tmp_ptr),
out_grad.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "exp");
r = xpu::reciprocal<XPUType>(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(tmp_ptr),
reinterpret_cast<XPUType*>(tmp2_ptr),
out_grad.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "reciprocal");
r = xpu::mul<XPUType>(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(tmp2_ptr),
reinterpret_cast<const XPUType*>(out_grad.data<T>()),
reinterpret_cast<XPUType*>(tmp2_ptr),
out_grad.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "mul");
r = xpu::softmax_grad<XPUType>(
dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(tmp_ptr),
reinterpret_cast<const XPUType*>(tmp2_ptr),
reinterpret_cast<XPUType*>(x_grad->data<T>()),
out_shape,
axis);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax_grad");
}
auto out_shape = phi::vectorize<int>(out.dims());
dev_ctx.template Alloc<T>(x_grad);
int r = xpu::log_softmax_grad(
dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(out.data<T>()),
reinterpret_cast<const XPUType*>(out_grad.data<T>()),
reinterpret_cast<XPUType*>(x_grad->data<T>()),
out_shape,
axis);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "log_softmax_grad");
}
} // namespace phi
......
......@@ -39,17 +39,13 @@ void LogSoftmaxKernel(const Context& dev_ctx,
auto x_shape = phi::vectorize<int>(x.dims());
dev_ctx.template Alloc<T>(out);
if (axis < 0) axis += rank;
int r = xpu::softmax<XPUType>(dev_ctx.x_context(),
int r =
xpu::log_softmax<XPUType>(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(x.data<T>()),
reinterpret_cast<XPUType*>(out->data<T>()),
x_shape,
axis);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax");
r = xpu::log<XPUType>(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(out->data<T>()),
reinterpret_cast<XPUType*>(out->data<T>()),
out->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "log");
PADDLE_ENFORCE_XDNN_SUCCESS(r, "log_softmax");
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册