未验证 提交 0b9d4c56 编写于 作者: Y ykkk2333 提交者: GitHub

migrate softmax_with_cross_entropy and topk kernels to phi, test=kunlun (#45650)

上级 3b9b4c34
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "xpu/refactor/math.h"
#include "xpu/refactor/nn.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE_EQ(
platform::is_xpu_place(context.GetPlace()),
true,
platform::errors::PreconditionNotMet("This kernel only runs on XPU."));
const Tensor* logits = context.Input<Tensor>("Logits");
const Tensor* labels = context.Input<Tensor>("Label");
Tensor* softmax = context.Output<Tensor>("Softmax");
Tensor* loss = context.Output<Tensor>("Loss");
const int rank = logits->dims().size();
const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
softmax->mutable_data<T>(context.GetPlace());
loss->mutable_data<T>(context.GetPlace());
const int n = phi::funcs::SizeToAxis(axis, logits->dims());
const int d = phi::funcs::SizeFromAxis(axis, logits->dims());
std::vector<int> logits_dims = phi::vectorize<int>(logits->dims());
const bool soft_label = context.Attr<bool>("soft_label");
int t = logits_dims[axis];
auto logits_data = reinterpret_cast<const XPUType*>(logits->data<T>());
auto softmax_data = reinterpret_cast<XPUType*>(softmax->data<T>());
auto loss_data = reinterpret_cast<XPUType*>(loss->data<T>());
// softmax
auto& dev_ctx =
context.template device_context<platform::XPUDeviceContext>();
int r = XPU_SUCCESS;
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
if (platform::get_xpu_version(context.GetPlace().GetDeviceId()) ==
phi::backends::xpu::XPUVersion::XPU2 &&
soft_label && axis == rank - 1) {
auto labels_data = reinterpret_cast<const XPUType*>(labels->data<T>());
r = xpu::soft_softmax_with_cross_entropy<XPUType>(dev_ctx.x_context(),
logits_data,
labels_data,
softmax_data,
loss_data,
n,
d);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy");
return;
}
int len = logits->numel();
T* clip_logits = RAII_GUARD.alloc_l3_or_gm<T>(len);
PADDLE_ENFORCE_XDNN_NOT_NULL(clip_logits);
XPUType* clip_logits_data = reinterpret_cast<XPUType*>(clip_logits);
float max_val = 1e20;
float min_val = -1e20;
if (std::is_same<T, platform::float16>::value) {
max_val = 65504;
min_val = -65504;
}
r = xpu::clip_v2<XPUType>(dev_ctx.x_context(),
logits_data,
clip_logits_data,
len,
static_cast<XPUType>(min_val),
static_cast<XPUType>(max_val));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2");
r = xpu::softmax<XPUType>(
dev_ctx.x_context(), clip_logits_data, softmax_data, logits_dims, axis);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax");
// cross_entropy
if (axis != rank - 1) {
XPUType* trans_softmax = RAII_GUARD.alloc_l3_or_gm<XPUType>(n * d);
PADDLE_ENFORCE_XDNN_NOT_NULL(trans_softmax);
r = xpu::transpose(dev_ctx.x_context(),
softmax_data,
trans_softmax,
{n, t, d / t},
{0, 2, 1});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
softmax_data = trans_softmax;
}
if (soft_label) {
auto labels_data = reinterpret_cast<const XPUType*>(labels->data<T>());
if (axis != rank - 1) {
XPUType* trans_label = RAII_GUARD.alloc_l3_or_gm<XPUType>(n * d);
PADDLE_ENFORCE_XDNN_NOT_NULL(trans_label);
r = xpu::transpose(dev_ctx.x_context(),
labels_data,
trans_label,
{n, t, d / t},
{0, 2, 1});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
labels_data = trans_label;
}
r = xpu::soft_cross_entropy<XPUType>(dev_ctx.x_context(),
softmax_data,
labels_data,
loss_data,
axis == rank - 1 ? n : n * d / t,
axis == rank - 1 ? d : t);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_cross_entropy");
} else {
auto ignore_index = context.Attr<int>("ignore_index");
Tensor labels_int32;
labels_int32.mutable_data<int32_t>(context.GetPlace(),
labels->numel() * sizeof(int32_t));
r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(),
labels->data<int64_t>(),
labels_int32.data<int32_t>(),
labels->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2");
r = xpu::hard_cross_entropy<XPUType, int32_t>(
dev_ctx.x_context(),
softmax_data,
labels_int32.data<int32_t>(),
loss_data,
nullptr,
axis == rank - 1 ? n : n * d / t,
axis == rank - 1 ? d : t,
ignore_index);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_cross_entropy");
}
}
};
template <typename T>
class SoftmaxWithCrossEntropyGradXPUKernel : public framework::OpKernel<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* out_grad =
context.Input<Tensor>(framework::GradVarName("Loss"));
const Tensor* labels = context.Input<Tensor>("Label");
Tensor* logit_grad =
context.Output<Tensor>(framework::GradVarName("Logits"));
logit_grad->mutable_data<T>(context.GetPlace());
const Tensor* softmax = context.Input<Tensor>("Softmax");
const bool use_softmax = context.Attr<bool>("use_softmax");
const bool soft_label = context.Attr<bool>("soft_label");
auto ignore_index = context.Attr<int>("ignore_index");
const int rank = logit_grad->dims().size();
const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
const int n = phi::funcs::SizeToAxis(axis, logit_grad->dims());
const int d = phi::funcs::SizeFromAxis(axis, logit_grad->dims());
auto& dev_ctx =
context.template device_context<platform::XPUDeviceContext>();
int r = XPU_SUCCESS;
if (axis == rank - 1) {
if (soft_label) {
r = xpu::soft_softmax_with_cross_entropy_grad<XPUType>(
dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(out_grad->data<T>()),
reinterpret_cast<const XPUType*>(labels->data<T>()),
reinterpret_cast<const XPUType*>(softmax->data<T>()),
reinterpret_cast<XPUType*>(logit_grad->data<T>()),
use_softmax,
n,
d);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy_grad");
} else {
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
int* labels_int_ptr_l3 =
RAII_GUARD.alloc_l3_or_gm<int32_t>(labels->numel());
PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3);
r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(),
labels->data<int64_t>(),
labels_int_ptr_l3,
labels->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
r = xpu::hard_softmax_with_cross_entropy_grad<XPUType, int>(
dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(out_grad->data<T>()),
labels_int_ptr_l3,
reinterpret_cast<const XPUType*>(softmax->data<T>()),
reinterpret_cast<XPUType*>(logit_grad->data<T>()),
ignore_index,
use_softmax,
n,
d);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_softmax_with_cross_entropy_grad");
}
} else {
int t = logit_grad->dims()[axis];
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
int len = softmax->numel();
XPUType* trans_logit = RAII_GUARD.alloc_l3_or_gm<XPUType>(len);
PADDLE_ENFORCE_XDNN_NOT_NULL(trans_logit);
XPUType* trans_softmax = RAII_GUARD.alloc_l3_or_gm<XPUType>(len);
PADDLE_ENFORCE_XDNN_NOT_NULL(trans_softmax);
r = xpu::transpose(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(softmax->data<T>()),
trans_softmax,
{n, t, d / t},
{0, 2, 1});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
if (soft_label) {
XPUType* trans_labels = RAII_GUARD.alloc_l3_or_gm<XPUType>(len);
PADDLE_ENFORCE_XDNN_NOT_NULL(trans_labels);
r = xpu::transpose(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(labels->data<T>()),
trans_labels,
{n, t, d / t},
{0, 2, 1});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
r = xpu::soft_softmax_with_cross_entropy_grad<XPUType>(
dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(out_grad->data<T>()),
trans_labels,
trans_softmax,
trans_logit,
use_softmax,
n * d / t,
t);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy_grad");
} else {
int* labels_int_ptr_l3 =
RAII_GUARD.alloc_l3_or_gm<int32_t>(labels->numel());
PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3);
r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(),
labels->data<int64_t>(),
labels_int_ptr_l3,
labels->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2");
r = xpu::hard_softmax_with_cross_entropy_grad<XPUType, int>(
dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(out_grad->data<T>()),
labels_int_ptr_l3,
trans_softmax,
trans_logit,
ignore_index,
use_softmax,
n * d / t,
t);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_softmax_with_cross_entropy_grad");
}
r = xpu::transpose<XPUType>(
dev_ctx.x_context(),
trans_logit,
reinterpret_cast<XPUType*>(logit_grad->data<T>()),
{n, d / t, t},
{0, 2, 1});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(
softmax_with_cross_entropy,
ops::SoftmaxWithCrossEntropyXPUKernel<float>,
ops::SoftmaxWithCrossEntropyXPUKernel<paddle::platform::float16>);
REGISTER_OP_XPU_KERNEL(
softmax_with_cross_entropy_grad,
ops::SoftmaxWithCrossEntropyGradXPUKernel<float>,
ops::SoftmaxWithCrossEntropyGradXPUKernel<paddle::platform::float16>);
#endif
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include <memory>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/transpose_op.h"
#include "xpu/refactor/math.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
class TopkV2XPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out");
auto* indices = ctx.Output<Tensor>("Indices");
const auto& in_dims = input->dims();
const T* in_data = input->data<T>();
int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
T* output_data = output->mutable_data<T>(ctx.GetPlace());
const auto& out_dims = output->dims();
const auto& sorted = static_cast<bool>(ctx.Attr<bool>("sorted"));
const auto& largest = static_cast<bool>(ctx.Attr<bool>("largest"));
PADDLE_ENFORCE_EQ(
sorted,
true,
platform::errors::External(
"XPU API does not support unsorted topk operation currently."
" Operator will be supported in future update."));
PADDLE_ENFORCE_EQ(
largest,
true,
platform::errors::External(
"XPU API does not support smallest topk operation currently."
" Operator will be supported in future update."));
int axis = static_cast<int>(ctx.Attr<int>("axis"));
if (axis < 0) axis += in_dims.size();
size_t k = static_cast<int>(ctx.Attr<int>("k"));
auto* k_t = ctx.Input<Tensor>("K");
if (k_t) {
k = k_t->data<int>()[0];
framework::DDim output_dims = output->dims();
output_dims[axis] = k;
output->Resize(output_dims);
indices->Resize(output_dims);
}
if (axis + 1 == in_dims.size()) {
auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
int32_t* indices_int_data =
RAII_GUARD.alloc_l3_or_gm<int32_t>(indices->numel());
const size_t row =
phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
const size_t col = in_dims[in_dims.size() - 1];
int r = xpu::sorted_topk<T>(dev_ctx.x_context(),
in_data,
output_data,
indices_int_data,
row,
col,
k);
PADDLE_ENFORCE_EQ(
r,
XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d %s] in call kernel name "
"[%s], please check "
"where Baidu Kunlun Card is properly installed.",
r,
XPUAPIErrorMsg[r],
"sorted_topk"));
r = xpu::cast_v2<int32_t, int64_t>(dev_ctx.x_context(),
(const int32_t*)indices_int_data,
indices_data,
indices->numel());
PADDLE_ENFORCE_EQ(
r,
XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d %s] in call kernel name "
"[%s], please check "
"where Baidu Kunlun Card is properly installed.",
r,
XPUAPIErrorMsg[r],
"cast_v2"));
} else {
// do transpose if axis is not the last dim of input
std::vector<int> trans_axes;
for (int i = 0; i < axis; i++) {
trans_axes.emplace_back(i);
}
for (int i = axis + 1; i < in_dims.size(); i++) {
trans_axes.emplace_back(i);
}
trans_axes.emplace_back(axis);
// Get input and output dims for transpose
framework::DDim trans_dims(in_dims);
framework::DDim trans_out_dims(output->dims());
for (size_t i = 0; i < trans_axes.size(); i++) {
trans_dims[i] = in_dims[trans_axes[i]];
trans_out_dims[i] = out_dims[trans_axes[i]];
}
std::vector<int> x_shape_host(in_dims.size(), 0);
for (int i = 0; i < in_dims.size(); ++i) {
x_shape_host[i] = in_dims[i];
}
auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
T* trans_in_data = RAII_GUARD.alloc_l3_or_gm<T>(input->numel());
// Transpose and save interval output to trans_in
int r = xpu::transpose<T>(dev_ctx.x_context(),
in_data,
trans_in_data,
x_shape_host,
trans_axes);
PADDLE_ENFORCE_EQ(
r,
xpu::Error_t::SUCCESS,
platform::errors::External("XPU API 1st Transpose kernel"
" returns wrong value[%d %s]!",
r,
XPUAPIErrorMsg[r]));
T* trans_out_data = RAII_GUARD.alloc_l3_or_gm<T>(output->numel());
int64_t* trans_idx_data =
RAII_GUARD.alloc_l3_or_gm<int64_t>(output->numel());
int32_t* trans_idx_int32_data =
RAII_GUARD.alloc_l3_or_gm<int32_t>(output->numel());
const size_t row =
phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
const size_t col = trans_dims[trans_dims.size() - 1];
// Do top k on transposed input
r = xpu::sorted_topk<T>(dev_ctx.x_context(),
trans_in_data,
trans_out_data,
trans_idx_int32_data,
row,
col,
k);
PADDLE_ENFORCE_EQ(
r,
XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d %s] in call kernel name "
"[%s], please check "
"where Baidu Kunlun Card is properly installed.",
r,
XPUAPIErrorMsg[r],
"sorted_topk"));
r = xpu::cast_v2<int32_t, int64_t>(dev_ctx.x_context(),
(const int32_t*)trans_idx_int32_data,
trans_idx_data,
indices->numel());
PADDLE_ENFORCE_EQ(
r,
XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d %s in call kernel name "
"[%s], please check "
"where Baidu Kunlun Card is properly installed.",
r,
XPUAPIErrorMsg[r],
"cast_v2"));
// Transpose back to original dims
std::vector<int> trans_back_axes;
for (int i = 0; i < axis; i++) {
trans_axes.emplace_back(i);
}
trans_axes.emplace_back(trans_out_dims.size() - 1);
for (int i = axis; i < trans_out_dims.size() - 1; i++) {
trans_axes.emplace_back(i);
}
std::vector<int> trans_out_shape_host(trans_back_axes.size(), 0);
for (size_t i = 0; i < trans_back_axes.size(); ++i) {
trans_out_shape_host[i] = trans_out_dims[i];
}
r = xpu::transpose<T>(dev_ctx.x_context(),
trans_out_data,
output_data,
trans_out_shape_host,
trans_back_axes);
PADDLE_ENFORCE_EQ(
r,
xpu::Error_t::SUCCESS,
platform::errors::External("XPU API 2nd Transpose kernel"
" returns wrong value[%d %s]",
r,
XPUAPIErrorMsg[r]));
r = xpu::transpose<int64_t>(dev_ctx.x_context(),
trans_idx_data,
indices_data,
trans_out_shape_host,
trans_back_axes);
PADDLE_ENFORCE_EQ(
r,
xpu::Error_t::SUCCESS,
platform::errors::External("XPU API 3rd Transpose kernel"
" returns wrong value[%d %s]",
r,
XPUAPIErrorMsg[r]));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(top_k_v2, ops::TopkV2XPUKernel<float>);
#endif
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/cross_entropy_grad_kernel.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
namespace phi {
template <typename T, typename Context>
void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx,
const DenseTensor& labels,
const DenseTensor& softmax,
const DenseTensor& loss_grad,
bool soft_label,
bool use_softmax,
bool numeric_stable_mode,
int ignore_index,
int axis_in,
DenseTensor* logit_grad) {
using XPUType = typename XPUTypeTrait<T>::Type;
dev_ctx.template Alloc<T>(logit_grad);
const int rank = logit_grad->dims().size();
const int axis = phi::funcs::CanonicalAxis(axis_in, rank);
const int n = phi::funcs::SizeToAxis(axis, logit_grad->dims());
const int d = phi::funcs::SizeFromAxis(axis, logit_grad->dims());
int r = XPU_SUCCESS;
if (axis == rank - 1) {
if (soft_label) {
r = xpu::soft_softmax_with_cross_entropy_grad<XPUType>(
dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(loss_grad.data<T>()),
reinterpret_cast<const XPUType*>(labels.data<T>()),
reinterpret_cast<const XPUType*>(softmax.data<T>()),
reinterpret_cast<XPUType*>(logit_grad->data<T>()),
use_softmax,
n,
d);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy_grad");
} else {
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
int* labels_int_ptr_l3 =
RAII_GUARD.alloc_l3_or_gm<int32_t>(labels.numel());
PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3);
r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(),
labels.data<int64_t>(),
labels_int_ptr_l3,
labels.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
r = xpu::hard_softmax_with_cross_entropy_grad<XPUType, int>(
dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(loss_grad.data<T>()),
labels_int_ptr_l3,
reinterpret_cast<const XPUType*>(softmax.data<T>()),
reinterpret_cast<XPUType*>(logit_grad->data<T>()),
ignore_index,
use_softmax,
n,
d);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_softmax_with_cross_entropy_grad");
}
} else {
int t = logit_grad->dims()[axis];
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
int len = softmax.numel();
XPUType* trans_logit = RAII_GUARD.alloc_l3_or_gm<XPUType>(len);
PADDLE_ENFORCE_XDNN_NOT_NULL(trans_logit);
XPUType* trans_softmax = RAII_GUARD.alloc_l3_or_gm<XPUType>(len);
PADDLE_ENFORCE_XDNN_NOT_NULL(trans_softmax);
r = xpu::transpose(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(softmax.data<T>()),
trans_softmax,
{n, t, d / t},
{0, 2, 1});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
if (soft_label) {
XPUType* trans_labels = RAII_GUARD.alloc_l3_or_gm<XPUType>(len);
PADDLE_ENFORCE_XDNN_NOT_NULL(trans_labels);
r = xpu::transpose(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(labels.data<T>()),
trans_labels,
{n, t, d / t},
{0, 2, 1});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
r = xpu::soft_softmax_with_cross_entropy_grad<XPUType>(
dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(loss_grad.data<T>()),
trans_labels,
trans_softmax,
trans_logit,
use_softmax,
n * d / t,
t);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy_grad");
} else {
int* labels_int_ptr_l3 =
RAII_GUARD.alloc_l3_or_gm<int32_t>(labels.numel());
PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3);
r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(),
labels.data<int64_t>(),
labels_int_ptr_l3,
labels.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2");
r = xpu::hard_softmax_with_cross_entropy_grad<XPUType, int>(
dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(loss_grad.data<T>()),
labels_int_ptr_l3,
trans_softmax,
trans_logit,
ignore_index,
use_softmax,
n * d / t,
t);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_softmax_with_cross_entropy_grad");
}
r = xpu::transpose<XPUType>(
dev_ctx.x_context(),
trans_logit,
reinterpret_cast<XPUType*>(logit_grad->data<T>()),
{n, d / t, t},
{0, 2, 1});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
}
}
} // namespace phi
PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad,
XPU,
ALL_LAYOUT,
phi::CrossEntropyWithSoftmaxGradKernel,
float,
phi::dtype::float16) {}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/cross_entropy_kernel.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
namespace phi {
template <typename T, typename Context>
void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx,
const DenseTensor& logits,
const DenseTensor& labels,
bool soft_label,
bool use_softmax,
bool numeric_stable_mode,
int ignore_index,
int axis_in,
DenseTensor* softmax,
DenseTensor* loss) {
using XPUType = typename XPUTypeTrait<T>::Type;
PADDLE_ENFORCE_EQ(
logits.place().GetType() == phi::AllocationType::XPU,
true,
errors::PreconditionNotMet("This kernel only runs on XPU."));
const int rank = logits.dims().size();
const int axis = phi::funcs::CanonicalAxis(axis_in, rank);
dev_ctx.template Alloc<T>(softmax);
dev_ctx.template Alloc<T>(loss);
const int n = phi::funcs::SizeToAxis(axis, logits.dims());
const int d = phi::funcs::SizeFromAxis(axis, logits.dims());
std::vector<int> logits_dims = phi::vectorize<int>(logits.dims());
int t = logits_dims[axis];
auto logits_data = reinterpret_cast<const XPUType*>(logits.data<T>());
auto softmax_data = reinterpret_cast<XPUType*>(softmax->data<T>());
auto loss_data = reinterpret_cast<XPUType*>(loss->data<T>());
// softmax
int r = XPU_SUCCESS;
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
if (phi::backends::xpu::get_xpu_version(dev_ctx.GetPlace().GetDeviceId()) ==
phi::backends::xpu::XPUVersion::XPU2 &&
soft_label && axis == rank - 1) {
auto labels_data = reinterpret_cast<const XPUType*>(labels.data<T>());
r = xpu::soft_softmax_with_cross_entropy<XPUType>(dev_ctx.x_context(),
logits_data,
labels_data,
softmax_data,
loss_data,
n,
d);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy");
return;
}
int len = logits.numel();
T* clip_logits = RAII_GUARD.alloc_l3_or_gm<T>(len);
PADDLE_ENFORCE_XDNN_NOT_NULL(clip_logits);
XPUType* clip_logits_data = reinterpret_cast<XPUType*>(clip_logits);
float max_val = 1e20;
float min_val = -1e20;
if (std::is_same<T, dtype::float16>::value) {
max_val = 65504;
min_val = -65504;
}
r = xpu::clip_v2<XPUType>(dev_ctx.x_context(),
logits_data,
clip_logits_data,
len,
static_cast<XPUType>(min_val),
static_cast<XPUType>(max_val));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2");
r = xpu::softmax<XPUType>(
dev_ctx.x_context(), clip_logits_data, softmax_data, logits_dims, axis);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax");
// cross_entropy
if (axis != rank - 1) {
XPUType* trans_softmax = RAII_GUARD.alloc_l3_or_gm<XPUType>(n * d);
PADDLE_ENFORCE_XDNN_NOT_NULL(trans_softmax);
r = xpu::transpose(dev_ctx.x_context(),
softmax_data,
trans_softmax,
{n, t, d / t},
{0, 2, 1});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
softmax_data = trans_softmax;
}
if (soft_label) {
auto labels_data = reinterpret_cast<const XPUType*>(labels.data<T>());
if (axis != rank - 1) {
XPUType* trans_label = RAII_GUARD.alloc_l3_or_gm<XPUType>(n * d);
PADDLE_ENFORCE_XDNN_NOT_NULL(trans_label);
r = xpu::transpose(dev_ctx.x_context(),
labels_data,
trans_label,
{n, t, d / t},
{0, 2, 1});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
labels_data = trans_label;
}
r = xpu::soft_cross_entropy<XPUType>(dev_ctx.x_context(),
softmax_data,
labels_data,
loss_data,
axis == rank - 1 ? n : n * d / t,
axis == rank - 1 ? d : t);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_cross_entropy");
} else {
DenseTensor labels_int32;
int* labels_int_ptr_l3 = RAII_GUARD.alloc_l3_or_gm<int32_t>(labels.numel());
PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3);
r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(),
labels.data<int64_t>(),
labels_int_ptr_l3,
labels.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2");
r = xpu::hard_cross_entropy<XPUType, int32_t>(
dev_ctx.x_context(),
softmax_data,
labels_int_ptr_l3,
loss_data,
nullptr,
axis == rank - 1 ? n : n * d / t,
axis == rank - 1 ? d : t,
ignore_index);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_cross_entropy");
}
}
} // namespace phi
PD_REGISTER_KERNEL(cross_entropy_with_softmax,
XPU,
ALL_LAYOUT,
phi::CrossEntropyWithSoftmaxKernel,
float,
phi::dtype::float16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/top_k_kernel.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void TopkKernel(const Context& dev_ctx,
const DenseTensor& x,
const Scalar& k_scalar,
int axis,
bool largest,
bool sorted,
DenseTensor* out,
DenseTensor* indices) {
const auto& in_dims = x.dims();
const T* in_data = x.data<T>();
int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
T* output_data = dev_ctx.template Alloc<T>(out);
const auto& out_dims = out->dims();
PADDLE_ENFORCE_EQ(
sorted,
true,
errors::External(
"XPU API does not support unsorted topk operation currently."
" Operator will be supported in future update."));
PADDLE_ENFORCE_EQ(
largest,
true,
errors::External(
"XPU API does not support smallest topk operation currently."
" Operator will be supported in future update."));
if (axis < 0) axis += in_dims.size();
size_t k = k_scalar.to<int>();
if (axis + 1 == in_dims.size()) {
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
int32_t* indices_int_data =
RAII_GUARD.alloc_l3_or_gm<int32_t>(indices->numel());
const size_t row =
phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
const size_t col = in_dims[in_dims.size() - 1];
int r = xpu::sorted_topk<T>(dev_ctx.x_context(),
in_data,
output_data,
indices_int_data,
row,
col,
k);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk");
r = xpu::cast_v2<int32_t, int64_t>(dev_ctx.x_context(),
(const int32_t*)indices_int_data,
indices_data,
indices->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
} else {
// do transpose if axis is not the last dim of input
std::vector<int> trans_axes;
for (int i = 0; i < axis; i++) {
trans_axes.emplace_back(i);
}
for (int i = axis + 1; i < in_dims.size(); i++) {
trans_axes.emplace_back(i);
}
trans_axes.emplace_back(axis);
// Get input and output dims for transpose
DDim trans_dims(in_dims);
DDim trans_out_dims(out->dims());
for (size_t i = 0; i < trans_axes.size(); i++) {
trans_dims[i] = in_dims[trans_axes[i]];
trans_out_dims[i] = out_dims[trans_axes[i]];
}
std::vector<int> x_shape_host(in_dims.size(), 0);
for (int i = 0; i < in_dims.size(); ++i) {
x_shape_host[i] = in_dims[i];
}
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
T* trans_in_data = RAII_GUARD.alloc_l3_or_gm<T>(x.numel());
// Transpose and save interval output to trans_in
int r = xpu::transpose<T>(
dev_ctx.x_context(), in_data, trans_in_data, x_shape_host, trans_axes);
PADDLE_ENFORCE_EQ(r,
xpu::Error_t::SUCCESS,
errors::External("XPU API 1st Transpose kernel"
" returns wrong value[%d %s]!",
r,
XPUAPIErrorMsg[r]));
T* trans_out_data = RAII_GUARD.alloc_l3_or_gm<T>(out->numel());
int64_t* trans_idx_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(out->numel());
int32_t* trans_idx_int32_data =
RAII_GUARD.alloc_l3_or_gm<int32_t>(out->numel());
const size_t row =
phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
const size_t col = trans_dims[trans_dims.size() - 1];
// Do top k on transposed input
r = xpu::sorted_topk<T>(dev_ctx.x_context(),
trans_in_data,
trans_out_data,
trans_idx_int32_data,
row,
col,
k);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk");
r = xpu::cast_v2<int32_t, int64_t>(dev_ctx.x_context(),
(const int32_t*)trans_idx_int32_data,
trans_idx_data,
indices->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
// Transpose back to original dims
std::vector<int> trans_back_axes;
for (int i = 0; i < axis; i++) {
trans_axes.emplace_back(i);
}
trans_axes.emplace_back(trans_out_dims.size() - 1);
for (int i = axis; i < trans_out_dims.size() - 1; i++) {
trans_axes.emplace_back(i);
}
std::vector<int> trans_out_shape_host(trans_back_axes.size(), 0);
for (size_t i = 0; i < trans_back_axes.size(); ++i) {
trans_out_shape_host[i] = trans_out_dims[i];
}
r = xpu::transpose<T>(dev_ctx.x_context(),
trans_out_data,
output_data,
trans_out_shape_host,
trans_back_axes);
PADDLE_ENFORCE_EQ(r,
xpu::Error_t::SUCCESS,
errors::External("XPU API 2nd Transpose kernel"
" returns wrong value[%d %s]",
r,
XPUAPIErrorMsg[r]));
r = xpu::transpose<int64_t>(dev_ctx.x_context(),
trans_idx_data,
indices_data,
trans_out_shape_host,
trans_back_axes);
PADDLE_ENFORCE_EQ(r,
xpu::Error_t::SUCCESS,
errors::External("XPU API 3rd Transpose kernel"
" returns wrong value[%d %s]",
r,
XPUAPIErrorMsg[r]));
}
}
} // namespace phi
PD_REGISTER_KERNEL(top_k, XPU, ALL_LAYOUT, phi::TopkKernel, float) {}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册