From 0b9d4c56917028ad687c8f30912f54c6f94ff820 Mon Sep 17 00:00:00 2001 From: ykkk2333 <77383312+ykkk2333@users.noreply.github.com> Date: Fri, 2 Sep 2022 16:26:21 +0800 Subject: [PATCH] migrate softmax_with_cross_entropy and topk kernels to phi, test=kunlun (#45650) --- .../softmax_with_cross_entropy_op_xpu.cc | 310 ------------------ paddle/fluid/operators/top_k_v2_op_xpu.cc | 237 ------------- .../kernels/xpu/cross_entropy_grad_kernel.cc | 155 +++++++++ .../phi/kernels/xpu/cross_entropy_kernel.cc | 161 +++++++++ paddle/phi/kernels/xpu/top_k_kernel.cc | 176 ++++++++++ 5 files changed, 492 insertions(+), 547 deletions(-) delete mode 100644 paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc delete mode 100644 paddle/fluid/operators/top_k_v2_op_xpu.cc create mode 100644 paddle/phi/kernels/xpu/cross_entropy_grad_kernel.cc create mode 100644 paddle/phi/kernels/xpu/cross_entropy_kernel.cc create mode 100644 paddle/phi/kernels/xpu/top_k_kernel.cc diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc deleted file mode 100644 index 8251fe21ea4..00000000000 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc +++ /dev/null @@ -1,310 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_WITH_XPU -#include -#include -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/device_wrapper.h" -#include "paddle/phi/kernels/funcs/axis_utils.h" -#include "xpu/refactor/math.h" -#include "xpu/refactor/nn.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel { - using XPUType = typename XPUTypeTrait::Type; - - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE_EQ( - platform::is_xpu_place(context.GetPlace()), - true, - platform::errors::PreconditionNotMet("This kernel only runs on XPU.")); - const Tensor* logits = context.Input("Logits"); - const Tensor* labels = context.Input("Label"); - Tensor* softmax = context.Output("Softmax"); - Tensor* loss = context.Output("Loss"); - const int rank = logits->dims().size(); - const int axis = phi::funcs::CanonicalAxis(context.Attr("axis"), rank); - softmax->mutable_data(context.GetPlace()); - loss->mutable_data(context.GetPlace()); - const int n = phi::funcs::SizeToAxis(axis, logits->dims()); - const int d = phi::funcs::SizeFromAxis(axis, logits->dims()); - std::vector logits_dims = phi::vectorize(logits->dims()); - const bool soft_label = context.Attr("soft_label"); - - int t = logits_dims[axis]; - - auto logits_data = reinterpret_cast(logits->data()); - auto softmax_data = reinterpret_cast(softmax->data()); - auto loss_data = reinterpret_cast(loss->data()); - // softmax - auto& dev_ctx = - context.template device_context(); - int r = XPU_SUCCESS; - xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); - - if (platform::get_xpu_version(context.GetPlace().GetDeviceId()) == - phi::backends::xpu::XPUVersion::XPU2 && - soft_label && axis == rank - 1) { - auto labels_data = reinterpret_cast(labels->data()); - r = xpu::soft_softmax_with_cross_entropy(dev_ctx.x_context(), - logits_data, - labels_data, - softmax_data, - loss_data, - n, - d); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy"); - return; - } - - int len = logits->numel(); - T* clip_logits = RAII_GUARD.alloc_l3_or_gm(len); - PADDLE_ENFORCE_XDNN_NOT_NULL(clip_logits); - XPUType* clip_logits_data = reinterpret_cast(clip_logits); - - float max_val = 1e20; - float min_val = -1e20; - if (std::is_same::value) { - max_val = 65504; - min_val = -65504; - } - - r = xpu::clip_v2(dev_ctx.x_context(), - logits_data, - clip_logits_data, - len, - static_cast(min_val), - static_cast(max_val)); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2"); - - r = xpu::softmax( - dev_ctx.x_context(), clip_logits_data, softmax_data, logits_dims, axis); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax"); - - // cross_entropy - if (axis != rank - 1) { - XPUType* trans_softmax = RAII_GUARD.alloc_l3_or_gm(n * d); - PADDLE_ENFORCE_XDNN_NOT_NULL(trans_softmax); - - r = xpu::transpose(dev_ctx.x_context(), - softmax_data, - trans_softmax, - {n, t, d / t}, - {0, 2, 1}); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); - softmax_data = trans_softmax; - } - - if (soft_label) { - auto labels_data = reinterpret_cast(labels->data()); - if (axis != rank - 1) { - XPUType* trans_label = RAII_GUARD.alloc_l3_or_gm(n * d); - PADDLE_ENFORCE_XDNN_NOT_NULL(trans_label); - r = xpu::transpose(dev_ctx.x_context(), - labels_data, - trans_label, - {n, t, d / t}, - {0, 2, 1}); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); - labels_data = trans_label; - } - r = xpu::soft_cross_entropy(dev_ctx.x_context(), - softmax_data, - labels_data, - loss_data, - axis == rank - 1 ? n : n * d / t, - axis == rank - 1 ? d : t); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_cross_entropy"); - } else { - auto ignore_index = context.Attr("ignore_index"); - Tensor labels_int32; - labels_int32.mutable_data(context.GetPlace(), - labels->numel() * sizeof(int32_t)); - r = xpu::cast_v2(dev_ctx.x_context(), - labels->data(), - labels_int32.data(), - labels->numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2"); - - r = xpu::hard_cross_entropy( - dev_ctx.x_context(), - softmax_data, - labels_int32.data(), - loss_data, - nullptr, - axis == rank - 1 ? n : n * d / t, - axis == rank - 1 ? d : t, - ignore_index); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_cross_entropy"); - } - } -}; - -template -class SoftmaxWithCrossEntropyGradXPUKernel : public framework::OpKernel { - using XPUType = typename XPUTypeTrait::Type; - - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* out_grad = - context.Input(framework::GradVarName("Loss")); - const Tensor* labels = context.Input("Label"); - Tensor* logit_grad = - context.Output(framework::GradVarName("Logits")); - - logit_grad->mutable_data(context.GetPlace()); - - const Tensor* softmax = context.Input("Softmax"); - const bool use_softmax = context.Attr("use_softmax"); - - const bool soft_label = context.Attr("soft_label"); - auto ignore_index = context.Attr("ignore_index"); - - const int rank = logit_grad->dims().size(); - const int axis = phi::funcs::CanonicalAxis(context.Attr("axis"), rank); - const int n = phi::funcs::SizeToAxis(axis, logit_grad->dims()); - const int d = phi::funcs::SizeFromAxis(axis, logit_grad->dims()); - - auto& dev_ctx = - context.template device_context(); - int r = XPU_SUCCESS; - - if (axis == rank - 1) { - if (soft_label) { - r = xpu::soft_softmax_with_cross_entropy_grad( - dev_ctx.x_context(), - reinterpret_cast(out_grad->data()), - reinterpret_cast(labels->data()), - reinterpret_cast(softmax->data()), - reinterpret_cast(logit_grad->data()), - use_softmax, - n, - d); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy_grad"); - } else { - xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); - int* labels_int_ptr_l3 = - RAII_GUARD.alloc_l3_or_gm(labels->numel()); - PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3); - - r = xpu::cast_v2(dev_ctx.x_context(), - labels->data(), - labels_int_ptr_l3, - labels->numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); - - r = xpu::hard_softmax_with_cross_entropy_grad( - dev_ctx.x_context(), - reinterpret_cast(out_grad->data()), - labels_int_ptr_l3, - reinterpret_cast(softmax->data()), - reinterpret_cast(logit_grad->data()), - ignore_index, - use_softmax, - n, - d); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_softmax_with_cross_entropy_grad"); - } - } else { - int t = logit_grad->dims()[axis]; - xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); - int len = softmax->numel(); - XPUType* trans_logit = RAII_GUARD.alloc_l3_or_gm(len); - PADDLE_ENFORCE_XDNN_NOT_NULL(trans_logit); - - XPUType* trans_softmax = RAII_GUARD.alloc_l3_or_gm(len); - PADDLE_ENFORCE_XDNN_NOT_NULL(trans_softmax); - r = xpu::transpose(dev_ctx.x_context(), - reinterpret_cast(softmax->data()), - trans_softmax, - {n, t, d / t}, - {0, 2, 1}); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); - - if (soft_label) { - XPUType* trans_labels = RAII_GUARD.alloc_l3_or_gm(len); - PADDLE_ENFORCE_XDNN_NOT_NULL(trans_labels); - r = xpu::transpose(dev_ctx.x_context(), - reinterpret_cast(labels->data()), - trans_labels, - {n, t, d / t}, - {0, 2, 1}); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); - r = xpu::soft_softmax_with_cross_entropy_grad( - dev_ctx.x_context(), - reinterpret_cast(out_grad->data()), - trans_labels, - trans_softmax, - trans_logit, - use_softmax, - n * d / t, - t); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy_grad"); - } else { - int* labels_int_ptr_l3 = - RAII_GUARD.alloc_l3_or_gm(labels->numel()); - PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3); - - r = xpu::cast_v2(dev_ctx.x_context(), - labels->data(), - labels_int_ptr_l3, - labels->numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2"); - r = xpu::hard_softmax_with_cross_entropy_grad( - dev_ctx.x_context(), - reinterpret_cast(out_grad->data()), - labels_int_ptr_l3, - trans_softmax, - trans_logit, - ignore_index, - use_softmax, - n * d / t, - t); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_softmax_with_cross_entropy_grad"); - } - - r = xpu::transpose( - dev_ctx.x_context(), - trans_logit, - reinterpret_cast(logit_grad->data()), - {n, d / t, t}, - {0, 2, 1}); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL( - softmax_with_cross_entropy, - ops::SoftmaxWithCrossEntropyXPUKernel, - ops::SoftmaxWithCrossEntropyXPUKernel); -REGISTER_OP_XPU_KERNEL( - softmax_with_cross_entropy_grad, - ops::SoftmaxWithCrossEntropyGradXPUKernel, - ops::SoftmaxWithCrossEntropyGradXPUKernel); -#endif diff --git a/paddle/fluid/operators/top_k_v2_op_xpu.cc b/paddle/fluid/operators/top_k_v2_op_xpu.cc deleted file mode 100644 index 0b98d9192a6..00000000000 --- a/paddle/fluid/operators/top_k_v2_op_xpu.cc +++ /dev/null @@ -1,237 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_WITH_XPU - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "xpu/refactor/math.h" - -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; -template -class TopkV2XPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - const auto& in_dims = input->dims(); - const T* in_data = input->data(); - int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); - T* output_data = output->mutable_data(ctx.GetPlace()); - const auto& out_dims = output->dims(); - - const auto& sorted = static_cast(ctx.Attr("sorted")); - const auto& largest = static_cast(ctx.Attr("largest")); - PADDLE_ENFORCE_EQ( - sorted, - true, - platform::errors::External( - "XPU API does not support unsorted topk operation currently." - " Operator will be supported in future update.")); - PADDLE_ENFORCE_EQ( - largest, - true, - platform::errors::External( - "XPU API does not support smallest topk operation currently." - " Operator will be supported in future update.")); - - int axis = static_cast(ctx.Attr("axis")); - if (axis < 0) axis += in_dims.size(); - - size_t k = static_cast(ctx.Attr("k")); - auto* k_t = ctx.Input("K"); - if (k_t) { - k = k_t->data()[0]; - framework::DDim output_dims = output->dims(); - output_dims[axis] = k; - output->Resize(output_dims); - indices->Resize(output_dims); - } - if (axis + 1 == in_dims.size()) { - auto& dev_ctx = ctx.template device_context(); - xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); - int32_t* indices_int_data = - RAII_GUARD.alloc_l3_or_gm(indices->numel()); - - const size_t row = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const size_t col = in_dims[in_dims.size() - 1]; - int r = xpu::sorted_topk(dev_ctx.x_context(), - in_data, - output_data, - indices_int_data, - row, - col, - k); - PADDLE_ENFORCE_EQ( - r, - XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d %s] in call kernel name " - "[%s], please check " - "where Baidu Kunlun Card is properly installed.", - r, - XPUAPIErrorMsg[r], - "sorted_topk")); - r = xpu::cast_v2(dev_ctx.x_context(), - (const int32_t*)indices_int_data, - indices_data, - indices->numel()); - PADDLE_ENFORCE_EQ( - r, - XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d %s] in call kernel name " - "[%s], please check " - "where Baidu Kunlun Card is properly installed.", - r, - XPUAPIErrorMsg[r], - "cast_v2")); - - } else { - // do transpose if axis is not the last dim of input - std::vector trans_axes; - for (int i = 0; i < axis; i++) { - trans_axes.emplace_back(i); - } - for (int i = axis + 1; i < in_dims.size(); i++) { - trans_axes.emplace_back(i); - } - trans_axes.emplace_back(axis); - // Get input and output dims for transpose - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(output->dims()); - for (size_t i = 0; i < trans_axes.size(); i++) { - trans_dims[i] = in_dims[trans_axes[i]]; - trans_out_dims[i] = out_dims[trans_axes[i]]; - } - - std::vector x_shape_host(in_dims.size(), 0); - for (int i = 0; i < in_dims.size(); ++i) { - x_shape_host[i] = in_dims[i]; - } - - auto& dev_ctx = ctx.template device_context(); - xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); - T* trans_in_data = RAII_GUARD.alloc_l3_or_gm(input->numel()); - - // Transpose and save interval output to trans_in - int r = xpu::transpose(dev_ctx.x_context(), - in_data, - trans_in_data, - x_shape_host, - trans_axes); - PADDLE_ENFORCE_EQ( - r, - xpu::Error_t::SUCCESS, - platform::errors::External("XPU API 1st Transpose kernel" - " returns wrong value[%d %s]!", - r, - XPUAPIErrorMsg[r])); - - T* trans_out_data = RAII_GUARD.alloc_l3_or_gm(output->numel()); - int64_t* trans_idx_data = - RAII_GUARD.alloc_l3_or_gm(output->numel()); - int32_t* trans_idx_int32_data = - RAII_GUARD.alloc_l3_or_gm(output->numel()); - const size_t row = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const size_t col = trans_dims[trans_dims.size() - 1]; - - // Do top k on transposed input - r = xpu::sorted_topk(dev_ctx.x_context(), - trans_in_data, - trans_out_data, - trans_idx_int32_data, - row, - col, - k); - PADDLE_ENFORCE_EQ( - r, - XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d %s] in call kernel name " - "[%s], please check " - "where Baidu Kunlun Card is properly installed.", - r, - XPUAPIErrorMsg[r], - "sorted_topk")); - - r = xpu::cast_v2(dev_ctx.x_context(), - (const int32_t*)trans_idx_int32_data, - trans_idx_data, - indices->numel()); - PADDLE_ENFORCE_EQ( - r, - XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d %s in call kernel name " - "[%s], please check " - "where Baidu Kunlun Card is properly installed.", - r, - XPUAPIErrorMsg[r], - "cast_v2")); - - // Transpose back to original dims - std::vector trans_back_axes; - for (int i = 0; i < axis; i++) { - trans_axes.emplace_back(i); - } - trans_axes.emplace_back(trans_out_dims.size() - 1); - for (int i = axis; i < trans_out_dims.size() - 1; i++) { - trans_axes.emplace_back(i); - } - - std::vector trans_out_shape_host(trans_back_axes.size(), 0); - for (size_t i = 0; i < trans_back_axes.size(); ++i) { - trans_out_shape_host[i] = trans_out_dims[i]; - } - r = xpu::transpose(dev_ctx.x_context(), - trans_out_data, - output_data, - trans_out_shape_host, - trans_back_axes); - PADDLE_ENFORCE_EQ( - r, - xpu::Error_t::SUCCESS, - platform::errors::External("XPU API 2nd Transpose kernel" - " returns wrong value[%d %s]", - r, - XPUAPIErrorMsg[r])); - r = xpu::transpose(dev_ctx.x_context(), - trans_idx_data, - indices_data, - trans_out_shape_host, - trans_back_axes); - PADDLE_ENFORCE_EQ( - r, - xpu::Error_t::SUCCESS, - platform::errors::External("XPU API 3rd Transpose kernel" - " returns wrong value[%d %s]", - r, - XPUAPIErrorMsg[r])); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL(top_k_v2, ops::TopkV2XPUKernel); -#endif diff --git a/paddle/phi/kernels/xpu/cross_entropy_grad_kernel.cc b/paddle/phi/kernels/xpu/cross_entropy_grad_kernel.cc new file mode 100644 index 00000000000..042f41df980 --- /dev/null +++ b/paddle/phi/kernels/xpu/cross_entropy_grad_kernel.cc @@ -0,0 +1,155 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/cross_entropy_grad_kernel.h" + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" + +namespace phi { + +template +void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx, + const DenseTensor& labels, + const DenseTensor& softmax, + const DenseTensor& loss_grad, + bool soft_label, + bool use_softmax, + bool numeric_stable_mode, + int ignore_index, + int axis_in, + DenseTensor* logit_grad) { + using XPUType = typename XPUTypeTrait::Type; + dev_ctx.template Alloc(logit_grad); + + const int rank = logit_grad->dims().size(); + const int axis = phi::funcs::CanonicalAxis(axis_in, rank); + const int n = phi::funcs::SizeToAxis(axis, logit_grad->dims()); + const int d = phi::funcs::SizeFromAxis(axis, logit_grad->dims()); + + int r = XPU_SUCCESS; + + if (axis == rank - 1) { + if (soft_label) { + r = xpu::soft_softmax_with_cross_entropy_grad( + dev_ctx.x_context(), + reinterpret_cast(loss_grad.data()), + reinterpret_cast(labels.data()), + reinterpret_cast(softmax.data()), + reinterpret_cast(logit_grad->data()), + use_softmax, + n, + d); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy_grad"); + } else { + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + int* labels_int_ptr_l3 = + RAII_GUARD.alloc_l3_or_gm(labels.numel()); + PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3); + + r = xpu::cast_v2(dev_ctx.x_context(), + labels.data(), + labels_int_ptr_l3, + labels.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); + + r = xpu::hard_softmax_with_cross_entropy_grad( + dev_ctx.x_context(), + reinterpret_cast(loss_grad.data()), + labels_int_ptr_l3, + reinterpret_cast(softmax.data()), + reinterpret_cast(logit_grad->data()), + ignore_index, + use_softmax, + n, + d); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_softmax_with_cross_entropy_grad"); + } + } else { + int t = logit_grad->dims()[axis]; + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + int len = softmax.numel(); + XPUType* trans_logit = RAII_GUARD.alloc_l3_or_gm(len); + PADDLE_ENFORCE_XDNN_NOT_NULL(trans_logit); + + XPUType* trans_softmax = RAII_GUARD.alloc_l3_or_gm(len); + PADDLE_ENFORCE_XDNN_NOT_NULL(trans_softmax); + r = xpu::transpose(dev_ctx.x_context(), + reinterpret_cast(softmax.data()), + trans_softmax, + {n, t, d / t}, + {0, 2, 1}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); + + if (soft_label) { + XPUType* trans_labels = RAII_GUARD.alloc_l3_or_gm(len); + PADDLE_ENFORCE_XDNN_NOT_NULL(trans_labels); + r = xpu::transpose(dev_ctx.x_context(), + reinterpret_cast(labels.data()), + trans_labels, + {n, t, d / t}, + {0, 2, 1}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); + r = xpu::soft_softmax_with_cross_entropy_grad( + dev_ctx.x_context(), + reinterpret_cast(loss_grad.data()), + trans_labels, + trans_softmax, + trans_logit, + use_softmax, + n * d / t, + t); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy_grad"); + } else { + int* labels_int_ptr_l3 = + RAII_GUARD.alloc_l3_or_gm(labels.numel()); + PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3); + + r = xpu::cast_v2(dev_ctx.x_context(), + labels.data(), + labels_int_ptr_l3, + labels.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2"); + r = xpu::hard_softmax_with_cross_entropy_grad( + dev_ctx.x_context(), + reinterpret_cast(loss_grad.data()), + labels_int_ptr_l3, + trans_softmax, + trans_logit, + ignore_index, + use_softmax, + n * d / t, + t); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_softmax_with_cross_entropy_grad"); + } + + r = xpu::transpose( + dev_ctx.x_context(), + trans_logit, + reinterpret_cast(logit_grad->data()), + {n, d / t, t}, + {0, 2, 1}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad, + XPU, + ALL_LAYOUT, + phi::CrossEntropyWithSoftmaxGradKernel, + float, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/xpu/cross_entropy_kernel.cc b/paddle/phi/kernels/xpu/cross_entropy_kernel.cc new file mode 100644 index 00000000000..cf58374f1c0 --- /dev/null +++ b/paddle/phi/kernels/xpu/cross_entropy_kernel.cc @@ -0,0 +1,161 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/cross_entropy_kernel.h" + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" + +namespace phi { + +template +void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx, + const DenseTensor& logits, + const DenseTensor& labels, + bool soft_label, + bool use_softmax, + bool numeric_stable_mode, + int ignore_index, + int axis_in, + DenseTensor* softmax, + DenseTensor* loss) { + using XPUType = typename XPUTypeTrait::Type; + PADDLE_ENFORCE_EQ( + logits.place().GetType() == phi::AllocationType::XPU, + true, + errors::PreconditionNotMet("This kernel only runs on XPU.")); + + const int rank = logits.dims().size(); + const int axis = phi::funcs::CanonicalAxis(axis_in, rank); + dev_ctx.template Alloc(softmax); + dev_ctx.template Alloc(loss); + const int n = phi::funcs::SizeToAxis(axis, logits.dims()); + const int d = phi::funcs::SizeFromAxis(axis, logits.dims()); + std::vector logits_dims = phi::vectorize(logits.dims()); + + int t = logits_dims[axis]; + + auto logits_data = reinterpret_cast(logits.data()); + auto softmax_data = reinterpret_cast(softmax->data()); + auto loss_data = reinterpret_cast(loss->data()); + // softmax + int r = XPU_SUCCESS; + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + + if (phi::backends::xpu::get_xpu_version(dev_ctx.GetPlace().GetDeviceId()) == + phi::backends::xpu::XPUVersion::XPU2 && + soft_label && axis == rank - 1) { + auto labels_data = reinterpret_cast(labels.data()); + r = xpu::soft_softmax_with_cross_entropy(dev_ctx.x_context(), + logits_data, + labels_data, + softmax_data, + loss_data, + n, + d); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy"); + return; + } + + int len = logits.numel(); + T* clip_logits = RAII_GUARD.alloc_l3_or_gm(len); + PADDLE_ENFORCE_XDNN_NOT_NULL(clip_logits); + XPUType* clip_logits_data = reinterpret_cast(clip_logits); + + float max_val = 1e20; + float min_val = -1e20; + if (std::is_same::value) { + max_val = 65504; + min_val = -65504; + } + + r = xpu::clip_v2(dev_ctx.x_context(), + logits_data, + clip_logits_data, + len, + static_cast(min_val), + static_cast(max_val)); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2"); + + r = xpu::softmax( + dev_ctx.x_context(), clip_logits_data, softmax_data, logits_dims, axis); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax"); + + // cross_entropy + if (axis != rank - 1) { + XPUType* trans_softmax = RAII_GUARD.alloc_l3_or_gm(n * d); + PADDLE_ENFORCE_XDNN_NOT_NULL(trans_softmax); + + r = xpu::transpose(dev_ctx.x_context(), + softmax_data, + trans_softmax, + {n, t, d / t}, + {0, 2, 1}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); + softmax_data = trans_softmax; + } + + if (soft_label) { + auto labels_data = reinterpret_cast(labels.data()); + if (axis != rank - 1) { + XPUType* trans_label = RAII_GUARD.alloc_l3_or_gm(n * d); + PADDLE_ENFORCE_XDNN_NOT_NULL(trans_label); + r = xpu::transpose(dev_ctx.x_context(), + labels_data, + trans_label, + {n, t, d / t}, + {0, 2, 1}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); + labels_data = trans_label; + } + r = xpu::soft_cross_entropy(dev_ctx.x_context(), + softmax_data, + labels_data, + loss_data, + axis == rank - 1 ? n : n * d / t, + axis == rank - 1 ? d : t); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_cross_entropy"); + } else { + DenseTensor labels_int32; + int* labels_int_ptr_l3 = RAII_GUARD.alloc_l3_or_gm(labels.numel()); + PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3); + + r = xpu::cast_v2(dev_ctx.x_context(), + labels.data(), + labels_int_ptr_l3, + labels.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2"); + + r = xpu::hard_cross_entropy( + dev_ctx.x_context(), + softmax_data, + labels_int_ptr_l3, + loss_data, + nullptr, + axis == rank - 1 ? n : n * d / t, + axis == rank - 1 ? d : t, + ignore_index); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_cross_entropy"); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(cross_entropy_with_softmax, + XPU, + ALL_LAYOUT, + phi::CrossEntropyWithSoftmaxKernel, + float, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/xpu/top_k_kernel.cc b/paddle/phi/kernels/xpu/top_k_kernel.cc new file mode 100644 index 00000000000..d68ff8df8c0 --- /dev/null +++ b/paddle/phi/kernels/xpu/top_k_kernel.cc @@ -0,0 +1,176 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/top_k_kernel.h" + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void TopkKernel(const Context& dev_ctx, + const DenseTensor& x, + const Scalar& k_scalar, + int axis, + bool largest, + bool sorted, + DenseTensor* out, + DenseTensor* indices) { + const auto& in_dims = x.dims(); + const T* in_data = x.data(); + int64_t* indices_data = dev_ctx.template Alloc(indices); + T* output_data = dev_ctx.template Alloc(out); + + const auto& out_dims = out->dims(); + + PADDLE_ENFORCE_EQ( + sorted, + true, + errors::External( + "XPU API does not support unsorted topk operation currently." + " Operator will be supported in future update.")); + PADDLE_ENFORCE_EQ( + largest, + true, + errors::External( + "XPU API does not support smallest topk operation currently." + " Operator will be supported in future update.")); + + if (axis < 0) axis += in_dims.size(); + + size_t k = k_scalar.to(); + if (axis + 1 == in_dims.size()) { + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + int32_t* indices_int_data = + RAII_GUARD.alloc_l3_or_gm(indices->numel()); + + const size_t row = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const size_t col = in_dims[in_dims.size() - 1]; + int r = xpu::sorted_topk(dev_ctx.x_context(), + in_data, + output_data, + indices_int_data, + row, + col, + k); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk"); + + r = xpu::cast_v2(dev_ctx.x_context(), + (const int32_t*)indices_int_data, + indices_data, + indices->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); + } else { + // do transpose if axis is not the last dim of input + std::vector trans_axes; + for (int i = 0; i < axis; i++) { + trans_axes.emplace_back(i); + } + for (int i = axis + 1; i < in_dims.size(); i++) { + trans_axes.emplace_back(i); + } + trans_axes.emplace_back(axis); + // Get input and output dims for transpose + DDim trans_dims(in_dims); + DDim trans_out_dims(out->dims()); + for (size_t i = 0; i < trans_axes.size(); i++) { + trans_dims[i] = in_dims[trans_axes[i]]; + trans_out_dims[i] = out_dims[trans_axes[i]]; + } + + std::vector x_shape_host(in_dims.size(), 0); + for (int i = 0; i < in_dims.size(); ++i) { + x_shape_host[i] = in_dims[i]; + } + + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + T* trans_in_data = RAII_GUARD.alloc_l3_or_gm(x.numel()); + + // Transpose and save interval output to trans_in + int r = xpu::transpose( + dev_ctx.x_context(), in_data, trans_in_data, x_shape_host, trans_axes); + PADDLE_ENFORCE_EQ(r, + xpu::Error_t::SUCCESS, + errors::External("XPU API 1st Transpose kernel" + " returns wrong value[%d %s]!", + r, + XPUAPIErrorMsg[r])); + + T* trans_out_data = RAII_GUARD.alloc_l3_or_gm(out->numel()); + int64_t* trans_idx_data = RAII_GUARD.alloc_l3_or_gm(out->numel()); + int32_t* trans_idx_int32_data = + RAII_GUARD.alloc_l3_or_gm(out->numel()); + const size_t row = + phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); + const size_t col = trans_dims[trans_dims.size() - 1]; + + // Do top k on transposed input + r = xpu::sorted_topk(dev_ctx.x_context(), + trans_in_data, + trans_out_data, + trans_idx_int32_data, + row, + col, + k); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk"); + + r = xpu::cast_v2(dev_ctx.x_context(), + (const int32_t*)trans_idx_int32_data, + trans_idx_data, + indices->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); + // Transpose back to original dims + std::vector trans_back_axes; + for (int i = 0; i < axis; i++) { + trans_axes.emplace_back(i); + } + trans_axes.emplace_back(trans_out_dims.size() - 1); + for (int i = axis; i < trans_out_dims.size() - 1; i++) { + trans_axes.emplace_back(i); + } + + std::vector trans_out_shape_host(trans_back_axes.size(), 0); + for (size_t i = 0; i < trans_back_axes.size(); ++i) { + trans_out_shape_host[i] = trans_out_dims[i]; + } + r = xpu::transpose(dev_ctx.x_context(), + trans_out_data, + output_data, + trans_out_shape_host, + trans_back_axes); + PADDLE_ENFORCE_EQ(r, + xpu::Error_t::SUCCESS, + errors::External("XPU API 2nd Transpose kernel" + " returns wrong value[%d %s]", + r, + XPUAPIErrorMsg[r])); + r = xpu::transpose(dev_ctx.x_context(), + trans_idx_data, + indices_data, + trans_out_shape_host, + trans_back_axes); + PADDLE_ENFORCE_EQ(r, + xpu::Error_t::SUCCESS, + errors::External("XPU API 3rd Transpose kernel" + " returns wrong value[%d %s]", + r, + XPUAPIErrorMsg[r])); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(top_k, XPU, ALL_LAYOUT, phi::TopkKernel, float) {} -- GitLab