From b7bbe39c9fac0867e1e129e2958b33fd958d5206 Mon Sep 17 00:00:00 2001 From: Linjie Chen <40840292+linjieccc@users.noreply.github.com> Date: Fri, 4 Mar 2022 15:40:03 +0800 Subject: [PATCH] [phi] move sigmoid_cross_entopy_with_logits log_loss cumsum auc kernel to phi (#39976) * move sigmoid cross entopy with logits to phi * fix ci * move log_loss to phi * move cumsum to phi * revert infershape * fix xpu ci * move auc to phi * remove comment * update sigmoid_cross_entropy_with_logits_op.cu * update sigmoid_cross_entropy_with_logits_op * Update log_loss --- paddle/fluid/operators/cum_op.h | 115 ------ paddle/fluid/operators/cumsum_op.cc | 7 +- paddle/fluid/operators/cumsum_op.cu | 325 ----------------- paddle/fluid/operators/cumsum_op_npu.cc | 2 +- paddle/fluid/operators/log_loss_op.cc | 12 +- paddle/fluid/operators/log_loss_op.h | 74 ---- paddle/fluid/operators/log_loss_op_npu.cc | 2 +- paddle/fluid/operators/log_loss_op_xpu.cc | 4 +- paddle/fluid/operators/metrics/auc_op.cc | 3 +- paddle/fluid/operators/metrics/auc_op.cu | 232 ------------ paddle/fluid/operators/metrics/auc_op.h | 186 ---------- .../sigmoid_cross_entropy_with_logits_op.cc | 14 +- .../sigmoid_cross_entropy_with_logits_op.cu | 264 -------------- .../sigmoid_cross_entropy_with_logits_op.h | 114 ------ ...igmoid_cross_entropy_with_logits_op_npu.cc | 3 +- ...igmoid_cross_entropy_with_logits_op_xpu.cc | 4 +- paddle/phi/kernels/auc_kernel.h | 36 ++ paddle/phi/kernels/cpu/auc_kernel.cc | 190 ++++++++++ paddle/phi/kernels/cpu/cumsum_kernel.cc | 143 ++++++++ .../phi/kernels/cpu/log_loss_grad_kernel.cc | 22 ++ paddle/phi/kernels/cpu/log_loss_kernel.cc | 21 ++ ...d_cross_entropy_with_logits_grad_kernel.cc | 70 ++++ ...igmoid_cross_entropy_with_logits_kernel.cc | 71 ++++ paddle/phi/kernels/cumsum_kernel.h | 30 ++ paddle/phi/kernels/gpu/auc_kernel.cu | 258 ++++++++++++++ paddle/phi/kernels/gpu/cumsum_kernel.cu | 336 ++++++++++++++++++ .../phi/kernels/gpu/log_loss_grad_kernel.cu | 22 ++ paddle/phi/kernels/gpu/log_loss_kernel.cu | 21 ++ .../gpu/sigmoid_cross_entropy_with_logits.h | 69 ++++ ...d_cross_entropy_with_logits_grad_kernel.cu | 126 +++++++ ...igmoid_cross_entropy_with_logits_kernel.cu | 128 +++++++ .../kernels/impl/log_loss_grad_kernel_impl.h | 43 +++ .../phi/kernels/impl/log_loss_kernel_impl.h | 40 +++ paddle/phi/kernels/log_loss_grad_kernel.h | 29 ++ paddle/phi/kernels/log_loss_kernel.h | 28 ++ ...id_cross_entropy_with_logits_grad_kernel.h | 30 ++ ...sigmoid_cross_entropy_with_logits_kernel.h | 29 ++ paddle/phi/ops/compat/log_loss_sig.cc | 29 ++ .../sigmoid_cross_entropy_with_logits_sig.cc | 31 ++ 39 files changed, 1817 insertions(+), 1346 deletions(-) delete mode 100644 paddle/fluid/operators/cum_op.h delete mode 100644 paddle/fluid/operators/cumsum_op.cu delete mode 100644 paddle/fluid/operators/log_loss_op.h delete mode 100644 paddle/fluid/operators/metrics/auc_op.cu delete mode 100644 paddle/fluid/operators/metrics/auc_op.h delete mode 100644 paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu delete mode 100644 paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h create mode 100644 paddle/phi/kernels/auc_kernel.h create mode 100644 paddle/phi/kernels/cpu/auc_kernel.cc create mode 100644 paddle/phi/kernels/cpu/cumsum_kernel.cc create mode 100644 paddle/phi/kernels/cpu/log_loss_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/log_loss_kernel.cc create mode 100644 paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc create mode 100644 paddle/phi/kernels/cumsum_kernel.h create mode 100644 paddle/phi/kernels/gpu/auc_kernel.cu create mode 100644 paddle/phi/kernels/gpu/cumsum_kernel.cu create mode 100644 paddle/phi/kernels/gpu/log_loss_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/log_loss_kernel.cu create mode 100644 paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h create mode 100644 paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu create mode 100644 paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/log_loss_kernel_impl.h create mode 100644 paddle/phi/kernels/log_loss_grad_kernel.h create mode 100644 paddle/phi/kernels/log_loss_kernel.h create mode 100644 paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h create mode 100644 paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h create mode 100644 paddle/phi/ops/compat/log_loss_sig.cc create mode 100644 paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h deleted file mode 100644 index ab3860ecafc..00000000000 --- a/paddle/fluid/operators/cum_op.h +++ /dev/null @@ -1,115 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace operators { - -template -class CumKernel : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - - void Compute(const framework::ExecutionContext& context) const override { - auto& X = GET_DATA_SAFELY(context.Input("X"), "Input", - "X", "Cum"); - - auto& Out = GET_DATA_SAFELY(context.Output("Out"), - "Output", "Out", "Cum"); - int axis = context.Attr("axis"); - bool exclusive = context.Attr("exclusive"); - bool reverse = context.Attr("reverse"); - auto out_dims = Out.dims(); - - PADDLE_ENFORCE_EQ( - axis < out_dims.size() && axis >= (0 - out_dims.size()), true, - platform::errors::OutOfRange( - "Attr(axis) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(axis) = %d.", - out_dims.size(), out_dims.size() - 1, axis)); - if (axis < 0) { - axis += out_dims.size(); - } - - Out.template mutable_data(context.GetPlace()); - - int pre = 1; - int post = 1; - int mid = out_dims[axis]; - for (int i = 0; i < axis; ++i) { - pre *= out_dims[i]; - } - for (int i = axis + 1; i < out_dims.size(); ++i) { - post *= out_dims[i]; - } - - auto x = framework::EigenVector::Flatten(X); - auto out = framework::EigenVector::Flatten(Out); - auto* place = - context.template device_context().eigen_device(); - - using IndexT = Eigen::DenseIndex; - if (pre == 1) { - if (post == 1) { - ComputeImp(*place, Eigen::DSizes(mid), x, out, - /* axis= */ 0, reverse, exclusive); - } else { - ComputeImp(*place, Eigen::DSizes(mid, post), x, out, - /* axis= */ 0, reverse, exclusive); - } - } else { - if (post == 1) { - ComputeImp(*place, Eigen::DSizes(pre, mid), x, out, - /* axis= */ 1, reverse, exclusive); - } else { - ComputeImp(*place, Eigen::DSizes(pre, mid, post), x, out, - /* axis= */ 1, reverse, exclusive); - } - } - } - - private: - template - void ComputeImp(Device d, const Dim& dims, X x, Out out, int axis, - bool reverse, bool exclusive) const { - if (!reverse) { - out.reshape(dims).device(d) = Functor()(x.reshape(dims), axis, exclusive); - } else { - std::array rev; - rev.fill(false); - rev[axis] = reverse; - out.reshape(dims).device(d) = - Functor()(x.reshape(dims).reverse(rev), axis, exclusive).reverse(rev); - } - } -}; - -template -struct CumsumFunctor { - using ELEMENT_TYPE = T; - template - const typename X::TensorScanSumOp operator()(X x, int axis, - bool exclusive) const { - return x.cumsum(axis, exclusive); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc index 9fa355a9246..7c80917a713 100644 --- a/paddle/fluid/operators/cumsum_op.cc +++ b/paddle/fluid/operators/cumsum_op.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/cum_op.h" namespace paddle { namespace operators { @@ -91,11 +91,6 @@ using CPU = paddle::platform::CPUDeviceContext; REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker, ops::CumsumGradMaker); -REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel>, - ops::CumKernel>, - ops::CumKernel>, - ops::CumKernel>, - ops::CumKernel>); REGISTER_OP_VERSION(cumsum) .AddCheckpoint( diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu deleted file mode 100644 index 3402f42521f..00000000000 --- a/paddle/fluid/operators/cumsum_op.cu +++ /dev/null @@ -1,325 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#ifdef __NVCC__ -#include -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include "paddle/fluid/operators/cum_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" - -using Tensor = paddle::framework::Tensor; -using LoDTensor = paddle::framework::LoDTensor; - -namespace paddle { -namespace operators { - -template -__device__ void BlockReverse(const T* idata, T* odata, int src_base, - int dst_base, int valid_item) { - __shared__ T sh_mem[BLOCK_SIZE]; - int tx = threadIdx.x; - - int offset = tx; - int in_index = src_base + offset; - if (offset >= valid_item) { - sh_mem[offset] = 0; - } else { - int sh_mem_index = BLOCK_SIZE - offset - 1; - T data = idata[in_index]; - sh_mem[sh_mem_index] = data; - } - - __syncthreads(); - int out_index = dst_base - offset; - if (offset < valid_item) { - int sh_mem_index = BLOCK_SIZE - offset - 1; - odata[out_index] = sh_mem[sh_mem_index]; - } -} - -template -__global__ void MatrixRowReverse(const T* matrix_data, T* reverse_data, - int reverse_size, int outer_size, - int inner_size) { - int bx = blockIdx.x; - int by = blockIdx.y; - int item_per_block = 1024; - - for (int block_offset = 0; block_offset < reverse_size; - block_offset += item_per_block) { - int valid_item = (reverse_size - block_offset > item_per_block) - ? item_per_block - : reverse_size - block_offset; - int src_offset = - bx * reverse_size + block_offset + by * (inner_size * reverse_size); - int dst_offset = bx * reverse_size + by * (inner_size * reverse_size) + - reverse_size - 1 - block_offset; - if (reverse_size < item_per_block) { - valid_item = reverse_size; - } - - BlockReverse(matrix_data, reverse_data, src_offset, dst_offset, - valid_item); - } -} - -template -struct BlockPrefixCallbackOp { - // Running prefix - T running_total; - // Constructor - __device__ BlockPrefixCallbackOp(T running_total) - : running_total(running_total) {} - // Callback operator to be entered by the first warp of threads in the block. - // Thread-0 is responsible for returning a value for seeding the block-wide - // scan. - __device__ T operator()(T block_aggregate) { - T old_prefix = running_total; - running_total = old_prefix + block_aggregate; - return old_prefix; - } -}; - -// No bank-conflict transpose -template -__global__ void MatrixTranspose(T* odata, const T* idata, size_t height, - size_t width) { - __shared__ T tile[TILE_DIM][TILE_DIM + 1]; - - int x = blockIdx.x * TILE_DIM + threadIdx.x; - int y = blockIdx.y * TILE_DIM + threadIdx.y; - for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { - if (x < width && (y + j) < height) { - tile[threadIdx.y + j][threadIdx.x] = idata[(y + j) * width + x]; - } else { - tile[threadIdx.y + j][threadIdx.x] = 0; - } - } - - __syncthreads(); - - x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset - y = blockIdx.x * TILE_DIM + threadIdx.y; - - for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { - if (x < height && (y + j) < width) { - odata[(y + j) * height + x] = tile[threadIdx.x][threadIdx.y + j]; - } - } -} - -template -__global__ void BlockScanKernel(T* d_out, const T* d_in, int inner_size, - int outer_size, int scan_size, bool exclusive) { - // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types - typedef cub::BlockLoad - BlockLoadT; - typedef cub::BlockStore - BlockStoreT; - typedef cub::BlockScan BlockScanT; - // Allocate type-safe, repurposable shared memory for collectives - __shared__ union { - typename BlockLoadT::TempStorage load; - typename BlockStoreT::TempStorage store; - typename BlockScanT::TempStorage scan; - } temp_storage; - - int bx = blockIdx.x; - int by = blockIdx.y; - - BlockPrefixCallbackOp prefix_op(0); - T block_aggregate = static_cast(0); - - // Obtain this block's segment of consecutive keys (blocked across threads) - int item_per_block = BLOCK_THREADS * ITEMS_PER_THREAD; - for (int block_offset = 0; block_offset < scan_size; - block_offset += BLOCK_THREADS * ITEMS_PER_THREAD) { - int valid_item = (scan_size - block_offset > item_per_block) - ? item_per_block - : (scan_size - block_offset); - if (scan_size < item_per_block) { - valid_item = scan_size; - } - - int offset = bx * scan_size + block_offset + by * (inner_size * scan_size); - - T thread_keys[ITEMS_PER_THREAD]; - BlockLoadT(temp_storage.load) - .Load(d_in + offset, thread_keys, valid_item, 0); - - __syncthreads(); - if (exclusive) { - T init_value = static_cast(0); - BlockScanT(temp_storage.scan) - .ExclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op); - } else { - BlockScanT(temp_storage.scan) - .InclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op); - } - __syncthreads(); - - BlockStoreT(temp_storage.store) - .Store(d_out + offset, thread_keys, valid_item); - } -} - -template -class CumCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - - int axis = context.Attr("axis"); - bool exclusive = context.Attr("exclusive"); - bool reverse = context.Attr("reverse"); - auto out_dims = out->dims(); - auto size = in->numel(); - - PADDLE_ENFORCE_EQ( - axis < out_dims.size() && axis >= (0 - out_dims.size()), true, - platform::errors::OutOfRange( - "Attr(axis) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(axis) = %d.", - out_dims.size(), out_dims.size() - 1, axis)); - if (axis < 0) { - axis += out_dims.size(); - } - - T* out_data = out->mutable_data(context.GetPlace()); - const T* in_data = in->data(); - - // Use thrust for parallel acceleration when the input size is equal to the - // length of the ‘axis’ dimension. - if (size == out_dims[axis]) { - if (reverse) { - thrust::device_ptr dev_ptr = - thrust::device_pointer_cast(in_data); - thrust::device_vector vec(dev_ptr, dev_ptr + size); - if (exclusive) { - thrust::exclusive_scan(thrust::device, vec.rbegin(), vec.rend(), - out_data); - } else { - thrust::inclusive_scan(thrust::device, vec.rbegin(), vec.rend(), - out_data); - } - thrust::reverse(thrust::device, out_data, out_data + size); - } else { - if (exclusive) { - thrust::exclusive_scan(thrust::device, in_data, in_data + size, - out_data); - } else { - thrust::inclusive_scan(thrust::device, in_data, in_data + size, - out_data); - } - } - return; - } - - size_t height = 1; - size_t width = 1; - for (size_t i = 0; i <= axis; i++) { - height *= out_dims[i]; - } - - for (size_t i = axis + 1; i < out_dims.size(); i++) { - width *= out_dims[i]; - } - int scan_size = out_dims[axis]; - bool transpose = (axis != out_dims.size() - 1); - - int tile_size = 32; - dim3 blocks(32, 8); - dim3 transpose_grids((width + tile_size - 1) / tile_size, - (height + tile_size - 1) / tile_size); - auto& dev_ctx = context.template device_context(); - framework::Tensor tmp; - tmp.Resize(out_dims); - auto* tmp_data = tmp.mutable_data(context.GetPlace()); - T* next_in_data = out_data; - T* next_out_data = tmp_data; - if (transpose) { - MatrixTranspose<<>>( - out_data, in_data, height, width); - next_in_data = out_data; - next_out_data = tmp_data; - } - auto swap_ptr = [](T*& ptr1, T*& ptr2) { - T* tmp = ptr2; - ptr2 = ptr1; - ptr1 = tmp; - }; - int outer_size = height / scan_size; - int inner_size = width; - // Consider the size of shared memory, here block size is 128 - dim3 scan_grid(outer_size, inner_size); - dim3 reverse_grid = scan_grid; - if (reverse) { - if (transpose) { - reverse_grid.x = scan_grid.y; - reverse_grid.y = scan_grid.x; - MatrixRowReverse<<>>( - next_in_data, next_out_data, scan_size, outer_size, inner_size); - if (!transpose) next_in_data = tmp_data; - swap_ptr(next_in_data, next_out_data); - } else { - MatrixRowReverse<<>>( - in_data, out_data, scan_size, outer_size, inner_size); - } - } - if (!transpose && !reverse) { - BlockScanKernel<<>>( - out_data, in_data, outer_size, inner_size, scan_size, exclusive); - - } else { - BlockScanKernel<<>>( - next_out_data, next_in_data, outer_size, inner_size, scan_size, - exclusive); - } - swap_ptr(next_in_data, next_out_data); - if (reverse) { - MatrixRowReverse<<>>( - next_in_data, next_out_data, scan_size, outer_size, inner_size); - swap_ptr(next_in_data, next_out_data); - } - if (transpose) { - transpose_grids.x = (height + tile_size - 1) / tile_size; - transpose_grids.y = (width + tile_size - 1) / tile_size; - MatrixTranspose<<>>( - next_out_data, next_in_data, width, height); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - cumsum, ops::CumCUDAKernel, - ops::CumCUDAKernel, - ops::CumCUDAKernel, - ops::CumCUDAKernel, - ops::CumCUDAKernel); diff --git a/paddle/fluid/operators/cumsum_op_npu.cc b/paddle/fluid/operators/cumsum_op_npu.cc index 38bf53ca0aa..d197e4362e9 100644 --- a/paddle/fluid/operators/cumsum_op_npu.cc +++ b/paddle/fluid/operators/cumsum_op_npu.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/cum_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc index df4d0ebbccd..2e596ff3e62 100644 --- a/paddle/fluid/operators/log_loss_op.cc +++ b/paddle/fluid/operators/log_loss_op.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/log_loss_op.h" #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -149,13 +149,3 @@ REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker, ops::LogLossGradMaker, ops::LogLossGradMaker); REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp); -REGISTER_OP_CPU_KERNEL( - log_loss, ops::LogLossKernel); -REGISTER_OP_CPU_KERNEL( - log_loss_grad, - ops::LogLossGradKernel); -REGISTER_OP_CUDA_KERNEL( - log_loss, ops::LogLossKernel); -REGISTER_OP_CUDA_KERNEL( - log_loss_grad, - ops::LogLossGradKernel); diff --git a/paddle/fluid/operators/log_loss_op.h b/paddle/fluid/operators/log_loss_op.h deleted file mode 100644 index e7985ab810b..00000000000 --- a/paddle/fluid/operators/log_loss_op.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -template -using EigenVector = framework::EigenVector; - -template -class LogLossKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* loss_out = ctx.Output("Loss"); - - loss_out->mutable_data(ctx.GetPlace()); - - auto epsilon = static_cast(ctx.Attr("epsilon")); - - auto prediction = EigenVector::Flatten(*ctx.Input("Predicted")); - auto label = EigenVector::Flatten(*ctx.Input("Labels")); - - auto loss = EigenVector::Flatten(*loss_out); - auto& place = *ctx.template device_context().eigen_device(); - - EigenLogLoss, T>::Eval( - place, loss, prediction, label, epsilon); - } -}; - -template -class LogLossGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto epsilon = static_cast(ctx.Attr("epsilon")); - - auto prediction = EigenVector::Flatten(*ctx.Input("Predicted")); - auto label = EigenVector::Flatten(*ctx.Input("Labels")); - - auto* dloss = ctx.Input(framework::GradVarName("Loss")); - auto* dpred = ctx.Output(framework::GradVarName("Predicted")); - - auto dl = EigenVector::Flatten(*dloss); - auto& place = *ctx.template device_context().eigen_device(); - - if (dpred) { - dpred->mutable_data(ctx.GetPlace()); - auto dx = framework::EigenVector::Flatten(*dpred); - EigenLogLossGrad, T>::Eval( - place, dx, dl, prediction, label, epsilon); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc index 9775910bba5..f103a69707a 100644 --- a/paddle/fluid/operators/log_loss_op_npu.cc +++ b/paddle/fluid/operators/log_loss_op_npu.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/log_loss_op.h" #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/log_loss_op_xpu.cc b/paddle/fluid/operators/log_loss_op_xpu.cc index b2e68e9870d..aa5fdd86745 100644 --- a/paddle/fluid/operators/log_loss_op_xpu.cc +++ b/paddle/fluid/operators/log_loss_op_xpu.cc @@ -10,11 +10,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/log_loss_op.h" #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class LogLossXPUKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc index 2a3a0fa5d1f..54ecba08a82 100644 --- a/paddle/fluid/operators/metrics/auc_op.cc +++ b/paddle/fluid/operators/metrics/auc_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/metrics/auc_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -146,4 +146,3 @@ There are two types of possible curves: namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker); -REGISTER_OP_CPU_KERNEL(auc, ops::AucKernel); diff --git a/paddle/fluid/operators/metrics/auc_op.cu b/paddle/fluid/operators/metrics/auc_op.cu deleted file mode 100644 index 1cb7eba8775..00000000000 --- a/paddle/fluid/operators/metrics/auc_op.cu +++ /dev/null @@ -1,232 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/metrics/auc_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -__global__ void ClearObsoleteDataKernel(int64_t *pos, int64_t *neg, - const int bucket_length, - const int slide_steps) { - int cur_step_index = - static_cast(pos[(slide_steps + 1) * bucket_length]) % slide_steps; - int cur_step_begin = cur_step_index * bucket_length; - int sum_step_begin = slide_steps * bucket_length; - CUDA_KERNEL_LOOP(i, bucket_length) { - pos[sum_step_begin + i] -= pos[cur_step_begin + i]; - neg[sum_step_begin + i] -= neg[cur_step_begin + i]; - pos[cur_step_begin + i] = neg[cur_step_begin + i] = 0; - } -} - -__global__ void UpdateSumDataKernel(int64_t *pos, int64_t *neg, - const int bucket_length, - const int slide_steps) { - int cur_step_index = - static_cast(pos[(slide_steps + 1) * bucket_length]) % slide_steps; - int cur_step_begin = cur_step_index * bucket_length; - int sum_step_begin = slide_steps * bucket_length; - CUDA_KERNEL_LOOP(i, bucket_length) { - pos[sum_step_begin + i] += pos[cur_step_begin + i]; - neg[sum_step_begin + i] += neg[cur_step_begin + i]; - } -} - -template -__global__ void AddDataKernel(const int64_t *label_data, const T *pred_data, - const int inference_width, - const int num_thresholds, int64_t *pos, - int64_t *neg, const int numel, - const int slide_steps) { - int cur_step_begin = 0; - if (slide_steps > 0) { - int cur_step_index = - static_cast(pos[(slide_steps + 1) * (1 + num_thresholds)]) % - slide_steps; - cur_step_begin = cur_step_index * (1 + num_thresholds); - } - CUDA_KERNEL_LOOP(i, numel) { - auto predict_data = pred_data[i * inference_width + (inference_width - 1)]; - PADDLE_ENFORCE(predict_data <= 1, "The predict data must less or equal 1."); - PADDLE_ENFORCE(predict_data >= 0, - "The predict data must gather or equal 0."); - uint32_t binIdx = static_cast(predict_data * num_thresholds); - if (label_data[i]) { - paddle::platform::CudaAtomicAdd(pos + cur_step_begin + binIdx, 1); - } else { - paddle::platform::CudaAtomicAdd(neg + cur_step_begin + binIdx, 1); - } - } -} -__global__ void CalcAucKernel(int64_t *stat_pos, int64_t *stat_neg, - int num_thresholds, double *auc, - bool need_add_batch_num) { - *auc = 0.0f; - double totPos = 0.0; - double totNeg = 0.0; - double totPosPrev = 0.0; - double totNegPrev = 0.0; - - int idx = num_thresholds; - - while (idx >= 0) { - totPosPrev = totPos; - totNegPrev = totNeg; - totPos += stat_pos[idx]; - totNeg += stat_neg[idx]; - *auc += (totNeg - totNegPrev) * (totPos + totPosPrev) / 2.0; - --idx; - } - - if (totPos > 0.0 && totNeg > 0.0) { - *auc = *auc / totPos / totNeg; - } - if (need_add_batch_num) { - stat_pos[num_thresholds + 1] += 1; - stat_neg[num_thresholds + 1] += 1; - } -} - -template -class AucCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *predict = ctx.Input("Predict"); - auto *label = ctx.Input("Label"); - - int num_thresholds = ctx.Attr("num_thresholds"); - int slide_steps = ctx.Attr("slide_steps"); - - // Only use output var for now, make sure it's persistable and - // not cleaned up for each batch. - auto *auc_tensor = ctx.Output("AUC"); - auto *stat_pos = ctx.Output("StatPosOut"); - auto *stat_neg = ctx.Output("StatNegOut"); - - auto *origin_stat_pos = stat_pos->mutable_data(ctx.GetPlace()); - auto *origin_stat_neg = stat_neg->mutable_data(ctx.GetPlace()); - auto *auc_value = auc_tensor->mutable_data(ctx.GetPlace()); - - auto *stat_pos_in_tensor = ctx.Input("StatPos"); - auto *pos_in_data = stat_pos_in_tensor->data(); - auto *stat_neg_in_tensor = ctx.Input("StatNeg"); - auto *neg_in_data = stat_neg_in_tensor->data(); -#ifdef PADDLE_WITH_CUDA - if (stat_pos_in_tensor != stat_pos) { - cudaMemcpy(origin_stat_pos, pos_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t), - cudaMemcpyDeviceToDevice); - } - if (stat_neg_in_tensor != stat_neg) { - cudaMemcpy(origin_stat_neg, neg_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t), - cudaMemcpyDeviceToDevice); - } -#else - if (stat_pos_in_tensor != stat_pos) { - hipMemcpy(origin_stat_pos, pos_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t), - hipMemcpyDeviceToDevice); - } - if (stat_neg_in_tensor != stat_neg) { - hipMemcpy(origin_stat_neg, neg_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t), - hipMemcpyDeviceToDevice); - } -#endif - - statAuc(ctx, label, predict, num_thresholds, slide_steps, origin_stat_pos, - origin_stat_neg); - int sum_offset = slide_steps * (num_thresholds + 1); - auto stream = - ctx.template device_context().stream(); - CalcAucKernel<<<1, 1, 0, stream>>>( - origin_stat_pos + sum_offset, origin_stat_neg + sum_offset, - num_thresholds, auc_value, slide_steps > 0); - } - - private: - inline static double trapezoidArea(double X1, double X2, double Y1, - double Y2) { - return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0; - } - - inline static void statAuc(const framework::ExecutionContext &ctx, - const framework::Tensor *label, - const framework::Tensor *predict, - const int num_thresholds, const int slide_steps, - int64_t *origin_stat_pos, - int64_t *origin_stat_neg) { - size_t batch_size = predict->dims()[0]; - size_t inference_width = predict->dims()[1]; - const T *inference_data = predict->data(); - const auto *label_data = label->data(); - const int bucket_length = num_thresholds + 1; - auto stream = - ctx.template device_context().stream(); - if (slide_steps == 0) { - AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - label_data, inference_data, inference_width, num_thresholds, - origin_stat_pos, origin_stat_neg, batch_size, slide_steps); - return; - } - // the last number of origin_stat_pos store the index should be used in - // current step - int cur_step_index = - static_cast(origin_stat_pos[(slide_steps + 1) * bucket_length]) % - slide_steps; - int cur_step_begin = cur_step_index * bucket_length; - int sum_step_begin = slide_steps * bucket_length; - - ClearObsoleteDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - origin_stat_pos, origin_stat_neg, bucket_length, slide_steps); - - AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - label_data, inference_data, inference_width, num_thresholds, - origin_stat_pos, origin_stat_neg, batch_size, slide_steps); - UpdateSumDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - origin_stat_pos, origin_stat_neg, bucket_length, slide_steps); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(auc, - ops::AucCUDAKernel); diff --git a/paddle/fluid/operators/metrics/auc_op.h b/paddle/fluid/operators/metrics/auc_op.h deleted file mode 100644 index 10403472c69..00000000000 --- a/paddle/fluid/operators/metrics/auc_op.h +++ /dev/null @@ -1,186 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class AucKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *predict = ctx.Input("Predict"); - auto *label = ctx.Input("Label"); - - int num_thresholds = ctx.Attr("num_thresholds"); - int slide_steps = ctx.Attr("slide_steps"); - - // Only use output var for now, make sure it's persistable and - // not cleaned up for each batch. - auto *auc_tensor = ctx.Output("AUC"); - auto *stat_pos = ctx.Output("StatPosOut"); - auto *stat_neg = ctx.Output("StatNegOut"); - - auto *origin_stat_pos = stat_pos->mutable_data(ctx.GetPlace()); - auto *origin_stat_neg = stat_neg->mutable_data(ctx.GetPlace()); - auto *auc_value = auc_tensor->mutable_data(ctx.GetPlace()); - - // Just for pass UT, since UT's input & output connot be set same var - auto *stat_pos_in_tensor = ctx.Input("StatPos"); - auto *pos_in_data = stat_pos_in_tensor->data(); - auto *stat_neg_in_tensor = ctx.Input("StatNeg"); - auto *neg_in_data = stat_neg_in_tensor->data(); - if (stat_pos_in_tensor != stat_pos) { - memcpy(origin_stat_pos, pos_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t)); - } - if (stat_neg_in_tensor != stat_neg) { - memcpy(origin_stat_neg, neg_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t)); - } - statAuc(label, predict, num_thresholds, slide_steps, origin_stat_pos, - origin_stat_neg); - - int sum_offset = slide_steps * (num_thresholds + 1); - calcAuc(origin_stat_pos + sum_offset, origin_stat_neg + sum_offset, - num_thresholds, auc_value); - if (slide_steps) { - origin_stat_pos[(slide_steps + 1) * (num_thresholds + 1)] += 1; - origin_stat_neg[(slide_steps + 1) * (num_thresholds + 1)] += 1; - } - } - - private: - inline static double trapezoidArea(double X1, double X2, double Y1, - double Y2) { - return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0; - } - - inline static void statAuc(const framework::Tensor *label, - const framework::Tensor *predict, - const int num_thresholds, const int slide_steps, - int64_t *origin_stat_pos, - int64_t *origin_stat_neg) { - size_t batch_size = predict->dims()[0]; - size_t inference_width = predict->dims()[1]; - const T *inference_data = predict->data(); - const auto *label_data = label->data(); - const int bucket_length = num_thresholds + 1; - if (slide_steps == 0) { - for (size_t i = 0; i < batch_size; i++) { - // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob - // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob - auto predict_data = - inference_data[i * inference_width + (inference_width - 1)]; - PADDLE_ENFORCE_LE(predict_data, 1, - platform::errors::PreconditionNotMet( - "The predict data must less or equal 1.")); - PADDLE_ENFORCE_GE(predict_data, 0, - platform::errors::PreconditionNotMet( - "The predict data must gather or equal 0.")); - - uint32_t binIdx = static_cast(predict_data * num_thresholds); - if (label_data[i] > 0) { - origin_stat_pos[binIdx] += 1; - } else if (label_data[i] == 0) { - origin_stat_neg[binIdx] += 1; - } - } - return; - } - // the last number of origin_stat_pos store the index should be used in - // current step - int cur_step_index = - static_cast(origin_stat_pos[(slide_steps + 1) * bucket_length]) % - slide_steps; - int cur_step_begin = cur_step_index * bucket_length; - int sum_step_begin = slide_steps * bucket_length; - for (int i = 0; i < bucket_length; ++i) { - origin_stat_pos[sum_step_begin + i] -= - origin_stat_pos[cur_step_begin + i]; - origin_stat_neg[sum_step_begin + i] -= - origin_stat_neg[cur_step_begin + i]; - } - - std::memset(origin_stat_pos + cur_step_begin, 0, - bucket_length * sizeof(int64_t)); - std::memset(origin_stat_neg + cur_step_begin, 0, - bucket_length * sizeof(int64_t)); - - for (size_t i = 0; i < batch_size; i++) { - // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob - // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob - auto predict_data = - inference_data[i * inference_width + (inference_width - 1)]; - PADDLE_ENFORCE_LE(predict_data, 1, - platform::errors::PreconditionNotMet( - "The predict data must less or equal 1.")); - PADDLE_ENFORCE_GE(predict_data, 0, - platform::errors::PreconditionNotMet( - "The predict data must gather or equal 0.")); - - uint32_t binIdx = static_cast(predict_data * num_thresholds); - if (label_data[i] > 0) { - origin_stat_pos[cur_step_begin + binIdx] += 1; - } else if (label_data[i] == 0) { - origin_stat_neg[cur_step_begin + binIdx] += 1; - } - } - for (int i = 0; i < bucket_length; ++i) { - origin_stat_pos[sum_step_begin + i] += - origin_stat_pos[cur_step_begin + i]; - origin_stat_neg[sum_step_begin + i] += - origin_stat_neg[cur_step_begin + i]; - } - } - - inline static void calcAuc(const int64_t *stat_pos, const int64_t *stat_neg, - int num_thresholds, double *auc) { - *auc = 0.0f; - - double totPos = 0.0; - double totNeg = 0.0; - double totPosPrev = 0.0; - double totNegPrev = 0.0; - - int idx = num_thresholds; - - while (idx >= 0) { - totPosPrev = totPos; - totNegPrev = totNeg; - totPos += stat_pos[idx]; - totNeg += stat_neg[idx]; - *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev); - --idx; - } - - if (totPos > 0.0 && totNeg > 0.0) { - *auc = *auc / totPos / totNeg; - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index a4e80343903..8e502fc04db 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" #include #include #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { using framework::Tensor; +const int kIgnoreIndex = -100; class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel { public: @@ -209,14 +210,3 @@ REGISTER_OPERATOR( REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad, ops::SigmoidCrossEntropyWithLogitsGradOp, ops::SigmoidCrossEntropyWithLogitsGradInplaceInferer); -REGISTER_OP_CPU_KERNEL( - sigmoid_cross_entropy_with_logits, - ops::SigmoidCrossEntropyWithLogitsKernel, - ops::SigmoidCrossEntropyWithLogitsKernel); -REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad, - ops::SigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CPUDeviceContext, float>, - ops::SigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CPUDeviceContext, double>); diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu deleted file mode 100644 index 18402d908c4..00000000000 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu +++ /dev/null @@ -1,264 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/operators/math.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/core/hostdevice.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -#ifdef __HIPCC__ -static constexpr int kNumCUDAThreads = 256; -#else -static constexpr int kNumCUDAThreads = 512; -#endif -static constexpr int kNumMaxinumNumBlocks = 4096; - -static inline int NumBlocks(const int N) { - return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, - kNumMaxinumNumBlocks); -} - -template -struct NonzeroFunctor { - HOSTDEVICE explicit inline NonzeroFunctor() {} - HOSTDEVICE inline T operator()(const T x) const { - return static_cast(static_cast(x) != 0); - } -}; - -template -struct SigmoidFwdFunctor { - T ignore_index_; - T eps = static_cast(1e-5); - - HOSTDEVICE inline SigmoidFwdFunctor(const T ignore_index) - : ignore_index_(ignore_index) {} - - HOSTDEVICE inline phi::Array operator()(const T x, const T label) { - T counts; - T out_data; - - T diff = label - static_cast(ignore_index_); - if ((diff > -eps) && (diff < eps)) { - out_data = static_cast(0.); - counts = 0; - } else { - T term1 = (x > 0) ? x : 0; - T term2 = x * label; - T term3 = real_log(static_cast(1) + real_exp(static_cast(-abs(x)))); - - out_data = term1 - term2 + term3; - counts = 1; - } - phi::Array outs; - - outs[0] = out_data; - outs[1] = counts; - return outs; - } -}; - -template -struct SigmoidBwdFunctor { - T ignore_index_; - T eps = static_cast(1e-5); - - HOSTDEVICE inline SigmoidBwdFunctor(const T ignore_index) - : ignore_index_(ignore_index) {} - - HOSTDEVICE inline phi::Array operator()(const T x, const T label, - const T dout) { - T counts; - T dx_data; - - T diff = label - static_cast(ignore_index_); - if ((diff > -eps) && (diff < eps)) { - dx_data = static_cast(0.); - counts = 0; - } else { - T simoid_x = static_cast(1) / (static_cast(1) + real_exp(-x)); - T diff = simoid_x - label; - dx_data = dout * diff; - counts = 1; - } - phi::Array outs; - - outs[0] = dx_data; - outs[1] = counts; - return outs; - } -}; - -template -struct DivFunctor { - const T norm_; - HOSTDEVICE inline DivFunctor(const T norm) : norm_(norm) {} - - HOSTDEVICE inline T operator()(T loss) { - loss /= norm_; - return loss; - } -}; - -// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) -template -class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const Tensor *X = context.Input("X"); - const Tensor *Labels = context.Input("Label"); - Tensor *Out = context.Output("Out"); - int ignore_index = context.Attr("ignore_index"); - auto out_data = Out->mutable_data(context.GetPlace()); - - auto &dev_ctx = context.cuda_device_context(); - bool normalize = context.Attr("normalize"); - - // Temporary memory - Tensor *counts_tensor = new Tensor(); - counts_tensor->mutable_data(context.GetPlace(), - Labels->numel() * sizeof(T)); - counts_tensor->Resize(Out->dims()); - int limit = Out->numel(); - int blocks = NumBlocks(limit); - int threads = kNumCUDAThreads; - std::vector ins = {X, Labels}; - std::vector outs = {Out, counts_tensor}; - auto functor = SigmoidFwdFunctor(ignore_index); - constexpr int Size = 2; - phi::funcs::ElementwiseKernel(dev_ctx, ins, - &outs, functor); - if (normalize) { - T *counts = counts_tensor->mutable_data(context.GetPlace()); - Tensor *norm_tensor = new Tensor(); - norm_tensor->mutable_data(context.GetPlace(), sizeof(T)); - auto dims = phi::vectorize(counts_tensor->dims()); - std::vector reduce_dim = {}; - for (int i = 0; i < dims.size(); i++) { - reduce_dim.push_back(i); - } - - TensorReduceImpl>( - context.cuda_device_context(), *counts_tensor, norm_tensor, - NonzeroFunctor(), reduce_dim, dev_ctx.stream()); - T *norm = norm_tensor->mutable_data(context.GetPlace()); - auto norm_cpu_mem = memory::Alloc(platform::CPUPlace(), sizeof(T)); - T *norm_cpu_ptr = reinterpret_cast(norm_cpu_mem->ptr()); - memory::Copy(platform::CPUPlace(), norm_cpu_ptr, dev_ctx.GetPlace(), norm, - sizeof(T), dev_ctx.stream()); - auto eps = static_cast(1e-5); - *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps; - - std::vector div_ins = {Out}; - std::vector div_outs = {Out}; - auto div_functor = DivFunctor(*norm_cpu_ptr); - phi::funcs::ElementwiseKernel(dev_ctx, div_ins, &div_outs, - div_functor); - - delete norm_tensor; - delete counts_tensor; - } - } -}; - -// dX = sigmoid(X) - labels -template -class GPUSigmoidCrossEntropyWithLogitsGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const Tensor *X = context.Input("X"); - const Tensor *Labels = context.Input("Label"); - const Tensor *dOut = context.Input(framework::GradVarName("Out")); - Tensor *dX = context.Output(framework::GradVarName("X")); - auto dx_data = dX->mutable_data(context.GetPlace()); - - int ignore_index = context.Attr("ignore_index"); - - auto &dev_ctx = context.cuda_device_context(); - // Temporary memory - Tensor *counts_tensor = new Tensor(); - counts_tensor->mutable_data(context.GetPlace(), - Labels->numel() * sizeof(T)); - counts_tensor->Resize(dX->dims()); - - int limit = dX->numel(); - int blocks = NumBlocks(limit); - int threads = kNumCUDAThreads; - std::vector ins = {X, Labels, dOut}; - std::vector outs = {dX, counts_tensor}; - auto functor = SigmoidBwdFunctor(ignore_index); - constexpr int Size = 2; - phi::funcs::ElementwiseKernel(dev_ctx, ins, - &outs, functor); - bool normalize = context.Attr("normalize"); - if (normalize) { - T *counts = counts_tensor->mutable_data(context.GetPlace()); - Tensor *norm_tensor = new Tensor(); - norm_tensor->mutable_data(context.GetPlace(), sizeof(T)); - auto dims = phi::vectorize(counts_tensor->dims()); - std::vector reduce_dim = {}; - for (int i = 0; i < dims.size(); i++) { - reduce_dim.push_back(i); - } - - TensorReduceImpl>( - context.cuda_device_context(), *counts_tensor, norm_tensor, - NonzeroFunctor(), reduce_dim, dev_ctx.stream()); - T *norm = norm_tensor->mutable_data(context.GetPlace()); - auto norm_cpu_mem = memory::Alloc(platform::CPUPlace(), sizeof(T)); - T *norm_cpu_ptr = reinterpret_cast(norm_cpu_mem->ptr()); - memory::Copy(platform::CPUPlace(), norm_cpu_ptr, dev_ctx.GetPlace(), norm, - sizeof(T), dev_ctx.stream()); - auto eps = static_cast(1e-5); - *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps; - - std::vector div_ins = {dX}; - std::vector div_outs = {dX}; - auto div_functor = DivFunctor(*norm_cpu_ptr); - phi::funcs::ElementwiseKernel(dev_ctx, div_ins, &div_outs, - div_functor); - delete norm_tensor; - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits, - ops::GPUSigmoidCrossEntropyWithLogitsKernel< - paddle::platform::CUDADeviceContext, float>, - ops::GPUSigmoidCrossEntropyWithLogitsKernel< - paddle::platform::CUDADeviceContext, double>); -REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad, - ops::GPUSigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CUDADeviceContext, float>, - ops::GPUSigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CUDADeviceContext, double>); diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h deleted file mode 100644 index d2ced490cef..00000000000 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h +++ /dev/null @@ -1,114 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -const int kIgnoreIndex = -100; - -// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) -template -class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const Tensor *X = context.Input("X"); - const Tensor *Labels = context.Input("Label"); - Tensor *Out = context.Output("Out"); - int ignore_index = context.Attr("ignore_index"); - auto out_data = Out->mutable_data(context.GetPlace()); - int limit = Out->numel(); - auto x_data = X->data(); - auto label_data = Labels->data(); - for (int idx = 0; idx < limit; ++idx) { - T x = x_data[idx]; - T label = label_data[idx]; - if (static_cast(label) == ignore_index) { - out_data[idx] = static_cast(0.); - } else { - T term1 = (x > 0) ? x : 0; - T term2 = x * label; - T term3 = std::log(static_cast(1) + std::exp(-std::abs(x))); - out_data[idx] = term1 - term2 + term3; - } - } - bool normalize = context.Attr("normalize"); - if (normalize) { - int norm = 0; - T eps = static_cast(1e-6); - for (int idx = 0; idx < limit; ++idx) { - T diff = label_data[idx] - static_cast(ignore_index); - if ((diff < -eps) || (diff > eps)) { - norm += 1; - } - } - eps = static_cast(1e-5); - norm = norm > eps ? norm : eps; - std::for_each(out_data, out_data + limit, [norm](T &v) { v = v / norm; }); - } - } -}; - -// dX = sigmoid(X) - labels -template -class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const Tensor *X = context.Input("X"); - const Tensor *Labels = context.Input("Label"); - const Tensor *dOut = context.Input(framework::GradVarName("Out")); - Tensor *dX = context.Output(framework::GradVarName("X")); - auto dx_data = dX->mutable_data(context.GetPlace()); - - int ignore_index = context.Attr("ignore_index"); - int limit = dX->numel(); - auto x_data = X->data(); - auto label_data = Labels->data(); - auto dout_data = dOut->data(); - for (int idx = 0; idx < limit; ++idx) { - T x = x_data[idx]; - T label = label_data[idx]; - T dout = dout_data[idx]; - if (static_cast(label) == ignore_index) { - dx_data[idx] = static_cast(0.); - } else { - T simoid_x = static_cast(1) / (static_cast(1) + std::exp(-x)); - T diff = simoid_x - label; - dx_data[idx] = dout * diff; - } - } - bool normalize = context.Attr("normalize"); - if (normalize) { - int norm = 0; - T eps = static_cast(1e-6); - for (int idx = 0; idx < limit; ++idx) { - T diff = label_data[idx] - static_cast(ignore_index); - if ((diff < -eps) || (diff > eps)) { - norm += 1; - } - } - eps = static_cast(1e-5); - norm = norm > eps ? norm : eps; - std::for_each(dx_data, dx_data + limit, [norm](T &v) { v = v / norm; }); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc index 40852425997..f186f95a2b9 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc @@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; +const int kIgnoreIndex = -100; void CheckAttrs(const framework::ExecutionContext& ctx) { // Add this check is is due to Ascend SigmoidCrossEntropyWithLogits diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc index 6395aa1caa0..c37731580d1 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc @@ -17,13 +17,15 @@ #include #include -#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class SigmoidCrossEntropyWithLogitsXPUKernel : public framework::OpKernel { using XPUType = typename XPUTypeTrait::Type; diff --git a/paddle/phi/kernels/auc_kernel.h b/paddle/phi/kernels/auc_kernel.h new file mode 100644 index 00000000000..acbd17c7801 --- /dev/null +++ b/paddle/phi/kernels/auc_kernel.h @@ -0,0 +1,36 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void AucKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + const DenseTensor& stat_pos, + const DenseTensor& stat_neg, + const std::string& curve, + int num_thresholds, + int slide_steps, + DenseTensor* auc, + DenseTensor* stat_pos_out, + DenseTensor* stat_neg_out); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/auc_kernel.cc b/paddle/phi/kernels/cpu/auc_kernel.cc new file mode 100644 index 00000000000..bc25091de75 --- /dev/null +++ b/paddle/phi/kernels/cpu/auc_kernel.cc @@ -0,0 +1,190 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/auc_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +inline static double trapezoidArea(double X1, double X2, double Y1, double Y2) { + return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0; +} + +template +void statAuc(const DenseTensor &label, + const DenseTensor &predict, + const int num_thresholds, + const int slide_steps, + int64_t *origin_stat_pos, + int64_t *origin_stat_neg) { + size_t batch_size = predict.dims()[0]; + size_t inference_width = predict.dims()[1]; + const T *inference_data = predict.data(); + const auto *label_data = label.data(); + const int bucket_length = num_thresholds + 1; + if (slide_steps == 0) { + for (size_t i = 0; i < batch_size; i++) { + // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob + // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob + auto predict_data = + inference_data[i * inference_width + (inference_width - 1)]; + PADDLE_ENFORCE_LE(predict_data, + 1, + phi::errors::PreconditionNotMet( + "The predict data must less or equal 1.")); + PADDLE_ENFORCE_GE(predict_data, + 0, + phi::errors::PreconditionNotMet( + "The predict data must gather or equal 0.")); + + uint32_t binIdx = static_cast(predict_data * num_thresholds); + if (label_data[i] > 0) { + origin_stat_pos[binIdx] += 1; + } else if (label_data[i] == 0) { + origin_stat_neg[binIdx] += 1; + } + } + return; + } + // the last number of origin_stat_pos store the index should be used in + // current step + int cur_step_index = + static_cast(origin_stat_pos[(slide_steps + 1) * bucket_length]) % + slide_steps; + int cur_step_begin = cur_step_index * bucket_length; + int sum_step_begin = slide_steps * bucket_length; + for (int i = 0; i < bucket_length; ++i) { + origin_stat_pos[sum_step_begin + i] -= origin_stat_pos[cur_step_begin + i]; + origin_stat_neg[sum_step_begin + i] -= origin_stat_neg[cur_step_begin + i]; + } + + std::memset( + origin_stat_pos + cur_step_begin, 0, bucket_length * sizeof(int64_t)); + std::memset( + origin_stat_neg + cur_step_begin, 0, bucket_length * sizeof(int64_t)); + + for (size_t i = 0; i < batch_size; i++) { + // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob + // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob + auto predict_data = + inference_data[i * inference_width + (inference_width - 1)]; + PADDLE_ENFORCE_LE(predict_data, + 1, + phi::errors::PreconditionNotMet( + "The predict data must less or equal 1.")); + PADDLE_ENFORCE_GE(predict_data, + 0, + phi::errors::PreconditionNotMet( + "The predict data must gather or equal 0.")); + + uint32_t binIdx = static_cast(predict_data * num_thresholds); + if (label_data[i] > 0) { + origin_stat_pos[cur_step_begin + binIdx] += 1; + } else if (label_data[i] == 0) { + origin_stat_neg[cur_step_begin + binIdx] += 1; + } + } + for (int i = 0; i < bucket_length; ++i) { + origin_stat_pos[sum_step_begin + i] += origin_stat_pos[cur_step_begin + i]; + origin_stat_neg[sum_step_begin + i] += origin_stat_neg[cur_step_begin + i]; + } +} + +inline static void calcAuc(const int64_t *stat_pos, + const int64_t *stat_neg, + int num_thresholds, + double *auc) { + *auc = 0.0f; + + double totPos = 0.0; + double totNeg = 0.0; + double totPosPrev = 0.0; + double totNegPrev = 0.0; + + int idx = num_thresholds; + + while (idx >= 0) { + totPosPrev = totPos; + totNegPrev = totNeg; + totPos += stat_pos[idx]; + totNeg += stat_neg[idx]; + *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev); + --idx; + } + + if (totPos > 0.0 && totNeg > 0.0) { + *auc = *auc / totPos / totNeg; + } +} + +template +void AucKernel(const Context &dev_ctx, + const DenseTensor &input, + const DenseTensor &label, + const DenseTensor &stat_pos, + const DenseTensor &stat_neg, + const std::string &curve, + int num_thresholds, + int slide_steps, + DenseTensor *auc, + DenseTensor *stat_pos_out, + DenseTensor *stat_neg_out) { + // Only use output var for now, make sure it's persistable and + // not cleaned up for each batch. + auto *origin_stat_pos = dev_ctx.template Alloc(stat_pos_out); + auto *origin_stat_neg = dev_ctx.template Alloc(stat_neg_out); + auto *auc_value = dev_ctx.template Alloc(auc); + + // Just for pass UT, since UT's input & output connot be set same var + auto *stat_pos_in_tensor = &stat_pos; + auto *stat_neg_in_tensor = &stat_neg; + auto *pos_in_data = stat_pos.data(); + auto *neg_in_data = stat_neg.data(); + if (stat_pos_in_tensor != stat_pos_out) { + memcpy( + origin_stat_pos, + pos_in_data, + ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) * + sizeof(int64_t)); + } + if (stat_neg_in_tensor != stat_neg_out) { + memcpy( + origin_stat_neg, + neg_in_data, + ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) * + sizeof(int64_t)); + } + statAuc(label, + input, + num_thresholds, + slide_steps, + origin_stat_pos, + origin_stat_neg); + + int sum_offset = slide_steps * (num_thresholds + 1); + calcAuc(origin_stat_pos + sum_offset, + origin_stat_neg + sum_offset, + num_thresholds, + auc_value); + if (slide_steps) { + origin_stat_pos[(slide_steps + 1) * (num_thresholds + 1)] += 1; + origin_stat_neg[(slide_steps + 1) * (num_thresholds + 1)] += 1; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(auc, CPU, ALL_LAYOUT, phi::AucKernel, float) {} diff --git a/paddle/phi/kernels/cpu/cumsum_kernel.cc b/paddle/phi/kernels/cpu/cumsum_kernel.cc new file mode 100644 index 00000000000..d32e18479aa --- /dev/null +++ b/paddle/phi/kernels/cpu/cumsum_kernel.cc @@ -0,0 +1,143 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cumsum_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +struct CumsumFunctor { + template + const typename X::TensorScanSumOp operator()(X x, + int axis, + bool exclusive) const { + return x.cumsum(axis, exclusive); + } +}; + +template +void ComputeImp(Device d, + const Dim& dims, + X x, + Out out, + int axis, + bool reverse, + bool exclusive) { + if (!reverse) { + out.reshape(dims).device(d) = + CumsumFunctor()(x.reshape(dims), axis, exclusive); + } else { + std::array rev; + rev.fill(false); + rev[axis] = reverse; + out.reshape(dims).device(d) = + CumsumFunctor()(x.reshape(dims).reverse(rev), axis, exclusive) + .reverse(rev); + } +} + +template +void CumsumKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + bool flatten, + bool exclusive, + bool reverse, + DenseTensor* out) { + auto out_dims = out->dims(); + + PADDLE_ENFORCE_EQ( + axis < out_dims.size() && axis >= (0 - out_dims.size()), + true, + phi::errors::OutOfRange( + "Attr(axis) is out of range, It's expected " + "to be in range of [-%d, %d]. But received Attr(axis) = %d.", + out_dims.size(), + out_dims.size() - 1, + axis)); + if (axis < 0) { + axis += out_dims.size(); + } + + dev_ctx.template Alloc(out); + + int pre = 1; + int post = 1; + int mid = out_dims[axis]; + for (int i = 0; i < axis; ++i) { + pre *= out_dims[i]; + } + for (int i = axis + 1; i < out_dims.size(); ++i) { + post *= out_dims[i]; + } + + auto x0 = EigenVector::Flatten(x); + auto out0 = EigenVector::Flatten(*out); + auto& place = *dev_ctx.eigen_device(); + + using IndexT = Eigen::DenseIndex; + if (pre == 1) { + if (post == 1) { + ComputeImp(place, + Eigen::DSizes(mid), + x0, + out0, + /* axis= */ 0, + reverse, + exclusive); + } else { + ComputeImp(place, + Eigen::DSizes(mid, post), + x0, + out0, + /* axis= */ 0, + reverse, + exclusive); + } + } else { + if (post == 1) { + ComputeImp(place, + Eigen::DSizes(pre, mid), + x0, + out0, + /* axis= */ 1, + reverse, + exclusive); + } else { + ComputeImp(place, + Eigen::DSizes(pre, mid, post), + x0, + out0, + /* axis= */ 1, + reverse, + exclusive); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(cumsum, + CPU, + ALL_LAYOUT, + phi::CumsumKernel, + float, + double, + int16_t, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/log_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/log_loss_grad_kernel.cc new file mode 100644 index 00000000000..2e2d94df59e --- /dev/null +++ b/paddle/phi/kernels/cpu/log_loss_grad_kernel.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_loss_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + log_loss_grad, CPU, ALL_LAYOUT, phi::LogLossGradKernel, float) {} diff --git a/paddle/phi/kernels/cpu/log_loss_kernel.cc b/paddle/phi/kernels/cpu/log_loss_kernel.cc new file mode 100644 index 00000000000..38e93486f7b --- /dev/null +++ b/paddle/phi/kernels/cpu/log_loss_kernel.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_loss_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/log_loss_kernel_impl.h" + +PD_REGISTER_KERNEL(log_loss, CPU, ALL_LAYOUT, phi::LogLossKernel, float) {} diff --git a/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc b/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc new file mode 100644 index 00000000000..468db18aa21 --- /dev/null +++ b/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void SigmoidCrossEntropyWithLogitsGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& label, + const DenseTensor& out_grad, + bool normalize, + int ignore_index, + DenseTensor* in_grad) { + auto dx_data = dev_ctx.template Alloc(in_grad); + + int limit = in_grad->numel(); + auto x_data = x.data(); + auto label_data = label.data(); + auto dout_data = out_grad.data(); + for (int idx = 0; idx < limit; ++idx) { + T x = x_data[idx]; + T label = label_data[idx]; + T dout = dout_data[idx]; + if (static_cast(label) == ignore_index) { + dx_data[idx] = static_cast(0.); + } else { + T simoid_x = static_cast(1) / (static_cast(1) + std::exp(-x)); + T diff = simoid_x - label; + dx_data[idx] = dout * diff; + } + } + if (normalize) { + int norm = 0; + T eps = static_cast(1e-6); + for (int idx = 0; idx < limit; ++idx) { + T diff = label_data[idx] - static_cast(ignore_index); + if ((diff < -eps) || (diff > eps)) { + norm += 1; + } + } + eps = static_cast(1e-5); + norm = norm > eps ? norm : eps; + std::for_each(dx_data, dx_data + limit, [norm](T& v) { v = v / norm; }); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits_grad, + CPU, + ALL_LAYOUT, + phi::SigmoidCrossEntropyWithLogitsGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc b/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc new file mode 100644 index 00000000000..366d300320b --- /dev/null +++ b/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h" + +#include +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void SigmoidCrossEntropyWithLogitsKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& label, + bool normalize, + int ignore_index, + DenseTensor* out) { + auto out_data = dev_ctx.template Alloc(out); + int limit = out->numel(); + auto x_data = x.data(); + auto label_data = label.data(); + for (int idx = 0; idx < limit; ++idx) { + T x = x_data[idx]; + T label = label_data[idx]; + if (static_cast(label) == ignore_index) { + out_data[idx] = static_cast(0.); + } else { + T term1 = (x > 0) ? x : 0; + T term2 = x * label; + T term3 = std::log(static_cast(1) + std::exp(-std::abs(x))); + out_data[idx] = term1 - term2 + term3; + } + } + + if (normalize) { + int norm = 0; + T eps = static_cast(1e-6); + for (int idx = 0; idx < limit; ++idx) { + T diff = label_data[idx] - static_cast(ignore_index); + if ((diff < -eps) || (diff > eps)) { + norm += 1; + } + } + eps = static_cast(1e-5); + norm = norm > eps ? norm : eps; + std::for_each(out_data, out_data + limit, [norm](T& v) { v = v / norm; }); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits, + CPU, + ALL_LAYOUT, + phi::SigmoidCrossEntropyWithLogitsKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cumsum_kernel.h b/paddle/phi/kernels/cumsum_kernel.h new file mode 100644 index 00000000000..fd90c7b8f5e --- /dev/null +++ b/paddle/phi/kernels/cumsum_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void CumsumKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + bool flatten, + bool exclusive, + bool reverse, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/auc_kernel.cu b/paddle/phi/kernels/gpu/auc_kernel.cu new file mode 100644 index 00000000000..5a1bb9874fe --- /dev/null +++ b/paddle/phi/kernels/gpu/auc_kernel.cu @@ -0,0 +1,258 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/auc_kernel.h" + +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +__global__ void ClearObsoleteDataKernel(int64_t *pos, + int64_t *neg, + const int bucket_length, + const int slide_steps) { + int cur_step_index = + static_cast(pos[(slide_steps + 1) * bucket_length]) % slide_steps; + int cur_step_begin = cur_step_index * bucket_length; + int sum_step_begin = slide_steps * bucket_length; + CUDA_KERNEL_LOOP(i, bucket_length) { + pos[sum_step_begin + i] -= pos[cur_step_begin + i]; + neg[sum_step_begin + i] -= neg[cur_step_begin + i]; + pos[cur_step_begin + i] = neg[cur_step_begin + i] = 0; + } +} + +__global__ void UpdateSumDataKernel(int64_t *pos, + int64_t *neg, + const int bucket_length, + const int slide_steps) { + int cur_step_index = + static_cast(pos[(slide_steps + 1) * bucket_length]) % slide_steps; + int cur_step_begin = cur_step_index * bucket_length; + int sum_step_begin = slide_steps * bucket_length; + CUDA_KERNEL_LOOP(i, bucket_length) { + pos[sum_step_begin + i] += pos[cur_step_begin + i]; + neg[sum_step_begin + i] += neg[cur_step_begin + i]; + } +} + +template +__global__ void AddDataKernel(const int64_t *label_data, + const T *pred_data, + const int inference_width, + const int num_thresholds, + int64_t *pos, + int64_t *neg, + const int numel, + const int slide_steps) { + int cur_step_begin = 0; + if (slide_steps > 0) { + int cur_step_index = + static_cast(pos[(slide_steps + 1) * (1 + num_thresholds)]) % + slide_steps; + cur_step_begin = cur_step_index * (1 + num_thresholds); + } + CUDA_KERNEL_LOOP(i, numel) { + auto predict_data = pred_data[i * inference_width + (inference_width - 1)]; + PADDLE_ENFORCE(predict_data <= 1, "The predict data must less or equal 1."); + PADDLE_ENFORCE(predict_data >= 0, + "The predict data must gather or equal 0."); + uint32_t binIdx = static_cast(predict_data * num_thresholds); + if (label_data[i]) { + paddle::platform::CudaAtomicAdd(pos + cur_step_begin + binIdx, 1); + } else { + paddle::platform::CudaAtomicAdd(neg + cur_step_begin + binIdx, 1); + } + } +} + +__global__ void CalcAucKernel(int64_t *stat_pos, + int64_t *stat_neg, + int num_thresholds, + double *auc, + bool need_add_batch_num) { + *auc = 0.0f; + double totPos = 0.0; + double totNeg = 0.0; + double totPosPrev = 0.0; + double totNegPrev = 0.0; + + int idx = num_thresholds; + + while (idx >= 0) { + totPosPrev = totPos; + totNegPrev = totNeg; + totPos += stat_pos[idx]; + totNeg += stat_neg[idx]; + *auc += (totNeg - totNegPrev) * (totPos + totPosPrev) / 2.0; + --idx; + } + + if (totPos > 0.0 && totNeg > 0.0) { + *auc = *auc / totPos / totNeg; + } + if (need_add_batch_num) { + stat_pos[num_thresholds + 1] += 1; + stat_neg[num_thresholds + 1] += 1; + } +} + +inline static double trapezoidArea(double X1, double X2, double Y1, double Y2) { + return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0; +} + +template +void statAuc(const Context &dev_ctx, + const DenseTensor &label, + const DenseTensor &predict, + const int num_thresholds, + const int slide_steps, + int64_t *origin_stat_pos, + int64_t *origin_stat_neg) { + size_t batch_size = predict.dims()[0]; + size_t inference_width = predict.dims()[1]; + const T *inference_data = predict.data(); + const auto *label_data = label.data(); + const int bucket_length = num_thresholds + 1; + + if (slide_steps == 0) { + AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) / + PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + dev_ctx.stream()>>>(label_data, + inference_data, + inference_width, + num_thresholds, + origin_stat_pos, + origin_stat_neg, + batch_size, + slide_steps); + return; + } + // the last number of origin_stat_pos store the index should be used in + // current step + int cur_step_index = + static_cast(origin_stat_pos[(slide_steps + 1) * bucket_length]) % + slide_steps; + int cur_step_begin = cur_step_index * bucket_length; + int sum_step_begin = slide_steps * bucket_length; + + ClearObsoleteDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) / + PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + dev_ctx.stream()>>>( + origin_stat_pos, origin_stat_neg, bucket_length, slide_steps); + + AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) / + PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + dev_ctx.stream()>>>(label_data, + inference_data, + inference_width, + num_thresholds, + origin_stat_pos, + origin_stat_neg, + batch_size, + slide_steps); + UpdateSumDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) / + PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + dev_ctx.stream()>>>( + origin_stat_pos, origin_stat_neg, bucket_length, slide_steps); +} + +template +void AucKernel(const Context &dev_ctx, + const DenseTensor &input, + const DenseTensor &label, + const DenseTensor &stat_pos, + const DenseTensor &stat_neg, + const std::string &curve, + int num_thresholds, + int slide_steps, + DenseTensor *auc, + DenseTensor *stat_pos_out, + DenseTensor *stat_neg_out) { + // Only use output var for now, make sure it's persistable and + // not cleaned up for each batch. + auto *origin_stat_pos = dev_ctx.template Alloc(stat_pos_out); + auto *origin_stat_neg = dev_ctx.template Alloc(stat_neg_out); + auto *auc_value = dev_ctx.template Alloc(auc); + + auto *stat_pos_in_tensor = &stat_pos; + auto *stat_neg_in_tensor = &stat_neg; + auto *pos_in_data = stat_pos.data(); + auto *neg_in_data = stat_neg.data(); +#ifdef PADDLE_WITH_CUDA + if (stat_pos_in_tensor != stat_pos_out) { + cudaMemcpy( + origin_stat_pos, + pos_in_data, + ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) * + sizeof(int64_t), + cudaMemcpyDeviceToDevice); + } + if (stat_neg_in_tensor != stat_neg_out) { + cudaMemcpy( + origin_stat_neg, + neg_in_data, + ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) * + sizeof(int64_t), + cudaMemcpyDeviceToDevice); + } +#else + if (stat_pos_in_tensor != stat_pos_out) { + hipMemcpy( + origin_stat_pos, + pos_in_data, + ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) * + sizeof(int64_t), + hipMemcpyDeviceToDevice); + } + if (stat_neg_in_tensor != stat_neg_out) { + hipMemcpy( + origin_stat_neg, + neg_in_data, + ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) * + sizeof(int64_t), + hipMemcpyDeviceToDevice); + } +#endif + + statAuc(dev_ctx, + label, + input, + num_thresholds, + slide_steps, + origin_stat_pos, + origin_stat_neg); + int sum_offset = slide_steps * (num_thresholds + 1); + CalcAucKernel<<<1, 1, 0, dev_ctx.stream()>>>(origin_stat_pos + sum_offset, + origin_stat_neg + sum_offset, + num_thresholds, + auc_value, + slide_steps > 0); +} + +} // namespace phi + +PD_REGISTER_KERNEL(auc, GPU, ALL_LAYOUT, phi::AucKernel, float) {} diff --git a/paddle/phi/kernels/gpu/cumsum_kernel.cu b/paddle/phi/kernels/gpu/cumsum_kernel.cu new file mode 100644 index 00000000000..a253e6f4ad2 --- /dev/null +++ b/paddle/phi/kernels/gpu/cumsum_kernel.cu @@ -0,0 +1,336 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cumsum_kernel.h" + +#include +#include +#include +#include +#ifdef __NVCC__ +#include +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +__device__ void BlockReverse( + const T* idata, T* odata, int src_base, int dst_base, int valid_item) { + __shared__ T sh_mem[BLOCK_SIZE]; + int tx = threadIdx.x; + + int offset = tx; + int in_index = src_base + offset; + if (offset >= valid_item) { + sh_mem[offset] = 0; + } else { + int sh_mem_index = BLOCK_SIZE - offset - 1; + T data = idata[in_index]; + sh_mem[sh_mem_index] = data; + } + + __syncthreads(); + int out_index = dst_base - offset; + if (offset < valid_item) { + int sh_mem_index = BLOCK_SIZE - offset - 1; + odata[out_index] = sh_mem[sh_mem_index]; + } +} + +template +__global__ void MatrixRowReverse(const T* matrix_data, + T* reverse_data, + int reverse_size, + int outer_size, + int inner_size) { + int bx = blockIdx.x; + int by = blockIdx.y; + int item_per_block = 1024; + + for (int block_offset = 0; block_offset < reverse_size; + block_offset += item_per_block) { + int valid_item = (reverse_size - block_offset > item_per_block) + ? item_per_block + : reverse_size - block_offset; + int src_offset = + bx * reverse_size + block_offset + by * (inner_size * reverse_size); + int dst_offset = bx * reverse_size + by * (inner_size * reverse_size) + + reverse_size - 1 - block_offset; + if (reverse_size < item_per_block) { + valid_item = reverse_size; + } + + BlockReverse( + matrix_data, reverse_data, src_offset, dst_offset, valid_item); + } +} + +template +struct BlockPrefixCallbackOp { + // Running prefix + T running_total; + // Constructor + __device__ BlockPrefixCallbackOp(T running_total) + : running_total(running_total) {} + // Callback operator to be entered by the first warp of threads in the block. + // Thread-0 is responsible for returning a value for seeding the block-wide + // scan. + __device__ T operator()(T block_aggregate) { + T old_prefix = running_total; + running_total = old_prefix + block_aggregate; + return old_prefix; + } +}; + +// No bank-conflict transpose +template +__global__ void MatrixTranspose(T* odata, + const T* idata, + size_t height, + size_t width) { + __shared__ T tile[TILE_DIM][TILE_DIM + 1]; + + int x = blockIdx.x * TILE_DIM + threadIdx.x; + int y = blockIdx.y * TILE_DIM + threadIdx.y; + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { + if (x < width && (y + j) < height) { + tile[threadIdx.y + j][threadIdx.x] = idata[(y + j) * width + x]; + } else { + tile[threadIdx.y + j][threadIdx.x] = 0; + } + } + + __syncthreads(); + + x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset + y = blockIdx.x * TILE_DIM + threadIdx.y; + + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { + if (x < height && (y + j) < width) { + odata[(y + j) * height + x] = tile[threadIdx.x][threadIdx.y + j]; + } + } +} + +template +__global__ void BlockScanKernel(T* d_out, + const T* d_in, + int inner_size, + int outer_size, + int scan_size, + bool exclusive) { + // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types + typedef cub:: + BlockLoad + BlockLoadT; + typedef cub:: + BlockStore + BlockStoreT; + typedef cub::BlockScan BlockScanT; + // Allocate type-safe, repurposable shared memory for collectives + __shared__ union { + typename BlockLoadT::TempStorage load; + typename BlockStoreT::TempStorage store; + typename BlockScanT::TempStorage scan; + } temp_storage; + + int bx = blockIdx.x; + int by = blockIdx.y; + + BlockPrefixCallbackOp prefix_op(0); + T block_aggregate = static_cast(0); + + // Obtain this block's segment of consecutive keys (blocked across threads) + int item_per_block = BLOCK_THREADS * ITEMS_PER_THREAD; + for (int block_offset = 0; block_offset < scan_size; + block_offset += BLOCK_THREADS * ITEMS_PER_THREAD) { + int valid_item = (scan_size - block_offset > item_per_block) + ? item_per_block + : (scan_size - block_offset); + if (scan_size < item_per_block) { + valid_item = scan_size; + } + + int offset = bx * scan_size + block_offset + by * (inner_size * scan_size); + + T thread_keys[ITEMS_PER_THREAD]; + BlockLoadT(temp_storage.load) + .Load(d_in + offset, thread_keys, valid_item, 0); + + __syncthreads(); + if (exclusive) { + T init_value = static_cast(0); + BlockScanT(temp_storage.scan) + .ExclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op); + } else { + BlockScanT(temp_storage.scan) + .InclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op); + } + __syncthreads(); + + BlockStoreT(temp_storage.store) + .Store(d_out + offset, thread_keys, valid_item); + } +} + +template +void CumsumKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + bool flatten, + bool exclusive, + bool reverse, + DenseTensor* out) { + auto out_dims = out->dims(); + auto size = x.numel(); + + PADDLE_ENFORCE_EQ( + axis < out_dims.size() && axis >= (0 - out_dims.size()), + true, + phi::errors::OutOfRange( + "Attr(axis) is out of range, It's expected " + "to be in range of [-%d, %d]. But received Attr(axis) = %d.", + out_dims.size(), + out_dims.size() - 1, + axis)); + if (axis < 0) { + axis += out_dims.size(); + } + + T* out_data = dev_ctx.template Alloc(out); + const T* in_data = x.data(); + + // Use thrust for parallel acceleration when the input size is equal to the + // length of the ‘axis’ dimension. + if (size == out_dims[axis]) { + if (reverse) { + thrust::device_ptr dev_ptr = + thrust::device_pointer_cast(in_data); + thrust::device_vector vec(dev_ptr, dev_ptr + size); + if (exclusive) { + thrust::exclusive_scan( + thrust::device, vec.rbegin(), vec.rend(), out_data); + } else { + thrust::inclusive_scan( + thrust::device, vec.rbegin(), vec.rend(), out_data); + } + thrust::reverse(thrust::device, out_data, out_data + size); + } else { + if (exclusive) { + thrust::exclusive_scan( + thrust::device, in_data, in_data + size, out_data); + } else { + thrust::inclusive_scan( + thrust::device, in_data, in_data + size, out_data); + } + } + return; + } + + size_t height = 1; + size_t width = 1; + for (size_t i = 0; i <= axis; i++) { + height *= out_dims[i]; + } + + for (size_t i = axis + 1; i < out_dims.size(); i++) { + width *= out_dims[i]; + } + int scan_size = out_dims[axis]; + bool transpose = (axis != out_dims.size() - 1); + + int tile_size = 32; + dim3 blocks(32, 8); + dim3 transpose_grids((width + tile_size - 1) / tile_size, + (height + tile_size - 1) / tile_size); + out->Resize(out_dims); + auto* tmp_data = out->data(); + + T* next_in_data = out_data; + T* next_out_data = tmp_data; + if (transpose) { + MatrixTranspose<<>>( + out_data, in_data, height, width); + next_in_data = out_data; + next_out_data = tmp_data; + } + auto swap_ptr = [](T*& ptr1, T*& ptr2) { + T* tmp = ptr2; + ptr2 = ptr1; + ptr1 = tmp; + }; + int outer_size = height / scan_size; + int inner_size = width; + // Consider the size of shared memory, here block size is 128 + dim3 scan_grid(outer_size, inner_size); + dim3 reverse_grid = scan_grid; + if (reverse) { + if (transpose) { + reverse_grid.x = scan_grid.y; + reverse_grid.y = scan_grid.x; + MatrixRowReverse<<>>( + next_in_data, next_out_data, scan_size, outer_size, inner_size); + if (!transpose) next_in_data = tmp_data; + swap_ptr(next_in_data, next_out_data); + } else { + MatrixRowReverse<<>>( + in_data, out_data, scan_size, outer_size, inner_size); + } + } + if (!transpose && !reverse) { + BlockScanKernel<<>>( + out_data, in_data, outer_size, inner_size, scan_size, exclusive); + + } else { + BlockScanKernel<<>>( + next_out_data, + next_in_data, + outer_size, + inner_size, + scan_size, + exclusive); + } + swap_ptr(next_in_data, next_out_data); + if (reverse) { + MatrixRowReverse<<>>( + next_in_data, next_out_data, scan_size, outer_size, inner_size); + swap_ptr(next_in_data, next_out_data); + } + if (transpose) { + transpose_grids.x = (height + tile_size - 1) / tile_size; + transpose_grids.y = (width + tile_size - 1) / tile_size; + MatrixTranspose<<>>( + next_out_data, next_in_data, width, height); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(cumsum, + GPU, + ALL_LAYOUT, + phi::CumsumKernel, + float, + double, + int16_t, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu new file mode 100644 index 00000000000..3bb256ad032 --- /dev/null +++ b/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_loss_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + log_loss_grad, GPU, ALL_LAYOUT, phi::LogLossGradKernel, float) {} diff --git a/paddle/phi/kernels/gpu/log_loss_kernel.cu b/paddle/phi/kernels/gpu/log_loss_kernel.cu new file mode 100644 index 00000000000..0934520ea4a --- /dev/null +++ b/paddle/phi/kernels/gpu/log_loss_kernel.cu @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_loss_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/log_loss_kernel_impl.h" + +PD_REGISTER_KERNEL(log_loss, GPU, ALL_LAYOUT, phi::LogLossKernel, float) {} diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h new file mode 100644 index 00000000000..6f9cda83a9a --- /dev/null +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h @@ -0,0 +1,69 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/operators/math.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_helper.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/gpu/reduce.h" + +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +namespace phi { + +#ifdef __HIPCC__ +static constexpr int kNumCUDAThreads = 256; +#else +static constexpr int kNumCUDAThreads = 512; +#endif +static constexpr int kNumMaxinumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +struct NonzeroFunctor { + HOSTDEVICE explicit inline NonzeroFunctor() {} + HOSTDEVICE inline T operator()(const T x) const { + return static_cast(static_cast(x) != 0); + } +}; + +template +struct DivFunctor { + const T norm_; + HOSTDEVICE inline DivFunctor(const T norm) : norm_(norm) {} + + HOSTDEVICE inline T operator()(T loss) { + loss /= norm_; + return loss; + } +}; + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu new file mode 100644 index 00000000000..ae3cefd9e82 --- /dev/null +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu @@ -0,0 +1,126 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h" + +#include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h" + +namespace phi { + +template +struct SigmoidBwdFunctor { + T ignore_index_; + T eps = static_cast(1e-5); + + HOSTDEVICE inline SigmoidBwdFunctor(const T ignore_index) + : ignore_index_(ignore_index) {} + + HOSTDEVICE inline phi::Array operator()(const T x, + const T label, + const T dout) { + T counts; + T dx_data; + + T diff = label - static_cast(ignore_index_); + if ((diff > -eps) && (diff < eps)) { + dx_data = static_cast(0.); + counts = 0; + } else { + T simoid_x = static_cast(1) / + (static_cast(1) + paddle::operators::real_exp(-x)); + T diff = simoid_x - label; + dx_data = dout * diff; + counts = 1; + } + phi::Array outs; + + outs[0] = dx_data; + outs[1] = counts; + return outs; + } +}; + +template +void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &label, + const DenseTensor &out_grad, + bool normalize, + int ignore_index, + DenseTensor *in_grad) { + auto dx_data = dev_ctx.template Alloc(in_grad); + + // Temporary memory + DenseTensor *counts_tensor = new DenseTensor(); + + int64_t out_dims = label.numel() * sizeof(T); + counts_tensor->Resize({out_dims}); + dev_ctx.template Alloc(counts_tensor); + counts_tensor->Resize(in_grad->dims()); + + int limit = in_grad->numel(); + int blocks = NumBlocks(limit); + int threads = kNumCUDAThreads; + std::vector ins = {&x, &label, &out_grad}; + std::vector outs = {in_grad, counts_tensor}; + auto functor = SigmoidBwdFunctor(ignore_index); + constexpr int Size = 2; + phi::funcs::ElementwiseKernel( + dev_ctx, ins, &outs, functor); + if (normalize) { + T *counts = dev_ctx.template Alloc(counts_tensor); + DenseTensor *norm_tensor = new DenseTensor(); + norm_tensor->Resize({sizeof(T)}); + dev_ctx.template Alloc(norm_tensor); + auto dims = phi::vectorize(counts_tensor->dims()); + std::vector reduce_dim = {}; + for (int i = 0; i < dims.size(); i++) { + reduce_dim.push_back(i); + } + + kernels::TensorReduceImpl>( + dev_ctx, + *counts_tensor, + norm_tensor, + NonzeroFunctor(), + reduce_dim, + dev_ctx.stream()); + T *norm = dev_ctx.template Alloc(norm_tensor); + auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T)); + T *norm_cpu_ptr = reinterpret_cast(norm_cpu_mem->ptr()); + paddle::memory::Copy(phi::CPUPlace(), + norm_cpu_ptr, + dev_ctx.GetPlace(), + norm, + sizeof(T), + dev_ctx.stream()); + auto eps = static_cast(1e-5); + *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps; + + std::vector div_ins = {in_grad}; + std::vector div_outs = {in_grad}; + auto div_functor = DivFunctor(*norm_cpu_ptr); + phi::funcs::ElementwiseKernel(dev_ctx, div_ins, &div_outs, div_functor); + delete norm_tensor; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits_grad, + GPU, + ALL_LAYOUT, + phi::SigmoidCrossEntropyWithLogitsGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu new file mode 100644 index 00000000000..fb63badf56a --- /dev/null +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu @@ -0,0 +1,128 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h" + +#include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h" + +namespace phi { + +template +struct SigmoidFwdFunctor { + T ignore_index_; + T eps = static_cast(1e-5); + + HOSTDEVICE inline SigmoidFwdFunctor(const T ignore_index) + : ignore_index_(ignore_index) {} + + HOSTDEVICE inline phi::Array operator()(const T x, const T label) { + T counts; + T out_data; + + T diff = label - static_cast(ignore_index_); + if ((diff > -eps) && (diff < eps)) { + out_data = static_cast(0.); + counts = 0; + } else { + T term1 = (x > 0) ? x : 0; + T term2 = x * label; + T term3 = paddle::operators::real_log( + static_cast(1) + + paddle::operators::real_exp(static_cast(-abs(x)))); + + out_data = term1 - term2 + term3; + counts = 1; + } + phi::Array outs; + + outs[0] = out_data; + outs[1] = counts; + return outs; + } +}; + +template +void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &label, + bool normalize, + int ignore_index, + DenseTensor *out) { + auto out_data = dev_ctx.template Alloc(out); + + // Temporary memory + DenseTensor *counts_tensor = new DenseTensor(); + + int64_t out_dims = label.numel() * sizeof(T); + counts_tensor->Resize({out_dims}); + dev_ctx.template Alloc(counts_tensor); + counts_tensor->Resize(out->dims()); + + int limit = out->numel(); + int blocks = NumBlocks(limit); + int threads = kNumCUDAThreads; + std::vector ins = {&x, &label}; + std::vector outs = {out, counts_tensor}; + auto functor = SigmoidFwdFunctor(ignore_index); + constexpr int Size = 2; + phi::funcs::ElementwiseKernel( + dev_ctx, ins, &outs, functor); + if (normalize) { + T *counts = dev_ctx.template Alloc(counts_tensor); + DenseTensor *norm_tensor = new DenseTensor(); + norm_tensor->Resize({sizeof(T)}); + dev_ctx.template Alloc(norm_tensor); + auto dims = phi::vectorize(counts_tensor->dims()); + std::vector reduce_dim = {}; + for (int i = 0; i < dims.size(); i++) { + reduce_dim.push_back(i); + } + + kernels::TensorReduceImpl>( + dev_ctx, + *counts_tensor, + norm_tensor, + NonzeroFunctor(), + reduce_dim, + dev_ctx.stream()); + T *norm = dev_ctx.template Alloc(norm_tensor); + auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T)); + T *norm_cpu_ptr = reinterpret_cast(norm_cpu_mem->ptr()); + paddle::memory::Copy(phi::CPUPlace(), + norm_cpu_ptr, + dev_ctx.GetPlace(), + norm, + sizeof(T), + dev_ctx.stream()); + auto eps = static_cast(1e-5); + *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps; + + std::vector div_ins = {out}; + std::vector div_outs = {out}; + auto div_functor = DivFunctor(*norm_cpu_ptr); + phi::funcs::ElementwiseKernel(dev_ctx, div_ins, &div_outs, div_functor); + + delete norm_tensor; + delete counts_tensor; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits, + GPU, + ALL_LAYOUT, + phi::SigmoidCrossEntropyWithLogitsKernel, + float, + double) {} diff --git a/paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h b/paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h new file mode 100644 index 00000000000..6f84133d5f4 --- /dev/null +++ b/paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +void LogLossGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + const DenseTensor& out_grad, + float epsilon, + DenseTensor* in_grad) { + auto prediction = EigenVector::Flatten(input); + auto label_out = EigenVector::Flatten(label); + + auto dl = EigenVector::Flatten(out_grad); + auto& place = *dev_ctx.eigen_device(); + + if (in_grad) { + dev_ctx.template Alloc(in_grad); + auto dx = EigenVector::Flatten(*in_grad); + phi::funcs::EigenLogLossGrad, T>::Eval( + place, dx, dl, prediction, label_out, epsilon); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/log_loss_kernel_impl.h b/paddle/phi/kernels/impl/log_loss_kernel_impl.h new file mode 100644 index 00000000000..d49144c8354 --- /dev/null +++ b/paddle/phi/kernels/impl/log_loss_kernel_impl.h @@ -0,0 +1,40 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +void LogLossKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + float epsilon, + DenseTensor* out) { + dev_ctx.template Alloc(out); + + auto prediction = EigenVector::Flatten(input); + auto label_out = EigenVector::Flatten(label); + + auto loss = EigenVector::Flatten(*out); + auto& place = *dev_ctx.eigen_device(); + + phi::funcs::EigenLogLoss, T>::Eval( + place, loss, prediction, label_out, epsilon); +} + +} // namespace phi diff --git a/paddle/phi/kernels/log_loss_grad_kernel.h b/paddle/phi/kernels/log_loss_grad_kernel.h new file mode 100644 index 00000000000..6853140b19b --- /dev/null +++ b/paddle/phi/kernels/log_loss_grad_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LogLossGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + const DenseTensor& out_grad, + float epsilon, + DenseTensor* in_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/log_loss_kernel.h b/paddle/phi/kernels/log_loss_kernel.h new file mode 100644 index 00000000000..cd16c0f2c7c --- /dev/null +++ b/paddle/phi/kernels/log_loss_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LogLossKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + float epsilon, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h b/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h new file mode 100644 index 00000000000..6bc75b7670f --- /dev/null +++ b/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void SigmoidCrossEntropyWithLogitsGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& label, + const DenseTensor& out_grad, + bool normalize, + int ignore_index, + DenseTensor* in_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h b/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h new file mode 100644 index 00000000000..7ea3e6589f7 --- /dev/null +++ b/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void SigmoidCrossEntropyWithLogitsKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& label, + bool normalize, + int ignore_index, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/log_loss_sig.cc b/paddle/phi/ops/compat/log_loss_sig.cc new file mode 100644 index 00000000000..c4ae746e975 --- /dev/null +++ b/paddle/phi/ops/compat/log_loss_sig.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature LogLossGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("log_loss_grad", + {"Predicted", "Labels", GradVarName("Loss")}, + {"epsilon"}, + {GradVarName("Predicted")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(log_loss_grad, phi::LogLossGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc b/paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc new file mode 100644 index 00000000000..61ad9627a96 --- /dev/null +++ b/paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature SigmoidCrossEntropyWithLogitsKernelGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("sigmoid_cross_entropy_with_logits_grad", + {"X", "Label", GradVarName("Out")}, + {"normalize", "ignore_index"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN( + sigmoid_cross_entropy_with_logits_grad, + phi::SigmoidCrossEntropyWithLogitsKernelGradOpArgumentMapping); -- GitLab