diff --git a/paddle/fluid/operators/kthvalue_op.cc b/paddle/fluid/operators/kthvalue_op.cc index 2a79cee27814e86b277d927082e9a772359217f1..4c679d30263863c70176bebb686556af056068d0 100644 --- a/paddle/fluid/operators/kthvalue_op.cc +++ b/paddle/fluid/operators/kthvalue_op.cc @@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/kthvalue_op.h" #include #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -25,54 +26,6 @@ class KthvalueOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "kthvalue"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "kthvalue"); - OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "kthvalue"); - auto input_dims = ctx->GetInputDim("X"); - const int& dim_size = input_dims.size(); - int axis = static_cast(ctx->Attrs().Get("axis")); - PADDLE_ENFORCE_LT(axis, dim_size, - paddle::platform::errors::InvalidArgument( - "the axis must be [-%d, %d), but received %d .", - dim_size, dim_size, axis)); - PADDLE_ENFORCE_GE(axis, -dim_size, - paddle::platform::errors::InvalidArgument( - "the axis must be [-%d, %d), but received %d .", - dim_size, dim_size, axis)); - if (axis < 0) axis += dim_size; - int k = static_cast(ctx->Attrs().Get("k")); - PADDLE_ENFORCE_GE( - k, 1, paddle::platform::errors::InvalidArgument( - "the k in the kthvalue must >= 1, but received %d .", k)); - PADDLE_ENFORCE_GE(input_dims.size(), 1, - paddle::platform::errors::InvalidArgument( - "input of kthvalue must have >= 1d shape")); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_GE( - input_dims[axis], k, - paddle::platform::errors::InvalidArgument( - "input of kthvalue must have >= %d columns in axis of %d", k, - axis)); - } - bool keepdim = ctx->Attrs().Get("keepdim"); - std::vector dimvec; - for (int64_t i = 0; i < axis; i++) { - dimvec.emplace_back(input_dims[i]); - } - if (keepdim) { - dimvec.emplace_back(static_cast(1)); - } - for (int64_t i = axis + 1; i < dim_size; i++) { - dimvec.emplace_back(input_dims[i]); - } - framework::DDim dims = phi::make_ddim(dimvec); - ctx->SetOutputDim("Out", dims); - ctx->SetOutputDim("Indices", dims); - ctx->ShareLoD("X", "Out"); - ctx->ShareLoD("X", "Indices"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -155,20 +108,13 @@ class KthvalueGradOpMaker : public framework::SingleGradOpMaker { } // namespace operators } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(kthvalue, KthvalueInferShapeFunctor, + PD_INFER_META(phi::KthvalueInferMeta)); + namespace ops = paddle::operators; REGISTER_OPERATOR(kthvalue, ops::KthvalueOp, ops::KthvalueOpMaker, ops::KthvalueGradOpMaker, - ops::KthvalueGradOpMaker); -REGISTER_OP_CPU_KERNEL( - kthvalue, ops::KthvalueCPUKernel, - ops::KthvalueCPUKernel, - ops::KthvalueCPUKernel, - ops::KthvalueCPUKernel); + ops::KthvalueGradOpMaker, + KthvalueInferShapeFunctor); REGISTER_OPERATOR(kthvalue_grad, ops::KthvalueOpGrad); -REGISTER_OP_CPU_KERNEL( - kthvalue_grad, - ops::KthvalueGradCPUKernel, - ops::KthvalueGradCPUKernel, - ops::KthvalueGradCPUKernel, - ops::KthvalueGradCPUKernel); diff --git a/paddle/fluid/operators/kthvalue_op.cu b/paddle/fluid/operators/kthvalue_op.cu deleted file mode 100644 index f6f56f70f1a11971b31e679ef879f2d1d0a96085..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/kthvalue_op.cu +++ /dev/null @@ -1,278 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/kthvalue_op.h" -#include "paddle/fluid/operators/top_k_function_cuda.h" -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -#endif - -namespace paddle { -namespace operators { - -int getBlockSize(int col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; -} - -template -bool SortKthvalue(const platform::CUDADeviceContext& ctx, - const framework::Tensor* input_tensor, const int64_t num_cols, - const int64_t num_rows, const int k, - framework::Tensor* out_tensor, - framework::Tensor* indices_tensor) { - auto cu_stream = ctx.stream(); - framework::Tensor input_indices; - const std::vector dims = {num_rows, num_cols}; - auto dim = phi::make_ddim(dims); - input_indices.Resize(dim); - input_indices.mutable_data(ctx.GetPlace()); - size_t temp_storage_bytes = -1; - int block_size = getBlockSize(num_cols); - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - unsigned int grid_size = num_rows < maxGridDimX - ? static_cast(num_rows) - : maxGridDimX; - InitIndex<<>>( - input_indices.data(), num_rows, num_cols); - cub::CountingInputIterator counting_iter(0); - cub::TransformInputIterator> - segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols)); - T* sorted_values_ptr; - int64_t* sorted_indices_ptr; - framework::Tensor temp_values, temp_indices; - const T* input = input_tensor->data(); - T* values = out_tensor->data(); - int64_t* indices = indices_tensor->mutable_data(ctx.GetPlace()); - temp_values.Resize(dim); - temp_indices.Resize(dim); - sorted_values_ptr = temp_values.mutable_data(ctx.GetPlace()); - sorted_indices_ptr = temp_indices.mutable_data(ctx.GetPlace()); - auto err = cub::DeviceSegmentedRadixSort::SortPairs( - nullptr, temp_storage_bytes, input, sorted_values_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); -#ifdef __HIPCC__ - if (err != hipSuccess) { - LOG(ERROR) << "KthvalueOP failed as could not launch " - "hipcub::DeviceSegmentedRadixSort::SortPairs, status: " - << hipGetErrorString(err); - return false; - } -#else - if (err != cudaSuccess) { - LOG(ERROR) << "KthvalueOP failed as could not launch " - "cub::DeviceSegmentedRadixSort::SortPairs, status: " - << cudaGetErrorString(err); - return false; - } -#endif - framework::Tensor temp_storage; - temp_storage.mutable_data(ctx.GetPlace(), temp_storage_bytes); - - err = cub::DeviceSegmentedRadixSort::SortPairs( - temp_storage.data(), temp_storage_bytes, input, - sorted_values_ptr, input_indices.data(), sorted_indices_ptr, - num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1, - 0, sizeof(T) * 8, cu_stream); -#ifdef __HIPCC__ - if (err != hipSuccess) { - LOG(ERROR) << "KthvalueOP failed as could not launch " - "hipcub::DeviceSegmentedRadixSort::SortPairs, " - << temp_storage_bytes << ", status: " << hipGetErrorString(err); - return false; - } -#else - if (err != cudaSuccess) { - LOG(ERROR) << "KthvalueOP failed as could not launch " - "cub::DeviceSegmentedRadixSort::SortPairs, " - << temp_storage_bytes << ", status: " << cudaGetErrorString(err); - return false; - } -#endif - auto& dev = *ctx.eigen_device(); - const Eigen::DSizes slice_indices{0, k - 1}; - const Eigen::DSizes slice_sizes{num_rows, 1}; - auto e_indices = framework::EigenMatrix::From(*indices_tensor, dim); - auto e_tmp_indices = framework::EigenMatrix::From( - static_cast(temp_indices)); - std::vector odims = {static_cast(num_rows), static_cast(1)}; - dim = phi::make_ddim(odims); - auto e_values = framework::EigenMatrix::From(*out_tensor, dim); - auto e_tmp_values = framework::EigenMatrix::From( - static_cast(temp_values)); - - EigenSlice, int64_t, 2>::Eval( - dev, e_indices, e_tmp_indices, slice_indices, slice_sizes); - EigenSlice, T, 2>::Eval( - dev, e_values, e_tmp_values, slice_indices, slice_sizes); - return true; -} - -template -class KthvalueOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int k = static_cast(ctx.Attr("k")); - int axis = static_cast(ctx.Attr("axis")); - bool keepdim = static_cast(ctx.Attr("keepdim")); - const auto& in_dims = input->dims(); - if (axis < 0) axis += in_dims.size(); - auto out_dims = output->dims(); - const T* input_data = input->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); - int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); - - if (axis == in_dims.size() - 1) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - const auto& dev_ctx = ctx.cuda_device_context(); - PADDLE_ENFORCE_EQ(SortKthvalue(dev_ctx, input, input_width, - input_height, k, output, indices), - true, platform::errors::External( - "KthvalueOP: Error when use cub sorting")); - return; - } else { - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - framework::DDim tmp_out_dims = phi::make_ddim(tmp_out_shape); - output->Resize(tmp_out_dims); - indices->Resize(tmp_out_dims); - } - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(in_dims); - for (int i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - trans_out_dims[i] = in_dims[trans[i]]; - } - trans_out_dims[in_dims.size() - 1] = 1; - framework::Tensor trans_input; - trans_input.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - const auto& dev_ctx = ctx.cuda_device_context(); - TransCompute(ndims, dev_ctx, *input, - &trans_input, trans); - framework::Tensor trans_ind, trans_out; - trans_ind.mutable_data(trans_out_dims, ctx.GetPlace()); - trans_out.mutable_data(trans_out_dims, ctx.GetPlace()); - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - PADDLE_ENFORCE_EQ( - SortKthvalue(dev_ctx, &trans_input, input_width, input_height, k, - &trans_out, &trans_ind), - true, - platform::errors::External("KthvalueOP: Error when use cub sorting")); - TransCompute( - ndims, dev_ctx, trans_ind, indices, trans); - TransCompute(ndims, dev_ctx, trans_out, - output, trans); - if (!keepdim) { - output->Resize(out_dims); - indices->Resize(out_dims); - } - } - } -}; - -template -class KthvalueOpGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(context.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* x = context.Input("X"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = - context.Output(framework::GradVarName("X")); - int axis = context.Attr("axis"); - int k = static_cast(context.Attr("k")); - const auto& in_dims = x->dims(); - auto out_dims = indices->dims(); - if (axis < 0) axis += in_dims.size(); - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - const T* out_grad_data = out_grad->data(); - const int64_t* indices_data = indices->data(); - int pre, n, post; - GetDims(in_dims, axis, &pre, &n, &post); - auto& dev_ctx = context.cuda_device_context(); - int block_size = getBlockSize(post * k); - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); - int grid_size = std::min(max_blocks, pre); - AssignGradWithAxis<<>>( - out_grad_data, indices_data, x_grad_data, pre, post, n, 1); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - kthvalue, - ops::KthvalueOpCUDAKernel, - ops::KthvalueOpCUDAKernel, - ops::KthvalueOpCUDAKernel, - ops::KthvalueOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - kthvalue_grad, - ops::KthvalueOpGradCUDAKernel, - ops::KthvalueOpGradCUDAKernel, - ops::KthvalueOpGradCUDAKernel, - ops::KthvalueOpGradCUDAKernel); diff --git a/paddle/fluid/operators/kthvalue_op.h b/paddle/fluid/operators/kthvalue_op.h deleted file mode 100644 index 15df0a10c6992f07f9913b867319bff342180c3d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/kthvalue_op.h +++ /dev/null @@ -1,281 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/transpose_op.h" - -namespace paddle { -namespace operators { -template -static void getKthvalue(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, T* t_out, - Type* t_indices, const int& k) { - bool partial_sort_flag = (k * 64) < input_width; -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - std::vector> col_vec; - col_vec.reserve(input_width); - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(j), j)); - } - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(i, j), j)); - } - } - if (partial_sort_flag) { - std::partial_sort( - col_vec.begin(), col_vec.begin() + k, col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - } else { - std::nth_element( - col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - } - t_out[i] = col_vec[k - 1].first; - t_indices[i] = col_vec[k - 1].second; - } -} - -template -static void kthvalueAssign(const Type& input_height, const Type& input_width, - const int& input_dim, const framework::Tensor* input, - const framework::Tensor* indices, T* output_data) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - auto e_indices = framework::EigenVector::Flatten(*indices); - output_data[i * input_width + e_indices(0)] = e_input(0); - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = - framework::EigenMatrix::Reshape(*indices, input_dim - 1); - output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0); - } - } -} - -template -class KthvalueCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - auto* indices = context.Output("Indices"); - const auto& in_dims = input->dims(); - int k = static_cast(context.Attr("k")); - bool keepdim = static_cast(context.Attr("keepdim")); - int axis = static_cast(context.Attr("axis")); - if (axis < 0) axis += in_dims.size(); - T* output_data = output->mutable_data(context.GetPlace()); - int64_t* indices_data = indices->mutable_data(context.GetPlace()); - auto out_dims = output->dims(); - if (axis == in_dims.size() - 1) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - getKthvalue(input_height, input_width, in_dims.size(), input, - output_data, indices_data, k); - } else { - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - framework::DDim tmp_out_dims = phi::make_ddim(tmp_out_shape); - output->Resize(tmp_out_dims); - indices->Resize(tmp_out_dims); - } - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(in_dims); - - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - trans_out_dims[i] = in_dims[trans[i]]; - } - trans_out_dims[in_dims.size() - 1] = 1; - framework::Tensor trans_inp; - trans_inp.mutable_data(trans_dims, context.GetPlace()); - int ndims = trans.size(); - auto& dev_context = - context.template device_context(); - TransCompute(ndims, dev_context, *input, - &trans_inp, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - framework::Tensor tmp_out, tmp_indices; - T* t_out = tmp_out.mutable_data(trans_out_dims, context.GetPlace()); - auto* t_ind = - tmp_indices.mutable_data(trans_out_dims, context.GetPlace()); - - getKthvalue(input_height, input_width, in_dims.size(), - &trans_inp, t_out, t_ind, k); - TransCompute( - ndims, dev_context, tmp_indices, indices, trans); - TransCompute(ndims, dev_context, tmp_out, - output, trans); - if (!keepdim) { - output->Resize(out_dims); - indices->Resize(out_dims); - } - } - } -}; - -template -class KthvalueGradCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = - context.Output(framework::GradVarName("X")); - int axis = static_cast(context.Attr("axis")); - bool keepdim = static_cast(context.Attr("keepdim")); - auto in_dims = x->dims(); - auto out_dims = indices->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(out_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(out_dims[i - 1]); - } - out_dims = phi::make_ddim(tmp_out_shape); - } - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - if (axis == in_dims.size() - 1) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); - if (keepdim) { - kthvalueAssign(input_height, input_width, in_dims.size(), out_grad, - indices, x_grad_data); - } else { - auto& dev_context = - context.template device_context(); - framework::Tensor out_grad_tmp, indices_tmp; - out_grad_tmp.mutable_data(out_grad->dims(), dev_context.GetPlace()); - indices_tmp.mutable_data(indices->dims(), - dev_context.GetPlace()); - framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context, - &out_grad_tmp); - framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context, - &indices_tmp); - out_grad_tmp.Resize(out_dims); - indices_tmp.Resize(out_dims); - kthvalueAssign(input_height, input_width, in_dims.size(), &out_grad_tmp, - &indices_tmp, x_grad_data); - } - } else { - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(out_dims.size() - 1); - for (int i = axis + 1; i < out_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - framework::DDim trans_dims(out_dims); - framework::DDim trans_in_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = out_dims[trans[i]]; - trans_in_dims[i] = in_dims[trans[i]]; - } - framework::Tensor trans_dO, trans_ind; - trans_dO.mutable_data(trans_dims, context.GetPlace()); - trans_ind.mutable_data(trans_dims, context.GetPlace()); - int ndims = trans.size(); - auto& dev_context = - context.template device_context(); - if (keepdim) { - TransCompute( - ndims, dev_context, *out_grad, &trans_dO, trans); - TransCompute( - ndims, dev_context, *indices, &trans_ind, trans); - } else { - framework::Tensor out_grad_tmp, indices_tmp; - out_grad_tmp.mutable_data(out_grad->dims(), dev_context.GetPlace()); - indices_tmp.mutable_data(indices->dims(), - dev_context.GetPlace()); - framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context, - &out_grad_tmp); - framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context, - &indices_tmp); - out_grad_tmp.Resize(out_dims); - indices_tmp.Resize(out_dims); - TransCompute( - ndims, dev_context, out_grad_tmp, &trans_dO, trans); - TransCompute( - ndims, dev_context, indices_tmp, &trans_ind, trans); - } - const int64_t input_height = phi::product( - phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1)); - const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1]; - framework::Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_in_dims, context.GetPlace()); - memset(t_out, 0, x_grad->numel() * sizeof(T)); - kthvalueAssign(input_height, input_width, in_dims.size(), - &trans_dO, &trans_ind, t_out); - TransCompute(ndims, dev_context, tmp_out, - x_grad, trans); - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 8a2d718f124578dbab0164048f8daa09e9a54e8f..baa5b39670f3795c738acc8df0adb6a1a00ee67f 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -554,6 +554,67 @@ void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) { out->set_dtype(DataType::BOOL); } +void KthvalueInferMeta(const MetaTensor& x, + int k, + int axis, + bool keepdim, + MetaTensor* out, + MetaTensor* indices, + MetaConfig config) { + auto input_dims = x.dims(); + const int& dim_size = input_dims.size(); + PADDLE_ENFORCE_LT(axis, + dim_size, + phi::errors::InvalidArgument( + "the axis must be [-%d, %d), but received %d .", + dim_size, + dim_size, + axis)); + PADDLE_ENFORCE_GE(axis, + -dim_size, + phi::errors::InvalidArgument( + "the axis must be [-%d, %d), but received %d .", + dim_size, + dim_size, + axis)); + if (axis < 0) axis += dim_size; + PADDLE_ENFORCE_GE( + k, + 1, + phi::errors::InvalidArgument( + "the k in the kthvalue must >= 1, but received %d .", k)); + PADDLE_ENFORCE_GE( + input_dims.size(), + 1, + phi::errors::InvalidArgument("input of kthvalue must have >= 1d shape")); + if (config.is_runtime) { + PADDLE_ENFORCE_GE( + input_dims[axis], + k, + phi::errors::InvalidArgument( + "input of kthvalue must have >= %d columns in axis of %d", + k, + axis)); + } + std::vector dimvec; + for (int64_t i = 0; i < axis; i++) { + dimvec.emplace_back(input_dims[i]); + } + if (keepdim) { + dimvec.emplace_back(static_cast(1)); + } + for (int64_t i = axis + 1; i < dim_size; i++) { + dimvec.emplace_back(input_dims[i]); + } + DDim dims = phi::make_ddim(dimvec); + out->set_dims(dims); + out->share_lod(x); + out->set_dtype(x.dtype()); + indices->set_dims(dims); + indices->share_lod(x); + indices->set_dtype(x.dtype()); +} + void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out) { auto dims = x.dims(); auto n_dim = dims.size(); diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 7203a327b55698c0d4bd0271b2908cbc4a9b5ca1..00026f8598b078591adc9fb2103f4866868e1307 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -100,6 +100,14 @@ void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out); void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out); +void KthvalueInferMeta(const MetaTensor& x, + int k, + int axis, + bool keepdim, + MetaTensor* out, + MetaTensor* indices, + MetaConfig = MetaConfig()); + void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out); void MaxPoolWithIndexInferMeta(const MetaTensor& x, diff --git a/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..185d6cbedc85db83032cecd3c2f6cd1b0f46cbaf --- /dev/null +++ b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc @@ -0,0 +1,168 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kthvalue_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +template +static void kthvalueAssign(const Type& input_height, + const Type& input_width, + const int& input_dim, + const DenseTensor* input, + const DenseTensor* indices, + T* output_data) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (Type i = 0; i < input_height; ++i) { + if (input_dim == 1) { + auto e_input = EigenVector::Flatten(*input); + auto e_indices = EigenVector::Flatten(*indices); + output_data[i * input_width + e_indices(0)] = e_input(0); + } else { + auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + auto e_indices = EigenMatrix::Reshape(*indices, input_dim - 1); + output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0); + } + } +} + +template +void KthvalueGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const DenseTensor& x, + const DenseTensor& indices, + int k, + int axis, + bool keepdim, + DenseTensor* d_x) { + auto in_dims = x.dims(); + auto out_dims = indices.dims(); + axis = (axis < 0) ? (in_dims.size() + axis) : axis; + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(out_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(out_dims[i - 1]); + } + out_dims = phi::make_ddim(tmp_out_shape); + } + T* x_grad_data = dev_ctx.template Alloc(d_x); + if (axis == in_dims.size() - 1) { + const int64_t input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t input_width = in_dims[in_dims.size() - 1]; + memset(x_grad_data, 0, d_x->numel() * sizeof(T)); + if (keepdim) { + kthvalueAssign(input_height, + input_width, + in_dims.size(), + &d_out, + &indices, + x_grad_data); + } else { + DenseTensor out_grad_tmp, indices_tmp; + out_grad_tmp.Resize(d_out.dims()); + indices_tmp.Resize(indices.dims()); + dev_ctx.template Alloc(&out_grad_tmp); + dev_ctx.template Alloc(&indices_tmp); + Copy(dev_ctx, d_out, dev_ctx.GetPlace(), false, &out_grad_tmp); + Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp); + out_grad_tmp.Resize(out_dims); + indices_tmp.Resize(out_dims); + kthvalueAssign(input_height, + input_width, + in_dims.size(), + &out_grad_tmp, + &indices_tmp, + x_grad_data); + } + } else { + std::vector trans; + for (int i = 0; i < axis; i++) { + trans.emplace_back(i); + } + trans.emplace_back(out_dims.size() - 1); + for (int i = axis + 1; i < out_dims.size() - 1; i++) { + trans.emplace_back(i); + } + trans.emplace_back(axis); + DDim trans_dims(out_dims); + DDim trans_in_dims(in_dims); + for (size_t i = 0; i < trans.size(); i++) { + trans_dims[i] = out_dims[trans[i]]; + trans_in_dims[i] = in_dims[trans[i]]; + } + DenseTensor trans_dO, trans_ind; + trans_dO.Resize(trans_dims); + trans_ind.Resize(trans_dims); + dev_ctx.template Alloc(&trans_dO); + dev_ctx.template Alloc(&trans_ind); + int ndims = trans.size(); + if (keepdim) { + funcs::TransCompute( + ndims, dev_ctx, d_out, &trans_dO, trans); + funcs::TransCompute( + ndims, dev_ctx, indices, &trans_ind, trans); + } else { + DenseTensor out_grad_tmp, indices_tmp; + out_grad_tmp.Resize(d_out.dims()); + indices_tmp.Resize(indices.dims()); + dev_ctx.template Alloc(&out_grad_tmp); + dev_ctx.template Alloc(&indices_tmp); + Copy(dev_ctx, d_out, dev_ctx.GetPlace(), false, &out_grad_tmp); + Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp); + out_grad_tmp.Resize(out_dims); + indices_tmp.Resize(out_dims); + funcs::TransCompute( + ndims, dev_ctx, out_grad_tmp, &trans_dO, trans); + funcs::TransCompute( + ndims, dev_ctx, indices_tmp, &trans_ind, trans); + } + const int64_t input_height = phi::product( + phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1)); + const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1]; + DenseTensor tmp_out; + tmp_out.Resize(trans_in_dims); + T* t_out = dev_ctx.template Alloc(&tmp_out); + memset(t_out, 0, d_x->numel() * sizeof(T)); + kthvalueAssign(input_height, + input_width, + in_dims.size(), + &trans_dO, + &trans_ind, + t_out); + funcs::TransCompute( + ndims, dev_ctx, tmp_out, d_x, trans); + } +} +} // namespace phi + +PD_REGISTER_KERNEL(kthvalue_grad, + CPU, + ALL_LAYOUT, + phi::KthvalueGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/kthvalue_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..5e436623cae7bbc27903eff8e2bf01a41ded9c94 --- /dev/null +++ b/paddle/phi/kernels/cpu/kthvalue_kernel.cc @@ -0,0 +1,167 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kthvalue_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +template +static void getKthvalue(Type input_height, + Type input_width, + int input_dim, + const DenseTensor* input, + T* t_out, + Type* t_indices, + const int& k) { + bool partial_sort_flag = (k * 64) < input_width; +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (Type i = 0; i < input_height; ++i) { + std::vector> col_vec; + col_vec.reserve(input_width); + if (input_dim == 1) { + auto e_input = EigenVector::Flatten(*input); + for (Type j = 0; j < input_width; ++j) { + col_vec.emplace_back(std::pair(e_input(j), j)); + } + } else { + auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + for (Type j = 0; j < input_width; ++j) { + col_vec.emplace_back(std::pair(e_input(i, j), j)); + } + } + if (partial_sort_flag) { + std::partial_sort( + col_vec.begin(), + col_vec.begin() + k, + col_vec.end(), + [](const std::pair& l, const std::pair& r) { + return (!std::isnan(static_cast(l.first)) && + std::isnan(static_cast(r.first))) || + (l.first < r.first); + }); + } else { + std::nth_element( + col_vec.begin(), + col_vec.begin() + k - 1, + col_vec.end(), + [](const std::pair& l, const std::pair& r) { + return (!std::isnan(static_cast(l.first)) && + std::isnan(static_cast(r.first))) || + (l.first < r.first); + }); + } + t_out[i] = col_vec[k - 1].first; + t_indices[i] = col_vec[k - 1].second; + } +} + +template +void KthvalueKernel(const Context& dev_ctx, + const DenseTensor& x, + int k, + int axis, + bool keepdim, + DenseTensor* output, + DenseTensor* indices) { + const auto& in_dims = x.dims(); + if (axis < 0) axis += in_dims.size(); + T* output_data = dev_ctx.template Alloc(output); + int64_t* indices_data = dev_ctx.template Alloc(indices); + auto out_dims = output->dims(); + if (axis == in_dims.size() - 1) { + const int64_t& input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t& input_width = in_dims[in_dims.size() - 1]; + getKthvalue(input_height, + input_width, + in_dims.size(), + &x, + output_data, + indices_data, + k); + } else { + std::vector trans; + for (int i = 0; i < axis; i++) { + trans.emplace_back(i); + } + trans.emplace_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans.emplace_back(i); + } + trans.emplace_back(axis); + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + DDim tmp_out_dims = phi::make_ddim(tmp_out_shape); + output->Resize(tmp_out_dims); + indices->Resize(tmp_out_dims); + } + DDim trans_dims(in_dims); + DDim trans_out_dims(in_dims); + + for (size_t i = 0; i < trans.size(); i++) { + trans_dims[i] = in_dims[trans[i]]; + trans_out_dims[i] = in_dims[trans[i]]; + } + trans_out_dims[in_dims.size() - 1] = 1; + DenseTensor trans_inp; + trans_inp.Resize(trans_dims); + dev_ctx.template Alloc(&trans_inp); + int ndims = trans.size(); + funcs::TransCompute( + ndims, dev_ctx, x, &trans_inp, trans); + + const int64_t input_height = + phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); + const int64_t input_width = trans_dims[trans_dims.size() - 1]; + DenseTensor tmp_out, tmp_indices; + tmp_out.Resize(trans_out_dims); + T* t_out = dev_ctx.template Alloc(&tmp_out); + tmp_indices.Resize(trans_out_dims); + int64_t* t_ind = dev_ctx.template Alloc(&tmp_indices); + getKthvalue( + input_height, input_width, in_dims.size(), &trans_inp, t_out, t_ind, k); + funcs::TransCompute( + ndims, dev_ctx, tmp_indices, indices, trans); + funcs::TransCompute( + ndims, dev_ctx, tmp_out, output, trans); + if (!keepdim) { + output->Resize(out_dims); + indices->Resize(out_dims); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(kthvalue, + CPU, + ALL_LAYOUT, + phi::KthvalueKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..f6e96046a2bd799f4a6b8d30a239afb505582deb --- /dev/null +++ b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kthvalue_grad_kernel.h" + +#include "paddle/fluid/operators/top_k_function_cuda.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +static int getBlockSize(int col) { + if (col > 512) + return 1024; + else if (col > 256 && col <= 512) + return 512; + else if (col > 128 && col <= 256) + return 256; + else if (col > 64 && col <= 128) + return 128; + else + return 64; +} + +template +void KthvalueGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const DenseTensor& x, + const DenseTensor& indices, + int k, + int axis, + bool keepdim, + DenseTensor* d_x) { + const auto& in_dims = x.dims(); + auto out_dims = indices.dims(); + if (axis < 0) axis += in_dims.size(); + T* x_grad_data = dev_ctx.template Alloc(d_x); + const T* out_grad_data = d_out.data(); + const int64_t* indices_data = indices.data(); + int pre, n, post; + paddle::operators::GetDims(in_dims, axis, &pre, &n, &post); + int block_size = getBlockSize(post * k); + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); + int grid_size = std::min(max_blocks, pre); + paddle::operators::AssignGradWithAxis< + T><<>>( + out_grad_data, indices_data, x_grad_data, pre, post, n, 1); +} + +} // namespace phi + +PD_REGISTER_KERNEL(kthvalue_grad, + GPU, + ALL_LAYOUT, + phi::KthvalueGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/kthvalue_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..4218e153ec29bd1757b2405f0af638040de9bff2 --- /dev/null +++ b/paddle/phi/kernels/gpu/kthvalue_kernel.cu @@ -0,0 +1,252 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kthvalue_kernel.h" + +#include "paddle/fluid/operators/top_k_function_cuda.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +inline int getBlockSize(int col) { + if (col > 512) + return 1024; + else if (col > 256 && col <= 512) + return 512; + else if (col > 128 && col <= 256) + return 256; + else if (col > 64 && col <= 128) + return 128; + else + return 64; +} + +template +bool SortKthvalue(const phi::GPUContext& dev_ctx, + const DenseTensor* input_tensor, + const int64_t num_cols, + const int64_t num_rows, + const int k, + DenseTensor* out_tensor, + DenseTensor* indices_tensor) { + auto cu_stream = dev_ctx.stream(); + DenseTensor input_indices; + const std::vector dims = {num_rows, num_cols}; + auto dim = phi::make_ddim(dims); + input_indices.Resize(dim); + dev_ctx.template Alloc(&input_indices); + size_t temp_storage_bytes = -1; + int block_size = getBlockSize(num_cols); + unsigned int maxGridDimX = dev_ctx.GetCUDAMaxGridDimSize()[0]; + unsigned int grid_size = num_rows < maxGridDimX + ? static_cast(num_rows) + : maxGridDimX; + paddle::operators::InitIndex< + int64_t><<>>( + input_indices.data(), num_rows, num_cols); + cub::CountingInputIterator counting_iter(0); + cub::TransformInputIterator> + segment_offsets_t(counting_iter, + paddle::operators::SegmentOffsetIter(num_cols)); + T* sorted_values_ptr; + int64_t* sorted_indices_ptr; + DenseTensor temp_values, temp_indices; + const T* input = input_tensor->data(); + T* values = out_tensor->data(); + int64_t* indices = indices_tensor->mutable_data(dev_ctx.GetPlace()); + temp_values.Resize(dim); + temp_indices.Resize(dim); + sorted_values_ptr = dev_ctx.template Alloc(&temp_values); + sorted_indices_ptr = dev_ctx.template Alloc(&temp_indices); + auto err = + cub::DeviceSegmentedRadixSort::SortPairs(nullptr, + temp_storage_bytes, + input, + sorted_values_ptr, + input_indices.data(), + sorted_indices_ptr, + num_cols * num_rows, + num_rows, + segment_offsets_t, + segment_offsets_t + 1, + 0, + sizeof(T) * 8, + cu_stream); +#ifdef __HIPCC__ + if (err != hipSuccess) { + LOG(ERROR) << "KthvalueOP failed as could not launch " + "hipcub::DeviceSegmentedRadixSort::SortPairs, status: " + << hipGetErrorString(err); + return false; + } +#else + if (err != cudaSuccess) { + LOG(ERROR) << "KthvalueOP failed as could not launch " + "cub::DeviceSegmentedRadixSort::SortPairs, status: " + << cudaGetErrorString(err); + return false; + } +#endif + DenseTensor temp_storage; + temp_storage.Resize({static_cast(temp_storage_bytes / sizeof(uint8_t))}); + uint8_t* temp_storage_data = dev_ctx.template Alloc(&temp_storage); + + err = cub::DeviceSegmentedRadixSort::SortPairs(temp_storage_data, + temp_storage_bytes, + input, + sorted_values_ptr, + input_indices.data(), + sorted_indices_ptr, + num_cols * num_rows, + num_rows, + segment_offsets_t, + segment_offsets_t + 1, + 0, + sizeof(T) * 8, + cu_stream); +#ifdef __HIPCC__ + if (err != hipSuccess) { + LOG(ERROR) << "KthvalueOP failed as could not launch " + "hipcub::DeviceSegmentedRadixSort::SortPairs, " + << temp_storage_bytes << ", status: " << hipGetErrorString(err); + return false; + } +#else + if (err != cudaSuccess) { + LOG(ERROR) << "KthvalueOP failed as could not launch " + "cub::DeviceSegmentedRadixSort::SortPairs, " + << temp_storage_bytes << ", status: " << cudaGetErrorString(err); + return false; + } +#endif + auto& dev = *dev_ctx.eigen_device(); + const Eigen::DSizes slice_indices{0, k - 1}; + const Eigen::DSizes slice_sizes{num_rows, 1}; + auto e_indices = EigenMatrix::From(*indices_tensor, dim); + auto e_tmp_indices = + EigenMatrix::From(static_cast(temp_indices)); + std::vector odims = {static_cast(num_rows), static_cast(1)}; + dim = phi::make_ddim(odims); + auto e_values = EigenMatrix::From(*out_tensor, dim); + auto e_tmp_values = + EigenMatrix::From(static_cast(temp_values)); + + funcs::EigenSlice, int64_t, 2>::Eval( + dev, e_indices, e_tmp_indices, slice_indices, slice_sizes); + funcs::EigenSlice, T, 2>::Eval( + dev, e_values, e_tmp_values, slice_indices, slice_sizes); + return true; +} + +template +void KthvalueKernel(const Context& dev_ctx, + const DenseTensor& x, + int k, + int axis, + bool keepdim, + DenseTensor* output, + DenseTensor* indices) { + const auto& in_dims = x.dims(); + if (axis < 0) axis += in_dims.size(); + auto out_dims = output->dims(); + const T* input_data = x.data(); + T* output_data = dev_ctx.template Alloc(output); + int64_t* indices_data = dev_ctx.template Alloc(indices); + + if (axis == in_dims.size() - 1) { + const int64_t& input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t& input_width = in_dims[in_dims.size() - 1]; + PADDLE_ENFORCE_EQ( + SortKthvalue( + dev_ctx, &x, input_width, input_height, k, output, indices), + true, + phi::errors::External("KthvalueOP: Error when use cub sorting")); + return; + } else { + std::vector trans; + for (int i = 0; i < axis; i++) { + trans.emplace_back(i); + } + trans.emplace_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans.emplace_back(i); + } + trans.emplace_back(axis); + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + DDim tmp_out_dims = phi::make_ddim(tmp_out_shape); + output->Resize(tmp_out_dims); + indices->Resize(tmp_out_dims); + } + DDim trans_dims(in_dims); + DDim trans_out_dims(in_dims); + for (int i = 0; i < trans.size(); i++) { + trans_dims[i] = in_dims[trans[i]]; + trans_out_dims[i] = in_dims[trans[i]]; + } + trans_out_dims[in_dims.size() - 1] = 1; + DenseTensor trans_input; + trans_input.mutable_data(trans_dims, dev_ctx.GetPlace()); + int ndims = trans.size(); + funcs::TransCompute( + ndims, dev_ctx, x, &trans_input, trans); + DenseTensor trans_ind, trans_out; + trans_ind.mutable_data(trans_out_dims, dev_ctx.GetPlace()); + trans_out.mutable_data(trans_out_dims, dev_ctx.GetPlace()); + const int64_t input_height = + phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); + const int64_t input_width = trans_dims[trans_dims.size() - 1]; + PADDLE_ENFORCE_EQ( + SortKthvalue(dev_ctx, + &trans_input, + input_width, + input_height, + k, + &trans_out, + &trans_ind), + true, + phi::errors::External("KthvalueOP: Error when use cub sorting")); + funcs::TransCompute( + ndims, dev_ctx, trans_ind, indices, trans); + funcs::TransCompute( + ndims, dev_ctx, trans_out, output, trans); + if (!keepdim) { + output->Resize(out_dims); + indices->Resize(out_dims); + } + } +} +} // namespace phi + +PD_REGISTER_KERNEL(kthvalue, + GPU, + ALL_LAYOUT, + phi::KthvalueKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/kthvalue_grad_kernel.h b/paddle/phi/kernels/kthvalue_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..488dde8237b0882b6606834f8a510ef360da1b24 --- /dev/null +++ b/paddle/phi/kernels/kthvalue_grad_kernel.h @@ -0,0 +1,30 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { +template +void KthvalueGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const DenseTensor& x, + const DenseTensor& indices, + int k, + int axis, + bool keepdim, + DenseTensor* d_x); +} // namespace phi diff --git a/paddle/phi/kernels/kthvalue_kernel.h b/paddle/phi/kernels/kthvalue_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..4809b9af4832f5d9c036d26adfc6ba5c7a808889 --- /dev/null +++ b/paddle/phi/kernels/kthvalue_kernel.h @@ -0,0 +1,30 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void KthvalueKernel(const Context& dev_ctx, + const DenseTensor& x, + int k, + int axis, + bool keepdim, + DenseTensor* out, + DenseTensor* indices); +} // namespace phi diff --git a/paddle/phi/ops/compat/kthvalue_sig.cc b/paddle/phi/ops/compat/kthvalue_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..e59e9de1e43822ed8d50b8c1d1888e0d1d14540f --- /dev/null +++ b/paddle/phi/ops/compat/kthvalue_sig.cc @@ -0,0 +1,29 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature KthvalueGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("kthvalue_grad", + {GradVarName("Out"), "X", "Indices"}, + {"k", "axis", "keepdim"}, + {GradVarName("X")}); +} + +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(kthvalue_grad, phi::KthvalueGradOpArgumentMapping);