未验证 提交 d7ccd6bf 编写于 作者: X xiongkun 提交者: GitHub

[phi] tranfer kthvalue from fluid to phi (#40676)

* tranfer kthvalue from fluid to phi

* transfer infershape
上级 8c713223
...@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/kthvalue_op.h"
#include <memory> #include <memory>
#include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -25,54 +26,6 @@ class KthvalueOp : public framework::OperatorWithKernel { ...@@ -25,54 +26,6 @@ class KthvalueOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "kthvalue");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "kthvalue");
OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "kthvalue");
auto input_dims = ctx->GetInputDim("X");
const int& dim_size = input_dims.size();
int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
PADDLE_ENFORCE_LT(axis, dim_size,
paddle::platform::errors::InvalidArgument(
"the axis must be [-%d, %d), but received %d .",
dim_size, dim_size, axis));
PADDLE_ENFORCE_GE(axis, -dim_size,
paddle::platform::errors::InvalidArgument(
"the axis must be [-%d, %d), but received %d .",
dim_size, dim_size, axis));
if (axis < 0) axis += dim_size;
int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
PADDLE_ENFORCE_GE(
k, 1, paddle::platform::errors::InvalidArgument(
"the k in the kthvalue must >= 1, but received %d .", k));
PADDLE_ENFORCE_GE(input_dims.size(), 1,
paddle::platform::errors::InvalidArgument(
"input of kthvalue must have >= 1d shape"));
if (ctx->IsRuntime()) {
PADDLE_ENFORCE_GE(
input_dims[axis], k,
paddle::platform::errors::InvalidArgument(
"input of kthvalue must have >= %d columns in axis of %d", k,
axis));
}
bool keepdim = ctx->Attrs().Get<bool>("keepdim");
std::vector<int64_t> dimvec;
for (int64_t i = 0; i < axis; i++) {
dimvec.emplace_back(input_dims[i]);
}
if (keepdim) {
dimvec.emplace_back(static_cast<int64_t>(1));
}
for (int64_t i = axis + 1; i < dim_size; i++) {
dimvec.emplace_back(input_dims[i]);
}
framework::DDim dims = phi::make_ddim(dimvec);
ctx->SetOutputDim("Out", dims);
ctx->SetOutputDim("Indices", dims);
ctx->ShareLoD("X", "Out");
ctx->ShareLoD("X", "Indices");
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
...@@ -155,20 +108,13 @@ class KthvalueGradOpMaker : public framework::SingleGradOpMaker<T> { ...@@ -155,20 +108,13 @@ class KthvalueGradOpMaker : public framework::SingleGradOpMaker<T> {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
DECLARE_INFER_SHAPE_FUNCTOR(kthvalue, KthvalueInferShapeFunctor,
PD_INFER_META(phi::KthvalueInferMeta));
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(kthvalue, ops::KthvalueOp, ops::KthvalueOpMaker, REGISTER_OPERATOR(kthvalue, ops::KthvalueOp, ops::KthvalueOpMaker,
ops::KthvalueGradOpMaker<paddle::framework::OpDesc>, ops::KthvalueGradOpMaker<paddle::framework::OpDesc>,
ops::KthvalueGradOpMaker<paddle::imperative::OpBase>); ops::KthvalueGradOpMaker<paddle::imperative::OpBase>,
REGISTER_OP_CPU_KERNEL( KthvalueInferShapeFunctor);
kthvalue, ops::KthvalueCPUKernel<paddle::platform::CPUPlace, float>,
ops::KthvalueCPUKernel<paddle::platform::CPUPlace, double>,
ops::KthvalueCPUKernel<paddle::platform::CPUPlace, int32_t>,
ops::KthvalueCPUKernel<paddle::platform::CPUPlace, int64_t>);
REGISTER_OPERATOR(kthvalue_grad, ops::KthvalueOpGrad); REGISTER_OPERATOR(kthvalue_grad, ops::KthvalueOpGrad);
REGISTER_OP_CPU_KERNEL(
kthvalue_grad,
ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, float>,
ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, double>,
ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, int32_t>,
ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, int64_t>);
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/kthvalue_op.h"
#include "paddle/fluid/operators/top_k_function_cuda.h"
#ifdef __NVCC__
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
#endif
namespace paddle {
namespace operators {
int getBlockSize(int col) {
if (col > 512)
return 1024;
else if (col > 256 && col <= 512)
return 512;
else if (col > 128 && col <= 256)
return 256;
else if (col > 64 && col <= 128)
return 128;
else
return 64;
}
template <typename T>
bool SortKthvalue(const platform::CUDADeviceContext& ctx,
const framework::Tensor* input_tensor, const int64_t num_cols,
const int64_t num_rows, const int k,
framework::Tensor* out_tensor,
framework::Tensor* indices_tensor) {
auto cu_stream = ctx.stream();
framework::Tensor input_indices;
const std::vector<int64_t> dims = {num_rows, num_cols};
auto dim = phi::make_ddim(dims);
input_indices.Resize(dim);
input_indices.mutable_data<int64_t>(ctx.GetPlace());
size_t temp_storage_bytes = -1;
int block_size = getBlockSize(num_cols);
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
unsigned int grid_size = num_rows < maxGridDimX
? static_cast<unsigned int>(num_rows)
: maxGridDimX;
InitIndex<int64_t><<<grid_size, block_size, 0, cu_stream>>>(
input_indices.data<int64_t>(), num_rows, num_cols);
cub::CountingInputIterator<int64_t> counting_iter(0);
cub::TransformInputIterator<int64_t, SegmentOffsetIter,
cub::CountingInputIterator<int64_t>>
segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
T* sorted_values_ptr;
int64_t* sorted_indices_ptr;
framework::Tensor temp_values, temp_indices;
const T* input = input_tensor->data<T>();
T* values = out_tensor->data<T>();
int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
temp_values.Resize(dim);
temp_indices.Resize(dim);
sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
auto err = cub::DeviceSegmentedRadixSort::SortPairs(
nullptr, temp_storage_bytes, input, sorted_values_ptr,
input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
cu_stream);
#ifdef __HIPCC__
if (err != hipSuccess) {
LOG(ERROR) << "KthvalueOP failed as could not launch "
"hipcub::DeviceSegmentedRadixSort::SortPairs, status: "
<< hipGetErrorString(err);
return false;
}
#else
if (err != cudaSuccess) {
LOG(ERROR) << "KthvalueOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairs, status: "
<< cudaGetErrorString(err);
return false;
}
#endif
framework::Tensor temp_storage;
temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
err = cub::DeviceSegmentedRadixSort::SortPairs(
temp_storage.data<uint8_t>(), temp_storage_bytes, input,
sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
0, sizeof(T) * 8, cu_stream);
#ifdef __HIPCC__
if (err != hipSuccess) {
LOG(ERROR) << "KthvalueOP failed as could not launch "
"hipcub::DeviceSegmentedRadixSort::SortPairs, "
<< temp_storage_bytes << ", status: " << hipGetErrorString(err);
return false;
}
#else
if (err != cudaSuccess) {
LOG(ERROR) << "KthvalueOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairs, "
<< temp_storage_bytes << ", status: " << cudaGetErrorString(err);
return false;
}
#endif
auto& dev = *ctx.eigen_device();
const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, k - 1};
const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, 1};
auto e_indices = framework::EigenMatrix<int64_t>::From(*indices_tensor, dim);
auto e_tmp_indices = framework::EigenMatrix<int64_t>::From(
static_cast<const framework::Tensor>(temp_indices));
std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(1)};
dim = phi::make_ddim(odims);
auto e_values = framework::EigenMatrix<T>::From(*out_tensor, dim);
auto e_tmp_values = framework::EigenMatrix<T>::From(
static_cast<const framework::Tensor>(temp_values));
EigenSlice<std::decay_t<decltype(dev)>, int64_t, 2>::Eval(
dev, e_indices, e_tmp_indices, slice_indices, slice_sizes);
EigenSlice<std::decay_t<decltype(dev)>, T, 2>::Eval(
dev, e_values, e_tmp_values, slice_indices, slice_sizes);
return true;
}
template <typename DeviceContext, typename T>
class KthvalueOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument(
"It must use CUDAPlace, you must check your device set."));
auto* input = ctx.Input<framework::Tensor>("X");
auto* output = ctx.Output<framework::Tensor>("Out");
auto* indices = ctx.Output<framework::Tensor>("Indices");
int k = static_cast<int>(ctx.Attr<int>("k"));
int axis = static_cast<int>(ctx.Attr<int>("axis"));
bool keepdim = static_cast<bool>(ctx.Attr<bool>("keepdim"));
const auto& in_dims = input->dims();
if (axis < 0) axis += in_dims.size();
auto out_dims = output->dims();
const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace());
int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
if (axis == in_dims.size() - 1) {
const int64_t& input_height =
phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
const int64_t& input_width = in_dims[in_dims.size() - 1];
const auto& dev_ctx = ctx.cuda_device_context();
PADDLE_ENFORCE_EQ(SortKthvalue<T>(dev_ctx, input, input_width,
input_height, k, output, indices),
true, platform::errors::External(
"KthvalueOP: Error when use cub sorting"));
return;
} else {
std::vector<int> trans;
for (int i = 0; i < axis; i++) {
trans.emplace_back(i);
}
trans.emplace_back(in_dims.size() - 1);
for (int i = axis + 1; i < in_dims.size() - 1; i++) {
trans.emplace_back(i);
}
trans.emplace_back(axis);
if (!keepdim) {
std::vector<int> tmp_out_shape;
for (int i = 0; i < axis; i++) {
tmp_out_shape.emplace_back(in_dims[i]);
}
tmp_out_shape.emplace_back(1);
for (int i = axis + 1; i < in_dims.size(); i++) {
tmp_out_shape.emplace_back(in_dims[i]);
}
framework::DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
output->Resize(tmp_out_dims);
indices->Resize(tmp_out_dims);
}
framework::DDim trans_dims(in_dims);
framework::DDim trans_out_dims(in_dims);
for (int i = 0; i < trans.size(); i++) {
trans_dims[i] = in_dims[trans[i]];
trans_out_dims[i] = in_dims[trans[i]];
}
trans_out_dims[in_dims.size() - 1] = 1;
framework::Tensor trans_input;
trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
int ndims = trans.size();
const auto& dev_ctx = ctx.cuda_device_context();
TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
&trans_input, trans);
framework::Tensor trans_ind, trans_out;
trans_ind.mutable_data<int64_t>(trans_out_dims, ctx.GetPlace());
trans_out.mutable_data<T>(trans_out_dims, ctx.GetPlace());
const int64_t input_height =
phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
const int64_t input_width = trans_dims[trans_dims.size() - 1];
PADDLE_ENFORCE_EQ(
SortKthvalue<T>(dev_ctx, &trans_input, input_width, input_height, k,
&trans_out, &trans_ind),
true,
platform::errors::External("KthvalueOP: Error when use cub sorting"));
TransCompute<platform::CUDADeviceContext, int64_t>(
ndims, dev_ctx, trans_ind, indices, trans);
TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
output, trans);
if (!keepdim) {
output->Resize(out_dims);
indices->Resize(out_dims);
}
}
}
};
template <typename DeviceContext, typename T>
class KthvalueOpGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(context.GetPlace()), true,
platform::errors::InvalidArgument(
"It must use CUDAPlace, you must check your device set."));
auto* x = context.Input<framework::Tensor>("X");
auto* out_grad =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* indices = context.Input<framework::Tensor>("Indices");
auto* x_grad =
context.Output<framework::Tensor>(framework::GradVarName("X"));
int axis = context.Attr<int>("axis");
int k = static_cast<int>(context.Attr<int>("k"));
const auto& in_dims = x->dims();
auto out_dims = indices->dims();
if (axis < 0) axis += in_dims.size();
T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
const T* out_grad_data = out_grad->data<T>();
const int64_t* indices_data = indices->data<int64_t>();
int pre, n, post;
GetDims(in_dims, axis, &pre, &n, &post);
auto& dev_ctx = context.cuda_device_context();
int block_size = getBlockSize(post * k);
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
int grid_size = std::min(max_blocks, pre);
AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
kthvalue,
ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, double>,
ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, int>,
ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
kthvalue_grad,
ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext,
int64_t>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <iostream>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/transpose_op.h"
namespace paddle {
namespace operators {
template <typename T, typename Type>
static void getKthvalue(Type input_height, Type input_width, int input_dim,
const framework::Tensor* input, T* t_out,
Type* t_indices, const int& k) {
bool partial_sort_flag = (k * 64) < input_width;
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (Type i = 0; i < input_height; ++i) {
std::vector<std::pair<T, Type>> col_vec;
col_vec.reserve(input_width);
if (input_dim == 1) {
auto e_input = framework::EigenVector<T>::Flatten(*input);
for (Type j = 0; j < input_width; ++j) {
col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
}
} else {
auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
for (Type j = 0; j < input_width; ++j) {
col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
}
}
if (partial_sort_flag) {
std::partial_sort(
col_vec.begin(), col_vec.begin() + k, col_vec.end(),
[](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
return (!std::isnan(static_cast<double>(l.first)) &&
std::isnan(static_cast<double>(r.first))) ||
(l.first < r.first);
});
} else {
std::nth_element(
col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
[](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
return (!std::isnan(static_cast<double>(l.first)) &&
std::isnan(static_cast<double>(r.first))) ||
(l.first < r.first);
});
}
t_out[i] = col_vec[k - 1].first;
t_indices[i] = col_vec[k - 1].second;
}
}
template <typename T, typename Type>
static void kthvalueAssign(const Type& input_height, const Type& input_width,
const int& input_dim, const framework::Tensor* input,
const framework::Tensor* indices, T* output_data) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (Type i = 0; i < input_height; ++i) {
if (input_dim == 1) {
auto e_input = framework::EigenVector<T>::Flatten(*input);
auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
output_data[i * input_width + e_indices(0)] = e_input(0);
} else {
auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
auto e_indices =
framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
}
}
}
template <typename DeviceContext, typename T>
class KthvalueCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* input = context.Input<framework::Tensor>("X");
auto* output = context.Output<framework::Tensor>("Out");
auto* indices = context.Output<framework::Tensor>("Indices");
const auto& in_dims = input->dims();
int k = static_cast<int>(context.Attr<int>("k"));
bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
int axis = static_cast<int>(context.Attr<int>("axis"));
if (axis < 0) axis += in_dims.size();
T* output_data = output->mutable_data<T>(context.GetPlace());
int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
auto out_dims = output->dims();
if (axis == in_dims.size() - 1) {
const int64_t& input_height =
phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
const int64_t& input_width = in_dims[in_dims.size() - 1];
getKthvalue<T, int64_t>(input_height, input_width, in_dims.size(), input,
output_data, indices_data, k);
} else {
std::vector<int> trans;
for (int i = 0; i < axis; i++) {
trans.emplace_back(i);
}
trans.emplace_back(in_dims.size() - 1);
for (int i = axis + 1; i < in_dims.size() - 1; i++) {
trans.emplace_back(i);
}
trans.emplace_back(axis);
if (!keepdim) {
std::vector<int> tmp_out_shape;
for (int i = 0; i < axis; i++) {
tmp_out_shape.emplace_back(in_dims[i]);
}
tmp_out_shape.emplace_back(1);
for (int i = axis + 1; i < in_dims.size(); i++) {
tmp_out_shape.emplace_back(in_dims[i]);
}
framework::DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
output->Resize(tmp_out_dims);
indices->Resize(tmp_out_dims);
}
framework::DDim trans_dims(in_dims);
framework::DDim trans_out_dims(in_dims);
for (size_t i = 0; i < trans.size(); i++) {
trans_dims[i] = in_dims[trans[i]];
trans_out_dims[i] = in_dims[trans[i]];
}
trans_out_dims[in_dims.size() - 1] = 1;
framework::Tensor trans_inp;
trans_inp.mutable_data<T>(trans_dims, context.GetPlace());
int ndims = trans.size();
auto& dev_context =
context.template device_context<platform::CPUDeviceContext>();
TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
&trans_inp, trans);
const int64_t input_height =
phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
const int64_t input_width = trans_dims[trans_dims.size() - 1];
framework::Tensor tmp_out, tmp_indices;
T* t_out = tmp_out.mutable_data<T>(trans_out_dims, context.GetPlace());
auto* t_ind =
tmp_indices.mutable_data<int64_t>(trans_out_dims, context.GetPlace());
getKthvalue<T, int64_t>(input_height, input_width, in_dims.size(),
&trans_inp, t_out, t_ind, k);
TransCompute<platform::CPUDeviceContext, int64_t>(
ndims, dev_context, tmp_indices, indices, trans);
TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
output, trans);
if (!keepdim) {
output->Resize(out_dims);
indices->Resize(out_dims);
}
}
}
};
template <typename DeviceContext, typename T>
class KthvalueGradCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<framework::Tensor>("X");
auto* out_grad =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* indices = context.Input<framework::Tensor>("Indices");
auto* x_grad =
context.Output<framework::Tensor>(framework::GradVarName("X"));
int axis = static_cast<int>(context.Attr<int>("axis"));
bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
auto in_dims = x->dims();
auto out_dims = indices->dims();
axis = (axis < 0) ? (in_dims.size() + axis) : axis;
if (!keepdim) {
std::vector<int> tmp_out_shape;
for (int i = 0; i < axis; i++) {
tmp_out_shape.emplace_back(out_dims[i]);
}
tmp_out_shape.emplace_back(1);
for (int i = axis + 1; i < in_dims.size(); i++) {
tmp_out_shape.emplace_back(out_dims[i - 1]);
}
out_dims = phi::make_ddim(tmp_out_shape);
}
T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
if (axis == in_dims.size() - 1) {
const int64_t input_height =
phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
const int64_t input_width = in_dims[in_dims.size() - 1];
memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
if (keepdim) {
kthvalueAssign(input_height, input_width, in_dims.size(), out_grad,
indices, x_grad_data);
} else {
auto& dev_context =
context.template device_context<platform::CPUDeviceContext>();
framework::Tensor out_grad_tmp, indices_tmp;
out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
indices_tmp.mutable_data<int64_t>(indices->dims(),
dev_context.GetPlace());
framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
&out_grad_tmp);
framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
&indices_tmp);
out_grad_tmp.Resize(out_dims);
indices_tmp.Resize(out_dims);
kthvalueAssign(input_height, input_width, in_dims.size(), &out_grad_tmp,
&indices_tmp, x_grad_data);
}
} else {
std::vector<int> trans;
for (int i = 0; i < axis; i++) {
trans.emplace_back(i);
}
trans.emplace_back(out_dims.size() - 1);
for (int i = axis + 1; i < out_dims.size() - 1; i++) {
trans.emplace_back(i);
}
trans.emplace_back(axis);
framework::DDim trans_dims(out_dims);
framework::DDim trans_in_dims(in_dims);
for (size_t i = 0; i < trans.size(); i++) {
trans_dims[i] = out_dims[trans[i]];
trans_in_dims[i] = in_dims[trans[i]];
}
framework::Tensor trans_dO, trans_ind;
trans_dO.mutable_data<T>(trans_dims, context.GetPlace());
trans_ind.mutable_data<int64_t>(trans_dims, context.GetPlace());
int ndims = trans.size();
auto& dev_context =
context.template device_context<platform::CPUDeviceContext>();
if (keepdim) {
TransCompute<platform::CPUDeviceContext, T>(
ndims, dev_context, *out_grad, &trans_dO, trans);
TransCompute<platform::CPUDeviceContext, int64_t>(
ndims, dev_context, *indices, &trans_ind, trans);
} else {
framework::Tensor out_grad_tmp, indices_tmp;
out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
indices_tmp.mutable_data<int64_t>(indices->dims(),
dev_context.GetPlace());
framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
&out_grad_tmp);
framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
&indices_tmp);
out_grad_tmp.Resize(out_dims);
indices_tmp.Resize(out_dims);
TransCompute<platform::CPUDeviceContext, T>(
ndims, dev_context, out_grad_tmp, &trans_dO, trans);
TransCompute<platform::CPUDeviceContext, int64_t>(
ndims, dev_context, indices_tmp, &trans_ind, trans);
}
const int64_t input_height = phi::product(
phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
framework::Tensor tmp_out;
T* t_out = tmp_out.mutable_data<T>(trans_in_dims, context.GetPlace());
memset(t_out, 0, x_grad->numel() * sizeof(T));
kthvalueAssign<T, int64_t>(input_height, input_width, in_dims.size(),
&trans_dO, &trans_ind, t_out);
TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
x_grad, trans);
}
}
};
} // namespace operators
} // namespace paddle
...@@ -554,6 +554,67 @@ void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) { ...@@ -554,6 +554,67 @@ void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) {
out->set_dtype(DataType::BOOL); out->set_dtype(DataType::BOOL);
} }
void KthvalueInferMeta(const MetaTensor& x,
int k,
int axis,
bool keepdim,
MetaTensor* out,
MetaTensor* indices,
MetaConfig config) {
auto input_dims = x.dims();
const int& dim_size = input_dims.size();
PADDLE_ENFORCE_LT(axis,
dim_size,
phi::errors::InvalidArgument(
"the axis must be [-%d, %d), but received %d .",
dim_size,
dim_size,
axis));
PADDLE_ENFORCE_GE(axis,
-dim_size,
phi::errors::InvalidArgument(
"the axis must be [-%d, %d), but received %d .",
dim_size,
dim_size,
axis));
if (axis < 0) axis += dim_size;
PADDLE_ENFORCE_GE(
k,
1,
phi::errors::InvalidArgument(
"the k in the kthvalue must >= 1, but received %d .", k));
PADDLE_ENFORCE_GE(
input_dims.size(),
1,
phi::errors::InvalidArgument("input of kthvalue must have >= 1d shape"));
if (config.is_runtime) {
PADDLE_ENFORCE_GE(
input_dims[axis],
k,
phi::errors::InvalidArgument(
"input of kthvalue must have >= %d columns in axis of %d",
k,
axis));
}
std::vector<int64_t> dimvec;
for (int64_t i = 0; i < axis; i++) {
dimvec.emplace_back(input_dims[i]);
}
if (keepdim) {
dimvec.emplace_back(static_cast<int64_t>(1));
}
for (int64_t i = axis + 1; i < dim_size; i++) {
dimvec.emplace_back(input_dims[i]);
}
DDim dims = phi::make_ddim(dimvec);
out->set_dims(dims);
out->share_lod(x);
out->set_dtype(x.dtype());
indices->set_dims(dims);
indices->share_lod(x);
indices->set_dtype(x.dtype());
}
void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out) { void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out) {
auto dims = x.dims(); auto dims = x.dims();
auto n_dim = dims.size(); auto n_dim = dims.size();
......
...@@ -100,6 +100,14 @@ void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out); ...@@ -100,6 +100,14 @@ void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out);
void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out); void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out);
void KthvalueInferMeta(const MetaTensor& x,
int k,
int axis,
bool keepdim,
MetaTensor* out,
MetaTensor* indices,
MetaConfig = MetaConfig());
void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out); void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out);
void MaxPoolWithIndexInferMeta(const MetaTensor& x, void MaxPoolWithIndexInferMeta(const MetaTensor& x,
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/kthvalue_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
template <typename T, typename Type>
static void kthvalueAssign(const Type& input_height,
const Type& input_width,
const int& input_dim,
const DenseTensor* input,
const DenseTensor* indices,
T* output_data) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (Type i = 0; i < input_height; ++i) {
if (input_dim == 1) {
auto e_input = EigenVector<T>::Flatten(*input);
auto e_indices = EigenVector<Type>::Flatten(*indices);
output_data[i * input_width + e_indices(0)] = e_input(0);
} else {
auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
}
}
}
template <typename T, typename Context>
void KthvalueGradKernel(const Context& dev_ctx,
const DenseTensor& d_out,
const DenseTensor& x,
const DenseTensor& indices,
int k,
int axis,
bool keepdim,
DenseTensor* d_x) {
auto in_dims = x.dims();
auto out_dims = indices.dims();
axis = (axis < 0) ? (in_dims.size() + axis) : axis;
if (!keepdim) {
std::vector<int> tmp_out_shape;
for (int i = 0; i < axis; i++) {
tmp_out_shape.emplace_back(out_dims[i]);
}
tmp_out_shape.emplace_back(1);
for (int i = axis + 1; i < in_dims.size(); i++) {
tmp_out_shape.emplace_back(out_dims[i - 1]);
}
out_dims = phi::make_ddim(tmp_out_shape);
}
T* x_grad_data = dev_ctx.template Alloc<T>(d_x);
if (axis == in_dims.size() - 1) {
const int64_t input_height =
phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
const int64_t input_width = in_dims[in_dims.size() - 1];
memset(x_grad_data, 0, d_x->numel() * sizeof(T));
if (keepdim) {
kthvalueAssign(input_height,
input_width,
in_dims.size(),
&d_out,
&indices,
x_grad_data);
} else {
DenseTensor out_grad_tmp, indices_tmp;
out_grad_tmp.Resize(d_out.dims());
indices_tmp.Resize(indices.dims());
dev_ctx.template Alloc<T>(&out_grad_tmp);
dev_ctx.template Alloc<int64_t>(&indices_tmp);
Copy(dev_ctx, d_out, dev_ctx.GetPlace(), false, &out_grad_tmp);
Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
out_grad_tmp.Resize(out_dims);
indices_tmp.Resize(out_dims);
kthvalueAssign(input_height,
input_width,
in_dims.size(),
&out_grad_tmp,
&indices_tmp,
x_grad_data);
}
} else {
std::vector<int> trans;
for (int i = 0; i < axis; i++) {
trans.emplace_back(i);
}
trans.emplace_back(out_dims.size() - 1);
for (int i = axis + 1; i < out_dims.size() - 1; i++) {
trans.emplace_back(i);
}
trans.emplace_back(axis);
DDim trans_dims(out_dims);
DDim trans_in_dims(in_dims);
for (size_t i = 0; i < trans.size(); i++) {
trans_dims[i] = out_dims[trans[i]];
trans_in_dims[i] = in_dims[trans[i]];
}
DenseTensor trans_dO, trans_ind;
trans_dO.Resize(trans_dims);
trans_ind.Resize(trans_dims);
dev_ctx.template Alloc<T>(&trans_dO);
dev_ctx.template Alloc<int64_t>(&trans_ind);
int ndims = trans.size();
if (keepdim) {
funcs::TransCompute<phi::CPUContext, T>(
ndims, dev_ctx, d_out, &trans_dO, trans);
funcs::TransCompute<phi::CPUContext, int64_t>(
ndims, dev_ctx, indices, &trans_ind, trans);
} else {
DenseTensor out_grad_tmp, indices_tmp;
out_grad_tmp.Resize(d_out.dims());
indices_tmp.Resize(indices.dims());
dev_ctx.template Alloc<T>(&out_grad_tmp);
dev_ctx.template Alloc<int64_t>(&indices_tmp);
Copy(dev_ctx, d_out, dev_ctx.GetPlace(), false, &out_grad_tmp);
Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
out_grad_tmp.Resize(out_dims);
indices_tmp.Resize(out_dims);
funcs::TransCompute<phi::CPUContext, T>(
ndims, dev_ctx, out_grad_tmp, &trans_dO, trans);
funcs::TransCompute<phi::CPUContext, int64_t>(
ndims, dev_ctx, indices_tmp, &trans_ind, trans);
}
const int64_t input_height = phi::product(
phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
DenseTensor tmp_out;
tmp_out.Resize(trans_in_dims);
T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
memset(t_out, 0, d_x->numel() * sizeof(T));
kthvalueAssign<T, int64_t>(input_height,
input_width,
in_dims.size(),
&trans_dO,
&trans_ind,
t_out);
funcs::TransCompute<phi::CPUContext, T>(
ndims, dev_ctx, tmp_out, d_x, trans);
}
}
} // namespace phi
PD_REGISTER_KERNEL(kthvalue_grad,
CPU,
ALL_LAYOUT,
phi::KthvalueGradKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/kthvalue_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
template <typename T, typename Type>
static void getKthvalue(Type input_height,
Type input_width,
int input_dim,
const DenseTensor* input,
T* t_out,
Type* t_indices,
const int& k) {
bool partial_sort_flag = (k * 64) < input_width;
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (Type i = 0; i < input_height; ++i) {
std::vector<std::pair<T, Type>> col_vec;
col_vec.reserve(input_width);
if (input_dim == 1) {
auto e_input = EigenVector<T>::Flatten(*input);
for (Type j = 0; j < input_width; ++j) {
col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
}
} else {
auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
for (Type j = 0; j < input_width; ++j) {
col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
}
}
if (partial_sort_flag) {
std::partial_sort(
col_vec.begin(),
col_vec.begin() + k,
col_vec.end(),
[](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
return (!std::isnan(static_cast<double>(l.first)) &&
std::isnan(static_cast<double>(r.first))) ||
(l.first < r.first);
});
} else {
std::nth_element(
col_vec.begin(),
col_vec.begin() + k - 1,
col_vec.end(),
[](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
return (!std::isnan(static_cast<double>(l.first)) &&
std::isnan(static_cast<double>(r.first))) ||
(l.first < r.first);
});
}
t_out[i] = col_vec[k - 1].first;
t_indices[i] = col_vec[k - 1].second;
}
}
template <typename T, typename Context>
void KthvalueKernel(const Context& dev_ctx,
const DenseTensor& x,
int k,
int axis,
bool keepdim,
DenseTensor* output,
DenseTensor* indices) {
const auto& in_dims = x.dims();
if (axis < 0) axis += in_dims.size();
T* output_data = dev_ctx.template Alloc<T>(output);
int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
auto out_dims = output->dims();
if (axis == in_dims.size() - 1) {
const int64_t& input_height =
phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
const int64_t& input_width = in_dims[in_dims.size() - 1];
getKthvalue<T, int64_t>(input_height,
input_width,
in_dims.size(),
&x,
output_data,
indices_data,
k);
} else {
std::vector<int> trans;
for (int i = 0; i < axis; i++) {
trans.emplace_back(i);
}
trans.emplace_back(in_dims.size() - 1);
for (int i = axis + 1; i < in_dims.size() - 1; i++) {
trans.emplace_back(i);
}
trans.emplace_back(axis);
if (!keepdim) {
std::vector<int> tmp_out_shape;
for (int i = 0; i < axis; i++) {
tmp_out_shape.emplace_back(in_dims[i]);
}
tmp_out_shape.emplace_back(1);
for (int i = axis + 1; i < in_dims.size(); i++) {
tmp_out_shape.emplace_back(in_dims[i]);
}
DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
output->Resize(tmp_out_dims);
indices->Resize(tmp_out_dims);
}
DDim trans_dims(in_dims);
DDim trans_out_dims(in_dims);
for (size_t i = 0; i < trans.size(); i++) {
trans_dims[i] = in_dims[trans[i]];
trans_out_dims[i] = in_dims[trans[i]];
}
trans_out_dims[in_dims.size() - 1] = 1;
DenseTensor trans_inp;
trans_inp.Resize(trans_dims);
dev_ctx.template Alloc<T>(&trans_inp);
int ndims = trans.size();
funcs::TransCompute<phi::CPUContext, T>(
ndims, dev_ctx, x, &trans_inp, trans);
const int64_t input_height =
phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
const int64_t input_width = trans_dims[trans_dims.size() - 1];
DenseTensor tmp_out, tmp_indices;
tmp_out.Resize(trans_out_dims);
T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
tmp_indices.Resize(trans_out_dims);
int64_t* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
getKthvalue<T, int64_t>(
input_height, input_width, in_dims.size(), &trans_inp, t_out, t_ind, k);
funcs::TransCompute<phi::CPUContext, int64_t>(
ndims, dev_ctx, tmp_indices, indices, trans);
funcs::TransCompute<phi::CPUContext, T>(
ndims, dev_ctx, tmp_out, output, trans);
if (!keepdim) {
output->Resize(out_dims);
indices->Resize(out_dims);
}
}
}
} // namespace phi
PD_REGISTER_KERNEL(kthvalue,
CPU,
ALL_LAYOUT,
phi::KthvalueKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/kthvalue_grad_kernel.h"
#include "paddle/fluid/operators/top_k_function_cuda.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
static int getBlockSize(int col) {
if (col > 512)
return 1024;
else if (col > 256 && col <= 512)
return 512;
else if (col > 128 && col <= 256)
return 256;
else if (col > 64 && col <= 128)
return 128;
else
return 64;
}
template <typename T, typename Context>
void KthvalueGradKernel(const Context& dev_ctx,
const DenseTensor& d_out,
const DenseTensor& x,
const DenseTensor& indices,
int k,
int axis,
bool keepdim,
DenseTensor* d_x) {
const auto& in_dims = x.dims();
auto out_dims = indices.dims();
if (axis < 0) axis += in_dims.size();
T* x_grad_data = dev_ctx.template Alloc<T>(d_x);
const T* out_grad_data = d_out.data<T>();
const int64_t* indices_data = indices.data<int64_t>();
int pre, n, post;
paddle::operators::GetDims(in_dims, axis, &pre, &n, &post);
int block_size = getBlockSize(post * k);
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
int grid_size = std::min(max_blocks, pre);
paddle::operators::AssignGradWithAxis<
T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
}
} // namespace phi
PD_REGISTER_KERNEL(kthvalue_grad,
GPU,
ALL_LAYOUT,
phi::KthvalueGradKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/kthvalue_kernel.h"
#include "paddle/fluid/operators/top_k_function_cuda.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
inline int getBlockSize(int col) {
if (col > 512)
return 1024;
else if (col > 256 && col <= 512)
return 512;
else if (col > 128 && col <= 256)
return 256;
else if (col > 64 && col <= 128)
return 128;
else
return 64;
}
template <typename T>
bool SortKthvalue(const phi::GPUContext& dev_ctx,
const DenseTensor* input_tensor,
const int64_t num_cols,
const int64_t num_rows,
const int k,
DenseTensor* out_tensor,
DenseTensor* indices_tensor) {
auto cu_stream = dev_ctx.stream();
DenseTensor input_indices;
const std::vector<int64_t> dims = {num_rows, num_cols};
auto dim = phi::make_ddim(dims);
input_indices.Resize(dim);
dev_ctx.template Alloc<int64_t>(&input_indices);
size_t temp_storage_bytes = -1;
int block_size = getBlockSize(num_cols);
unsigned int maxGridDimX = dev_ctx.GetCUDAMaxGridDimSize()[0];
unsigned int grid_size = num_rows < maxGridDimX
? static_cast<unsigned int>(num_rows)
: maxGridDimX;
paddle::operators::InitIndex<
int64_t><<<grid_size, block_size, 0, cu_stream>>>(
input_indices.data<int64_t>(), num_rows, num_cols);
cub::CountingInputIterator<int64_t> counting_iter(0);
cub::TransformInputIterator<int64_t,
paddle::operators::SegmentOffsetIter,
cub::CountingInputIterator<int64_t>>
segment_offsets_t(counting_iter,
paddle::operators::SegmentOffsetIter(num_cols));
T* sorted_values_ptr;
int64_t* sorted_indices_ptr;
DenseTensor temp_values, temp_indices;
const T* input = input_tensor->data<T>();
T* values = out_tensor->data<T>();
int64_t* indices = indices_tensor->mutable_data<int64_t>(dev_ctx.GetPlace());
temp_values.Resize(dim);
temp_indices.Resize(dim);
sorted_values_ptr = dev_ctx.template Alloc<T>(&temp_values);
sorted_indices_ptr = dev_ctx.template Alloc<int64_t>(&temp_indices);
auto err =
cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
temp_storage_bytes,
input,
sorted_values_ptr,
input_indices.data<int64_t>(),
sorted_indices_ptr,
num_cols * num_rows,
num_rows,
segment_offsets_t,
segment_offsets_t + 1,
0,
sizeof(T) * 8,
cu_stream);
#ifdef __HIPCC__
if (err != hipSuccess) {
LOG(ERROR) << "KthvalueOP failed as could not launch "
"hipcub::DeviceSegmentedRadixSort::SortPairs, status: "
<< hipGetErrorString(err);
return false;
}
#else
if (err != cudaSuccess) {
LOG(ERROR) << "KthvalueOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairs, status: "
<< cudaGetErrorString(err);
return false;
}
#endif
DenseTensor temp_storage;
temp_storage.Resize({static_cast<int>(temp_storage_bytes / sizeof(uint8_t))});
uint8_t* temp_storage_data = dev_ctx.template Alloc<uint8_t>(&temp_storage);
err = cub::DeviceSegmentedRadixSort::SortPairs(temp_storage_data,
temp_storage_bytes,
input,
sorted_values_ptr,
input_indices.data<int64_t>(),
sorted_indices_ptr,
num_cols * num_rows,
num_rows,
segment_offsets_t,
segment_offsets_t + 1,
0,
sizeof(T) * 8,
cu_stream);
#ifdef __HIPCC__
if (err != hipSuccess) {
LOG(ERROR) << "KthvalueOP failed as could not launch "
"hipcub::DeviceSegmentedRadixSort::SortPairs, "
<< temp_storage_bytes << ", status: " << hipGetErrorString(err);
return false;
}
#else
if (err != cudaSuccess) {
LOG(ERROR) << "KthvalueOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairs, "
<< temp_storage_bytes << ", status: " << cudaGetErrorString(err);
return false;
}
#endif
auto& dev = *dev_ctx.eigen_device();
const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, k - 1};
const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, 1};
auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
auto e_tmp_indices =
EigenMatrix<int64_t>::From(static_cast<const DenseTensor>(temp_indices));
std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(1)};
dim = phi::make_ddim(odims);
auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
auto e_tmp_values =
EigenMatrix<T>::From(static_cast<const DenseTensor>(temp_values));
funcs::EigenSlice<std::decay_t<decltype(dev)>, int64_t, 2>::Eval(
dev, e_indices, e_tmp_indices, slice_indices, slice_sizes);
funcs::EigenSlice<std::decay_t<decltype(dev)>, T, 2>::Eval(
dev, e_values, e_tmp_values, slice_indices, slice_sizes);
return true;
}
template <typename T, typename Context>
void KthvalueKernel(const Context& dev_ctx,
const DenseTensor& x,
int k,
int axis,
bool keepdim,
DenseTensor* output,
DenseTensor* indices) {
const auto& in_dims = x.dims();
if (axis < 0) axis += in_dims.size();
auto out_dims = output->dims();
const T* input_data = x.data<T>();
T* output_data = dev_ctx.template Alloc<T>(output);
int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
if (axis == in_dims.size() - 1) {
const int64_t& input_height =
phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
const int64_t& input_width = in_dims[in_dims.size() - 1];
PADDLE_ENFORCE_EQ(
SortKthvalue<T>(
dev_ctx, &x, input_width, input_height, k, output, indices),
true,
phi::errors::External("KthvalueOP: Error when use cub sorting"));
return;
} else {
std::vector<int> trans;
for (int i = 0; i < axis; i++) {
trans.emplace_back(i);
}
trans.emplace_back(in_dims.size() - 1);
for (int i = axis + 1; i < in_dims.size() - 1; i++) {
trans.emplace_back(i);
}
trans.emplace_back(axis);
if (!keepdim) {
std::vector<int> tmp_out_shape;
for (int i = 0; i < axis; i++) {
tmp_out_shape.emplace_back(in_dims[i]);
}
tmp_out_shape.emplace_back(1);
for (int i = axis + 1; i < in_dims.size(); i++) {
tmp_out_shape.emplace_back(in_dims[i]);
}
DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
output->Resize(tmp_out_dims);
indices->Resize(tmp_out_dims);
}
DDim trans_dims(in_dims);
DDim trans_out_dims(in_dims);
for (int i = 0; i < trans.size(); i++) {
trans_dims[i] = in_dims[trans[i]];
trans_out_dims[i] = in_dims[trans[i]];
}
trans_out_dims[in_dims.size() - 1] = 1;
DenseTensor trans_input;
trans_input.mutable_data<T>(trans_dims, dev_ctx.GetPlace());
int ndims = trans.size();
funcs::TransCompute<phi::GPUContext, T>(
ndims, dev_ctx, x, &trans_input, trans);
DenseTensor trans_ind, trans_out;
trans_ind.mutable_data<int64_t>(trans_out_dims, dev_ctx.GetPlace());
trans_out.mutable_data<T>(trans_out_dims, dev_ctx.GetPlace());
const int64_t input_height =
phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
const int64_t input_width = trans_dims[trans_dims.size() - 1];
PADDLE_ENFORCE_EQ(
SortKthvalue<T>(dev_ctx,
&trans_input,
input_width,
input_height,
k,
&trans_out,
&trans_ind),
true,
phi::errors::External("KthvalueOP: Error when use cub sorting"));
funcs::TransCompute<phi::GPUContext, int64_t>(
ndims, dev_ctx, trans_ind, indices, trans);
funcs::TransCompute<phi::GPUContext, T>(
ndims, dev_ctx, trans_out, output, trans);
if (!keepdim) {
output->Resize(out_dims);
indices->Resize(out_dims);
}
}
}
} // namespace phi
PD_REGISTER_KERNEL(kthvalue,
GPU,
ALL_LAYOUT,
phi::KthvalueKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void KthvalueGradKernel(const Context& dev_ctx,
const DenseTensor& d_out,
const DenseTensor& x,
const DenseTensor& indices,
int k,
int axis,
bool keepdim,
DenseTensor* d_x);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void KthvalueKernel(const Context& dev_ctx,
const DenseTensor& x,
int k,
int axis,
bool keepdim,
DenseTensor* out,
DenseTensor* indices);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature KthvalueGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("kthvalue_grad",
{GradVarName("Out"), "X", "Indices"},
{"k", "axis", "keepdim"},
{GradVarName("X")});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(kthvalue_grad, phi::KthvalueGradOpArgumentMapping);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册