未验证 提交 74894cd7 编写于 作者: C csy0225 提交者: GitHub

fix conflict (#40851)

上级 e559fe41
...@@ -14,6 +14,10 @@ limitations under the License. */ ...@@ -14,6 +14,10 @@ limitations under the License. */
#include "paddle/fluid/operators/range_op.h" #include "paddle/fluid/operators/range_op.h"
#include <string> #include <string>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/ternary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -22,51 +26,6 @@ class RangeOp : public framework::OperatorWithKernel { ...@@ -22,51 +26,6 @@ class RangeOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
if (ctx->HasInput("Start")) {
auto s_dims = ctx->GetInputDim("Start");
PADDLE_ENFORCE_EQ(
s_dims.size(), 1,
platform::errors::InvalidArgument(
"The dim of the shape of Input(Start) should be 1, but got %d",
s_dims.size()));
PADDLE_ENFORCE_EQ(s_dims[0], 1,
platform::errors::InvalidArgument(
"The first dim of the shape of Input(Start) should "
"be 1, but got %d",
s_dims[0]));
}
if (ctx->HasInput("End")) {
auto e_dims = ctx->GetInputDim("End");
PADDLE_ENFORCE_EQ(
e_dims.size(), 1,
platform::errors::InvalidArgument(
"The dim of the shape of Input(End) should be 1, but got %d",
e_dims.size()));
PADDLE_ENFORCE_EQ(e_dims[0], 1, platform::errors::InvalidArgument(
"The first dim of the shape of "
"Input(End) should be 1, but got %d",
e_dims[0]));
}
if (ctx->HasInput("Step")) {
auto step_dims = ctx->GetInputDim("Step");
PADDLE_ENFORCE_EQ(
step_dims.size(), 1,
platform::errors::InvalidArgument(
"The dim of the shape of Input(Step) should be 1, but got %d",
step_dims.size()));
PADDLE_ENFORCE_EQ(step_dims[0], 1,
platform::errors::InvalidArgument(
"The first dim of the shape of Input(Step) should "
"be 1, but got %d",
step_dims[0]));
}
ctx->SetOutputDim("Out", {-1});
}
protected: protected:
framework::OpKernelType GetKernelTypeForVar( framework::OpKernelType GetKernelTypeForVar(
const std::string &var_name, const framework::Tensor &tensor, const std::string &var_name, const framework::Tensor &tensor,
...@@ -101,7 +60,7 @@ class RangeOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -101,7 +60,7 @@ class RangeOpMaker : public framework::OpProtoAndCheckerMaker {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(range, ops::RangeOp, ops::RangeOpMaker); DECLARE_INFER_SHAPE_FUNCTOR(range, RangeInferMetaFunctor,
REGISTER_OP_CPU_KERNEL(range, ops::CPURangeKernel<int>, PD_INFER_META(phi::RangeInferMeta));
ops::CPURangeKernel<float>, ops::CPURangeKernel<double>, REGISTER_OP_WITHOUT_GRADIENT(range, ops::RangeOp, ops::RangeOpMaker,
ops::CPURangeKernel<int64_t>); RangeInferMetaFunctor);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/range_op.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void RangeKernel(T start, T step, int64_t size, T* out) {
CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
}
template <typename T>
class CUDARangeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* start_t = context.Input<framework::Tensor>("Start");
auto* end_t = context.Input<framework::Tensor>("End");
auto* step_t = context.Input<framework::Tensor>("Step");
auto* out = context.Output<framework::Tensor>("Out");
T start = GetValue<T>(start_t);
T end = GetValue<T>(end_t);
T step = GetValue<T>(step_t);
int64_t size = 0;
GetSize(start, end, step, &size);
out->Resize(phi::make_ddim({size}));
T* out_data = out->mutable_data<T>(context.GetPlace());
auto stream = context.cuda_device_context().stream();
int block = std::min(size, static_cast<int64_t>(256));
int grid = (size + block - 1) / block;
RangeKernel<T><<<grid, block, 0, stream>>>(start, step, size, out_data);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(range, ops::CUDARangeKernel<int>,
ops::CUDARangeKernel<int64_t>,
ops::CUDARangeKernel<float>,
ops::CUDARangeKernel<double>);
...@@ -30,7 +30,7 @@ limitations under the License. */ ...@@ -30,7 +30,7 @@ limitations under the License. */
namespace f = paddle::framework; namespace f = paddle::framework;
namespace p = paddle::platform; namespace p = paddle::platform;
USE_OP(range); USE_OP_ITSELF(range);
USE_OP_DEVICE_KERNEL(range, NPU); USE_OP_DEVICE_KERNEL(range, NPU);
template <typename T> template <typename T>
......
...@@ -12,9 +12,12 @@ ...@@ -12,9 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/stack_op.h"
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace plat = paddle::platform; namespace plat = paddle::platform;
namespace ops = paddle::operators; namespace ops = paddle::operators;
...@@ -26,52 +29,6 @@ class StackOp : public framework::OperatorWithKernel { ...@@ -26,52 +29,6 @@ class StackOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_GT(ctx->Inputs("X").size(), 0,
platform::errors::InvalidArgument(
"Number of Inputs(X) must be larger than 0, but"
" received value is:%d.",
ctx->Inputs("X").size()));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Y"), true,
platform::errors::InvalidArgument(
"Output(Y) of stack_op should not be null."));
auto input_dims = ctx->GetInputsDim("X");
for (size_t i = 1; i < input_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0],
platform::errors::InvalidArgument(
"Dims of all Inputs(X) must be the same, but"
" received input %d dim is:%d not equal to input 0"
" dim:%d.",
i, input_dims[i], input_dims[0]));
}
// Only lod of X[0] would be shared with Y
ctx->ShareLoD("X", /*->*/ "Y");
int axis = ctx->Attrs().Get<int>("axis");
int rank = input_dims[0].size();
PADDLE_ENFORCE_GE(
axis, -(rank + 1),
platform::errors::InvalidArgument(
"Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, "
"but received axis is:%d.",
rank, axis));
PADDLE_ENFORCE_LT(
axis, rank + 1,
platform::errors::InvalidArgument(
"Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, "
"but received axis is:%d",
rank, axis));
if (axis < 0) axis += (rank + 1);
auto vec = phi::vectorize<int>(input_dims[0]);
vec.insert(vec.begin() + axis, input_dims.size());
ctx->SetOutputDim("Y", phi::make_ddim(vec));
}
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
auto input_data_type = auto input_data_type =
...@@ -168,21 +125,10 @@ class StackGradOpMaker : public framework::SingleGradOpMaker<T> { ...@@ -168,21 +125,10 @@ class StackGradOpMaker : public framework::SingleGradOpMaker<T> {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
DECLARE_INFER_SHAPE_FUNCTOR(stack, StackInferMetaFunctor,
PD_INFER_META(phi::StackInferMeta));
REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker, REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker,
ops::StackGradOpMaker<paddle::framework::OpDesc>, ops::StackGradOpMaker<paddle::framework::OpDesc>,
ops::StackGradOpMaker<paddle::imperative::OpBase>); ops::StackGradOpMaker<paddle::imperative::OpBase>,
StackInferMetaFunctor);
REGISTER_OPERATOR(stack_grad, ops::StackOpGrad); REGISTER_OPERATOR(stack_grad, ops::StackOpGrad);
REGISTER_OP_CPU_KERNEL(
stack, ops::StackKernel<plat::CPUDeviceContext, float>,
ops::StackKernel<plat::CPUDeviceContext, double>,
ops::StackKernel<plat::CPUDeviceContext, int>,
ops::StackKernel<plat::CPUDeviceContext, int64_t>,
ops::StackKernel<plat::CPUDeviceContext, paddle::platform::bfloat16>);
REGISTER_OP_CPU_KERNEL(
stack_grad, ops::StackGradKernel<plat::CPUDeviceContext, float>,
ops::StackGradKernel<plat::CPUDeviceContext, double>,
ops::StackGradKernel<plat::CPUDeviceContext, int>,
ops::StackGradKernel<plat::CPUDeviceContext, int64_t>,
ops::StackGradKernel<plat::CPUDeviceContext, paddle::platform::bfloat16>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <limits>
#include <vector>
#include "paddle/fluid/operators/stack_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
namespace plat = paddle::platform;
namespace ops = paddle::operators;
namespace paddle {
namespace operators {
template <typename T, typename IntType>
__global__ void StackCUDAKernel(T** input_ptrs, int split_size, int rows,
int cols, T* __restrict__ output) {
IntType grid_x = blockIdx.x * blockDim.x + threadIdx.x;
for (; grid_x < cols; grid_x += blockDim.x * gridDim.x) {
IntType grid_y = blockIdx.y * blockDim.y + threadIdx.y;
IntType split = grid_x / split_size;
const T* input_ptr = input_ptrs[split];
IntType col_offset = grid_x % split_size;
#pragma unroll
for (; grid_y < rows; grid_y += blockDim.y * gridDim.y) {
output[grid_y * cols + grid_x] =
input_ptr[grid_y * split_size + col_offset];
}
}
}
template <typename T>
class StackGPUKernel : public framework::OpKernel<T> {
using Tensor = framework::LoDTensor;
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto x = ctx.MultiInput<Tensor>("X");
auto* y = ctx.Output<Tensor>("Y");
int axis = ctx.Attr<int>("axis");
if (axis < 0) axis += (x[0]->dims().size() + 1);
int n = static_cast<int>(x.size());
auto* y_data = y->mutable_data<T>(ctx.GetPlace());
std::vector<const T*> x_datas(n);
for (int i = 0; i < n; i++) {
x_datas[i] = x[i]->data<T>();
}
auto& dev_ctx = ctx.template device_context<plat::CUDADeviceContext>();
auto tmp_x_data = memory::Alloc(dev_ctx, x_datas.size() * sizeof(T*));
memory::Copy(dev_ctx.GetPlace(), tmp_x_data->ptr(), platform::CPUPlace(),
reinterpret_cast<void*>(x_datas.data()),
x_datas.size() * sizeof(T*), dev_ctx.stream());
// Split x dim from axis to matrix
int x_row = 1, x_col = 1;
for (int i = 0; i < axis; ++i) {
x_row *= x[0]->dims()[i];
}
x_col = x[0]->numel() / x_row;
int out_col = x_col * n;
auto config = GetGpuLaunchConfig2D(dev_ctx, out_col, x_row);
if (y->numel() < std::numeric_limits<int32_t>::max()) {
StackCUDAKernel<T,
int32_t><<<config.block_per_grid, config.thread_per_block,
0, dev_ctx.stream()>>>(
reinterpret_cast<T**>(tmp_x_data->ptr()), x_col, x_row, out_col,
y_data);
} else {
StackCUDAKernel<T,
int64_t><<<config.block_per_grid, config.thread_per_block,
0, dev_ctx.stream()>>>(
reinterpret_cast<T**>(tmp_x_data->ptr()), x_col, x_row, out_col,
y_data);
}
}
};
template <typename T, typename IntType>
__global__ void UnStackHelperCUDAKernel(const T* __restrict__ input,
int pre_dim_size, int split_dim_size,
int suf_dim_size, int num_split,
T** output_ptrs) {
assert(blockDim.y == 1);
assert(blockDim.z == 1);
// In this case they are equal
assert(split_dim_size % num_split == 0);
IntType size = pre_dim_size * split_dim_size * suf_dim_size;
IntType each_dim_size = split_dim_size / num_split;
for (IntType offset = blockIdx.x * blockDim.x + threadIdx.x; offset < size;
offset += blockDim.x * gridDim.x) {
IntType i = offset / (split_dim_size * suf_dim_size);
IntType j = (offset % (split_dim_size * suf_dim_size)) / suf_dim_size;
IntType k = offset % suf_dim_size;
T* output = output_ptrs[j / each_dim_size];
if (output == nullptr) {
return;
}
IntType output_ind = i * each_dim_size * suf_dim_size +
(j % each_dim_size) * suf_dim_size + k;
*(output + output_ind) = input[offset];
}
}
template <typename T>
class StackGradGPUKernel : public framework::OpKernel<T> {
using Tensor = framework::LoDTensor;
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto dx = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
int axis = ctx.Attr<int>("axis");
if (axis < 0) axis += dy->dims().size();
int n = dy->dims()[axis];
PADDLE_ENFORCE_EQ(n, dx.size(),
platform::errors::InvalidArgument(
"Output dx size should be equal to n, but"
" received n is:%d dx size is:%d.",
n, dx.size()));
// dx is output, so save each data address, then copy each dy into dx_data
std::vector<T*> outputs(n);
auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
for (size_t j = 0; j < dx.size(); ++j) {
if (dx[j] == nullptr) {
outputs[j] = nullptr;
}
if (out_var_names[j] != framework::kEmptyVarName &&
dx[j]->numel() != 0UL) {
T* ptr = dx[j]->mutable_data<T>(ctx.GetPlace());
outputs[j] = ptr;
} else {
outputs[j] = nullptr;
}
}
auto dy_data = dy->data<T>();
// each dx should have same shape
int dy_pre = 1, dy_suf = 1;
auto dy_dims = dy->dims();
int split_dim = n;
for (int i = 0; i < axis; ++i) {
dy_pre *= dy_dims[i];
}
dy_suf = dy->numel() / (split_dim * dy_pre);
auto& dev_ctx = ctx.template device_context<plat::CUDADeviceContext>();
auto tmp_out_data = memory::Alloc(dev_ctx, outputs.size() * sizeof(T*));
memory::Copy(dev_ctx.GetPlace(), tmp_out_data->ptr(), platform::CPUPlace(),
reinterpret_cast<void*>(outputs.data()),
outputs.size() * sizeof(T*), dev_ctx.stream());
auto config = GetGpuLaunchConfig1D(dev_ctx, dy_pre * split_dim * dy_suf);
if (dy->numel() < std::numeric_limits<int32_t>::max()) {
UnStackHelperCUDAKernel<
T, int32_t><<<config.block_per_grid.x, config.thread_per_block.x, 0,
dev_ctx.stream()>>>(
dy_data, dy_pre, split_dim, dy_suf, split_dim,
reinterpret_cast<T**>(tmp_out_data->ptr()));
} else {
UnStackHelperCUDAKernel<
T, int64_t><<<config.block_per_grid.x, config.thread_per_block.x, 0,
dev_ctx.stream()>>>(
dy_data, dy_pre, split_dim, dy_suf, split_dim,
reinterpret_cast<T**>(tmp_out_data->ptr()));
}
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(stack, ops::StackGPUKernel<float>,
ops::StackGPUKernel<double>, ops::StackGPUKernel<int>,
ops::StackGPUKernel<int64_t>,
ops::StackGPUKernel<plat::float16>,
ops::StackGPUKernel<plat::bfloat16>);
REGISTER_OP_CUDA_KERNEL(stack_grad, ops::StackGradGPUKernel<float>,
ops::StackGradGPUKernel<double>,
ops::StackGradGPUKernel<int>,
ops::StackGradGPUKernel<int64_t>,
ops::StackGradGPUKernel<plat::float16>,
ops::StackGradGPUKernel<plat::bfloat16>);
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/stack_op.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace paddle { namespace paddle {
......
...@@ -13,9 +13,9 @@ ...@@ -13,9 +13,9 @@
// limitations under the License. // limitations under the License.
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/stack_op.h"
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/concat_op.h" #include "paddle/fluid/operators/concat_op.h"
#include "paddle/fluid/platform/device/xpu/xpu_header.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h"
......
...@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/unique_op.h" #include "paddle/fluid/operators/unique_op.h"
#include "paddle/fluid/framework/op_version_registry.h" #include <memory>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -25,62 +29,54 @@ class UniqueOp : public framework::OperatorWithKernel { ...@@ -25,62 +29,54 @@ class UniqueOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unique"); OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unique");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "unique"); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "unique");
auto in_dims = ctx->GetInputDim("X");
if (!ctx->Attrs().Get<bool>("is_sorted")) {
OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
PADDLE_ENFORCE_EQ(in_dims.size(), 1,
platform::errors::InvalidArgument(
"The Input(X) should be 1-D Tensor, "
"But now the dims of Input(X) is %d.",
in_dims.size()));
ctx->SetOutputDim("Out", {-1});
ctx->SetOutputDim("Index", in_dims);
return;
}
bool return_index = ctx->Attrs().Get<bool>("return_index"); bool return_index = ctx->Attrs().Get<bool>("return_index");
bool return_inverse = ctx->Attrs().Get<bool>("return_inverse"); bool return_inverse = ctx->Attrs().Get<bool>("return_inverse");
bool return_counts = ctx->Attrs().Get<bool>("return_counts"); bool return_counts = ctx->Attrs().Get<bool>("return_counts");
auto axis_vec = ctx->Attrs().Get<std::vector<int>>("axis"); auto axis_vec = ctx->Attrs().Get<std::vector<int>>("axis");
auto data_type =
static_cast<phi::DataType>(static_cast<framework::proto::VarType::Type>(
ctx->Attrs().Get<int>("dtype")));
// Construct MetaTensor for InferMeta Func
using CompatMetaTensor = framework::CompatMetaTensor;
CompatMetaTensor x(ctx->GetInputVarPtrs("X")[0], ctx->IsRuntime());
CompatMetaTensor out(ctx->GetOutputVarPtrs("Out")[0], ctx->IsRuntime());
std::unique_ptr<CompatMetaTensor> indices(nullptr);
std::unique_ptr<CompatMetaTensor> index(nullptr);
std::unique_ptr<CompatMetaTensor> counts(nullptr);
if (return_index) { if (return_index) {
OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "unique"); OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "unique");
indices =
std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
ctx->GetOutputVarPtrs("Indices")[0], ctx->IsRuntime())));
} }
if (return_inverse) { if (return_inverse) {
OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique"); OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
index = std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
ctx->GetOutputVarPtrs("Index")[0], ctx->IsRuntime())));
} }
if (return_counts) { if (return_counts) {
OP_INOUT_CHECK(ctx->HasOutput("Counts"), "Output", "Counts", "unique"); OP_INOUT_CHECK(ctx->HasOutput("Counts"), "Output", "Counts", "unique");
counts = std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
ctx->GetOutputVarPtrs("Counts")[0], ctx->IsRuntime())));
} }
bool is_sorted = ctx->Attrs().Get<bool>("is_sorted");
if (axis_vec.empty()) { if (is_sorted) {
ctx->SetOutputDim("Out", {-1}); phi::UniqueInferMeta(x, return_index, return_inverse, return_counts,
if (return_inverse) { axis_vec, data_type, &out, indices.get(),
ctx->SetOutputDim("Index", {phi::product(in_dims)}); index.get(), counts.get());
}
} else { } else {
int axis = axis_vec[0]; OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
if (axis < 0) { if (index == nullptr) {
axis += in_dims.size(); index =
} std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
PADDLE_ENFORCE_LT( ctx->GetOutputVarPtrs("Index")[0], ctx->IsRuntime())));
axis, in_dims.size(),
platform::errors::InvalidArgument("The axis(%d) should be less than "
"the dimension size(%d) of x.",
axis, in_dims.size()));
auto out_dims = in_dims;
out_dims[axis] = -1;
ctx->SetOutputDim("Out", out_dims);
if (return_inverse) {
ctx->SetOutputDim("Index", {in_dims[axis]});
} }
} phi::UniqueRawInferMeta(x, return_index, return_inverse, return_counts,
if (return_index) { axis_vec, data_type, is_sorted, &out,
ctx->SetOutputDim("Indices", {-1}); indices.get(), index.get(), counts.get());
}
if (return_counts) {
ctx->SetOutputDim("Counts", {-1});
} }
} }
...@@ -152,40 +148,5 @@ class UniqueOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -152,40 +148,5 @@ class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(unique, ops::UniqueOp, ops::UniqueOpMaker); REGISTER_OP_WITHOUT_GRADIENT(unique, ops::UniqueOp, ops::UniqueOpMaker);
REGISTER_OP_CPU_KERNEL(
unique, ops::UniqueKernel<paddle::platform::CPUDeviceContext, float>,
ops::UniqueKernel<paddle::platform::CPUDeviceContext, double>,
ops::UniqueKernel<paddle::platform::CPUDeviceContext, int32_t>,
ops::UniqueKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_VERSION(unique)
.AddCheckpoint(
R"ROC(
Upgrade unique, add 2 outputs [Indices, Counts] and 5 attribute
[return_index, return_inverse, return_counts, axis, is_sorted].
)ROC",
paddle::framework::compatible::OpVersionDesc()
.NewOutput("Indices",
"The indices of the input tensor that result in the "
"unique tensor.")
.NewOutput("Counts", "The counts for each unique element.")
.NewAttr("return_index",
"If True, also return the indices of the input"
" tensor that result in the unique Tensor.",
false)
.NewAttr("return_inverse",
"If True, also return the indices for where elements"
" in the original input ended up in the returned unique "
"tensor.",
false)
.NewAttr("return_counts",
"If True, also return the counts for each unique element.",
false)
.NewAttr("axis",
"The axis to apply unique. If None, the input will be "
"flattened.",
std::vector<int>{})
.NewAttr("is_sorted",
"If True, the unique elements of X are in ascending order."
"Otherwise, the unique elements are not sorted.",
false));
...@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/unstack_op.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -25,43 +27,6 @@ namespace operators { ...@@ -25,43 +27,6 @@ namespace operators {
class UnStackOp : public framework::OperatorWithKernel { class UnStackOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "UnStack");
int axis = ctx->Attrs().Get<int>("axis");
int num = ctx->Attrs().Get<int>("num");
auto x_dim = ctx->GetInputDim("X");
int rank = x_dim.size();
PADDLE_ENFORCE_GE(axis, -rank,
platform::errors::InvalidArgument(
"The attribute axis is out of range, it must be "
"inside [-rank, rank), where rank = %d",
rank));
PADDLE_ENFORCE_LT(axis, rank,
platform::errors::InvalidArgument(
"The attribute axis is out of range, it must be "
"inside [-rank, rank), where rank = %d",
rank));
if (axis < 0) axis += rank;
PADDLE_ENFORCE_EQ(ctx->Outputs("Y").size(), static_cast<size_t>(num),
platform::errors::InvalidArgument(
"Number of Outputs(Y) is wrong. Got %d , but it must "
"equal to attribute num which is %d.",
ctx->Outputs("Y").size(), static_cast<size_t>(num)));
if (x_dim[axis] > 0) {
PADDLE_ENFORCE_EQ(
num, x_dim[axis],
platform::errors::InvalidArgument(
"The number of attribute num is not equal to the length of the "
"%d axis of Input(X). Expect %d but got %d.",
axis, x_dim[axis], num));
}
auto vec = phi::vectorize<int>(x_dim);
vec.erase(vec.begin() + axis);
ctx->SetOutputsDim("Y", std::vector<framework::DDim>( // NOLINT
x_dim[axis], phi::make_ddim(vec)));
}
}; };
class UnStackOpMaker : public framework::OpProtoAndCheckerMaker { class UnStackOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -141,20 +106,12 @@ class UnStackGradOp : public framework::OperatorWithKernel { ...@@ -141,20 +106,12 @@ class UnStackGradOp : public framework::OperatorWithKernel {
namespace plat = paddle::platform; namespace plat = paddle::platform;
namespace ops = paddle::operators; namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(unstack, UnStackInferMetaFunctor,
PD_INFER_META(phi::UnStackInferMeta));
REGISTER_OPERATOR(unstack, ops::UnStackOp, ops::UnStackOpMaker, REGISTER_OPERATOR(unstack, ops::UnStackOp, ops::UnStackOpMaker,
ops::UnStackGradOpMaker<paddle::framework::OpDesc>, ops::UnStackGradOpMaker<paddle::framework::OpDesc>,
ops::UnStackGradOpMaker<paddle::imperative::OpBase>); ops::UnStackGradOpMaker<paddle::imperative::OpBase>,
UnStackInferMetaFunctor);
REGISTER_OPERATOR(unstack_grad, ops::UnStackGradOp); REGISTER_OPERATOR(unstack_grad, ops::UnStackGradOp);
REGISTER_OP_CPU_KERNEL(unstack,
ops::UnStackKernel<plat::CPUDeviceContext, float>,
ops::UnStackKernel<plat::CPUDeviceContext, double>,
ops::UnStackKernel<plat::CPUDeviceContext, int>,
ops::UnStackKernel<plat::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(unstack_grad,
ops::UnStackGradKernel<plat::CPUDeviceContext, float>,
ops::UnStackGradKernel<plat::CPUDeviceContext, double>,
ops::UnStackGradKernel<plat::CPUDeviceContext, int>,
ops::UnStackGradKernel<plat::CPUDeviceContext, int64_t>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/for_range.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#include <thrust/device_vector.h>
#endif
namespace paddle {
namespace operators {
template <typename VecXType, typename T>
struct StackFunctor {
HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post)
: x_(x), y_(y), n_(n), post_(post) {}
HOSTDEVICE void operator()(int idx) {
int i = idx / (n_ * post_);
int which_x = idx / post_ - i * n_;
int x_index = i * post_ + idx % post_;
y_[idx] = x_[which_x][x_index];
}
private:
VecXType x_;
T *y_;
int n_;
int post_;
};
template <typename VecDxType, typename T>
struct StackGradFunctor {
HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post)
: dx_(dx), dy_(dy), n_(n), post_(post) {}
HOSTDEVICE void operator()(int idx) {
int i = idx / (n_ * post_);
int which_x = idx / post_ - i * n_;
int x_index = i * post_ + idx % post_;
dx_[which_x][x_index] = dy_[idx];
}
private:
VecDxType dx_;
const T *dy_;
int n_;
int post_;
};
template <typename DeviceContext, typename VecXType, typename T>
static inline void StackFunctorForRange(const DeviceContext &ctx,
const VecXType &x, T *y, int total_num,
int n, int post) {
platform::ForRange<DeviceContext> for_range(ctx, total_num);
for_range(StackFunctor<VecXType, T>(x, y, n, post));
}
template <typename DeviceContext, typename VecDxType, typename T>
static inline void StackGradFunctorForRange(const DeviceContext &ctx,
const VecDxType &dx, const T *dy,
int total_num, int n, int post) {
platform::ForRange<DeviceContext> for_range(ctx, total_num);
for_range(StackGradFunctor<VecDxType, T>(dx, dy, n, post));
}
template <typename DeviceContext, typename T>
class UnStackGradKernel : public framework::OpKernel<T> {
using Tensor = framework::LoDTensor;
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto x = ctx.MultiInput<Tensor>(framework::GradVarName("Y"));
auto *y = ctx.Output<Tensor>(framework::GradVarName("X"));
int axis = ctx.Attr<int>("axis");
if (axis < 0) axis += (x[0]->dims().size() + 1);
int n = static_cast<int>(x.size());
auto *y_data = y->mutable_data<T>(ctx.GetPlace());
std::vector<const T *> x_datas(n);
for (int i = 0; i < n; i++) x_datas[i] = x[i]->data<T>();
int pre = 1;
int post = 1;
auto &dim = x[0]->dims();
for (auto i = 0; i < axis; ++i) pre *= dim[i];
for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
#if defined(__NVCC__) || defined(__HIPCC__)
int total_num = pre * n * post;
auto &dev_ctx = ctx.template device_context<DeviceContext>();
thrust::device_vector<const T *> device_x_vec(x_datas);
auto x_data_arr = device_x_vec.data().get();
StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
// Wait() must be called because device_x_vec may be destructed before
// kernel ends
dev_ctx.Wait();
#else
auto x_data_arr = x_datas.data();
size_t x_offset = 0;
size_t y_offset = 0;
for (int i = 0; i < pre; i++) {
for (int j = 0; j < n; j++) {
std::memcpy(y_data + y_offset, x_data_arr[j] + x_offset,
post * sizeof(T));
y_offset += post;
}
x_offset += post;
}
#endif
}
};
template <typename DeviceContext, typename T>
class UnStackKernel : public framework::OpKernel<T> {
using Tensor = framework::LoDTensor;
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *dy = ctx.Input<Tensor>("X");
auto dx = ctx.MultiOutput<Tensor>("Y");
int axis = ctx.Attr<int>("axis");
if (axis < 0) axis += dy->dims().size();
int n = dy->dims()[axis];
std::vector<T *> dx_datas(n); // NOLINT
for (int i = 0; i < n; i++) {
dx_datas[i] = dx[i]->mutable_data<T>(ctx.GetPlace());
}
auto dy_data = dy->data<T>();
if (dy->numel() == 0) return;
int pre = 1;
for (int i = 0; i < axis; ++i) pre *= dy->dims()[i];
int total_num = dy->numel();
int post = total_num / (n * pre);
auto &dev_ctx = ctx.template device_context<DeviceContext>();
#if defined(__NVCC__) || defined(__HIPCC__)
thrust::device_vector<T *> device_dx_vec(dx_datas);
auto dx_data_arr = device_dx_vec.data().get();
#else
auto dx_data_arr = dx_datas.data();
#endif
StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post);
#if defined(__NVCC__) || defined(__HIPCC__)
// Wait() must be called because device_dx_vec may be destructed before
// kernel ends
dev_ctx.Wait();
#endif
}
};
} // namespace operators
} // namespace paddle
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/unstack_op.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace paddle { namespace paddle {
......
...@@ -44,6 +44,10 @@ namespace phi { ...@@ -44,6 +44,10 @@ namespace phi {
_PhiForEachDataTypeHelper_( \ _PhiForEachDataTypeHelper_( \
callback, ::phi::dtype::complex<double>, DataType::COMPLEX128); callback, ::phi::dtype::complex<double>, DataType::COMPLEX128);
#define _PhiForEachDataTypeTiny_(callback) \
_PhiForEachDataTypeHelper_(callback, int, DataType::INT32); \
_PhiForEachDataTypeHelper_(callback, int64_t, DataType::INT64);
template <typename Visitor> template <typename Visitor>
inline void VisitDataType(phi::DataType type, Visitor visitor) { inline void VisitDataType(phi::DataType type, Visitor visitor) {
#define PhiVisitDataTypeCallback(cpp_type, data_type) \ #define PhiVisitDataTypeCallback(cpp_type, data_type) \
...@@ -59,4 +63,21 @@ inline void VisitDataType(phi::DataType type, Visitor visitor) { ...@@ -59,4 +63,21 @@ inline void VisitDataType(phi::DataType type, Visitor visitor) {
PADDLE_THROW(phi::errors::Unimplemented( PADDLE_THROW(phi::errors::Unimplemented(
"Not supported phi::DataType(%d) as data type.", static_cast<int>(type))); "Not supported phi::DataType(%d) as data type.", static_cast<int>(type)));
} }
template <typename Visitor>
inline void VisitDataTypeTiny(phi::DataType type, Visitor visitor) {
#define PhiVisitDataTypeCallbackTiny(cpp_type, data_type) \
do { \
if (type == data_type) { \
visitor.template apply<cpp_type>(); \
return; \
} \
} while (0)
_PhiForEachDataTypeTiny_(PhiVisitDataTypeCallbackTiny);
#undef PhiVisitDataTypeCallbackTiny
PADDLE_THROW(phi::errors::Unimplemented(
"Not supported phi::DataType(%d) as data type.", static_cast<int>(type)));
}
} // namespace phi } // namespace phi
...@@ -1167,6 +1167,52 @@ void RnnInferMeta(const MetaTensor& x, ...@@ -1167,6 +1167,52 @@ void RnnInferMeta(const MetaTensor& x,
} }
} }
void StackInferMeta(const std::vector<MetaTensor*>& x,
int axis,
MetaTensor* out) {
PADDLE_ENFORCE_GT(x.size(),
0UL,
phi::errors::InvalidArgument(
"Number of Inputs(x) must be larger than 0, but"
" received value is:%d.",
x.size()));
const auto& input_dims = GetMetaTensorsDim(x);
for (size_t i = 1; i < input_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(input_dims[i],
input_dims[0],
phi::errors::InvalidArgument(
"Dims of all Inputs(X) must be the same, but"
" received input %d dim is:%d not equal to input 0"
" dim:%d.",
i,
input_dims[i],
input_dims[0]));
}
int rank = input_dims[0].size();
PADDLE_ENFORCE_GE(
axis,
-(rank + 1),
phi::errors::InvalidArgument(
"Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, "
"but received axis is:%d.",
rank,
axis));
PADDLE_ENFORCE_LT(
axis,
rank + 1,
phi::errors::InvalidArgument(
"Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, "
"but received axis is:%d",
rank,
axis));
if (axis < 0) axis += (rank + 1);
auto vec = phi::vectorize<int>(input_dims[0]);
vec.insert(vec.begin() + axis, input_dims.size());
out->set_dims(phi::make_ddim(vec));
out->set_dtype(x.at(0)->dtype());
out->share_lod(*x.at(0));
}
void WarpctcInferMeta(const MetaTensor& logits, void WarpctcInferMeta(const MetaTensor& logits,
const MetaTensor& label, const MetaTensor& label,
const paddle::optional<const MetaTensor&> logits_length, const paddle::optional<const MetaTensor&> logits_length,
......
...@@ -231,6 +231,10 @@ void RnnInferMeta(const MetaTensor& x, ...@@ -231,6 +231,10 @@ void RnnInferMeta(const MetaTensor& x,
std::vector<MetaTensor*> state, std::vector<MetaTensor*> state,
MetaTensor* reserve); MetaTensor* reserve);
void StackInferMeta(const std::vector<MetaTensor*>& x,
int axis,
MetaTensor* out);
void WarpctcInferMeta(const MetaTensor& logits, void WarpctcInferMeta(const MetaTensor& logits,
const MetaTensor& label, const MetaTensor& label,
const paddle::optional<const MetaTensor&> logits_length, const paddle::optional<const MetaTensor&> logits_length,
......
...@@ -345,6 +345,56 @@ void PutAlongAxisInferMeta(const MetaTensor& x, ...@@ -345,6 +345,56 @@ void PutAlongAxisInferMeta(const MetaTensor& x,
out->set_dtype(x.dtype()); out->set_dtype(x.dtype());
} }
void RangeInferMeta(const MetaTensor& start,
const MetaTensor& end,
const MetaTensor& step,
MetaTensor* out) {
auto start_dims = start.dims();
auto end_dims = end.dims();
auto step_dims = step.dims();
PADDLE_ENFORCE_EQ(
start_dims.size(),
1,
phi::errors::InvalidArgument(
"The dim of the shape of Input(Start) should be 1, but got %d",
start_dims.size()));
PADDLE_ENFORCE_EQ(start_dims[0],
1,
phi::errors::InvalidArgument(
"The first dim of the shape of Input(Start) should "
"be 1, but got %d",
start_dims[0]));
PADDLE_ENFORCE_EQ(
end_dims.size(),
1,
phi::errors::InvalidArgument(
"The dim of the shape of Input(End) should be 1, but got %d",
end_dims.size()));
PADDLE_ENFORCE_EQ(
end_dims[0],
1,
phi::errors::InvalidArgument("The first dim of the shape of "
"Input(End) should be 1, but got %d",
end_dims[0]));
PADDLE_ENFORCE_EQ(
step_dims.size(),
1,
phi::errors::InvalidArgument(
"The dim of the shape of Input(Step) should be 1, but got %d",
step_dims.size()));
PADDLE_ENFORCE_EQ(step_dims[0],
1,
phi::errors::InvalidArgument(
"The first dim of the shape of Input(Step) should "
"be 1, but got %d",
step_dims[0]));
out->set_dims({-1});
out->set_dtype(start.dtype());
}
void RoiAlignInferMeta(const MetaTensor& x, void RoiAlignInferMeta(const MetaTensor& x,
const MetaTensor& boxes, const MetaTensor& boxes,
paddle::optional<const MetaTensor&> boxes_num, paddle::optional<const MetaTensor&> boxes_num,
......
...@@ -81,6 +81,11 @@ void PutAlongAxisInferMeta(const MetaTensor& x, ...@@ -81,6 +81,11 @@ void PutAlongAxisInferMeta(const MetaTensor& x,
const std::string& reduce, const std::string& reduce,
MetaTensor* out); MetaTensor* out);
void RangeInferMeta(const MetaTensor& start,
const MetaTensor& end,
const MetaTensor& step,
MetaTensor* out);
void RoiAlignInferMeta(const MetaTensor& x, void RoiAlignInferMeta(const MetaTensor& x,
const MetaTensor& boxes, const MetaTensor& boxes,
paddle::optional<const MetaTensor&> boxes_num, paddle::optional<const MetaTensor&> boxes_num,
......
...@@ -2552,6 +2552,85 @@ void UnfoldInferMeta(const MetaTensor& x, ...@@ -2552,6 +2552,85 @@ void UnfoldInferMeta(const MetaTensor& x,
out->set_dims(phi::make_ddim(out_dims)); out->set_dims(phi::make_ddim(out_dims));
} }
void UniqueInferMeta(const MetaTensor& x,
bool return_index,
bool return_inverse,
bool return_counts,
const std::vector<int>& axis,
DataType dtype,
MetaTensor* out,
MetaTensor* indices,
MetaTensor* index,
MetaTensor* counts) {
bool is_sorted = true;
UniqueRawInferMeta(x,
return_index,
return_inverse,
return_counts,
axis,
dtype,
is_sorted,
out,
indices,
index,
counts);
}
void UniqueRawInferMeta(const MetaTensor& x,
bool return_index,
bool return_inverse,
bool return_counts,
const std::vector<int>& axis,
DataType dtype,
bool is_sorted,
MetaTensor* out,
MetaTensor* indices,
MetaTensor* index,
MetaTensor* counts) {
if (!is_sorted) {
PADDLE_ENFORCE_EQ(
x.dims().size(),
1,
phi::errors::InvalidArgument("The Input(X) should be 1-D Tensor, "
"But now the dims of Input(X) is %d.",
x.dims().size()));
out->set_dims(phi::make_ddim({-1}));
index->set_dims(x.dims());
return;
}
if (axis.empty()) {
out->set_dims(phi::make_ddim({-1}));
if (return_inverse) {
index->set_dims(phi::make_ddim({phi::product(x.dims())}));
}
} else {
int axis_value = axis[0];
if (axis_value < 0) {
axis_value += x.dims().size();
}
PADDLE_ENFORCE_LT(
axis_value,
x.dims().size(),
phi::errors::InvalidArgument("The axis(%d) should be less than "
"the dimension size(%d) of x.",
axis_value,
x.dims().size()));
auto out_dims = x.dims();
out_dims[axis_value] = -1;
out->set_dims(out_dims);
if (return_inverse) {
index->set_dims(phi::make_ddim({x.dims()[axis_value]}));
}
}
if (return_index) {
indices->set_dims(phi::make_ddim({-1}));
}
if (return_counts) {
counts->set_dims(phi::make_ddim({-1}));
}
}
void UnsqueezeInferMeta(const MetaTensor& x, void UnsqueezeInferMeta(const MetaTensor& x,
const IntArray& axes, const IntArray& axes,
MetaTensor* xshape, MetaTensor* xshape,
...@@ -2595,6 +2674,53 @@ void UnsqueezeInferMeta(const MetaTensor& x, ...@@ -2595,6 +2674,53 @@ void UnsqueezeInferMeta(const MetaTensor& x,
xshape->set_dtype(x.dtype()); xshape->set_dtype(x.dtype());
} }
void UnStackInferMeta(const MetaTensor& x,
int axis,
int num,
std::vector<MetaTensor*> outs) {
auto x_dim = x.dims();
int rank = x_dim.size();
PADDLE_ENFORCE_GE(axis,
-rank,
phi::errors::InvalidArgument(
"The attribute axis is out of range, it must be "
"inside [-rank, rank), where rank = %d",
rank));
PADDLE_ENFORCE_LT(axis,
rank,
phi::errors::InvalidArgument(
"The attribute axis is out of range, it must be "
"inside [-rank, rank), where rank = %d",
rank));
if (axis < 0) axis += rank;
size_t output_count = outs.size();
PADDLE_ENFORCE_EQ(output_count,
static_cast<size_t>(num),
phi::errors::InvalidArgument(
"Number of Outputs(Y) is wrong. Got %d , but it must "
"equal to attribute num which is %d.",
output_count,
static_cast<size_t>(num)));
if (x_dim[axis] > 0) {
PADDLE_ENFORCE_EQ(
num,
x_dim[axis],
phi::errors::InvalidArgument(
"The number of attribute num is not equal to the length of the "
"%d axis of Input(X). Expect %d but got %d.",
axis,
x_dim[axis],
num));
}
auto vec = phi::vectorize<int>(x_dim);
vec.erase(vec.begin() + axis);
for (size_t i = 0; i < output_count; i++) {
outs[i]->set_dims(phi::make_ddim(vec));
outs[i]->set_dtype(x.dtype());
}
}
void OneHotRawInferMeta(const MetaTensor& x, void OneHotRawInferMeta(const MetaTensor& x,
int32_t depth, int32_t depth,
DataType dtype, DataType dtype,
......
...@@ -360,12 +360,40 @@ void UnfoldInferMeta(const MetaTensor& x, ...@@ -360,12 +360,40 @@ void UnfoldInferMeta(const MetaTensor& x,
MetaTensor* out, MetaTensor* out,
MetaConfig config = MetaConfig()); MetaConfig config = MetaConfig());
void UniqueInferMeta(const MetaTensor& x,
bool return_index,
bool return_inverse,
bool return_counts,
const std::vector<int>& axis,
DataType dtype,
MetaTensor* out,
MetaTensor* indices,
MetaTensor* index,
MetaTensor* counts);
void UniqueRawInferMeta(const MetaTensor& x,
bool return_index,
bool return_inverse,
bool return_counts,
const std::vector<int>& axis,
DataType dtype,
bool is_sorted,
MetaTensor* out,
MetaTensor* indices,
MetaTensor* index,
MetaTensor* counts);
void UnsqueezeInferMeta(const MetaTensor& x, void UnsqueezeInferMeta(const MetaTensor& x,
const IntArray& axes, const IntArray& axes,
MetaTensor* xshape, MetaTensor* xshape,
MetaTensor* out, MetaTensor* out,
MetaConfig config = MetaConfig()); MetaConfig config = MetaConfig());
void UnStackInferMeta(const MetaTensor& x,
int axis,
int num,
std::vector<MetaTensor*> outs);
void OneHotRawInferMeta(const MetaTensor& x, void OneHotRawInferMeta(const MetaTensor& x,
int32_t depth, int32_t depth,
DataType dtype, DataType dtype,
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/range_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/range_function.h"
namespace phi {
template <typename T, typename Context>
void RangeKernel(const Context& dev_ctx,
const DenseTensor& start,
const DenseTensor& end,
const DenseTensor& step,
DenseTensor* out) {
T start_value = start.data<T>()[0];
T end_value = end.data<T>()[0];
T step_value = step.data<T>()[0];
int64_t size = 0;
phi::funcs::GetSize(start_value, end_value, step_value, &size);
out->Resize(phi::make_ddim({size}));
T* out_data = dev_ctx.template Alloc<T>(out);
T value = start_value;
for (int64_t i = 0; i < size; ++i) {
out_data[i] = value;
value += step_value;
}
}
} // namespace phi
PD_REGISTER_KERNEL(
range, CPU, ALL_LAYOUT, phi::RangeKernel, float, double, int, int64_t) {}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/stack_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/stack_functor.h"
namespace phi {
template <typename T, typename Context>
void StackGradKernel(const Context& dev_ctx,
const DenseTensor& out,
int axis,
std::vector<DenseTensor*> x_grad) {
if (axis < 0) axis += out.dims().size();
int n = out.dims()[axis];
std::vector<T*> dx_datas(n); // NOLINT
for (int i = 0; i < n; i++) {
if (x_grad[i] == nullptr) {
dx_datas[i] = nullptr;
} else {
dx_datas[i] = dev_ctx.template Alloc<T>(x_grad[i]);
}
}
auto dy_data = out.data<T>();
int pre = 1;
for (int i = 0; i < axis; ++i) pre *= out.dims()[i];
int total_num = out.numel();
int post = total_num / (n * pre);
auto dx_data_arr = dx_datas.data();
phi::funcs::StackGradFunctorForRange(
dev_ctx, dx_data_arr, dy_data, total_num, n, post);
}
} // namespace phi
PD_REGISTER_KERNEL(stack_grad,
CPU,
ALL_LAYOUT,
phi::StackGradKernel,
float,
double,
int64_t,
int,
phi::dtype::bfloat16) {}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/stack_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void StackKernel(const Context& dev_ctx,
const std::vector<const DenseTensor*>& x,
int axis,
DenseTensor* out) {
if (axis < 0) axis += (x[0]->dims().size() + 1);
int n = static_cast<int>(x.size());
T* y_data = dev_ctx.template Alloc<T>(out);
std::vector<const T*> x_datas(n);
for (int i = 0; i < n; i++) x_datas[i] = x[i]->data<T>();
int pre = 1, post = 1;
auto& dim = x[0]->dims();
for (auto i = 0; i < axis; ++i) pre *= dim[i];
for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
auto x_data_arr = x_datas.data();
size_t x_offset = 0;
size_t y_offset = 0;
for (int i = 0; i < pre; i++) {
for (int j = 0; j < n; j++) {
std::memcpy(
y_data + y_offset, x_data_arr[j] + x_offset, post * sizeof(T));
y_offset += post;
}
x_offset += post;
}
}
} // namespace phi
PD_REGISTER_KERNEL(stack,
CPU,
ALL_LAYOUT,
phi::StackKernel,
float,
double,
int,
int64_t,
phi::dtype::bfloat16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/unique_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/utils/data_type.h"
#include "paddle/phi/kernels/funcs/unique_functor.h"
namespace phi {
template <typename T, typename Context>
void UniqueKernel(const Context& context,
const DenseTensor& x,
bool return_index,
bool return_inverse,
bool return_counts,
const std::vector<int>& axis,
DataType dtype,
DenseTensor* out,
DenseTensor* indices,
DenseTensor* index,
DenseTensor* counts) {
bool is_sorted = true;
UniqueRawKernel<T, Context>(context,
x,
return_index,
return_inverse,
return_counts,
axis,
dtype,
is_sorted,
out,
indices,
index,
counts);
}
template <typename T, typename Context>
void UniqueRawKernel(const Context& context,
const DenseTensor& x,
bool return_index,
bool return_inverse,
bool return_counts,
const std::vector<int>& axis,
DataType dtype,
bool is_sorted,
DenseTensor* out,
DenseTensor* indices,
DenseTensor* index,
DenseTensor* counts) {
if (dtype == phi::DataType::INT32) {
PADDLE_ENFORCE_LE(
x.numel(),
INT_MAX,
phi::errors::InvalidArgument(
"The number of elements in Input(X) should be less than or "
"equal to INT_MAX, but received num is %d. Please set `dtype` to "
"int64.",
x.numel()));
}
if (!is_sorted) {
phi::VisitDataType(
dtype,
phi::funcs::UniqueOpFunctor<Context, T>(context, out, index, &x));
return;
}
if (x.numel() == 0) {
context.template Alloc<T>(out);
return;
}
if (axis.empty()) {
phi::VisitDataTypeTiny(
dtype,
phi::funcs::UniqueFlattendTensorFunctor<Context, T>(context,
x,
out,
indices,
index,
counts,
return_index,
return_inverse,
return_counts));
} else {
int axis_value = axis[0];
phi::VisitDataTypeTiny(
dtype,
phi::funcs::UniqueDimFunctor<Context, T>(context,
x,
out,
indices,
index,
counts,
axis_value,
return_index,
return_inverse,
return_counts));
}
}
} // namespace phi
PD_REGISTER_KERNEL(unique,
CPU,
ALL_LAYOUT,
phi::UniqueKernel,
float,
double,
int32_t,
int64_t) {}
PD_REGISTER_KERNEL(unique_raw,
CPU,
ALL_LAYOUT,
phi::UniqueRawKernel,
float,
double,
int32_t,
int64_t) {}
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -12,21 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,21 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/unstack_op.h" #include "paddle/phi/kernels/unstack_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/unstack_grad_kernel_impl.h"
namespace plat = paddle::platform; PD_REGISTER_KERNEL(unstack_grad,
namespace ops = paddle::operators; CPU,
ALL_LAYOUT,
REGISTER_OP_CUDA_KERNEL( phi::UnStackGradKernel,
unstack, ops::UnStackKernel<plat::CUDADeviceContext, float>, float,
ops::UnStackKernel<plat::CUDADeviceContext, double>, double,
ops::UnStackKernel<plat::CUDADeviceContext, int>, int,
ops::UnStackKernel<plat::CUDADeviceContext, int64_t>, int64_t) {}
ops::UnStackKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
unstack_grad, ops::UnStackGradKernel<plat::CUDADeviceContext, float>,
ops::UnStackGradKernel<plat::CUDADeviceContext, double>,
ops::UnStackGradKernel<plat::CUDADeviceContext, int>,
ops::UnStackGradKernel<plat::CUDADeviceContext, int64_t>,
ops::UnStackGradKernel<plat::CUDADeviceContext, plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/unstack_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/unstack_kernel_impl.h"
PD_REGISTER_KERNEL(
unstack, CPU, ALL_LAYOUT, phi::UnStackKernel, float, double, int, int64_t) {
}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/enforce.h"
namespace phi {
namespace funcs {
template <typename T>
void GetSize(T start, T end, T step, int64_t* size) {
PADDLE_ENFORCE_NE(
step,
0,
phi::errors::InvalidArgument("The step of range op should not be 0."));
if (start < end) {
PADDLE_ENFORCE_GT(
step,
0,
phi::errors::InvalidArgument(
"The step should be greater than 0 while start < end."));
}
if (start > end) {
PADDLE_ENFORCE_LT(step,
0,
phi::errors::InvalidArgument(
"The step should be less than 0 while start > end."));
}
*size = std::is_integral<T>::value
? ((std::abs(end - start) + std::abs(step) - 1) / std::abs(step))
: std::ceil(std::abs((end - start) / step));
}
} // namespace funcs
} // namespace phi
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -14,12 +14,29 @@ ...@@ -14,12 +14,29 @@
#pragma once #pragma once
#include <memory> #include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle { namespace phi {
namespace operators { namespace funcs {
template <typename VecXType, typename T>
struct StackFunctor {
HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post)
: x_(x), y_(y), n_(n), post_(post) {}
HOSTDEVICE void operator()(int idx) {
int i = idx / (n_ * post_);
int which_x = idx / post_ - i * n_;
int x_index = i * post_ + idx % post_;
y_[idx] = x_[which_x][x_index];
}
private:
VecXType x_;
T *y_;
int n_;
int post_;
};
template <typename VecDxType, typename T> template <typename VecDxType, typename T>
struct StackGradFunctor { struct StackGradFunctor {
...@@ -40,81 +57,27 @@ struct StackGradFunctor { ...@@ -40,81 +57,27 @@ struct StackGradFunctor {
int post_; int post_;
}; };
template <typename DeviceContext, typename VecXType, typename T>
static inline void StackFunctorForRange(const DeviceContext &ctx,
const VecXType &x,
T *y,
int total_num,
int n,
int post) {
phi::funcs::ForRange<DeviceContext> for_range(ctx, total_num);
for_range(StackFunctor<VecXType, T>(x, y, n, post));
}
template <typename DeviceContext, typename VecDxType, typename T> template <typename DeviceContext, typename VecDxType, typename T>
static inline void StackGradFunctorForRange(const DeviceContext &ctx, static inline void StackGradFunctorForRange(const DeviceContext &ctx,
const VecDxType &dx, const T *dy, const VecDxType &dx,
int total_num, int n, int post) { const T *dy,
platform::ForRange<DeviceContext> for_range(ctx, total_num); int total_num,
int n,
int post) {
phi::funcs::ForRange<DeviceContext> for_range(ctx, total_num);
for_range(StackGradFunctor<VecDxType, T>(dx, dy, n, post)); for_range(StackGradFunctor<VecDxType, T>(dx, dy, n, post));
} }
template <typename DeviceContext, typename T> } // namespace funcs
class StackKernel : public framework::OpKernel<T> { } // namespace phi
using Tensor = framework::LoDTensor;
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto x = ctx.MultiInput<Tensor>("X");
auto *y = ctx.Output<Tensor>("Y");
int axis = ctx.Attr<int>("axis");
if (axis < 0) axis += (x[0]->dims().size() + 1);
int n = static_cast<int>(x.size());
auto *y_data = y->mutable_data<T>(ctx.GetPlace());
std::vector<const T *> x_datas(n);
for (int i = 0; i < n; i++) x_datas[i] = x[i]->data<T>();
int pre = 1, post = 1;
auto &dim = x[0]->dims();
for (auto i = 0; i < axis; ++i) pre *= dim[i];
for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
auto x_data_arr = x_datas.data();
size_t x_offset = 0;
size_t y_offset = 0;
for (int i = 0; i < pre; i++) {
for (int j = 0; j < n; j++) {
std::memcpy(y_data + y_offset, x_data_arr[j] + x_offset,
post * sizeof(T));
y_offset += post;
}
x_offset += post;
}
}
};
template <typename DeviceContext, typename T>
class StackGradKernel : public framework::OpKernel<T> {
using Tensor = framework::LoDTensor;
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto dx = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
int axis = ctx.Attr<int>("axis");
if (axis < 0) axis += dy->dims().size();
int n = dy->dims()[axis];
std::vector<T *> dx_datas(n); // NOLINT
for (int i = 0; i < n; i++) {
if (dx[i] == nullptr) {
dx_datas[i] = nullptr;
} else {
dx_datas[i] = dx[i]->mutable_data<T>(ctx.GetPlace());
}
}
auto dy_data = dy->data<T>();
int pre = 1;
for (int i = 0; i < axis; ++i) pre *= dy->dims()[i];
int total_num = dy->numel();
int post = total_num / (n * pre);
auto &dev_ctx = ctx.template device_context<DeviceContext>();
auto dx_data_arr = dx_datas.data();
StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post);
}
};
} // namespace operators
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
namespace funcs {
template <typename Context, typename InT>
struct UniqueOpFunctor {
const Context& context_;
DenseTensor* out_;
DenseTensor* index_;
const DenseTensor* in_;
DenseTensor* count_;
UniqueOpFunctor(const Context& context,
DenseTensor* out,
DenseTensor* index,
const DenseTensor* in,
DenseTensor* count = nullptr)
: context_(context), out_(out), index_(index), in_(in), count_(count) {}
template <typename IndexT>
void apply() const {
auto* in_data = in_->data<InT>();
auto* index_data = context_.template Alloc<IndexT>(index_);
int64_t j = 0;
// TODO(fangzeyang): Should optimize performance here.
std::unordered_map<InT, int64_t> dict;
std::vector<InT> uniq;
PADDLE_ENFORCE_LT(
in_->numel(),
pow(2, 31),
phi::errors::InvalidArgument(
"The num of Input(X) elements should be less then INT_MAX, "
"but received num is %d.",
in_->numel()));
for (auto i = 0; i < in_->numel(); i++) {
auto it = dict.find(in_data[i]);
if (it == dict.end()) {
dict.emplace(std::make_pair(in_data[i], j));
uniq.emplace_back(in_data[i]);
index_data[i] = static_cast<IndexT>(j);
j++;
} else {
index_data[i] = static_cast<IndexT>(it->second);
}
}
if (count_ != nullptr) {
// Resize the count tensor dims to allocate the memory
count_->Resize(phi::make_ddim({static_cast<int64_t>(uniq.size())}));
IndexT* count_data = context_.template Alloc<IndexT>(count_);
// init count_data to 0
memset(count_data, 0, uniq.size() * sizeof(IndexT));
const auto& index_type = index_->dtype();
bool index_type_match =
index_type == DataType::INT32 || index_type == DataType::INT64;
PADDLE_ENFORCE_EQ(
index_type_match,
true,
phi::errors::InvalidArgument(
"Index holds the wrong type, it holds %s, "
"but desires to be %s or %s",
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType(index_type)),
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType(DataType::INT32)),
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType(DataType::INT64))));
if (index_type == DataType::INT32) {
for (auto i = 0; i < in_->numel(); ++i) {
const IndexT& index = index_data[i];
count_data[static_cast<int32_t>(index)] += static_cast<IndexT>(1);
}
} else {
for (auto i = 0; i < in_->numel(); ++i) {
const IndexT& index = index_data[i];
count_data[static_cast<int64_t>(index)] += static_cast<IndexT>(1);
}
}
}
out_->Resize(phi::make_ddim({static_cast<int64_t>(uniq.size())}));
auto* out_data = context_.template Alloc<InT>(out_);
std::memcpy(out_data, uniq.data(), uniq.size() * sizeof(InT));
}
};
static std::vector<DenseTensor> Unbind(const DenseTensor& in) {
int64_t size = in.dims()[0];
std::vector<DenseTensor> tensors(size);
for (int64_t i = 0; i < size; ++i) {
tensors[i] = in.Slice(i, i + 1);
}
return tensors;
}
template <typename T>
static bool Equal(const DenseTensor& a, const DenseTensor& b) {
if (a.numel() != b.numel()) {
return false;
}
for (int64_t i = 0; i < a.numel(); ++i) {
if (a.data<T>()[i] != b.data<T>()[i]) {
return false;
}
}
return true;
}
template <typename Context, typename InT, typename IndexT>
static void UniqueFlattendTensor(const Context& context,
const DenseTensor& in,
DenseTensor* out,
DenseTensor* indices,
DenseTensor* index,
DenseTensor* count,
bool return_index,
bool return_inverse,
bool return_counts) {
const InT* in_data = in.data<InT>();
std::set<InT> unique(in_data, in_data + in.numel());
out->Resize(phi::make_ddim({static_cast<int64_t>(unique.size())}));
auto* out_data = context.template Alloc<InT>(out);
std::copy(unique.begin(), unique.end(), out_data);
if (return_index) {
indices->Resize(phi::make_ddim({out->numel()}));
auto indices_data = context.template Alloc<IndexT>(indices);
std::unordered_map<InT, IndexT> indices_map;
indices_map.reserve(out->numel());
for (int64_t i = 0; i < in.numel(); ++i) {
if (indices_map.find(in_data[i]) != indices_map.end()) continue;
indices_map[in_data[i]] = i;
}
for (int64_t i = 0; i < out->numel(); ++i) {
indices_data[i] = indices_map[out_data[i]];
}
}
if (return_inverse) {
index->Resize(phi::make_ddim({in.numel()}));
auto inverse_data = context.template Alloc<IndexT>(index);
std::unordered_map<InT, IndexT> inverse_map;
inverse_map.reserve(out->numel());
for (int64_t i = 0; i < out->numel(); ++i) {
inverse_map[out_data[i]] = i;
}
for (int64_t i = 0; i < in.numel(); ++i) {
inverse_data[i] = inverse_map[in_data[i]];
}
}
if (return_counts) {
count->Resize(phi::make_ddim({out->numel()}));
auto count_data = context.template Alloc<IndexT>(count);
std::unordered_map<InT, IndexT> counts_map;
counts_map.reserve(out->numel());
for (int64_t i = 0; i < out->numel(); ++i) {
counts_map[out_data[i]] = 0;
}
for (int64_t i = 0; i < in.numel(); i++) {
counts_map[in_data[i]] += 1;
}
for (int64_t i = 0; i < out->numel(); i++) {
count_data[i] = counts_map[out_data[i]];
}
}
}
template <typename Context, typename ForwardIt, typename InT, typename IndexT>
static ForwardIt UniqueDimImpl(const Context& context,
ForwardIt first,
ForwardIt last,
const std::vector<IndexT>& sorted_indices_vec,
std::vector<IndexT>* inverse_vec,
std::vector<IndexT>* counts_vec,
std::vector<IndexT>* indices_vec) {
if (first == last) {
return last;
}
(*inverse_vec)[sorted_indices_vec[0]] = 0;
(*counts_vec)[0] = 1;
(*indices_vec)[0] = sorted_indices_vec[0];
ForwardIt begin = first;
ForwardIt result = first;
while (++first != last) {
int64_t idx_first = std::distance(begin, first);
int64_t idx_result = std::distance(begin, result);
if (!Equal<InT>(*result, *first)) {
if (++result != first) {
*result = std::move(*first);
}
idx_result += 1;
(*indices_vec)[idx_result] = sorted_indices_vec[idx_first];
}
(*inverse_vec)[sorted_indices_vec[idx_first]] = idx_result;
(*counts_vec)[idx_result] += 1;
}
return ++result;
}
template <typename Context, typename InT, typename IndexT>
static void UniqueDim(const Context& context,
const DenseTensor& in,
DenseTensor* out,
DenseTensor* indices,
DenseTensor* index,
DenseTensor* count,
bool return_index,
bool return_inverse,
bool return_counts,
int axis) {
// transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
std::vector<int> permute(in.dims().size());
std::iota(permute.begin(), permute.end(), 0);
permute[axis] = 0;
permute[0] = axis;
std::vector<int64_t> in_trans_dims_vec(phi::vectorize(in.dims()));
in_trans_dims_vec[axis] = in.dims()[0];
in_trans_dims_vec[0] = in.dims()[axis];
DenseTensor in_trans;
phi::DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec);
in_trans.Resize(in_trans_dims);
context.template Alloc<InT>(&in_trans);
TransCompute<Context, InT>(in.dims().size(), context, in, &in_trans, permute);
// reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
phi::DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1);
in_trans.Resize(in_trans_flat_dims);
// sort indices
std::vector<IndexT> sorted_indices_vec(in_trans.dims()[0]);
std::iota(sorted_indices_vec.begin(), sorted_indices_vec.end(), 0);
int64_t col = in_trans.dims()[1];
const InT* in_trans_data = in_trans.data<InT>();
std::sort(sorted_indices_vec.begin(),
sorted_indices_vec.end(),
[&](int64_t a, int64_t b) -> bool {
for (int64_t i = 0; i < col; ++i) {
InT lhs = in_trans_data[i + a * col];
InT rhs = in_trans_data[i + b * col];
if (lhs < rhs) {
return true;
} else if (lhs > rhs) {
return false;
}
}
return false;
});
// sort tensor according to indices
DenseTensor input_sorted;
input_sorted.Resize(in_trans_dims);
context.template Alloc<InT>(&input_sorted);
InT* input_sorted_data = input_sorted.data<InT>();
for (size_t i = 0; i < sorted_indices_vec.size(); ++i) {
memcpy(input_sorted_data + i * col,
in_trans_data + static_cast<int64_t>(sorted_indices_vec[i]) * col,
col * sizeof(InT));
}
std::vector<DenseTensor> input_unbind = Unbind(input_sorted);
std::vector<IndexT> inverse_vec(sorted_indices_vec.size(), 0);
std::vector<IndexT> counts_vec(sorted_indices_vec.size(), 0);
std::vector<IndexT> indices_vec(sorted_indices_vec.size(), 0);
auto last = UniqueDimImpl<Context, std::vector<DenseTensor>::iterator, InT>(
context,
input_unbind.begin(),
input_unbind.end(),
sorted_indices_vec,
&inverse_vec,
&counts_vec,
&indices_vec);
input_unbind.erase(last, input_unbind.end());
counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end());
indices_vec.erase(indices_vec.begin() + input_unbind.size(),
indices_vec.end());
phi::funcs::ConcatFunctor<Context, InT> concat_functor;
DenseTensor out_trans;
std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
out_trans_dims_vec[0] = input_unbind.size();
out_trans.Resize(phi::make_ddim(out_trans_dims_vec));
context.template Alloc<InT>(&out_trans);
std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
out->Resize(phi::make_ddim(out_trans_dims_vec));
context.template Alloc<InT>(out);
concat_functor(context, input_unbind, 0, &out_trans);
TransCompute<Context, InT>(
out_trans.dims().size(), context, out_trans, out, permute);
if (return_inverse) {
paddle::framework::TensorFromVector(inverse_vec, context, index);
}
if (return_counts) {
paddle::framework::TensorFromVector(counts_vec, context, count);
}
if (return_index) {
paddle::framework::TensorFromVector(indices_vec, context, indices);
}
}
template <typename Context, typename InT>
struct UniqueFlattendTensorFunctor {
const Context& ctx_; /* */
const DenseTensor& in_;
DenseTensor* out_;
DenseTensor* indices_;
DenseTensor* index_;
DenseTensor* count_;
const bool return_index_;
const bool return_inverse_;
const bool return_counts_;
UniqueFlattendTensorFunctor(const Context& context,
const DenseTensor& in,
DenseTensor* out,
DenseTensor* indices,
DenseTensor* index,
DenseTensor* count,
bool return_index,
bool return_inverse,
bool return_counts)
: ctx_(context),
in_(in),
out_(out),
indices_(indices),
index_(index),
count_(count),
return_index_(return_index),
return_inverse_(return_inverse),
return_counts_(return_counts) {}
template <typename IndexT>
void apply() const {
UniqueFlattendTensor<Context, InT, IndexT>(ctx_,
in_,
out_,
indices_,
index_,
count_,
return_index_,
return_inverse_,
return_counts_);
}
};
template <typename Context, typename InT>
struct UniqueDimFunctor {
const Context& ctx_;
const DenseTensor& in_;
DenseTensor* out_;
DenseTensor* indices_;
DenseTensor* index_;
DenseTensor* count_;
const int axis_;
const bool return_index_;
const bool return_inverse_;
const bool return_counts_;
UniqueDimFunctor(const Context& context,
const DenseTensor& in,
DenseTensor* out,
DenseTensor* indices,
DenseTensor* index,
DenseTensor* count,
const int axis,
bool return_index,
bool return_inverse,
bool return_counts)
: ctx_(context),
in_(in),
out_(out),
indices_(indices),
index_(index),
count_(count),
axis_(axis),
return_index_(return_index),
return_inverse_(return_inverse),
return_counts_(return_counts) {}
template <typename IndexT>
void apply() const {
UniqueDim<Context, InT, IndexT>(ctx_,
in_,
out_,
indices_,
index_,
count_,
return_index_,
return_inverse_,
return_counts_,
axis_);
}
};
} // namespace funcs
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/range_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/range_function.h"
namespace phi {
template <typename T>
__global__ void Range(T start, T step, int64_t size, T* out) {
CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
}
template <typename T, typename Context>
void RangeKernel(const Context& dev_ctx,
const DenseTensor& start,
const DenseTensor& end,
const DenseTensor& step,
DenseTensor* out) {
T start_value = start.data<T>()[0];
T end_value = end.data<T>()[0];
T step_value = step.data<T>()[0];
int64_t size = 0;
phi::funcs::GetSize(start_value, end_value, step_value, &size);
out->Resize(phi::make_ddim({size}));
T* out_data = dev_ctx.template Alloc<T>(out);
auto stream = dev_ctx.stream();
int block = std::min(size, static_cast<int64_t>(256));
int grid = (size + block - 1) / block;
Range<T><<<grid, block, 0, stream>>>(start_value, step_value, size, out_data);
}
} // namespace phi
PD_REGISTER_KERNEL(
range, GPU, ALL_LAYOUT, phi::RangeKernel, float, double, int64_t, int) {
kernel->InputAt(0).SetBackend(phi::Backend::CPU);
kernel->InputAt(1).SetBackend(phi::Backend::CPU);
kernel->InputAt(2).SetBackend(phi::Backend::CPU);
}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/stack_grad_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename IntType>
__global__ void UnStackHelperCUDAKernel(const T* __restrict__ input,
int pre_dim_size,
int split_dim_size,
int suf_dim_size,
int num_split,
T** output_ptrs) {
assert(blockDim.y == 1);
assert(blockDim.z == 1);
// In this case they are equal
assert(split_dim_size % num_split == 0);
IntType size = pre_dim_size * split_dim_size * suf_dim_size;
IntType each_dim_size = split_dim_size / num_split;
for (IntType offset = blockIdx.x * blockDim.x + threadIdx.x; offset < size;
offset += blockDim.x * gridDim.x) {
IntType i = offset / (split_dim_size * suf_dim_size);
IntType j = (offset % (split_dim_size * suf_dim_size)) / suf_dim_size;
IntType k = offset % suf_dim_size;
T* output = output_ptrs[j / each_dim_size];
if (output == nullptr) {
return;
}
IntType output_ind = i * each_dim_size * suf_dim_size +
(j % each_dim_size) * suf_dim_size + k;
*(output + output_ind) = input[offset];
}
}
template <typename T, typename Context>
void StackGradKernel(const Context& dev_ctx,
const DenseTensor& out,
int axis,
std::vector<DenseTensor*> x_grad) {
if (axis < 0) axis += out.dims().size();
int n = out.dims()[axis];
PADDLE_ENFORCE_EQ(n,
x_grad.size(),
phi::errors::InvalidArgument(
"Output x_grad size should be equal to n, but"
" received n is:%d x_grad size is:%d.",
n,
x_grad.size()));
// x_grad is output, so save each data address, then copy each dy into dx_data
std::vector<T*> outputs(n);
for (size_t j = 0; j < x_grad.size(); ++j) {
if (x_grad[j] == nullptr) {
outputs[j] = nullptr;
continue;
}
if (x_grad[j]->numel() != 0UL) {
T* ptr = dev_ctx.template Alloc<T>(x_grad[j]);
outputs[j] = ptr;
} else {
outputs[j] = nullptr;
}
}
auto dy_data = out.data<T>();
// each x_grad should have same shape
int dy_pre = 1, dy_suf = 1;
auto dy_dims = out.dims();
int split_dim = n;
for (int i = 0; i < axis; ++i) {
dy_pre *= dy_dims[i];
}
dy_suf = out.numel() / (split_dim * dy_pre);
auto tmp_out_data =
paddle::memory::Alloc(dev_ctx, outputs.size() * sizeof(T*));
paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_out_data->ptr(),
phi::CPUPlace(),
reinterpret_cast<void*>(outputs.data()),
outputs.size() * sizeof(T*),
dev_ctx.stream());
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
dev_ctx, dy_pre * split_dim * dy_suf);
if (out.numel() < std::numeric_limits<int32_t>::max()) {
UnStackHelperCUDAKernel<T, int32_t><<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(
dy_data,
dy_pre,
split_dim,
dy_suf,
split_dim,
reinterpret_cast<T**>(tmp_out_data->ptr()));
} else {
UnStackHelperCUDAKernel<T, int64_t><<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(
dy_data,
dy_pre,
split_dim,
dy_suf,
split_dim,
reinterpret_cast<T**>(tmp_out_data->ptr()));
}
}
} // namespace phi
PD_REGISTER_KERNEL(stack_grad,
GPU,
ALL_LAYOUT,
phi::StackGradKernel,
float,
double,
int64_t,
int,
phi::dtype::float16,
phi::dtype::bfloat16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/stack_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename IntType>
__global__ void StackCUDAKernel(T** input_ptrs,
int split_size,
int rows,
int cols,
T* __restrict__ output) {
IntType grid_x = blockIdx.x * blockDim.x + threadIdx.x;
for (; grid_x < cols; grid_x += blockDim.x * gridDim.x) {
IntType grid_y = blockIdx.y * blockDim.y + threadIdx.y;
IntType split = grid_x / split_size;
const T* input_ptr = input_ptrs[split];
IntType col_offset = grid_x % split_size;
#pragma unroll
for (; grid_y < rows; grid_y += blockDim.y * gridDim.y) {
output[grid_y * cols + grid_x] =
input_ptr[grid_y * split_size + col_offset];
}
}
}
template <typename T, typename Context>
void StackKernel(const Context& dev_ctx,
const std::vector<const DenseTensor*>& x,
int axis,
DenseTensor* out) {
if (axis < 0) axis += (x[0]->dims().size() + 1);
int n = static_cast<int>(x.size());
T* y_data = dev_ctx.template Alloc<T>(out);
std::vector<const T*> x_datas(n);
for (int i = 0; i < n; i++) {
x_datas[i] = x[i]->data<T>();
}
auto tmp_x_data = paddle::memory::Alloc(dev_ctx, x_datas.size() * sizeof(T*));
paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_x_data->ptr(),
phi::CPUPlace(),
reinterpret_cast<void*>(x_datas.data()),
x_datas.size() * sizeof(T*),
dev_ctx.stream());
// Split x dim from axis to matrix
int x_row = 1, x_col = 1;
for (int i = 0; i < axis; ++i) {
x_row *= x[0]->dims()[i];
}
x_col = x[0]->numel() / x_row;
int out_col = x_col * n;
auto config =
phi::backends::gpu::GetGpuLaunchConfig2D(dev_ctx, out_col, x_row);
if (out->numel() < std::numeric_limits<int32_t>::max()) {
StackCUDAKernel<T, int32_t><<<config.block_per_grid,
config.thread_per_block,
0,
dev_ctx.stream()>>>(
reinterpret_cast<T**>(tmp_x_data->ptr()),
x_col,
x_row,
out_col,
y_data);
} else {
StackCUDAKernel<T, int64_t><<<config.block_per_grid,
config.thread_per_block,
0,
dev_ctx.stream()>>>(
reinterpret_cast<T**>(tmp_x_data->ptr()),
x_col,
x_row,
out_col,
y_data);
}
}
} // namespace phi
PD_REGISTER_KERNEL(stack,
GPU,
ALL_LAYOUT,
phi::StackKernel,
float,
double,
int64_t,
int,
phi::dtype::float16,
phi::dtype::bfloat16) {}
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/unique_kernel.h"
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <thrust/adjacent_difference.h> #include <thrust/adjacent_difference.h>
#include <thrust/device_vector.h> #include <thrust/device_vector.h>
#include <thrust/execution_policy.h> #include <thrust/execution_policy.h>
...@@ -21,12 +24,12 @@ limitations under the License. */ ...@@ -21,12 +24,12 @@ limitations under the License. */
#include <iostream> #include <iostream>
#include <vector> #include <vector>
#include "paddle/fluid/framework/tensor_util.h" // TensorToVector() #include "paddle/fluid/framework/tensor_util.h" // TensorToVector()
#include "paddle/fluid/operators/unique_op.h" // TransComute() #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace paddle { #include "paddle/phi/kernels/copy_kernel.h"
namespace operators { #include "paddle/phi/kernels/funcs/unique_functor.h"
using Tensor = framework::Tensor; namespace phi {
// Binary function 'less than' // Binary function 'less than'
template <typename InT> template <typename InT>
...@@ -93,10 +96,12 @@ struct BinaryNotEqual { ...@@ -93,10 +96,12 @@ struct BinaryNotEqual {
} }
}; };
// index_select() function for Tensor // index_select() function for DenseTensor
template <typename InT, typename IndexT> template <typename Context, typename InT, typename IndexT>
void IndexSelect(const framework::ExecutionContext& context, void IndexSelect(const Context& context,
const Tensor& input, const Tensor& index, Tensor* output, const DenseTensor& input,
const DenseTensor& index,
DenseTensor* output,
int dim) { int dim) {
auto input_dim = input.dims(); auto input_dim = input.dims();
auto input_dim_size = input_dim.size(); auto input_dim_size = input_dim.size();
...@@ -119,27 +124,29 @@ void IndexSelect(const framework::ExecutionContext& context, ...@@ -119,27 +124,29 @@ void IndexSelect(const framework::ExecutionContext& context,
std::vector<InT> input_vec; std::vector<InT> input_vec;
std::vector<IndexT> index_vec; std::vector<IndexT> index_vec;
paddle::framework::TensorToVector(input, context.device_context(), paddle::framework::TensorToVector(input, context, &input_vec);
&input_vec); paddle::framework::TensorToVector(index, context, &index_vec);
paddle::framework::TensorToVector(index, context.device_context(),
&index_vec);
std::vector<InT> out_vec(output->numel()); std::vector<InT> out_vec(output->numel());
for (int i = 0; i < index_size; i++) { for (int i = 0; i < index_size; i++) {
PADDLE_ENFORCE_GE( PADDLE_ENFORCE_GE(
index_vec[i], 0, index_vec[i],
platform::errors::InvalidArgument( 0,
phi::errors::InvalidArgument(
"Variable value (index) of OP(index_select) " "Variable value (index) of OP(index_select) "
"expected >= 0 and < %ld, but got %ld. Please check input " "expected >= 0 and < %ld, but got %ld. Please check input "
"value.", "value.",
input_dim[dim], index_vec[i])); input_dim[dim],
index_vec[i]));
PADDLE_ENFORCE_LT( PADDLE_ENFORCE_LT(
index_vec[i], input_dim[dim], index_vec[i],
platform::errors::InvalidArgument( input_dim[dim],
phi::errors::InvalidArgument(
"Variable value (index) of OP(index_select) " "Variable value (index) of OP(index_select) "
"expected >= 0 and < %ld, but got %ld. Please check input " "expected >= 0 and < %ld, but got %ld. Please check input "
"value.", "value.",
input_dim[dim], index_vec[i])); input_dim[dim],
index_vec[i]));
} }
for (auto i = 0; i < outer_nums; i++) { for (auto i = 0; i < outer_nums; i++) {
...@@ -154,150 +161,193 @@ void IndexSelect(const framework::ExecutionContext& context, ...@@ -154,150 +161,193 @@ void IndexSelect(const framework::ExecutionContext& context,
} }
} }
} }
output->mutable_data<InT>(context.GetPlace()); context.template Alloc<IndexT>(output);
framework::TensorFromVector(out_vec, context.device_context(), output); paddle::framework::TensorFromVector(out_vec, context, output);
output->Resize(output_dim); output->Resize(output_dim);
} }
// The core logic of computing Unique for a flattend Tensor // The core logic of computing Unique for a flattend DenseTensor
template <typename InT, typename IndexT, typename equal_T, typename not_equal_T> template <typename Context,
static void UniqueFlattendCUDATensor(const framework::ExecutionContext& context, typename InT,
const Tensor& in, Tensor* out, typename IndexT,
bool return_index, bool return_inverse, typename equal_T,
bool return_counts, equal_T equal, typename not_equal_T>
not_equal_T not_equal, int64_t num_input) { static void UniqueFlattendCUDATensor(const Context& context,
const DenseTensor& in,
DenseTensor* out,
DenseTensor* indices,
DenseTensor* index,
DenseTensor* counts,
bool return_index,
bool return_inverse,
bool return_counts,
equal_T equal,
not_equal_T not_equal,
int64_t num_input) {
// 0. Prepration // 0. Prepration
Tensor in_hat; DenseTensor in_hat;
framework::TensorCopy(in, context.GetPlace(), &in_hat); phi::Copy(context, in, context.GetPlace(), false, &in_hat);
auto in_data_hat = in_hat.mutable_data<InT>(context.GetPlace()); auto* in_data_hat = context.template Alloc<InT>(&in_hat);
Tensor* sorted_indices = context.Output<Tensor>("Indices"); indices->Resize(phi::make_ddim({num_input}));
sorted_indices->Resize(phi::make_ddim({num_input})); auto* indices_data = context.template Alloc<IndexT>(indices);
auto sorted_indices_data =
sorted_indices->mutable_data<IndexT>(context.GetPlace()); thrust::sequence(thrust::device, indices_data, indices_data + num_input);
thrust::sequence(thrust::device, sorted_indices_data, thrust::sort_by_key(
sorted_indices_data + num_input); thrust::device, in_data_hat, in_data_hat + num_input, indices_data);
thrust::sort_by_key(thrust::device, in_data_hat, in_data_hat + num_input,
sorted_indices_data);
// 1. Calculate op result: 'out' // 1. Calculate op result: 'out'
Tensor range; DenseTensor range;
range.Resize(phi::make_ddim({num_input + 1})); range.Resize(phi::make_ddim({num_input + 1}));
auto range_data_ptr = range.mutable_data<IndexT>(context.GetPlace()); auto* range_data_ptr = context.template Alloc<IndexT>(&range);
thrust::sequence(thrust::device, range_data_ptr, thrust::sequence(
range_data_ptr + num_input + 1); thrust::device, range_data_ptr, range_data_ptr + num_input + 1);
framework::TensorCopy(in_hat, context.GetPlace(), out); phi::Copy(context, in_hat, context.GetPlace(), false, out);
int num_out; int num_out;
auto out_data = out->mutable_data<InT>(context.GetPlace()); auto out_data = context.template Alloc<InT>(out);
num_out = thrust::unique_by_key(thrust::device, out_data, num_out =
out_data + num_input, range_data_ptr, equal) thrust::unique_by_key(
.first - thrust::device, out_data, out_data + num_input, range_data_ptr, equal)
out_data; .first -
out_data;
out->Resize(phi::make_ddim({num_out})); out->Resize(phi::make_ddim({num_out}));
// 3. Calculate inverse index: 'inverse' // 3. Calculate inverse index: 'inverse'
if (return_inverse) { if (return_inverse) {
Tensor* inverse = context.Output<Tensor>("Index"); index->Resize(phi::make_ddim({num_input}));
inverse->Resize(phi::make_ddim({num_input})); auto* inverse_data = context.template Alloc<IndexT>(index);
auto inverse_data = inverse->mutable_data<IndexT>(context.GetPlace()); DenseTensor inv_loc;
Tensor inv_loc;
inv_loc.Resize(phi::make_ddim({num_input})); inv_loc.Resize(phi::make_ddim({num_input}));
auto inv_loc_data_ptr = inv_loc.mutable_data<IndexT>(context.GetPlace()); auto inv_loc_data_ptr = context.template Alloc<IndexT>(&inv_loc);
thrust::adjacent_difference(thrust::device, in_data_hat, thrust::adjacent_difference(thrust::device,
in_data_hat + num_input, inv_loc_data_ptr, in_data_hat,
in_data_hat + num_input,
inv_loc_data_ptr,
not_equal); not_equal);
thrust::device_ptr<IndexT> inv_loc_data_dev(inv_loc_data_ptr); thrust::device_ptr<IndexT> inv_loc_data_dev(inv_loc_data_ptr);
inv_loc_data_dev[0] = 0; // without device_ptr, segmentation fault inv_loc_data_dev[0] = 0; // without device_ptr, segmentation fault
thrust::inclusive_scan(thrust::device, inv_loc_data_ptr, thrust::inclusive_scan(thrust::device,
inv_loc_data_ptr + num_input, inv_loc_data_ptr); inv_loc_data_ptr,
thrust::scatter(thrust::device, inv_loc_data_ptr, inv_loc_data_ptr + num_input,
inv_loc_data_ptr + num_input, sorted_indices_data, inv_loc_data_ptr);
thrust::scatter(thrust::device,
inv_loc_data_ptr,
inv_loc_data_ptr + num_input,
indices_data,
inverse_data); inverse_data);
} }
// 2. Calculate sorted index: 'sorted_indices' // 2. Calculate sorted index: 'indices'
if (return_index) { if (return_index) {
Tensor indices; DenseTensor tmp_indices;
indices.Resize(phi::make_ddim({num_input})); tmp_indices.Resize(phi::make_ddim({num_input}));
auto indices_data_ptr = indices.mutable_data<IndexT>(context.GetPlace()); auto* tmp_indices_data_ptr = context.template Alloc<IndexT>(&tmp_indices);
thrust::copy(thrust::device, in_data_hat, in_data_hat + num_input, thrust::copy(thrust::device,
indices_data_ptr); in_data_hat,
thrust::unique_by_key(thrust::device, indices_data_ptr, in_data_hat + num_input,
indices_data_ptr + num_input, sorted_indices_data, tmp_indices_data_ptr);
thrust::unique_by_key(thrust::device,
tmp_indices_data_ptr,
tmp_indices_data_ptr + num_input,
indices_data,
equal); equal);
sorted_indices->Resize(phi::make_ddim({num_out})); indices->Resize(phi::make_ddim({num_out}));
} }
// 4. Calculate 'counts' // 4. Calculate 'counts'
if (return_counts) { if (return_counts) {
Tensor* counts = context.Output<Tensor>("Counts");
counts->Resize(phi::make_ddim({num_out})); counts->Resize(phi::make_ddim({num_out}));
auto count_data = counts->mutable_data<IndexT>(context.GetPlace()); auto count_data = context.template Alloc<IndexT>(counts);
// init 'count_data' as 0 // init 'count_data' as 0
thrust::fill(thrust::device, count_data, count_data + num_out, 0); thrust::fill(thrust::device, count_data, count_data + num_out, 0);
thrust::device_ptr<IndexT> range_data_ptr_dev(range_data_ptr); thrust::device_ptr<IndexT> range_data_ptr_dev(range_data_ptr);
range_data_ptr_dev[num_out] = num_input; range_data_ptr_dev[num_out] = num_input;
thrust::adjacent_difference(thrust::device, range_data_ptr + 1, thrust::adjacent_difference(thrust::device,
range_data_ptr + num_out + 1, count_data); range_data_ptr + 1,
range_data_ptr + num_out + 1,
count_data);
} }
} }
// The logic of compute unique with axis required, it's a little different // The logic of compute unique with axis required, it's a little different
// from above function // from above function
template <typename InT, typename IndexT, typename equal_T, typename not_equal_T> template <typename Context,
static void ComputeUniqueDims(const framework::ExecutionContext& context, typename InT,
Tensor* sorted_indices, typename IndexT,
IndexT* sorted_indices_data, Tensor* out, typename equal_T,
bool return_index, bool return_inverse, typename not_equal_T>
bool return_counts, equal_T equal, static void ComputeUniqueDims(const Context& context,
not_equal_T not_equal, int64_t row) { DenseTensor* sorted_indices,
IndexT* sorted_indices_data,
DenseTensor* out,
DenseTensor* inverse,
DenseTensor* counts,
bool return_index,
bool return_inverse,
bool return_counts,
equal_T equal,
not_equal_T not_equal,
int64_t row) {
// 1. inverse indices: 'inverse' // 1. inverse indices: 'inverse'
Tensor* inverse = context.Output<Tensor>("Index");
inverse->Resize(phi::make_ddim({row})); inverse->Resize(phi::make_ddim({row}));
auto inverse_data = inverse->mutable_data<IndexT>(context.GetPlace()); auto* inverse_data = context.template Alloc<IndexT>(inverse);
Tensor inv_loc; DenseTensor inv_loc;
inv_loc.Resize(phi::make_ddim({row})); inv_loc.Resize(phi::make_ddim({row}));
auto inv_loc_data_ptr = inv_loc.mutable_data<IndexT>(context.GetPlace()); auto inv_loc_data_ptr = context.template Alloc<IndexT>(&inv_loc);
thrust::adjacent_difference(thrust::device, sorted_indices_data, thrust::adjacent_difference(thrust::device,
sorted_indices_data + row, inv_loc_data_ptr, sorted_indices_data,
sorted_indices_data + row,
inv_loc_data_ptr,
not_equal); not_equal);
thrust::device_ptr<IndexT> inv_loc_data_dev(inv_loc_data_ptr); thrust::device_ptr<IndexT> inv_loc_data_dev(inv_loc_data_ptr);
inv_loc_data_dev[0] = 0; inv_loc_data_dev[0] = 0;
thrust::inclusive_scan(thrust::device, inv_loc_data_ptr, thrust::inclusive_scan(thrust::device,
inv_loc_data_ptr + row, inv_loc_data_ptr); inv_loc_data_ptr,
thrust::scatter(thrust::device, inv_loc_data_ptr, inv_loc_data_ptr + row, inv_loc_data_ptr + row,
sorted_indices_data, inverse_data); inv_loc_data_ptr);
thrust::scatter(thrust::device,
inv_loc_data_ptr,
inv_loc_data_ptr + row,
sorted_indices_data,
inverse_data);
// 2. sorted indices // 2. sorted indices
Tensor range; DenseTensor range;
range.Resize(phi::make_ddim({row + 1})); range.Resize(phi::make_ddim({row + 1}));
auto range_data_ptr = range.mutable_data<IndexT>(context.GetPlace()); auto range_data_ptr = context.template Alloc<IndexT>(&range);
thrust::sequence(thrust::device, range_data_ptr, range_data_ptr + row + 1); thrust::sequence(thrust::device, range_data_ptr, range_data_ptr + row + 1);
int num_out; int num_out;
num_out = num_out = thrust::unique_by_key(thrust::device,
thrust::unique_by_key(thrust::device, sorted_indices_data, sorted_indices_data,
sorted_indices_data + row, range_data_ptr, equal) sorted_indices_data + row,
.first - range_data_ptr,
sorted_indices_data; equal)
.first -
sorted_indices_data;
thrust::device_ptr<IndexT> range_data_ptr_dev(range_data_ptr); thrust::device_ptr<IndexT> range_data_ptr_dev(range_data_ptr);
range_data_ptr_dev[num_out] = row; range_data_ptr_dev[num_out] = row;
sorted_indices->Resize(phi::make_ddim({num_out})); sorted_indices->Resize(phi::make_ddim({num_out}));
// 3. counts: 'counts' // 3. counts: 'counts'
Tensor* counts = context.Output<Tensor>("Counts");
counts->Resize(phi::make_ddim({num_out})); counts->Resize(phi::make_ddim({num_out}));
auto count_data = counts->mutable_data<IndexT>(context.GetPlace()); auto* count_data = context.template Alloc<IndexT>(counts);
thrust::fill(thrust::device, count_data, count_data + row, 0); thrust::fill(thrust::device, count_data, count_data + row, 0);
thrust::adjacent_difference(thrust::device, range_data_ptr + 1, thrust::adjacent_difference(
range_data_ptr + row + 1, count_data); thrust::device, range_data_ptr + 1, range_data_ptr + row + 1, count_data);
} }
// Calculate unique when 'axis' is set // Calculate unique when 'axis' is set
template <typename DeviceContext, typename InT, typename IndexT> template <typename Context, typename InT, typename IndexT>
static void UniqueDimsCUDATensor(const framework::ExecutionContext& context, static void UniqueDimsCUDATensor(const Context& context,
const Tensor& in, Tensor* out, const DenseTensor& in,
bool return_index, bool return_inverse, DenseTensor* out,
bool return_counts, int axis) { DenseTensor* indices,
DenseTensor* index,
DenseTensor* counts,
bool return_index,
bool return_inverse,
bool return_counts,
int axis) {
// 1. Transpose & reshape // 1. Transpose & reshape
// Transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2] // Transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
std::vector<int> permute(in.dims().size()); std::vector<int> permute(in.dims().size());
...@@ -307,19 +357,19 @@ static void UniqueDimsCUDATensor(const framework::ExecutionContext& context, ...@@ -307,19 +357,19 @@ static void UniqueDimsCUDATensor(const framework::ExecutionContext& context,
std::vector<int64_t> in_trans_dims_vec(phi::vectorize(in.dims())); std::vector<int64_t> in_trans_dims_vec(phi::vectorize(in.dims()));
in_trans_dims_vec[axis] = in.dims()[0]; in_trans_dims_vec[axis] = in.dims()[0];
in_trans_dims_vec[0] = in.dims()[axis]; in_trans_dims_vec[0] = in.dims()[axis];
framework::Tensor in_trans; DenseTensor in_trans;
framework::DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec); auto in_trans_dims = phi::make_ddim(in_trans_dims_vec);
in_trans.Resize(in_trans_dims); in_trans.Resize(in_trans_dims);
in_trans.mutable_data<InT>(context.GetPlace()); context.template Alloc<InT>(&in_trans);
auto& dev_ctx = context.cuda_device_context(); phi::funcs::TransCompute<Context, InT>(
TransCompute<DeviceContext, InT>(in.dims().size(), // num of dims in.dims().size(), // num of dims
dev_ctx, // device context, // device
in, // original Tensor in, // original DenseTensor
&in_trans, // Tensor after reshape &in_trans, // DenseTensor after reshape
permute); // index of axis permute); // index of axis
// Reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2] // Reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
framework::DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1); auto in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1);
in_trans.Resize(in_trans_flat_dims); in_trans.Resize(in_trans_flat_dims);
// now 'in_trans' is 2D // now 'in_trans' is 2D
...@@ -327,87 +377,129 @@ static void UniqueDimsCUDATensor(const framework::ExecutionContext& context, ...@@ -327,87 +377,129 @@ static void UniqueDimsCUDATensor(const framework::ExecutionContext& context,
int64_t row = in_trans.dims()[0]; int64_t row = in_trans.dims()[0];
const InT* in_trans_data = in_trans.data<InT>(); const InT* in_trans_data = in_trans.data<InT>();
Tensor* sorted_indices = context.Output<Tensor>("Indices"); indices->Resize(phi::make_ddim({row}));
sorted_indices->Resize(phi::make_ddim({row})); auto* sorted_indices_data = context.template Alloc<IndexT>(indices);
auto sorted_indices_data =
sorted_indices->mutable_data<IndexT>(context.GetPlace());
// 2. Calculate 'sorted_indices', 'inverse', 'counts' // 2. Calculate 'indices', 'inverse', 'counts'
// Init index and sort // Init index and sort
thrust::sequence(thrust::device, sorted_indices_data, thrust::sequence(
sorted_indices_data + row); thrust::device, sorted_indices_data, sorted_indices_data + row);
thrust::sort(thrust::device, sorted_indices_data, sorted_indices_data + row, thrust::sort(thrust::device,
sorted_indices_data,
sorted_indices_data + row,
LessThan<InT>(col, in_trans_data)); LessThan<InT>(col, in_trans_data));
ComputeUniqueDims<InT, IndexT>( ComputeUniqueDims<Context, InT, IndexT>(
context, sorted_indices, sorted_indices_data, out, return_index, context,
return_inverse, return_counts, BinaryEqual<InT>(col, in_trans_data), indices,
BinaryNotEqual<InT>(col, in_trans_data), row); sorted_indices_data,
out,
index,
counts,
return_index,
return_inverse,
return_counts,
BinaryEqual<InT>(col, in_trans_data),
BinaryNotEqual<InT>(col, in_trans_data),
row);
// 3. Select indices and reshape back to get 'out' // 3. Select indices and reshape back to get 'out'
Tensor out_trans; DenseTensor out_trans;
std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec; std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
out_trans_dims_vec[0] = sorted_indices->numel(); out_trans_dims_vec[0] = indices->numel();
out_trans.Resize(phi::make_ddim(out_trans_dims_vec)); out_trans.Resize(phi::make_ddim(out_trans_dims_vec));
out_trans.mutable_data<InT>(context.GetPlace()); context.template Alloc<InT>(&out_trans);
IndexSelect<InT, IndexT>(context, in_trans, *sorted_indices, &out_trans, 0); IndexSelect<Context, InT, IndexT>(context, in_trans, *indices, &out_trans, 0);
std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]); std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
out->Resize(phi::make_ddim(out_trans_dims_vec)); out->Resize(phi::make_ddim(out_trans_dims_vec));
out->mutable_data<InT>(context.GetPlace()); context.template Alloc<InT>(out);
std::vector<framework::Tensor> out_trans_unbind = Unbind(out_trans); std::vector<DenseTensor> out_trans_unbind = phi::funcs::Unbind(out_trans);
math::ConcatFunctor<DeviceContext, InT> concat_functor; phi::funcs::ConcatFunctor<Context, InT> concat_functor;
concat_functor(dev_ctx, out_trans_unbind, 0, &out_trans); concat_functor(context, out_trans_unbind, 0, &out_trans);
TransCompute<DeviceContext, InT>(out_trans.dims().size(), dev_ctx, out_trans, phi::funcs::TransCompute<Context, InT>(
out, permute); out_trans.dims().size(), context, out_trans, out, permute);
} }
// functor for processing a flattend Tensor // functor for processing a flattend DenseTensor
template <typename DeviceContext, typename InT> template <typename Context, typename InT>
struct UniqueFlattendCUDAFunctor { struct UniqueFlattendCUDAFunctor {
const framework::ExecutionContext& ctx_; const Context& ctx_;
const Tensor& in_; const DenseTensor& in_;
Tensor* out_; DenseTensor* out_;
DenseTensor* indices_;
DenseTensor* index_;
DenseTensor* counts_;
const bool return_index_; const bool return_index_;
const bool return_inverse_; const bool return_inverse_;
const bool return_counts_; const bool return_counts_;
UniqueFlattendCUDAFunctor(const framework::ExecutionContext& context, UniqueFlattendCUDAFunctor(const Context& context,
const Tensor& in, Tensor* out, bool return_index, const DenseTensor& in,
bool return_inverse, bool return_counts) DenseTensor* out,
DenseTensor* indices,
DenseTensor* index,
DenseTensor* counts,
bool return_index,
bool return_inverse,
bool return_counts)
: ctx_(context), : ctx_(context),
in_(in), in_(in),
out_(out), out_(out),
indices_(indices),
index_(index),
counts_(counts),
return_index_(return_index), return_index_(return_index),
return_inverse_(return_inverse), return_inverse_(return_inverse),
return_counts_(return_counts) {} return_counts_(return_counts) {}
template <typename IndexT> template <typename IndexT>
void apply() const { void apply() const {
UniqueFlattendCUDATensor<InT, IndexT>( UniqueFlattendCUDATensor<Context, InT, IndexT>(ctx_,
ctx_, in_, out_, return_index_, return_inverse_, return_counts_, in_,
thrust::equal_to<InT>(), thrust::not_equal_to<InT>(), in_.numel()); out_,
indices_,
index_,
counts_,
return_index_,
return_inverse_,
return_counts_,
thrust::equal_to<InT>(),
thrust::not_equal_to<InT>(),
in_.numel());
} }
}; };
// functor for processing a multi-dimentional Tensor // functor for processing a multi-dimentional DenseTensor
template <typename DeviceContext, typename InT> template <typename Context, typename InT>
struct UniqueDimsCUDAFunctor { struct UniqueDimsCUDAFunctor {
const framework::ExecutionContext& ctx_; const Context& ctx_;
const Tensor& in_; const DenseTensor& in_;
Tensor* out_; DenseTensor* out_;
DenseTensor* indices_;
DenseTensor* index_;
DenseTensor* counts_;
const int axis_; const int axis_;
const bool return_index_; const bool return_index_;
const bool return_inverse_; const bool return_inverse_;
const bool return_counts_; const bool return_counts_;
UniqueDimsCUDAFunctor(const framework::ExecutionContext& context, UniqueDimsCUDAFunctor(const Context& context,
const Tensor& in, Tensor* out, const int axis, const DenseTensor& in,
bool return_index, bool return_inverse, DenseTensor* out,
DenseTensor* indices,
DenseTensor* index,
DenseTensor* counts,
const int axis,
bool return_index,
bool return_inverse,
bool return_counts) bool return_counts)
: ctx_(context), : ctx_(context),
in_(in), in_(in),
out_(out), out_(out),
indices_(indices),
index_(index),
counts_(counts),
axis_(axis), axis_(axis),
return_index_(return_index), return_index_(return_index),
return_inverse_(return_inverse), return_inverse_(return_inverse),
...@@ -415,60 +507,109 @@ struct UniqueDimsCUDAFunctor { ...@@ -415,60 +507,109 @@ struct UniqueDimsCUDAFunctor {
template <typename IndexT> template <typename IndexT>
void apply() const { void apply() const {
UniqueDimsCUDATensor<DeviceContext, InT, IndexT>( UniqueDimsCUDATensor<Context, InT, IndexT>(ctx_,
ctx_, in_, out_, return_index_, return_inverse_, return_counts_, axis_); in_,
out_,
indices_,
index_,
counts_,
return_index_,
return_inverse_,
return_counts_,
axis_);
} }
}; };
// Unique_op CUDA implementation. template <typename T, typename Context>
template <typename InT> void UniqueRawKernel(const Context& context,
class UniqueKernel<platform::CUDADeviceContext, InT> const DenseTensor& x,
: public framework::OpKernel<InT> { bool return_index,
public: bool return_inverse,
void Compute(const framework::ExecutionContext& context) const override { bool return_counts,
auto* x = context.Input<framework::Tensor>("X"); const std::vector<int>& axis,
auto* out = context.Output<framework::Tensor>("Out"); DataType dtype,
auto data_type = static_cast<framework::proto::VarType::Type>( bool is_sorted,
context.Attr<int>("dtype")); DenseTensor* out,
if (data_type == framework::proto::VarType::INT32) { DenseTensor* indices,
PADDLE_ENFORCE_LE( DenseTensor* index,
x->numel() + 1, INT_MAX, DenseTensor* counts) {
platform::errors::InvalidArgument( if (dtype == phi::DataType::INT32) {
"The number of elements in Input(X) should be less than or " PADDLE_ENFORCE_LE(
"equal to INT_MAX, but received num is %d. Please set `dtype` to " x.numel() + 1,
"int64.", INT_MAX,
x->numel())); phi::errors::InvalidArgument(
} "The number of elements in Input(X) should be less than or "
"equal to INT_MAX, but received num is %d. Please set `dtype` to "
std::vector<int> axis_vec = context.Attr<std::vector<int>>("axis"); "int64.",
bool return_index = context.Attr<bool>("return_index"); x.numel()));
bool return_inverse = context.Attr<bool>("return_inverse");
bool return_counts = context.Attr<bool>("return_counts");
// if 'axis' is not required, flatten the Tensor.
if (axis_vec.empty()) {
framework::VisitDataTypeTiny(
data_type,
UniqueFlattendCUDAFunctor<platform::CUDADeviceContext, InT>(
context, *x, out, return_index, return_inverse, return_counts));
} else {
// 'axis' is required.
int axis = axis_vec[0];
framework::VisitDataTypeTiny(
data_type, UniqueDimsCUDAFunctor<platform::CUDADeviceContext, InT>(
context, *x, out, axis, return_index, return_inverse,
return_counts));
}
} }
}; // if 'axis' is not required, flatten the DenseTensor.
if (axis.empty()) {
phi::VisitDataTypeTiny(
dtype,
UniqueFlattendCUDAFunctor<Context, T>(context,
x,
out,
indices,
index,
counts,
return_index,
return_inverse,
return_counts));
} else {
// 'axis' is required.
int axis_value = axis[0];
phi::VisitDataTypeTiny(dtype,
UniqueDimsCUDAFunctor<Context, T>(context,
x,
out,
indices,
index,
counts,
axis_value,
return_index,
return_inverse,
return_counts));
}
}
template <typename T, typename Context>
void UniqueKernel(const Context& context,
const DenseTensor& x,
bool return_index,
bool return_inverse,
bool return_counts,
const std::vector<int>& axis,
DataType dtype,
DenseTensor* out,
DenseTensor* indices,
DenseTensor* index,
DenseTensor* counts) {
bool is_sorted = true;
UniqueRawKernel<T, Context>(context,
x,
return_index,
return_inverse,
return_counts,
axis,
dtype,
is_sorted,
out,
indices,
index,
counts);
}
} // namespace operators } // namespace phi
} // namespace paddle
namespace ops = paddle::operators; PD_REGISTER_KERNEL(
unique, GPU, ALL_LAYOUT, phi::UniqueKernel, float, double, int64_t, int) {}
REGISTER_OP_CUDA_KERNEL( PD_REGISTER_KERNEL(unique_raw,
unique, ops::UniqueKernel<paddle::platform::CUDADeviceContext, float>, GPU,
ops::UniqueKernel<paddle::platform::CUDADeviceContext, double>, ALL_LAYOUT,
ops::UniqueKernel<paddle::platform::CUDADeviceContext, int32_t>, phi::UniqueRawKernel,
ops::UniqueKernel<paddle::platform::CUDADeviceContext, int64_t>); float,
double,
int64_t,
int) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/unstack_grad_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/unstack_grad_kernel_impl.h"
PD_REGISTER_KERNEL(unstack_grad,
GPU,
ALL_LAYOUT,
phi::UnStackGradKernel,
float,
double,
int64_t,
int,
phi::dtype::float16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/unstack_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/unstack_kernel_impl.h"
PD_REGISTER_KERNEL(unstack,
GPU,
ALL_LAYOUT,
phi::UnStackKernel,
float,
double,
int64_t,
int,
phi::dtype::float16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/stack_functor.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#include <thrust/device_vector.h>
#endif
namespace phi {
template <typename T, typename Context>
void UnStackGradKernel(const Context &dev_ctx,
const std::vector<const DenseTensor *> &x,
int axis,
DenseTensor *x_grad) {
if (axis < 0) axis += (x[0]->dims().size() + 1);
int n = static_cast<int>(x.size());
auto *x_grad_data = dev_ctx.template Alloc<T>(x_grad);
std::vector<const T *> x_datas(n);
for (int i = 0; i < n; i++) x_datas[i] = x[i]->data<T>();
int pre = 1;
int post = 1;
auto &dim = x[0]->dims();
for (auto i = 0; i < axis; ++i) pre *= dim[i];
for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
#if defined(__NVCC__) || defined(__HIPCC__)
int total_num = pre * n * post;
thrust::device_vector<const T *> device_x_vec(x_datas);
auto x_data_arr = device_x_vec.data().get();
phi::funcs::StackFunctorForRange(
dev_ctx, x_data_arr, x_grad_data, total_num, n, post);
// Wait() must be called because device_x_vec may be destructed before
// kernel ends
dev_ctx.Wait();
#else
auto x_data_arr = x_datas.data();
size_t x_offset = 0;
size_t y_offset = 0;
for (int i = 0; i < pre; i++) {
for (int j = 0; j < n; j++) {
std::memcpy(
x_grad_data + y_offset, x_data_arr[j] + x_offset, post * sizeof(T));
y_offset += post;
}
x_offset += post;
}
#endif
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/stack_functor.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#include <thrust/device_vector.h>
#endif
namespace phi {
template <typename T, typename Context>
void UnStackKernel(const Context &dev_ctx,
const DenseTensor &x,
int axis,
int num,
std::vector<DenseTensor *> outs) {
auto *dy = &x;
auto dx = outs;
if (axis < 0) axis += dy->dims().size();
int n = dy->dims()[axis];
std::vector<T *> dx_datas(n); // NOLINT
for (int i = 0; i < n; i++) {
dx_datas[i] = dev_ctx.template Alloc<T>(dx[i]);
}
auto dy_data = dy->data<T>();
if (dy->numel() == 0) return;
int pre = 1;
for (int i = 0; i < axis; ++i) pre *= dy->dims()[i];
int total_num = dy->numel();
int post = total_num / (n * pre);
#if defined(__NVCC__) || defined(__HIPCC__)
thrust::device_vector<T *> device_dx_vec(dx_datas);
auto dx_data_arr = device_dx_vec.data().get();
#else
auto dx_data_arr = dx_datas.data();
#endif
phi::funcs::StackGradFunctorForRange(
dev_ctx, dx_data_arr, dy_data, total_num, n, post);
#if defined(__NVCC__) || defined(__HIPCC__)
// Wait() must be called because device_dx_vec may be destructed before
// kernel ends
dev_ctx.Wait();
#endif
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void RangeKernel(const Context& dev_ctx,
const DenseTensor& start,
const DenseTensor& end,
const DenseTensor& step,
DenseTensor* out);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void StackGradKernel(const Context& dev_ctx,
const DenseTensor& out,
int axis,
std::vector<DenseTensor*> x_grad);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void StackKernel(const Context& dev_ctx,
const std::vector<const DenseTensor*>& x,
int axis,
DenseTensor* out);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void UniqueKernel(const Context& dev_ctx,
const DenseTensor& x,
bool return_index,
bool return_inverse,
bool return_counts,
const std::vector<int>& axis,
DataType dtype,
DenseTensor* out,
DenseTensor* indices,
DenseTensor* index,
DenseTensor* counts);
template <typename T, typename Context>
void UniqueRawKernel(const Context& dev_ctx,
const DenseTensor& x,
bool return_index,
bool return_inverse,
bool return_counts,
const std::vector<int>& axis,
DataType dtype,
bool is_sorted,
DenseTensor* out,
DenseTensor* indices,
DenseTensor* index,
DenseTensor* counts);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void UnStackGradKernel(const Context& dev_ctx,
const std::vector<const DenseTensor*>& x,
int axis,
DenseTensor* x_grad);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void UnStackKernel(const Context& dev_ctx,
const DenseTensor& x,
int axis,
int num,
std::vector<DenseTensor*> outs);
} // namespace phi
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature StackGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature(
"stack_grad", {GradVarName("Y")}, {"axis"}, {GradVarName("X")});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(stack_grad, phi::StackGradOpArgumentMapping);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature UniqueOpArgumentMapping(const ArgumentMappingContext& ctx) {
bool is_sorted = paddle::any_cast<bool>(ctx.Attr("is_sorted"));
if (is_sorted) {
return KernelSignature(
"unique",
{"X"},
{"return_index", "return_inverse", "return_counts", "axis", "dtype"},
{"Out", "Indices", "Index", "Counts"});
} else {
return KernelSignature("unique_raw",
{"X"},
{"return_index",
"return_inverse",
"return_counts",
"axis",
"dtype",
"is_sorted"},
{"Out", "Indices", "Index", "Counts"});
}
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(unique, phi::UniqueOpArgumentMapping);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature UnStackGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"unstack_grad", {GradVarName("Y")}, {"axis"}, {GradVarName("X")});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(unstack_grad, phi::UnStackGradOpArgumentMapping);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册