From f147fc9987df03a481c1b5a1eb641ef983cbf2e9 Mon Sep 17 00:00:00 2001 From: Huihuang Zheng Date: Fri, 31 Dec 2021 11:00:27 +0800 Subject: [PATCH] Put_along_axis (based on PR #37921 by Xu Huang) (#38608) Paddle new APIs: put_along_axis. Xu Huang is on holiday so we created this PR to work on it. It is based on his PR: https://github.com/PaddlePaddle/Paddle/pull/37921 --- .../fluid/operators/gather_scatter_kernel.cc | 60 +++++- .../fluid/operators/gather_scatter_kernel.cu | 85 +++++++- .../fluid/operators/gather_scatter_kernel.h | 25 +++ paddle/fluid/operators/put_along_axis_op.cc | 138 +++++++++++++ paddle/fluid/operators/put_along_axis_op.cu | 134 ++++++++++++ paddle/fluid/operators/put_along_axis_op.h | 124 ++++++++++++ paddle/fluid/operators/take_along_axis_op.cu | 1 - python/paddle/__init__.py | 3 + .../paddle/fluid/tests/unittests/op_test.py | 4 +- .../tests/unittests/test_put_along_axis_op.py | 191 ++++++++++++++++++ .../unittests/test_take_along_axis_op.py | 2 +- python/paddle/tensor/__init__.py | 4 + python/paddle/tensor/manipulation.py | 104 +++++++++- 13 files changed, 858 insertions(+), 17 deletions(-) create mode 100644 paddle/fluid/operators/put_along_axis_op.cc create mode 100644 paddle/fluid/operators/put_along_axis_op.cu create mode 100644 paddle/fluid/operators/put_along_axis_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_put_along_axis_op.py diff --git a/paddle/fluid/operators/gather_scatter_kernel.cc b/paddle/fluid/operators/gather_scatter_kernel.cc index 6c2ce23c368..285e857e976 100644 --- a/paddle/fluid/operators/gather_scatter_kernel.cc +++ b/paddle/fluid/operators/gather_scatter_kernel.cc @@ -34,9 +34,17 @@ class ReduceAdd { *self_data += *src_data; } }; - static ReduceAdd reduce_add; +class ReduceMultiply { + public: + template + void operator()(tensor_t* self_data, tensor_t* src_data) const { + *self_data *= *src_data; + } +}; +static ReduceMultiply reduce_mul; + template struct cpu_gather_scatter_functor { @@ -75,7 +83,6 @@ struct cpu_gather_scatter_functor { for (int i = dim + 1; i < index_dims.size(); i++) { outer_dim_size *= index_dims[i]; } - int64_t index_idx = 0; int64_t self_idx, src_idx; @@ -141,8 +148,55 @@ void cpu_scatter_add_kernel(Tensor self, int dim, const Tensor& index, self, dim, index, src, "scatter_add_cpu", reduce_add, ctx); } +template +void cpu_scatter_mul_kernel(Tensor self, int dim, const Tensor& index, + Tensor src, const platform::DeviceContext& ctx) { + cpu_gather_scatter_functor()( + self, dim, index, src, "scatter_mul_cpu", reduce_mul, ctx); +} + +template +void cpu_scatter_input_grad_kernel(Tensor self, int dim, const Tensor& index, + Tensor output, + const platform::DeviceContext& ctx) { + auto* index_data = index.data(); + auto* output_data = output.data(); + + auto index_dims = index.dims(); + auto output_dims = output.dims(); + + int64_t inner_dim_size = 1; + int64_t outer_dim_size = 1; + int select_dim_size = index_dims[dim]; + int output_select_dim_size = output_dims[dim]; + for (int64_t i = 0; i < dim; ++i) { + inner_dim_size *= index_dims[i]; + } + + for (int i = dim + 1; i < index_dims.size(); i++) { + outer_dim_size *= index_dims[i]; + } + + int64_t index_idx = 0; + for (int64_t i = 0; i < inner_dim_size; i++) { + for (int64_t j = 0; j < select_dim_size; j++) { + for (int64_t k = 0; k < outer_dim_size; k++) { + int64_t index = index_data[index_idx]; + int64_t replace_index = k + index * outer_dim_size + + i * outer_dim_size * output_select_dim_size; + output_data[replace_index] = 0; + index_idx++; + } + } + } +} + Instantiate_Template_Function(cpu_gather_kernel) - Instantiate_Template_Function(cpu_scatter_add_kernel) + Instantiate_Template_Function(cpu_scatter_assign_kernel) + Instantiate_Template_Function(cpu_scatter_add_kernel) + Instantiate_Template_Function(cpu_scatter_mul_kernel) + Instantiate_Template_Function(cpu_scatter_input_grad_kernel) } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/gather_scatter_kernel.cu b/paddle/fluid/operators/gather_scatter_kernel.cu index b94001a56db..dc87fc52aac 100644 --- a/paddle/fluid/operators/gather_scatter_kernel.cu +++ b/paddle/fluid/operators/gather_scatter_kernel.cu @@ -45,6 +45,16 @@ class ReduceAdd { }; static ReduceAdd reduce_add; +class ReduceMul { + public: + template + __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const { + *self_data *= *src_data; + // TODO(huangxu96) platform::CudaAtomicMul(*self_data, *src_data); + } +}; +static ReduceMul reduce_mul; + template __global__ void GatherScatterGPUKernel( @@ -141,6 +151,14 @@ void gpu_gather_kernel(Tensor self, int dim, const Tensor& index, Tensor result, return; } +template +void gpu_scatter_assign_kernel(Tensor self, int dim, const Tensor& index, + Tensor src, const platform::DeviceContext& ctx) { + gpu_gather_scatter_functor()( + self, dim, index, src, "scatter_assign_gpu", tensor_assign, ctx); +} + template void gpu_scatter_add_kernel(Tensor self, int dim, const Tensor& index, Tensor src, const platform::DeviceContext& ctx) { @@ -149,9 +167,72 @@ void gpu_scatter_add_kernel(Tensor self, int dim, const Tensor& index, self, dim, index, src, "scatter_add_gpu", reduce_add, ctx); } -namespace plat = paddle::platform; +template +void gpu_scatter_mul_kernel(Tensor self, int dim, const Tensor& index, + Tensor src, const platform::DeviceContext& ctx) { + gpu_gather_scatter_functor()( + self, dim, index, src, "scatter_mul_gpu", reduce_mul, ctx); +} + +template +__global__ void ScatterInputGradGPUKernel( + tensor_t* grad_data, int dim, const index_t* index_data, + int64_t inner_dim_size, int select_dim_size, int grad_select_dim_size, + int64_t outer_dim_size, int64_t numel) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= numel) return; + int64_t i, j, k; + i = tid / (select_dim_size * outer_dim_size); + int64_t remind = tid % (select_dim_size * outer_dim_size); + j = remind / outer_dim_size; + k = remind % outer_dim_size; + index_t index = index_data[tid]; + int64_t replace_index = + k + index * outer_dim_size + i * outer_dim_size * grad_select_dim_size; + grad_data[replace_index] = 0; +} +template +void gpu_scatter_input_grad_kernel(Tensor self, int dim, const Tensor& index, + Tensor grad, + const platform::DeviceContext& ctx) { + auto* index_data = index.data(); + auto* grad_data = grad.data(); + + auto index_dims = index.dims(); + auto grad_dims = grad.dims(); + int64_t index_size = index.numel(); + + int64_t inner_dim_size = 1; + int64_t outer_dim_size = 1; + int select_dim_size = index_dims[dim]; + int grad_select_dim_size = grad_dims[dim]; + for (int64_t i = 0; i < dim; ++i) { + inner_dim_size *= index_dims[i]; + } + + for (int i = dim + 1; i < index_dims.size(); i++) { + outer_dim_size *= index_dims[i]; + } + + int64_t slice_size = 1; + for (int i = 1; i < grad_dims.size(); ++i) slice_size *= grad_dims[i]; + + int block = 512; + int64_t n = slice_size * index_size; + int64_t grid = (n + block - 1) / block; + auto stream = + reinterpret_cast(ctx).stream(); + + ScatterInputGradGPUKernel<<>>( + grad_data, dim, index_data, inner_dim_size, select_dim_size, + grad_select_dim_size, outer_dim_size, index_size); +} Instantiate_Template_Function(gpu_gather_kernel) - Instantiate_Template_Function(gpu_scatter_add_kernel) + Instantiate_Template_Function(gpu_scatter_assign_kernel) + Instantiate_Template_Function(gpu_scatter_add_kernel) + Instantiate_Template_Function(gpu_scatter_mul_kernel) + Instantiate_Template_Function(gpu_scatter_input_grad_kernel) } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/gather_scatter_kernel.h b/paddle/fluid/operators/gather_scatter_kernel.h index 28eea5f2a03..1cbc1896918 100644 --- a/paddle/fluid/operators/gather_scatter_kernel.h +++ b/paddle/fluid/operators/gather_scatter_kernel.h @@ -41,17 +41,42 @@ template void cpu_gather_kernel(Tensor self, int dim, const Tensor& index, Tensor result, const platform::DeviceContext& ctx); +template +void cpu_scatter_assign_kernel(Tensor self, int dim, const Tensor& index, + Tensor src, const platform::DeviceContext& ctx); + template void cpu_scatter_add_kernel(Tensor self, int dim, const Tensor& index, Tensor src, const platform::DeviceContext& ctx); +template +void cpu_scatter_mul_kernel(Tensor self, int dim, const Tensor& index, + Tensor src, const platform::DeviceContext& ctx); + +template +void cpu_scatter_input_grad_kernel(Tensor self, int dim, const Tensor& index, + Tensor result, + const platform::DeviceContext& ctx); + template void gpu_gather_kernel(Tensor self, int dim, const Tensor& index, Tensor result, const platform::DeviceContext& ctx); +template +void gpu_scatter_assign_kernel(Tensor self, int dim, const Tensor& index, + Tensor src, const platform::DeviceContext& ctx); + template void gpu_scatter_add_kernel(Tensor self, int dim, const Tensor& index, Tensor src, const platform::DeviceContext& ctx); +template +void gpu_scatter_mul_kernel(Tensor self, int dim, const Tensor& index, + Tensor src, const platform::DeviceContext& ctx); + +template +void gpu_scatter_input_grad_kernel(Tensor self, int dim, const Tensor& index, + Tensor result, + const platform::DeviceContext& ctx); } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/put_along_axis_op.cc b/paddle/fluid/operators/put_along_axis_op.cc new file mode 100644 index 00000000000..35570486727 --- /dev/null +++ b/paddle/fluid/operators/put_along_axis_op.cc @@ -0,0 +1,138 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/put_along_axis_op.h" +#include +#include +#include +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace operators { + +class PutAlongAxisOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "PutAlongAxis"); + OP_INOUT_CHECK(ctx->HasInput("Index"), "Input", "Index", "PutAlongAxis"); + OP_INOUT_CHECK(ctx->HasInput("Value"), "Input", "Value", "PutAlongAxis"); + OP_INOUT_CHECK(ctx->HasOutput("Result"), "Output", "Result", + "PutAlongAxis"); + + auto index_dim = ctx->GetInputDim("Index"); + + ctx->SetOutputDim("Result", index_dim); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "Input"), + ctx.device_context()); + } + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } +}; + +class PutAlongAxisOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Input", "The input tensor of PutAlongAxisOp"); + AddInput("Index", "The index tensor of PutAlongAxisOp"); + AddInput("Value", "The value tensor of PutAlongAxisOp"); + AddOutput("Result", "The result tensor of PutAlongAxisOp"); + AddAttr("Axis", "The axis that we do PutAlongAxis operation"); + AddAttr("Reduce", "The reduce operation for scatter") + .SetDefault("assign"); + AddComment(R"DOC( + PutAlongAxis Operator.) + )DOC"); + } +}; + +class PutAlongAxisGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim(framework::GradVarName("Input"), + ctx->GetInputDim("Input")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Result")), + ctx.device_context()); + } + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } +}; + +template +class PutAlongAxisGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("put_along_axis_grad"); + op->SetInput("Index", this->Input("Index")); + op->SetInput("Input", this->Input("Input")); + + op->SetInput(framework::GradVarName("Result"), this->OutputGrad("Result")); + op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input")); + op->SetOutput(framework::GradVarName("Value"), this->InputGrad("Value")); + op->SetAttrMap(this->Attrs()); + } +}; + +DECLARE_INPLACE_OP_INFERER(PutAlongAxisInplaceInferer, {"Input", "Result"}); + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(put_along_axis, ops::PutAlongAxisOp, ops::PutAlongAxisOpMaker, + ops::PutAlongAxisGradOpMaker, + ops::PutAlongAxisGradOpMaker, + paddle::operators::PutAlongAxisInplaceInferer); + +REGISTER_OPERATOR(put_along_axis_grad, ops::PutAlongAxisGradOp); + +REGISTER_OP_CPU_KERNEL(put_along_axis, ops::PutAlongAxisOpKernel, + ops::PutAlongAxisOpKernel, + ops::PutAlongAxisOpKernel, + ops::PutAlongAxisOpKernel, + ops::PutAlongAxisOpKernel); + +REGISTER_OP_CPU_KERNEL(put_along_axis_grad, + ops::PutAlongAxisGradOpKernel, + ops::PutAlongAxisGradOpKernel, + ops::PutAlongAxisGradOpKernel, + ops::PutAlongAxisGradOpKernel, + ops::PutAlongAxisGradOpKernel); diff --git a/paddle/fluid/operators/put_along_axis_op.cu b/paddle/fluid/operators/put_along_axis_op.cu new file mode 100644 index 00000000000..da36b564337 --- /dev/null +++ b/paddle/fluid/operators/put_along_axis_op.cu @@ -0,0 +1,134 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/put_along_axis_op.h" + +namespace paddle { +namespace operators { + +template +class PutAlongAxisCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, + platform::errors::PreconditionNotMet( + "PutAlongAxisCUDAKernel only runs on GPU device.")); + auto input = ctx.Input("Input"); + auto axis = ctx.Attr("Axis"); + auto value = ctx.Input("Value"); + auto index = ctx.Input("Index"); + auto reduce_op = ctx.Attr("Reduce"); + auto result = ctx.Output("Result"); + const platform::DeviceContext &device_ctx = ctx.device_context(); + + const auto &index_type = index->type(); + + framework::TensorCopy(*input, ctx.GetPlace(), result); + if (reduce_op == "add") { + if (index_type == framework::proto::VarType::INT32) { + gpu_scatter_add_kernel(*result, axis, *index, *value, + device_ctx); + } else if (index_type == framework::proto::VarType::INT64) { + gpu_scatter_add_kernel(*result, axis, *index, *value, + device_ctx); + } + } else if (reduce_op == "multiply" || reduce_op == "mul") { + if (index_type == framework::proto::VarType::INT32) { + gpu_scatter_mul_kernel(*result, axis, *index, *value, + device_ctx); + } else if (index_type == framework::proto::VarType::INT64) { + gpu_scatter_mul_kernel(*result, axis, *index, *value, + device_ctx); + } + } else if (reduce_op == "assign") { + if (index_type == framework::proto::VarType::INT32) { + gpu_scatter_assign_kernel(*result, axis, *index, *value, + device_ctx); + } else if (index_type == framework::proto::VarType::INT64) { + gpu_scatter_assign_kernel(*result, axis, *index, *value, + device_ctx); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "can not support reduce_op: '%s' for scatter kernel, only " + "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the " + "defalut reduce op is 'assign' ", + reduce_op)); + return; + } + } +}; + +template +class PutAlongAxisGradOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, + platform::errors::PreconditionNotMet( + "PutAlongAxisGradOpCUDAKernel only runs on GPU.")); + + auto input_grad = ctx.Output(framework::GradVarName("Input")); + auto value_grad = ctx.Output(framework::GradVarName("Value")); + auto index = ctx.Input("Index"); + auto result_grad = ctx.Input(framework::GradVarName("Result")); + auto axis = ctx.Attr("Axis"); + + const auto &index_type = index->type(); + if (input_grad) { + framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad); + if (index_type == framework::proto::VarType::INT32) { + gpu_scatter_input_grad_kernel( + *result_grad, axis, *index, *input_grad, ctx.device_context()); + } else { + gpu_scatter_input_grad_kernel( + *result_grad, axis, *index, *input_grad, ctx.device_context()); + } + } + if (value_grad) { + value_grad->Resize(index->dims()); + value_grad->mutable_data(ctx.GetPlace()); + if (index_type == framework::proto::VarType::INT32) { + gpu_gather_kernel( + *result_grad, axis, *index, *value_grad, + ctx.device_context()); // the gradient of scatter is gather + } else if (index_type == framework::proto::VarType::INT64) { + gpu_gather_kernel(*result_grad, axis, *index, *value_grad, + ctx.device_context()); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_CUDA_KERNEL(put_along_axis, ops::PutAlongAxisCUDAKernel, + ops::PutAlongAxisCUDAKernel, + ops::PutAlongAxisCUDAKernel, + ops::PutAlongAxisCUDAKernel, + ops::PutAlongAxisCUDAKernel); +REGISTER_OP_CUDA_KERNEL(put_along_axis_grad, + ops::PutAlongAxisGradOpCUDAKernel, + ops::PutAlongAxisGradOpCUDAKernel, + ops::PutAlongAxisGradOpCUDAKernel, + ops::PutAlongAxisGradOpCUDAKernel, + ops::PutAlongAxisGradOpCUDAKernel); diff --git a/paddle/fluid/operators/put_along_axis_op.h b/paddle/fluid/operators/put_along_axis_op.h new file mode 100644 index 00000000000..f23ca177db9 --- /dev/null +++ b/paddle/fluid/operators/put_along_axis_op.h @@ -0,0 +1,124 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/gather_scatter_kernel.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class PutAlongAxisOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + platform::errors::PreconditionNotMet( + "PutAlongAxisOpKernel only runs on CPU.")); + + auto input = ctx.Input("Input"); + auto axis = ctx.Attr("Axis"); + auto value = ctx.Input("Value"); + auto index = ctx.Input("Index"); + auto reduce_op = ctx.Attr("Reduce"); + auto result = ctx.Output("Result"); + + framework::TensorCopy(*input, ctx.GetPlace(), result); + const platform::DeviceContext &device_ctx = ctx.device_context(); + const auto &index_type = index->type(); + if (reduce_op == "add") { + if (index_type == framework::proto::VarType::INT32) { + cpu_scatter_add_kernel(*result, axis, *index, *value, + device_ctx); + } else if (index_type == framework::proto::VarType::INT64) { + cpu_scatter_add_kernel(*result, axis, *index, *value, + device_ctx); + } + } else if (reduce_op == "multiply" || reduce_op == "mul") { + if (index_type == framework::proto::VarType::INT32) { + cpu_scatter_mul_kernel(*result, axis, *index, *value, + device_ctx); + } else if (index_type == framework::proto::VarType::INT64) { + cpu_scatter_mul_kernel(*result, axis, *index, *value, + device_ctx); + } + } else if (reduce_op == "assign") { + if (index_type == framework::proto::VarType::INT32) { + cpu_scatter_assign_kernel(*result, axis, *index, *value, + device_ctx); + } else if (index_type == framework::proto::VarType::INT64) { + cpu_scatter_assign_kernel(*result, axis, *index, *value, + device_ctx); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "can not support reduce_op: '%s' for scatter kernel, only " + "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the " + "defalut reduce " + "op is 'assign' ", + reduce_op)); + return; + } + } +}; + +template +class PutAlongAxisGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + platform::errors::PreconditionNotMet( + "PutAlongAxisGradOpKernel only runs on CPU.")); + + auto input_grad = ctx.Output(framework::GradVarName("Input")); + auto value_grad = ctx.Output(framework::GradVarName("Value")); + auto index = ctx.Input("Index"); + auto result_grad = ctx.Input(framework::GradVarName("Result")); + auto axis = ctx.Attr("Axis"); + const auto &index_type = index->type(); + + if (input_grad) { + framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad); + if (index_type == framework::proto::VarType::INT32) { + cpu_scatter_input_grad_kernel( + // Here passing an unused argument *result_grad, because it's + // convenient to instantiate a bunch of template function with the + // same arguments list. + *result_grad, axis, *index, *input_grad, ctx.device_context()); + } else { + cpu_scatter_input_grad_kernel( + *result_grad, axis, *index, *input_grad, ctx.device_context()); + } + } + + if (value_grad) { + value_grad->Resize(index->dims()); + value_grad->mutable_data(ctx.GetPlace()); + if (index_type == framework::proto::VarType::INT32) { + cpu_gather_kernel(*result_grad, axis, *index, *value_grad, + ctx.device_context()); + } else if (index_type == framework::proto::VarType::INT64) { + cpu_gather_kernel(*result_grad, axis, *index, *value_grad, + ctx.device_context()); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/take_along_axis_op.cu b/paddle/fluid/operators/take_along_axis_op.cu index c705959390b..e9f9b187187 100644 --- a/paddle/fluid/operators/take_along_axis_op.cu +++ b/paddle/fluid/operators/take_along_axis_op.cu @@ -32,7 +32,6 @@ class TakeAlongAxisCUDAKernel : public framework::OpKernel { auto result = ctx.Output("Result"); result->Resize(index->dims()); result->mutable_data(ctx.GetPlace()); - const auto &index_type = index->type(); if (index_type == framework::proto::VarType::INT32) { gpu_gather_kernel(*input, axis, *index, *result, diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 67c514b56b9..f9467400bc8 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -159,6 +159,7 @@ from .tensor.manipulation import roll # noqa: F401 from .tensor.manipulation import chunk # noqa: F401 from .tensor.manipulation import tolist # noqa: F401 from .tensor.manipulation import take_along_axis # noqa: F401 +from .tensor.manipulation import put_along_axis # noqa: F401 from .tensor.manipulation import tensordot # noqa: F401 from .tensor.manipulation import as_complex # noqa: F401 from .tensor.manipulation import as_real # noqa: F401 @@ -611,4 +612,6 @@ __all__ = [ # noqa 'repeat_interleave', 'clone', 'renorm', + 'take_along_axis', + 'put_along_axis', ] diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index a3e1650c131..ec59c275583 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -139,8 +139,8 @@ def get_numeric_gradient(place, elif tensor_to_check_dtype == core.VarDesc.VarType.COMPLEX128: tensor_tp_check_dtype = np.complex128 else: - raise ValueError("Not supported data type " + str( - tensor_to_check_dtype)) + raise ValueError("Not supported data type " + str(tensor_to_check_dtype) + + ", tensor name : " + str(input_to_check)) def get_output(): sum = [] diff --git a/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py b/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py new file mode 100644 index 00000000000..7a7c2987f3b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py @@ -0,0 +1,191 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import copy +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.framework import core +from paddle.fluid.dygraph.base import switch_to_static_graph + +paddle.enable_static() + + +class TestPutAlongAxisOp(OpTest): + def setUp(self): + self.init_data() + self.reduce_op = "assign" + self.dtype = 'float64' + self.op_type = "put_along_axis" + self.xnp = np.random.random(self.x_shape).astype(self.x_type) + # numpy put_along_axis is an inplace opearion. + self.xnp_result = copy.deepcopy(self.xnp) + np.put_along_axis(self.xnp_result, self.index, self.value, self.axis) + self.target = self.xnp_result + broadcast_shape_list = list(self.x_shape) + broadcast_shape_list[self.axis] = 1 + self.braodcast_shape = tuple(broadcast_shape_list) + self.index_broadcast = np.broadcast_to(self.index, self.braodcast_shape) + self.value_broadcast = np.broadcast_to(self.value, self.braodcast_shape) + self.inputs = { + 'Input': self.xnp, + 'Index': self.index_broadcast, + 'Value': self.value_broadcast + } + self.attrs = {'Axis': self.axis, 'Reduce': self.reduce_op} + self.outputs = {'Result': self.target} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["Input", "Value"], "Result") + + def init_data(self): + self.x_type = "float64" + self.x_shape = (10, 10, 10) + self.value_type = "float64" + self.value = np.array([99]).astype(self.value_type) + self.index_type = "int32" + self.index = np.array([[[0]]]).astype(self.index_type) + self.axis = 1 + self.axis_type = "int64" + + +class TestPutAlongAxisAPI(unittest.TestCase): + def setUp(self): + np.random.seed(0) + self.shape = [1, 3] + self.index_shape = [1, 1] + self.index_np = np.array([[0]]).astype('int64') + self.x_np = np.random.random(self.shape).astype(np.float32) + self.place = [paddle.CPUPlace()] + self.axis = 0 + self.value_np = 99.0 + self.value_shape = [1] + self.x_feed = copy.deepcopy(self.x_np) + if core.is_compiled_with_cuda(): + self.place.append(paddle.CUDAPlace(0)) + + def test_api_static_case1(self): + paddle.enable_static() + + def run(place): + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.fluid.data('X', self.shape) + index = paddle.fluid.data('Index', self.index_shape, "int64") + value = paddle.fluid.data('Value', self.value_shape) + out = paddle.put_along_axis(x, index, value, self.axis) + exe = paddle.static.Executor(self.place[0]) + res = exe.run(feed={ + 'X': self.x_feed, + 'Value': self.value_np, + 'Index': self.index_np + }, + fetch_list=[out]) + + np.put_along_axis(self.x_np, self.index_np, self.value_np, + self.axis) + # numpy put_along_axis is an inplace opearion. + out_ref = self.x_np + + for out in res: + self.assertEqual(np.allclose(out, out_ref, rtol=1e-03), True) + + for place in self.place: + run(place) + + def test_api_dygraph_case1(self): + def run(place): + paddle.disable_static(place) + x_tensor = paddle.to_tensor(self.x_np) + index_tensor = paddle.to_tensor(self.index_np) + value_tensor = paddle.to_tensor(self.value_np) + out = paddle.put_along_axis(x_tensor, index_tensor, value_tensor, + self.axis) + np.array( + np.put_along_axis(self.x_np, self.index_np, self.value_np, + self.axis)) + out_ref = self.x_np + self.assertEqual( + np.allclose( + out.numpy(), out_ref, rtol=1e-03), True) + + # for ci coverage, numpy put_along_axis did not support argument of 'reduce' + paddle.put_along_axis(x_tensor, index_tensor, value_tensor, + self.axis, 'mul') + paddle.put_along_axis(x_tensor, index_tensor, value_tensor, + self.axis, 'add') + + paddle.enable_static() + + for place in self.place: + run(place) + + def test_api_dygraph_case2(self): + def run(place): + paddle.disable_static(place) + self.shape = [2, 2] + self.index_shape = [2, 2] + self.index_np = np.array([[0, 0], [1, 0]]).astype('int64') + self.x_np = np.random.random(self.shape).astype(np.float32) + + x_tensor = paddle.to_tensor(self.x_np) + index_tensor = paddle.to_tensor(self.index_np) + value_tensor = paddle.to_tensor(self.value_np) + out = paddle.put_along_axis(x_tensor, index_tensor, value_tensor, + self.axis) + np.array( + np.put_along_axis(self.x_np, self.index_np, self.value_np, + self.axis)) + out_ref = self.x_np + self.assertEqual( + np.allclose( + out.numpy(), out_ref, rtol=1e-03), True) + + paddle.enable_static() + + for place in self.place: + run(place) + + def test_inplace_dygraph_case3(self): + def run(place): + paddle.disable_static(place) + x_tensor = paddle.to_tensor(self.x_np) + index_tensor = paddle.to_tensor(self.index_np) + value_tensor = paddle.to_tensor(self.value_np) + + x_tensor.put_along_axis_(index_tensor, value_tensor, self.axis) + + np.array( + np.put_along_axis(self.x_np, self.index_np, self.value_np, + self.axis)) + out_ref = self.x_np + + self.assertEqual( + np.allclose( + x_tensor.numpy(), out_ref, rtol=1e-03), True) + paddle.enable_static() + + for place in self.place: + run(place) + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py b/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py index 855a790cf05..97162eb9c70 100644 --- a/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py +++ b/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest import numpy as np -from op_test import OpTest, skip_check_grad_ci +from op_test import OpTest import paddle import paddle.fluid as fluid from paddle.framework import core diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index ce70181096c..c01efc8738e 100755 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -120,6 +120,8 @@ from .manipulation import chunk # noqa: F401 from .manipulation import tensordot # noqa: F401 from .manipulation import as_complex # noqa: F401 from .manipulation import take_along_axis # noqa: F401 +from .manipulation import put_along_axis # noqa: F401 +from .manipulation import put_along_axis_ # noqa: F401 from .manipulation import as_real # noqa: F401 from .manipulation import moveaxis # noqa: F401 from .manipulation import repeat_interleave # noqa: F401 @@ -473,6 +475,8 @@ tensor_method_func = [ #noqa 'moveaxis', 'repeat_interleave', 'take_along_axis', + 'put_along_axis', + 'put_along_axis_', 'exponential_', ] diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index d50a0c34ee6..b54c3596a26 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -2756,9 +2756,9 @@ def take_along_axis(arr, indices, axis): Take values from the input array by given indices matrix along the designated axis. Args: - arr (Tensor) : The input Tensor. supported data type are float32 and float64. + arr (Tensor) : The input Tensor. Supported data types are float32 and float64. indices (Tensor) : Indices to take along each 1d slice of arr. This must match the dimension of arr, - and need to broadcast against arr. Supported data type are int and int64. + and need to broadcast against arr. Supported data type are int and int64. axis (int) : The axis to take 1d slices along. Returns: @@ -2779,9 +2779,12 @@ def take_along_axis(arr, indices, axis): print(result) # [[1, 2, 3]] """ - broadcast_shape_list = list(arr.shape) - broadcast_shape_list[axis] = 1 - broadcast_shape = tuple(broadcast_shape_list) + if (arr.shape == indices.shape): + broadcast_shape = arr.shape + else: + broadcast_shape_list = list(arr.shape) + broadcast_shape_list[axis] = 1 + broadcast_shape = tuple(broadcast_shape_list) if in_dygraph_mode(): indices = paddle.broadcast_to(indices, broadcast_shape) return _C_ops.take_along_axis(arr, indices, 'Axis', axis) @@ -2790,9 +2793,7 @@ def take_along_axis(arr, indices, axis): 'take_along_axis') check_variable_and_dtype(indices, 'index', ['int32', 'int64'], 'take_along_axis') - indices = paddle.broadcast_to( - indices, - broadcast_shape) # broadcast to shape of the input array first. + indices = paddle.broadcast_to(indices, broadcast_shape) helper = LayerHelper('take_along_axis', **locals()) dtype = helper.input_dtype() result = helper.create_variable_for_type_inference(dtype) @@ -2803,3 +2804,90 @@ def take_along_axis(arr, indices, axis): attrs={"Axis": axis}, outputs={"Result": result}) return result + + +def put_along_axis(arr, indices, values, axis, reduce='assign'): + """ + Put values into the destination array by given indices matrix along the designated axis. + + Args: + arr (Tensor) : The Destination Tensor. Supported data types are float32 and float64. + indices (Tensor) : Indices to put along each 1d slice of arr. This must match the dimension of arr, + and need to broadcast against arr. Supported data type are int and int64. + axis (int) : The axis to put 1d slices along. + reduce (string | optinal) : The reduce operation, default is 'assign', support 'add', 'assign', 'mul' and 'multiply'. + Returns : + Tensor: The indexed element, same dtype with arr + + Examples: + .. code-block:: python + + import paddle + import numpy as np + + x_np = np.array([[10, 30, 20], [60, 40, 50]]) + index_np = np.array([[0]]) + x = paddle.to_tensor(x_np) + index = paddle.to_tensor(index_np) + value = 99 + axis = 0 + result = paddle.put_along_axis(x, index, value, axis) + print(result) + # [[99, 99, 99], + # [60, 40, 50]] + + """ + if (arr.shape == indices.shape): + broadcast_shape = arr.shape + else: + broadcast_shape_list = list(arr.shape) + broadcast_shape_list[axis] = 1 + broadcast_shape = tuple(broadcast_shape_list) + if in_dygraph_mode(): + indices = paddle.broadcast_to(indices, broadcast_shape) + values = paddle.to_tensor(values) if not isinstance( + values, paddle.Tensor) else values + values = paddle.broadcast_to(values, broadcast_shape) + return _C_ops.put_along_axis(arr, indices, values, "Axis", axis, + "Reduce", reduce) + + check_variable_and_dtype( + arr, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'], + 'put_along_axis') + check_variable_and_dtype(indices, 'index', ['int32', 'int64'], + 'put_along_axis') + indices = paddle.broadcast_to(indices, broadcast_shape) + values = paddle.broadcast_to(values, broadcast_shape) + helper = LayerHelper('put_along_axis', **locals()) + dtype = helper.input_dtype() + result = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type="put_along_axis", + inputs={"Input": arr, + "Index": indices, + "Value": values}, + attrs={"Axis": axis, + "Reduce": reduce}, + outputs={"Result": result}) + return result + + +@inplace_apis_in_dygraph_only +def put_along_axis_(arr, indices, values, axis, reduce='assign'): + r""" + Inplace version of ``put_along_axis`` API, the output Tensor will be inplaced with input ``x``. + Please refer to :ref:`api_tensor_put_along_axis`. + """ + if (arr.shape == indices.shape): + broadcast_shape = arr.shape + else: + broadcast_shape_list = list(arr.shape) + broadcast_shape_list[axis] = 1 + broadcast_shape = tuple(broadcast_shape_list) + + indices = paddle.broadcast_to(indices, broadcast_shape) + values = paddle.to_tensor(values) if not isinstance( + values, paddle.Tensor) else values + values = paddle.broadcast_to(values, broadcast_shape) + return _C_ops.put_along_axis_(arr, indices, values, "Axis", axis, "Reduce", + reduce) -- GitLab