From affddfaa47d56666135a3b2e71b13bed75d226ae Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Wed, 23 Jun 2021 14:18:19 +0800 Subject: [PATCH] Add new operation: BroadcastTensorsOp (#33294) --- .../fluid/operators/broadcast_tensors_op.cc | 253 ++++++++++++++++ .../fluid/operators/broadcast_tensors_op.cu | 132 ++++++++ paddle/fluid/operators/broadcast_tensors_op.h | 282 ++++++++++++++++++ python/paddle/__init__.py | 4 +- .../unittests/test_broadcast_tensors_op.py | 196 ++++++++++++ python/paddle/tensor/__init__.py | 2 + python/paddle/tensor/manipulation.py | 95 ++++++ 7 files changed, 963 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/operators/broadcast_tensors_op.cc create mode 100644 paddle/fluid/operators/broadcast_tensors_op.cu create mode 100644 paddle/fluid/operators/broadcast_tensors_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc new file mode 100644 index 00000000000..074607e05ea --- /dev/null +++ b/paddle/fluid/operators/broadcast_tensors_op.cc @@ -0,0 +1,253 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/broadcast_tensors_op.h" + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace operators { +using framework::Tensor; +using framework::DDim; + +class BroadcastTensorsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "broadcast_tensors"); + OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", + "broadcast_tensors"); + + int target_rank = 0; + const auto& input_dims = ctx->GetInputsDim("X"); + // 1. Find Output rank = max(Inputs rank) + for (const auto& input_ddim : input_dims) { + target_rank = std::max(target_rank, input_ddim.size()); + } + + PADDLE_ENFORCE_GT( + target_rank, 0, + platform::errors::InvalidArgument( + "BroadcastTensorsOp requires at least one input tensor" + "to have rank greater than zero")); + + std::vector target_dims(target_rank, 0); + // 2. Output dim(axis=x) = max(Inputs dim(axis=x)) + for (int index = 0; index < target_rank; index++) { + // Loop axes in reverse order, + // For each axis, take the maximum as target size + // Fill size = 1 if shape vector exhausts + int target_dim_size = 1; + for (const auto& input_ddim : input_dims) { + // Reversed order + int axis = static_cast(input_ddim.size()) - index - 1; + int dim_size = 1; + if (axis >= 0) { + dim_size = input_ddim[axis]; + } + + // We performed bcast semantics check at python level + // So input tensors should all have legal shape + target_dim_size = std::max(target_dim_size, dim_size); + } + target_dims[target_rank - index - 1] = target_dim_size; + } + + // 3. Set Output Dim + std::vector output_ddims; + for (size_t i = 0; i < input_dims.size(); i++) { + output_ddims.emplace_back(framework::make_ddim(target_dims)); + } + ctx->SetOutputsDim("Out", output_ddims); + ctx->ShareAllLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + // Broadcast semantics enforces all input variables having the same + // DataType/VarType + // This condition is also checked during VarType Inference + // Here we simply copy input type to output + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } +}; + +class BroadcastTensorsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "A Varaible list. The shape and data type of the list elements" + "should be consistent. Variable can be multi-dimensional Tensor" + "or LoDTensor, and data types can be: bool, float16, float32, " + "float64, int32, " + "int64.") + .AsDuplicable(); + AddOutput("Out", + "the sum of input :code:`x`. its shape and data types are " + "consistent with :code:`x`.") + .AsDuplicable(); + AddComment( + R"DOC(This OP is used to broadcast a vector of inputs + with Tensor or LoDTensor type, following broadcast semantics.)DOC"); + } +}; + +class BroadcastTensorsOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* ctx) const override { + // We need at least two tensors to satisfy broadcast semantics + size_t input_size = ctx->InputSize("X"); + PADDLE_ENFORCE_GT( + input_size, 0, + platform::errors::InvalidArgument( + "BroadcastTensorsOp should have at least one input variables," + "but only received %d ", + input_size)); + + // BroadcastTensorsOp takes a vector of variables named "X" + // Here we loop through input variables, + // and check if their DataType/VarType are the same + auto var_type = ctx->GetInputType("X", 0); + auto data_type = ctx->GetInputDataType("X", 0); + for (size_t ind = 1; ind < input_size; ind++) { + auto cur_var_type = ctx->GetInputType("X", ind); + PADDLE_ENFORCE_EQ( + var_type, cur_var_type, + platform::errors::InvalidArgument( + "inputs to BroadcastTensorsOp should have the same variable type," + "but detected %d v.s %d ", + framework::ToTypeName(var_type), + framework::ToTypeName(cur_var_type))); + + auto cur_data_type = ctx->GetInputDataType("X", ind); + PADDLE_ENFORCE_EQ( + data_type, cur_data_type, + platform::errors::InvalidArgument( + "inputs to BroadcastTensorsOp should have the same data type," + "but detected %d v.s %d ", + framework::ToTypeName(var_type), + framework::ToTypeName(cur_var_type))); + } + + // Outputs having the same DataType/VarType as inputs + ctx->SetOutputType("Out", var_type, framework::ALL_ELEMENTS); + ctx->SetOutputDataType("Out", data_type, framework::ALL_ELEMENTS); + } +}; + +/* ------ BroadcastTensorsGradOp ------ */ +class BroadcastTensorsGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasOutputs(framework::GradVarName("X")), "Output", + "X@grad", "broadcast_tensors"); + OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "broadcast_tensors"); + OP_INOUT_CHECK(ctx->HasInputs(framework::GradVarName("Out")), "Input", + "Out@grad", "broadcast_tensors"); + + const auto& forward_input_dims = ctx->GetInputsDim("X"); + ctx->SetOutputsDim(framework::GradVarName("X"), forward_input_dims); + ctx->ShareAllLoD("X", /*->*/ framework::GradVarName("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")), + ctx.device_context()); + } +}; + +template +class BroadcastTensorsGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + void Apply(GradOpPtr grad_op) const override { + grad_op->SetType("broadcast_tensors_grad"); + // We need "X" only for backward shape inference + grad_op->SetInput("X", this->Input("X")); + grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + grad_op->SetOutput(framework::GradVarName("X"), + this->InputGrad("X", /* drop_empty_grad */ false)); + grad_op->SetAttrMap(this->Attrs()); + } +}; + +class BroadcastTensorsGradOpVarTypeInference + : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* ctx) const override { + auto var_type = ctx->GetInputType("X", 0); + auto data_type = ctx->GetInputDataType("X", 0); + + ctx->SetOutputType(framework::GradVarName("X"), var_type, + framework::ALL_ELEMENTS); + ctx->SetOutputDataType(framework::GradVarName("X"), data_type, + framework::ALL_ELEMENTS); + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer, + "X"); + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp, + ops::BroadcastTensorsOpMaker, + ops::BroadcastTensorsGradOpMaker, + ops::BroadcastTensorsGradOpMaker, + ops::BroadcastTensorsOpVarTypeInference); + +REGISTER_OPERATOR(broadcast_tensors_grad, ops::BroadcastTensorsGradOp, + ops::BroadcastTensorsGradOpVarTypeInference, + ops::BroadcastTensorsGradNoNeedBufVarsInferer); + +REGISTER_OP_CPU_KERNEL( + broadcast_tensors, + ops::BroadcastTensorsOpKernel, + ops::BroadcastTensorsOpKernel, + ops::BroadcastTensorsOpKernel, + ops::BroadcastTensorsOpKernel, + ops::BroadcastTensorsOpKernel, + ops::BroadcastTensorsOpKernel); + +REGISTER_OP_CPU_KERNEL( + broadcast_tensors_grad, + ops::BroadcastTensorsGradOpKernel, + ops::BroadcastTensorsGradOpKernel, + ops::BroadcastTensorsGradOpKernel, + ops::BroadcastTensorsGradOpKernel, + ops::BroadcastTensorsGradOpKernel); diff --git a/paddle/fluid/operators/broadcast_tensors_op.cu b/paddle/fluid/operators/broadcast_tensors_op.cu new file mode 100644 index 00000000000..d670e1b333d --- /dev/null +++ b/paddle/fluid/operators/broadcast_tensors_op.cu @@ -0,0 +1,132 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/broadcast_tensors_op.h" + +#include +#include +#include +#include +#include + +#include "paddle/fluid/operators/reduce_ops/cub_reduce.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; +using framework::DDim; + +template +struct IdentityFunctor { + HOSTDEVICE explicit inline IdentityFunctor() {} + + template + HOSTDEVICE inline Tout operator()(const U& x) const { + return static_cast(x); + } +}; + +template +class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + // Find reduce dimensions + const auto& in_tensors = + context.MultiInput(framework::GradVarName("Out")); + auto out_tensors = context.MultiOutput(framework::GradVarName("X")); + + size_t num_ins = in_tensors.size(); + + PADDLE_ENFORCE_GT( + num_ins, 1, + platform::errors::InvalidArgument( + "Expected at least 2 input tensors, but only received d%.", + in_tensors.size())); + + PADDLE_ENFORCE_EQ( + num_ins, out_tensors.size(), + platform::errors::InvalidArgument( + "BroadcastTensorsOp expects equal number of inputs and outputs," + "but received: %d inputs v.s %d outputs", + num_ins, out_tensors.size())); + + // For each In-Out tensor pair, + // Prepare and apply broadcast dims array + for (size_t i = 0; i < num_ins; i++) { + auto* input_tensor = in_tensors[i]; + auto* output_tensor = out_tensors[i]; + + const DDim& input_dims = input_tensor->dims(); + const DDim& output_dims = output_tensor->dims(); + + int in_rank = input_dims.size(); + int out_rank = output_dims.size(); + + // Collect reduce_dims + // Example: + // dX = [1,1,1,1] + // dOut = [1,1,1,4] + // + // reduce_dims = [3] // reduce along the broadcasted axis + std::vector reduce_dims_vec; + for (int j = 0; j < in_rank; j++) { + int out_axis = out_rank - j - 1; + int in_axis = in_rank - j - 1; + + if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) { + reduce_dims_vec.push_back(in_axis); + } + } + + bool just_copy = (reduce_dims_vec.size() == 0); + output_tensor->mutable_data(context.GetPlace()); + if (just_copy) { + // Turns out to be a No-Op, simply copy tensors + framework::TensorCopy(*input_tensor, context.GetPlace(), + context.device_context(), output_tensor); + } else { + // reduce_sum implementation on CUDA + auto stream = context.cuda_device_context().stream(); + TensorReduce>( + *input_tensor, output_tensor, reduce_dims_vec, static_cast(0), + cub::Sum(), IdentityFunctor(), stream); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + broadcast_tensors, + ops::BroadcastTensorsOpKernel, + ops::BroadcastTensorsOpKernel, + ops::BroadcastTensorsOpKernel, + ops::BroadcastTensorsOpKernel, + ops::BroadcastTensorsOpKernel, + ops::BroadcastTensorsOpKernel); + +REGISTER_OP_CUDA_KERNEL(broadcast_tensors_grad, + ops::CUDABroadcastTensorsGradOpKernel, + ops::CUDABroadcastTensorsGradOpKernel, + ops::CUDABroadcastTensorsGradOpKernel, + ops::CUDABroadcastTensorsGradOpKernel, + ops::CUDABroadcastTensorsGradOpKernel); diff --git a/paddle/fluid/operators/broadcast_tensors_op.h b/paddle/fluid/operators/broadcast_tensors_op.h new file mode 100644 index 00000000000..0eeb9234df0 --- /dev/null +++ b/paddle/fluid/operators/broadcast_tensors_op.h @@ -0,0 +1,282 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/fluid/operators/math/math_function.h" + +#define SWITCH_OUT_RANK_CASE(n) \ + case n: { \ + ApplyBroadcast(context, in_tensors[i], out_tensors[i]); \ + break; \ + } + +namespace paddle { +namespace operators { + +using framework::Tensor; +using framework::DDim; +using framework::EigenTensor; + +template +class BroadcastTensorsOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const auto& in_tensors = context.MultiInput("X"); + auto out_tensors = context.MultiOutput("Out"); + + size_t num_ins = in_tensors.size(); + + PADDLE_ENFORCE_GT( + num_ins, 1, + platform::errors::InvalidArgument( + "Expected at least 2 input tensors, but only received d%.", + in_tensors.size())); + + PADDLE_ENFORCE_EQ( + num_ins, out_tensors.size(), + platform::errors::InvalidArgument( + "BroadcastTensorsOp expects equal number of inputs and outputs," + "but received: %d inputs v.s %d outputs", + num_ins, out_tensors.size())); + + // Eigen has no support for dynamic ranked tensor + // Thus we perform static expansion for each possible ranks + for (size_t i = 0; i < num_ins; i++) { + int out_rank = out_tensors[i]->dims().size(); + switch (out_rank) { + SWITCH_OUT_RANK_CASE(1) + SWITCH_OUT_RANK_CASE(2) + SWITCH_OUT_RANK_CASE(3) + SWITCH_OUT_RANK_CASE(4) + SWITCH_OUT_RANK_CASE(5) + default: { + PADDLE_THROW(platform::errors::InvalidArgument( + "Target tensor rank out of range" + "Maximum supported rank for broadcast is: 5")); + } + } + } + } + + template + void ApplyBroadcast(const framework::ExecutionContext& context, + const Tensor* input_tensor, Tensor* output_tensor) const { + const auto& input_dims = input_tensor->dims(); + const auto& output_dims = output_tensor->dims(); + + int in_rank = input_dims.size(); + int out_rank = output_dims.size(); + + // 1. Collect bcast_dims, each element of which indicates how many + // times we need to replicate along the corresponding dimension + // 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for + // both input and output tensors, so we need to initialize input X with + // expanded dims: "new_input_dims_vec" + Eigen::DSizes bcast_dims; + std::vector new_input_dims_vec(out_rank); + for (int j = 0; j < out_rank; j++) { + int out_axis = out_rank - j - 1; + int in_axis = in_rank - j - 1; + + bcast_dims[out_axis] = output_dims[out_axis]; + new_input_dims_vec[out_axis] = 1; + if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) { + bcast_dims[out_axis] = 1; + new_input_dims_vec[out_axis] = input_dims[in_axis]; + } + } + auto new_input_dims = framework::make_ddim(new_input_dims_vec); + + // Initialize input X with new_input_dims_vec, so it's rank-aligned with the + // output + auto x = EigenTensor::From(*input_tensor, new_input_dims); + + output_tensor->mutable_data(context.GetPlace()); + auto y = EigenTensor::From(*output_tensor, output_dims); + + auto& place = + *context.template device_context().eigen_device(); + EigenBroadcast, T, OutRank>::Eval(place, y, x, + bcast_dims); + } +}; + +#define SWITCH_RESHAPE_DIMS(n) \ + case n: { \ + Eigen::DSizes reshape_dims; \ + for (size_t i = 0; i < reshape_dims_vec.size(); ++i) { \ + reshape_dims[i] = reshape_dims_vec[i]; \ + } \ + dX.device(place) = \ + dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \ + break; \ + } + +#define UPPER_SWITCH_REDUCE_DIMS(m) \ + case m: { \ + Eigen::DSizes reduce_dims; \ + for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \ + reduce_dims[i] = reduce_dims_vec[i]; \ + } \ + switch (reshape_size) { +#define LOWER_SWITCH_REDUCE_DIMS \ + default: { \ + PADDLE_THROW(platform::errors::InvalidArgument( \ + "Detected reshape size: %d out of range" \ + "Minimum value should be larger than reduce size %d" \ + "While maximum supported is: 5", \ + reshape_size, reduce_size)); \ + } \ + } \ + break; \ + } + +/* ----- GradOpKernel ----- */ +template +class BroadcastTensorsGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + // Find reduce dimensions + const auto& in_tensors = + context.MultiInput(framework::GradVarName("Out")); + auto out_tensors = context.MultiOutput(framework::GradVarName("X")); + + size_t num_ins = in_tensors.size(); + + PADDLE_ENFORCE_GT( + num_ins, 1, + platform::errors::InvalidArgument( + "Expected at least 2 input tensors, but only received d%.", + in_tensors.size())); + + PADDLE_ENFORCE_EQ( + num_ins, out_tensors.size(), + platform::errors::InvalidArgument( + "BroadcastTensorsOp expects equal number of inputs and outputs," + "but received: %d inputs v.s %d outputs", + num_ins, out_tensors.size())); + + // For each In-Out tensor pair, + // Prepare and apply broadcast dims array + for (size_t i = 0; i < num_ins; i++) { + const auto* input_tensor = in_tensors[i]; + auto* output_tensor = out_tensors[i]; + + const auto& input_dims = input_tensor->dims(); + const auto& output_dims = output_tensor->dims(); + + int in_rank = input_dims.size(); + int out_rank = output_dims.size(); + + // BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes + // Here we perform the following Eigen operations: + // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) -> + // reshape(dX_shape) -> dX + // Note the last "reshape(dX_shape)" will be performed implicitly, + // and we only need to collect reduce_dims and reshape_dims + std::vector reduce_dims_vec; + std::vector reshape_dims_vec; + for (int j = 0; j < in_rank; j++) { + int out_axis = out_rank - j - 1; + int in_axis = in_rank - j - 1; + + reshape_dims_vec.push_back(input_dims[j]); + if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) { + reduce_dims_vec.push_back(in_axis); + } + } + + size_t reduce_size = reduce_dims_vec.size(); + size_t reshape_size = reshape_dims_vec.size(); + bool just_copy = (reduce_dims_vec.size() == 0); + output_tensor->mutable_data(context.GetPlace()); + if (just_copy) { + // If this turns out to be a No-Op, simply perform a tensor copy + framework::TensorCopy(*input_tensor, context.GetPlace(), + context.device_context(), output_tensor); + } else { + PADDLE_ENFORCE_GE(reduce_dims_vec.size(), 1, + platform::errors::InvalidArgument( + "The number of dimensions of the input " + "'Out@GRAD' for Op(broadcast_tensors)" + " must be greater than or equal to 1, but " + "the value received is %d.", + reduce_dims_vec.size())); + PADDLE_ENFORCE_LE( + reduce_dims_vec.size(), 5, + platform::errors::InvalidArgument( + "The number of dimensions of the input 'Out@GRAD' " + "for Op(broadcast_tensors) must be less than or equal " + "to 5, but the value received is %d.", + reduce_dims_vec.size())); + + // Overall: + // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) -> + // reshape(dX_shape) -> dX + auto dX = framework::EigenVector::Flatten(*output_tensor); + auto dOut = framework::EigenVector::Flatten(*input_tensor); + auto& place = + *context.template device_context().eigen_device(); + + // Expand ReduceSize and ReshapeSize into static values + switch (reduce_size) { + UPPER_SWITCH_REDUCE_DIMS(1) + SWITCH_RESHAPE_DIMS(1) + SWITCH_RESHAPE_DIMS(2) + SWITCH_RESHAPE_DIMS(3) + SWITCH_RESHAPE_DIMS(4) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + UPPER_SWITCH_REDUCE_DIMS(2) + SWITCH_RESHAPE_DIMS(2) + SWITCH_RESHAPE_DIMS(3) + SWITCH_RESHAPE_DIMS(4) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + UPPER_SWITCH_REDUCE_DIMS(3) + SWITCH_RESHAPE_DIMS(3) + SWITCH_RESHAPE_DIMS(4) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + UPPER_SWITCH_REDUCE_DIMS(4) + SWITCH_RESHAPE_DIMS(4) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + UPPER_SWITCH_REDUCE_DIMS(5) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + default: { + PADDLE_THROW(platform::errors::InvalidArgument( + "Detected reduce size: %d out of range" + "While maximum supported is: 5", + reduce_size)); + } + } + } + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index c7fc74deec0..773ae61a691 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -118,6 +118,7 @@ from .tensor.logic import equal_all # noqa: F401 from .tensor.logic import is_tensor # noqa: F401 from .tensor.manipulation import cast # noqa: F401 from .tensor.manipulation import concat # noqa: F401 +from .tensor.manipulation import broadcast_tensors # noqa: F401 from .tensor.manipulation import expand # noqa: F401 from .tensor.manipulation import broadcast_to # noqa: F401 from .tensor.manipulation import expand_as # noqa: F401 @@ -505,5 +506,6 @@ __all__ = [ # noqa 'trunc', 'digamma', 'standard_normal', - 'diagonal' + 'diagonal', + 'broadcast_tensors', ] diff --git a/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py b/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py new file mode 100644 index 00000000000..602c5bae8f8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py @@ -0,0 +1,196 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid.core as core +from op_test import OpTest +from test_collective_base import TestDistBase + +import random +random.seed(2021) + +paddle.enable_static() + + +def find_output_shape(input_list): + """Infer output tensor shape according to bcast semantics""" + output_rank = 0 + for x in input_list: + rank = len(x.shape) + output_rank = max(output_rank, rank) + + output_shape = [0 for i in range(output_rank)] + for i in range(output_rank): + for x in input_list: + shape = list(reversed(x.shape)) + size = 1 + if i < len(shape): + size = shape[i] + output_shape[i] = max(output_shape[i], size) + + return list(reversed(output_shape)) + + +def make_inputs_outputs(input_shapes, dtype): + """Automatically generate formatted inputs and outputs from input_shapes""" + input_list = [ + np.random.random(shape).astype(dtype) for shape in input_shapes + ] + output_shape = find_output_shape(input_list) + output_list = [ + x + np.zeros(output_shape).astype(x.dtype) for x in input_list + ] + + output_formatted = { + "Out": [(f"out{i}", output_list[i]) for i in range(len(output_list))] + } + input_formatted = { + "X": [(f"x{i}", input_list[i]) for i in range(len(input_list))] + } + + return input_formatted, output_formatted + + +def gen_rank_diff_test(dtype): + input_shapes = [(2, 60, 1), (6, 2, 1, 10)] + return make_inputs_outputs(input_shapes, dtype) + + +def gen_no_broadcast_test(dtype): + input_shapes = [(12, 1, 10, 1), (12, 1, 10, 1)] + return make_inputs_outputs(input_shapes, dtype) + + +def gen_mixed_tensors_test(dtype): + input_shapes = [(2, 60, 1), (2, 2, 1, 30), (1, 2, 60, 1)] + return make_inputs_outputs(input_shapes, dtype) + + +class TestCPUBroadcastTensorsOp(OpTest): + def set_place(self): + self.place = core.CPUPlace() + + def set_dtypes(self): + self.dtypes = ['float64'] + + def setUp(self): + self.op_type = "broadcast_tensors" + self.use_mkldnn = False + self.attrs = {'use_mkldnn': self.use_mkldnn} + self.test_gen_func_list = [ + gen_rank_diff_test, gen_no_broadcast_test, gen_mixed_tensors_test + ] + self.set_place() + self.set_dtypes() + + def run_test(self, test_func, args): + for dtype in self.dtypes: + for gen_func in self.test_gen_func_list: + self.inputs, self.outputs = gen_func(dtype) + test_func(**args) + + def test_check_output(self): + self.run_test(self.check_output_with_place, + {"place": self.place, + "atol": 1e-1}) + + def test_check_grad_normal(self): + self.run_test(self.check_grad_with_place, { + "place": self.place, + "inputs_to_check": ['x0', 'x1'], + "output_names": ['out0', 'out1'], + "max_relative_error": 0.05, + }) + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestCUDABroadcastTensorsOp(TestCPUBroadcastTensorsOp): + def set_place(self): + self.place = core.CUDAPlace(0) + + def set_dtypes(self): + self.dtypes = ['float64'] + if core.is_float16_supported(self.place): + self.dtypes.append('float16') + + +class TestBroadcastTensorsAPI(unittest.TestCase): + def test_api(self): + def test_static(): + inputs = [ + paddle.fluid.layers.data( + shape=[4, 1, 4, 1], dtype='float32', name="x0"), + paddle.fluid.layers.data( + shape=[1, 4, 1, 4], dtype='float32', name="x1") + ] + paddle.broadcast_tensors(inputs) + + def test_dynamic(): + paddle.disable_static() + try: + inputs = [ + paddle.to_tensor( + np.random.random([4, 1, 4, 1]).astype("float32")), + paddle.to_tensor( + np.random.random([1, 4, 1, 4]).astype("float32")) + ] + paddle.broadcast_tensors(inputs) + finally: + paddle.enable_static() + + test_static() + test_dynamic() + + +class TestRaiseBroadcastTensorsError(unittest.TestCase): + def test_errors(self): + def test_type(): + inputs = [ + paddle.fluid.layers.data( + shape=[1, 1, 1, 1], dtype='float32', name="x4"), + paddle.fluid.layers.data( + shape=[1, 4, 1, 1], dtype='float64', name="x5") + ] + paddle.broadcast_tensors(inputs) + + def test_dtype(): + inputs = [ + paddle.fluid.layers.data( + shape=[1, 1, 1, 1], dtype='int8', name="x6"), + paddle.fluid.layers.data( + shape=[1, 4, 1, 1], dtype='int8', name="x7") + ] + paddle.broadcast_tensors(inputs) + + def test_bcast_semantics(): + inputs = [ + paddle.fluid.layers.data( + shape=[1, 3, 1, 1], dtype='float32', name="x9"), + paddle.fluid.layers.data( + shape=[1, 8, 1, 1], dtype='float32', name="x10") + ] + paddle.broadcast_tensors(inputs) + + self.assertRaises(TypeError, test_type) + self.assertRaises(TypeError, test_dtype) + self.assertRaises(TypeError, test_bcast_semantics) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index 98d033ecec3..2d4c97212be 100755 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -66,6 +66,7 @@ from .manipulation import cast # noqa: F401 from .manipulation import concat # noqa: F401 from .manipulation import expand # noqa: F401 from .manipulation import broadcast_to # noqa: F401 +from .manipulation import broadcast_tensors # noqa: F401 from .manipulation import expand_as # noqa: F401 from .manipulation import tile # noqa: F401 from .manipulation import flatten # noqa: F401 @@ -363,6 +364,7 @@ tensor_method_func = [ #noqa 'bitwise_or', 'bitwise_xor', 'bitwise_not', + 'broadcast_tensors', ] #this list used in math_op_patch.py for magic_method bind diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 1c33d19db4b..981baecb644 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -120,6 +120,101 @@ def concat(x, axis=0, name=None): return paddle.fluid.layers.concat(input=x, axis=axis, name=name) +def broadcast_tensors(input, name=None): + """ + This OP broadcast a list of tensors following broadcast semantics + + .. note:: + If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`. + + Args: + input(list|tuple): ``input`` is a Tensor list or Tensor tuple which is with data type bool, + float16, float32, float64, int32, int64. All the Tensors in ``input`` must have same data type. + Currently we only support tensors with rank no greater than 5. + + name (str, optional): The default value is None. Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + list(Tensor): The list of broadcasted tensors following the same order as ``input``. + + Examples: + .. code-block:: python + + import paddle + x1 = paddle.rand([1, 2, 3, 4]).astype('float32') + x2 = paddle.rand([1, 2, 1, 4]).astype('float32') + x3 = paddle.rand([1, 1, 3, 1]).astype('float32') + out1, out2, out3 = paddle.broadcast_tensors(input=[x1, x2, x3]) + # out1, out2, out3: tensors broadcasted from x1, x2, x3 with shape [1,2,3,4] + """ + + num_inputs = len(input) + if in_dygraph_mode(): + return core.ops.broadcast_tensors(input, num_inputs) + + check_type(input, 'input', (list, tuple), 'broadcast_tensors') + if num_inputs < 1: + raise TypeError( + "At least 1 tensor is needed to perform broadcast_tensors") + + # Check input types + for id, x in enumerate(input): + check_variable_and_dtype( + x, 'input[' + str(id) + ']', + ['bool', 'float32', 'float64', 'int32', 'int64'], + 'broadcast_tensors') + if x.dtype != input[0].dtype: + raise TypeError( + "All the Tensors in the input must have the same data type.") + + # Check bcast semantics + output_shape_r_last_tensor_index = [] + output_shape_r = [] + + # Use while loop due to weird behaviour of "range()" + j = 0 + while j < len(input): + tensor = input[j] + shape = list(reversed(tensor.shape)) + + i = 0 + while i < len(shape): + if len(output_shape_r) <= i: + output_shape_r.append(shape[i]) + output_shape_r_last_tensor_index.append(j) + else: + invalid = (output_shape_r[i] != shape[i] and + output_shape_r[i] != 1 and shape[i] != 1) + if invalid: + last_index = output_shape_r_last_tensor_index[i] + raise TypeError( + "Input tensors to broadcast_tensors does not follow bcast semantics" + f"Tensor {last_index} conflicts with Tensor {j} in reversed dimension {i}" + ) + if output_shape_r[i] <= shape[i]: + output_shape_r[i] = shape[i] + output_shape_r_last_tensor_index[i] = j + i += 1 # while i < len(shape) + j += 1 # while j < len(input) + + helper = LayerHelper('broadcast_tensors', **locals()) + i = 0 + out = [] + while i < num_inputs: + out.append( + helper.create_variable_for_type_inference(dtype=helper.input_dtype( + ))) + i += 1 + + inputs = {'X': input} + helper.append_op( + type='broadcast_tensors', inputs=inputs, outputs={'Out': out}, + attrs={}) + + return out + + def flip(x, axis, name=None): """ Reverse the order of a n-D tensor along given axis in axis. -- GitLab