diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc index 27b1107675d4e722f9a2e25801ecc4dfb206cce5..c3917fad555cb4633d4d958abcde0244fae13cae 100644 --- a/paddle/fluid/operators/broadcast_tensors_op.cc +++ b/paddle/fluid/operators/broadcast_tensors_op.cc @@ -12,15 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/broadcast_tensors_op.h" - -#include -#include -#include -#include -#include - +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/var_type_inference.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -31,64 +27,6 @@ class BroadcastTensorsOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "broadcast_tensors"); - OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", - "broadcast_tensors"); - - int target_rank = 0; - const auto& input_dims = ctx->GetInputsDim("X"); - - // 1. Find Output rank = max(Inputs rank) - for (const auto& input_ddim : input_dims) { - target_rank = std::max(target_rank, input_ddim.size()); - } - - PADDLE_ENFORCE_GT( - target_rank, 0, - platform::errors::InvalidArgument( - "BroadcastTensorsOp requires at least one input tensor" - "to have rank greater than zero")); - - std::vector target_dims(target_rank, 0); - // 2. Output dim(axis=x) = max(Inputs dim(axis=x)) - for (int index = 0; index < target_rank; index++) { - // Loop axes in reverse order, - // For each axis, take the maximum as target size - // Fill size = 1 if shape vector exhausts - int target_dim_size = 1; - for (const auto& input_ddim : input_dims) { - // Reversed order - int axis = static_cast(input_ddim.size()) - index - 1; - int dim_size = 1; - if (axis >= 0) { - dim_size = input_ddim[axis]; - } - - if (target_dim_size != 1 && dim_size != 1 && - target_dim_size != dim_size) { - PADDLE_THROW(platform::errors::InvalidArgument( - "BroadcastTensorsOp inputs does not satisfy bcast semantics," - "Please check axis = %d in reverse order", - index)); - } - - // We performed bcast semantics check at python level - // So input tensors should all have legal shape - target_dim_size = std::max(target_dim_size, dim_size); - } - target_dims[target_rank - index - 1] = target_dim_size; - } - - // 3. Set Output Dim - std::vector output_ddims; - for (size_t i = 0; i < input_dims.size(); i++) { - output_ddims.emplace_back(phi::make_ddim(target_dims)); - } - ctx->SetOutputsDim("Out", output_ddims); - ctx->ShareAllLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -229,34 +167,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer, namespace ops = paddle::operators; namespace plat = paddle::platform; +DELCARE_INFER_SHAPE_FUNCTOR(broadcast_tensors, + BroadcastTensorsInferShapeFunctor, + PT_INFER_META(phi::BroadcastTensorsInferMeta)); + REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp, ops::BroadcastTensorsOpMaker, ops::BroadcastTensorsGradOpMaker, ops::BroadcastTensorsGradOpMaker, - ops::BroadcastTensorsOpVarTypeInference); + ops::BroadcastTensorsOpVarTypeInference, + BroadcastTensorsInferShapeFunctor); REGISTER_OPERATOR(broadcast_tensors_grad, ops::BroadcastTensorsGradOp, ops::BroadcastTensorsGradOpVarTypeInference, ops::BroadcastTensorsGradNoNeedBufVarsInferer); - -REGISTER_OP_CPU_KERNEL( - broadcast_tensors, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel); - -REGISTER_OP_CPU_KERNEL( - broadcast_tensors_grad, - ops::BroadcastTensorsGradOpKernel, - ops::BroadcastTensorsGradOpKernel, - ops::BroadcastTensorsGradOpKernel, - ops::BroadcastTensorsGradOpKernel, - ops::BroadcastTensorsGradOpKernel); diff --git a/paddle/fluid/operators/broadcast_tensors_op.cu b/paddle/fluid/operators/broadcast_tensors_op.cu deleted file mode 100644 index 5882258317d7daa6c62905f8a76d5c68060787a8..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/broadcast_tensors_op.cu +++ /dev/null @@ -1,122 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/broadcast_tensors_op.h" - -#include -#include -#include -#include -#include - -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" - -namespace paddle { -namespace operators { - -using framework::Tensor; -using framework::DDim; - -template -class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - // Find reduce dimensions - const auto& in_tensors = - context.MultiInput(framework::GradVarName("Out")); - auto out_tensors = context.MultiOutput(framework::GradVarName("X")); - - size_t num_ins = in_tensors.size(); - - PADDLE_ENFORCE_GT( - num_ins, 1, - platform::errors::InvalidArgument( - "Expected at least 2 input tensors, but only received d%.", - in_tensors.size())); - - PADDLE_ENFORCE_EQ( - num_ins, out_tensors.size(), - platform::errors::InvalidArgument( - "BroadcastTensorsOp expects equal number of inputs and outputs," - "but received: %d inputs v.s %d outputs", - num_ins, out_tensors.size())); - - // For each In-Out tensor pair, - // Prepare and apply broadcast dims array - for (size_t i = 0; i < num_ins; i++) { - auto* input_tensor = in_tensors[i]; - auto* output_tensor = out_tensors[i]; - - const DDim& input_dims = input_tensor->dims(); - const DDim& output_dims = output_tensor->dims(); - - int in_rank = input_dims.size(); - int out_rank = output_dims.size(); - - // Collect reduce_dims - // Example: - // dX = [1,1,1,1] - // dOut = [1,1,1,4] - // - // reduce_dims = [3] // reduce along the broadcasted axis - std::vector reduce_dims_vec; - for (int j = 0; j < in_rank; j++) { - int out_axis = out_rank - j - 1; - int in_axis = in_rank - j - 1; - - if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) { - reduce_dims_vec.push_back(in_axis); - } - } - - bool just_copy = (reduce_dims_vec.size() == 0); - output_tensor->mutable_data(context.GetPlace()); - if (just_copy) { - // Turns out to be a No-Op, simply copy tensors - framework::TensorCopy(*input_tensor, context.GetPlace(), - context.device_context(), output_tensor); - } else { - // reduce_sum implementation on CUDA - auto stream = context.cuda_device_context().stream(); - TensorReduceImpl>( - context.cuda_device_context(), *input_tensor, output_tensor, - kps::IdentityFunctor(), reduce_dims_vec, stream); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - broadcast_tensors, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel); - -REGISTER_OP_CUDA_KERNEL(broadcast_tensors_grad, - ops::CUDABroadcastTensorsGradOpKernel, - ops::CUDABroadcastTensorsGradOpKernel, - ops::CUDABroadcastTensorsGradOpKernel, - ops::CUDABroadcastTensorsGradOpKernel, - ops::CUDABroadcastTensorsGradOpKernel); diff --git a/paddle/fluid/operators/broadcast_tensors_op.h b/paddle/fluid/operators/broadcast_tensors_op.h deleted file mode 100644 index 682f2e24769221d04317d0e53d02406c4c5a26eb..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/broadcast_tensors_op.h +++ /dev/null @@ -1,282 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -#define SWITCH_OUT_RANK_CASE(n) \ - case n: { \ - ApplyBroadcast(context, in_tensors[i], out_tensors[i]); \ - break; \ - } - -namespace paddle { -namespace operators { - -using framework::Tensor; -using framework::DDim; -using framework::EigenTensor; - -template -class BroadcastTensorsOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const auto& in_tensors = context.MultiInput("X"); - auto out_tensors = context.MultiOutput("Out"); - - size_t num_ins = in_tensors.size(); - - PADDLE_ENFORCE_GT( - num_ins, 1, - platform::errors::InvalidArgument( - "Expected at least 2 input tensors, but only received d%.", - in_tensors.size())); - - PADDLE_ENFORCE_EQ( - num_ins, out_tensors.size(), - platform::errors::InvalidArgument( - "BroadcastTensorsOp expects equal number of inputs and outputs," - "but received: %d inputs v.s %d outputs", - num_ins, out_tensors.size())); - - // Eigen has no support for dynamic ranked tensor - // Thus we perform static expansion for each possible ranks - for (size_t i = 0; i < num_ins; i++) { - int out_rank = out_tensors[i]->dims().size(); - switch (out_rank) { - SWITCH_OUT_RANK_CASE(1) - SWITCH_OUT_RANK_CASE(2) - SWITCH_OUT_RANK_CASE(3) - SWITCH_OUT_RANK_CASE(4) - SWITCH_OUT_RANK_CASE(5) - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Target tensor rank out of range" - "Maximum supported rank for broadcast is: 5")); - } - } - } - } - - template - void ApplyBroadcast(const framework::ExecutionContext& context, - const Tensor* input_tensor, Tensor* output_tensor) const { - const auto& input_dims = input_tensor->dims(); - const auto& output_dims = output_tensor->dims(); - - int in_rank = input_dims.size(); - int out_rank = output_dims.size(); - - // 1. Collect bcast_dims, each element of which indicates how many - // times we need to replicate along the corresponding dimension - // 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for - // both input and output tensors, so we need to initialize input X with - // expanded dims: "new_input_dims_vec" - Eigen::DSizes bcast_dims; - std::vector new_input_dims_vec(out_rank); - for (int j = 0; j < out_rank; j++) { - int out_axis = out_rank - j - 1; - int in_axis = in_rank - j - 1; - - bcast_dims[out_axis] = output_dims[out_axis]; - new_input_dims_vec[out_axis] = 1; - if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) { - bcast_dims[out_axis] = 1; - new_input_dims_vec[out_axis] = input_dims[in_axis]; - } - } - auto new_input_dims = phi::make_ddim(new_input_dims_vec); - - // Initialize input X with new_input_dims_vec, so it's rank-aligned with the - // output - auto x = EigenTensor::From(*input_tensor, new_input_dims); - - output_tensor->mutable_data(context.GetPlace()); - auto y = EigenTensor::From(*output_tensor, output_dims); - - auto& place = - *context.template device_context().eigen_device(); - EigenBroadcast, T, OutRank>::Eval(place, y, x, - bcast_dims); - } -}; - -#define SWITCH_RESHAPE_DIMS(n) \ - case n: { \ - Eigen::DSizes reshape_dims; \ - for (size_t i = 0; i < reshape_dims_vec.size(); ++i) { \ - reshape_dims[i] = reshape_dims_vec[i]; \ - } \ - dX.device(place) = \ - dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \ - break; \ - } - -#define UPPER_SWITCH_REDUCE_DIMS(m) \ - case m: { \ - Eigen::DSizes reduce_dims; \ - for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \ - reduce_dims[i] = reduce_dims_vec[i]; \ - } \ - switch (reshape_size) { -#define LOWER_SWITCH_REDUCE_DIMS \ - default: { \ - PADDLE_THROW(platform::errors::InvalidArgument( \ - "Detected reshape size: %d out of range" \ - "Minimum value should be larger than reduce size %d" \ - "While maximum supported is: 5", \ - reshape_size, reduce_size)); \ - } \ - } \ - break; \ - } - -/* ----- GradOpKernel ----- */ -template -class BroadcastTensorsGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - // Find reduce dimensions - const auto& in_tensors = - context.MultiInput(framework::GradVarName("Out")); - auto out_tensors = context.MultiOutput(framework::GradVarName("X")); - - size_t num_ins = in_tensors.size(); - - PADDLE_ENFORCE_GT( - num_ins, 1, - platform::errors::InvalidArgument( - "Expected at least 2 input tensors, but only received d%.", - in_tensors.size())); - - PADDLE_ENFORCE_EQ( - num_ins, out_tensors.size(), - platform::errors::InvalidArgument( - "BroadcastTensorsOp expects equal number of inputs and outputs," - "but received: %d inputs v.s %d outputs", - num_ins, out_tensors.size())); - - // For each In-Out tensor pair, - // Prepare and apply broadcast dims array - for (size_t i = 0; i < num_ins; i++) { - const auto* input_tensor = in_tensors[i]; - auto* output_tensor = out_tensors[i]; - - const auto& input_dims = input_tensor->dims(); - const auto& output_dims = output_tensor->dims(); - - int in_rank = input_dims.size(); - int out_rank = output_dims.size(); - - // BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes - // Here we perform the following Eigen operations: - // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) -> - // reshape(dX_shape) -> dX - // Note the last "reshape(dX_shape)" will be performed implicitly, - // and we only need to collect reduce_dims and reshape_dims - std::vector reduce_dims_vec; - std::vector reshape_dims_vec; - for (int j = 0; j < in_rank; j++) { - int out_axis = out_rank - j - 1; - int in_axis = in_rank - j - 1; - - reshape_dims_vec.push_back(input_dims[j]); - if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) { - reduce_dims_vec.push_back(in_axis); - } - } - - size_t reduce_size = reduce_dims_vec.size(); - size_t reshape_size = reshape_dims_vec.size(); - bool just_copy = (reduce_dims_vec.size() == 0); - output_tensor->mutable_data(context.GetPlace()); - if (just_copy) { - // If this turns out to be a No-Op, simply perform a tensor copy - framework::TensorCopy(*input_tensor, context.GetPlace(), - context.device_context(), output_tensor); - } else { - PADDLE_ENFORCE_GE(reduce_dims_vec.size(), 1, - platform::errors::InvalidArgument( - "The number of dimensions of the input " - "'Out@GRAD' for Op(broadcast_tensors)" - " must be greater than or equal to 1, but " - "the value received is %d.", - reduce_dims_vec.size())); - PADDLE_ENFORCE_LE( - reduce_dims_vec.size(), 5, - platform::errors::InvalidArgument( - "The number of dimensions of the input 'Out@GRAD' " - "for Op(broadcast_tensors) must be less than or equal " - "to 5, but the value received is %d.", - reduce_dims_vec.size())); - - // Overall: - // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) -> - // reshape(dX_shape) -> dX - auto dX = framework::EigenVector::Flatten(*output_tensor); - auto dOut = framework::EigenVector::Flatten(*input_tensor); - auto& place = - *context.template device_context().eigen_device(); - - // Expand ReduceSize and ReshapeSize into static values - switch (reduce_size) { - UPPER_SWITCH_REDUCE_DIMS(1) - SWITCH_RESHAPE_DIMS(1) - SWITCH_RESHAPE_DIMS(2) - SWITCH_RESHAPE_DIMS(3) - SWITCH_RESHAPE_DIMS(4) - SWITCH_RESHAPE_DIMS(5) - LOWER_SWITCH_REDUCE_DIMS - - UPPER_SWITCH_REDUCE_DIMS(2) - SWITCH_RESHAPE_DIMS(2) - SWITCH_RESHAPE_DIMS(3) - SWITCH_RESHAPE_DIMS(4) - SWITCH_RESHAPE_DIMS(5) - LOWER_SWITCH_REDUCE_DIMS - - UPPER_SWITCH_REDUCE_DIMS(3) - SWITCH_RESHAPE_DIMS(3) - SWITCH_RESHAPE_DIMS(4) - SWITCH_RESHAPE_DIMS(5) - LOWER_SWITCH_REDUCE_DIMS - - UPPER_SWITCH_REDUCE_DIMS(4) - SWITCH_RESHAPE_DIMS(4) - SWITCH_RESHAPE_DIMS(5) - LOWER_SWITCH_REDUCE_DIMS - - UPPER_SWITCH_REDUCE_DIMS(5) - SWITCH_RESHAPE_DIMS(5) - LOWER_SWITCH_REDUCE_DIMS - - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Detected reduce size: %d out of range" - "While maximum supported is: 5", - reduce_size)); - } - } - } - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 7634e5e01aca4cdaf7fb46399f9594897f2d0e36..dc5478e8afb981defa9bc493cb440cead4f5965f 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -13,11 +13,21 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/infermeta/multiary.h" - +#include #include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/kernels/funcs/concat_funcs.h" namespace phi { +std::vector GetMetaTensorsDim(const std::vector& tensors) { + std::vector dims; + dims.reserve(tensors.size()); + for (const MetaTensor* tensor : tensors) { + dims.emplace_back(tensor->dims()); + } + return dims; +} + void BilinearTensorProductInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, @@ -84,6 +94,60 @@ void BilinearTensorProductInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } +void BroadcastTensorsInferMeta(const std::vector& x, + std::vector out) { + int target_rank = 0; + const auto& input_dims = GetMetaTensorsDim(x); + + // 1. Find Output rank = max(Inputs rank) + for (const auto& input_ddim : input_dims) { + target_rank = std::max(target_rank, input_ddim.size()); + } + + PADDLE_ENFORCE_GT(target_rank, + 0, + errors::InvalidArgument("BroadcastTensorsOp requires at " + "least one input tensor to have " + "rank greater than zero")); + + std::vector target_dims(target_rank, 0); + // 2. Output dim(axis=x) = max(Inputs dim(axis=x)) + for (int index = 0; index < target_rank; index++) { + // Loop axes in reverse order, + // For each axis, take the maximum as target size + // Fill size = 1 if shape vector exhausts + int target_dim_size = 1; + for (const auto& input_ddim : input_dims) { + // Reversed order + int axis = static_cast(input_ddim.size()) - index - 1; + int dim_size = 1; + if (axis >= 0) { + dim_size = input_ddim[axis]; + } + + if (target_dim_size != 1 && dim_size != 1 && + target_dim_size != dim_size) { + PADDLE_THROW(errors::InvalidArgument( + "BroadcastTensorsOp inputs does not satisfy bcast semantics, " + "please check axis = %d in reverse order", + index)); + } + + // We performed bcast semantics check at python level + // So input tensors should all have legal shape + target_dim_size = std::max(target_dim_size, dim_size); + } + target_dims[target_rank - index - 1] = target_dim_size; + } + + // 3. Set Output Dim + for (size_t i = 0; i < out.size(); i++) { + out[i]->set_dims(phi::make_ddim(target_dims)); + out[i]->share_lod(*(x[i])); + out[i]->set_dtype(x[i]->dtype()); + } +} + void ConcatInferMeta(const std::vector& x, const Scalar& axis_scalar, MetaTensor* out, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 2afb79daa355cc897e3bf4076003e9a41de8b96c..51738c5e08e9842c7cffcdd1a2ce7ee3764d6412 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -18,6 +18,8 @@ limitations under the License. */ #include "paddle/phi/core/meta_tensor.h" namespace phi { +std::vector GetMetaTensorsDim(const std::vector& tensors); + void BilinearTensorProductInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, @@ -25,6 +27,9 @@ void BilinearTensorProductInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void BroadcastTensorsInferMeta(const std::vector& x, + std::vector out); + void ConcatInferMeta(const std::vector& x, const Scalar& axis_scalar, MetaTensor* out, diff --git a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..5ec2e35cc9b0cfe09fd281605984e72a603b8f5e --- /dev/null +++ b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BroadcastTensorsGradKernel(const Context& ctx, + const std::vector& dout, + std::vector dx); + +} // namespace phi diff --git a/paddle/phi/kernels/broadcast_tensors_kernel.h b/paddle/phi/kernels/broadcast_tensors_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..fb2a6f1136c26cb1bee1ca26ae7d214566862709 --- /dev/null +++ b/paddle/phi/kernels/broadcast_tensors_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BroadcastTensorsKernel(const Context& ctx, + const std::vector& x, + std::vector out); + +} // namespace phi diff --git a/paddle/phi/kernels/complex_grad_kernel.h b/paddle/phi/kernels/complex_grad_kernel.h index 505d4d374424141ad71da863d1fd7a6424fb35ef..be13e2826ea81455fd811143dde02f2d11cfdae2 100644 --- a/paddle/phi/kernels/complex_grad_kernel.h +++ b/paddle/phi/kernels/complex_grad_kernel.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h index 44bfae9820aa84cb33784f108ace6aa0ab8b5281..3b3003392d37f384416643a3b8a52b4a6809216d 100644 --- a/paddle/phi/kernels/complex_kernel.h +++ b/paddle/phi/kernels/complex_kernel.h @@ -50,14 +50,10 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) { return x; } -template -void RealKernel(const DeviceContext& dev_ctx, - const DenseTensor& x, - DenseTensor* out); - -template -void ImagKernel(const DeviceContext& dev_ctx, - const DenseTensor& x, - DenseTensor* out); +template +void RealKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); + +template +void ImagKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..7a97f8c2189736452a722882f8d86a6cfaeae0f5 --- /dev/null +++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc @@ -0,0 +1,201 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h" + +#include +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +#define SWITCH_RESHAPE_DIMS(n) \ + case n: { \ + Eigen::DSizes reshape_dims; \ + for (size_t i = 0; i < reshape_dims_vec.size(); ++i) { \ + reshape_dims[i] = reshape_dims_vec[i]; \ + } \ + dX.device(place) = \ + dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \ + break; \ + } + +#define UPPER_SWITCH_REDUCE_DIMS(m) \ + case m: { \ + Eigen::DSizes reduce_dims; \ + for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \ + reduce_dims[i] = reduce_dims_vec[i]; \ + } \ + switch (reshape_size) { +#define LOWER_SWITCH_REDUCE_DIMS \ + default: { \ + PADDLE_THROW(errors::InvalidArgument( \ + "Detected reshape size: %d out of range" \ + "Minimum value should be larger than reduce size %d" \ + "While maximum supported is: 5", \ + reshape_size, \ + reduce_size)); \ + } \ + } \ + break; \ + } + +namespace phi { + +template +void BroadcastTensorsGradKernel(const Context& ctx, + const std::vector& dout, + std::vector dx) { + // Find reduce dimensions + const auto& in_tensors = dout; + auto& out_tensors = dx; + + size_t num_ins = in_tensors.size(); + + PADDLE_ENFORCE_GT( + num_ins, + 1, + errors::InvalidArgument( + "Expected at least 2 input tensors, but only received d%.", + in_tensors.size())); + + PADDLE_ENFORCE_EQ(num_ins, + out_tensors.size(), + errors::InvalidArgument( + "BroadcastTensorsOp expects equal number of inputs and " + "outputs, but received: %d inputs v.s %d outputs", + num_ins, + out_tensors.size())); + + // For each In-Out tensor pair, + // Prepare and apply broadcast dims array + for (size_t i = 0; i < num_ins; i++) { + const auto* input_tensor = &in_tensors[i]; + auto* output_tensor = out_tensors[i]; + + const auto& input_dims = input_tensor->dims(); + const auto& output_dims = output_tensor->dims(); + + int in_rank = input_dims.size(); + int out_rank = output_dims.size(); + + // BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes + // Here we perform the following Eigen operations: + // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) -> + // reshape(dX_shape) -> dX + // Note the last "reshape(dX_shape)" will be performed implicitly, + // and we only need to collect reduce_dims and reshape_dims + std::vector reduce_dims_vec; + std::vector reshape_dims_vec; + for (int j = 0; j < in_rank; j++) { + int out_axis = out_rank - j - 1; + int in_axis = in_rank - j - 1; + + reshape_dims_vec.push_back(input_dims[j]); + if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) { + reduce_dims_vec.push_back(in_axis); + } + } + + size_t reduce_size = reduce_dims_vec.size(); + size_t reshape_size = reshape_dims_vec.size(); + bool just_copy = (reduce_dims_vec.size() == 0); + ctx.template Alloc(output_tensor); + if (just_copy) { + // If this turns out to be a No-Op, simply perform a tensor copy + paddle::framework::TensorCopy( + *input_tensor, ctx.GetPlace(), ctx, output_tensor); + } else { + PADDLE_ENFORCE_GE( + reduce_dims_vec.size(), + 1, + errors::InvalidArgument("The number of dimensions of the input " + "'Out@GRAD' for Op(broadcast_tensors)" + " must be greater than or equal to 1, but " + "the value received is %d.", + reduce_dims_vec.size())); + PADDLE_ENFORCE_LE( + reduce_dims_vec.size(), + 5, + errors::InvalidArgument( + "The number of dimensions of the input 'Out@GRAD' " + "for Op(broadcast_tensors) must be less than or equal " + "to 5, but the value received is %d.", + reduce_dims_vec.size())); + + // Overall: + // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) -> + // reshape(dX_shape) -> dX + auto dX = EigenVector::Flatten(*output_tensor); + auto dOut = EigenVector::Flatten(*input_tensor); + auto& place = *ctx.eigen_device(); + + // Expand ReduceSize and ReshapeSize into static values + switch (reduce_size) { + UPPER_SWITCH_REDUCE_DIMS(1) + SWITCH_RESHAPE_DIMS(1) + SWITCH_RESHAPE_DIMS(2) + SWITCH_RESHAPE_DIMS(3) + SWITCH_RESHAPE_DIMS(4) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + UPPER_SWITCH_REDUCE_DIMS(2) + SWITCH_RESHAPE_DIMS(2) + SWITCH_RESHAPE_DIMS(3) + SWITCH_RESHAPE_DIMS(4) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + UPPER_SWITCH_REDUCE_DIMS(3) + SWITCH_RESHAPE_DIMS(3) + SWITCH_RESHAPE_DIMS(4) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + UPPER_SWITCH_REDUCE_DIMS(4) + SWITCH_RESHAPE_DIMS(4) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + UPPER_SWITCH_REDUCE_DIMS(5) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + default: { + PADDLE_THROW( + errors::InvalidArgument("Detected reduce size: %d out of range" + "While maximum supported is: 5", + reduce_size)); + } + } + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(broadcast_tensors_grad, + CPU, + ALL_LAYOUT, + phi::BroadcastTensorsGradKernel, + int, + int64_t, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..4cb6db876927142baac0ba0cde3438a4e3b00159 --- /dev/null +++ b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/broadcast_tensors_kernel.h" +#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h" + +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(broadcast_tensors, + CPU, + ALL_LAYOUT, + phi::BroadcastTensorsKernel, + bool, + int, + int64_t, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..6fb24d72145c67be2ad1d25620e7886326e8cd6f --- /dev/null +++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu @@ -0,0 +1,111 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h" + +#include +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/phi/kernels/primitive/functor_primitives.h" + +namespace phi { + +template +void BroadcastTensorsGradKernel(const Context& ctx, + const std::vector& dout, + std::vector dx) { + // Find reduce dimensions + const auto& in_tensors = dout; + auto& out_tensors = dx; + + size_t num_ins = in_tensors.size(); + + PADDLE_ENFORCE_GT( + num_ins, + 1, + errors::InvalidArgument( + "Expected at least 2 input tensors, but only received d%.", + in_tensors.size())); + + PADDLE_ENFORCE_EQ( + num_ins, + out_tensors.size(), + errors::InvalidArgument( + "BroadcastTensorsOp expects equal number of inputs and outputs," + "but received: %d inputs v.s %d outputs", + num_ins, + out_tensors.size())); + + // For each In-Out tensor pair, + // Prepare and apply broadcast dims array + for (size_t i = 0; i < num_ins; i++) { + auto* input_tensor = &in_tensors[i]; + auto* output_tensor = out_tensors[i]; + + const DDim& input_dims = input_tensor->dims(); + const DDim& output_dims = output_tensor->dims(); + + int in_rank = input_dims.size(); + int out_rank = output_dims.size(); + + // Collect reduce_dims + // Example: + // dX = [1,1,1,1] + // dOut = [1,1,1,4] + // + // reduce_dims = [3] // reduce along the broadcasted axis + std::vector reduce_dims_vec; + for (int j = 0; j < in_rank; j++) { + int out_axis = out_rank - j - 1; + int in_axis = in_rank - j - 1; + + if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) { + reduce_dims_vec.push_back(in_axis); + } + } + + bool just_copy = (reduce_dims_vec.size() == 0); + ctx.template Alloc(output_tensor); + if (just_copy) { + // Turns out to be a No-Op, simply copy tensors + paddle::framework::TensorCopy( + *input_tensor, ctx.GetPlace(), ctx, output_tensor); + } else { + // reduce_sum implementation on CUDA + kernels::TensorReduceImpl>( + ctx, + *input_tensor, + output_tensor, + kps::IdentityFunctor(), + reduce_dims_vec, + ctx.stream()); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(broadcast_tensors_grad, + GPU, + ALL_LAYOUT, + phi::BroadcastTensorsGradKernel, + int, + int64_t, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..aa45bd3c4389177a07b5228319940e9b840fe1b2 --- /dev/null +++ b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/broadcast_tensors_kernel.h" +#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h" + +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(broadcast_tensors, + GPU, + ALL_LAYOUT, + phi::BroadcastTensorsKernel, + bool, + int, + int64_t, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..eb01b83377cb62c7dc6147cd57edcd3c9c047f78 --- /dev/null +++ b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h @@ -0,0 +1,118 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/broadcast_tensors_kernel.h" + +#include +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +#define SWITCH_OUT_RANK_CASE(n) \ + case n: { \ + ApplyBroadcast(ctx, &in_tensors[i], out_tensors[i]); \ + break; \ + } + +namespace phi { + +template +void ApplyBroadcast(const Context& ctx, + const DenseTensor* input_tensor, + DenseTensor* output_tensor) { + const auto& input_dims = input_tensor->dims(); + const auto& output_dims = output_tensor->dims(); + + int in_rank = input_dims.size(); + int out_rank = output_dims.size(); + + // 1. Collect bcast_dims, each element of which indicates how many + // times we need to replicate along the corresponding dimension + // 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for + // both input and output tensors, so we need to initialize input X with + // expanded dims: "new_input_dims_vec" + Eigen::DSizes bcast_dims; + std::vector new_input_dims_vec(out_rank); + for (int j = 0; j < out_rank; j++) { + int out_axis = out_rank - j - 1; + int in_axis = in_rank - j - 1; + + bcast_dims[out_axis] = output_dims[out_axis]; + new_input_dims_vec[out_axis] = 1; + if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) { + bcast_dims[out_axis] = 1; + new_input_dims_vec[out_axis] = input_dims[in_axis]; + } + } + auto new_input_dims = phi::make_ddim(new_input_dims_vec); + + // Initialize input X with new_input_dims_vec, so it's rank-aligned with the + // output + auto x = EigenTensor::From(*input_tensor, new_input_dims); + + ctx.template Alloc(output_tensor); + auto y = EigenTensor::From(*output_tensor, output_dims); + + auto& place = *ctx.eigen_device(); + funcs::EigenBroadcast, T, OutRank>::Eval( + place, y, x, bcast_dims); +} + +template +void BroadcastTensorsKernel(const Context& ctx, + const std::vector& x, + std::vector out) { + const auto& in_tensors = x; + auto out_tensors = out; + size_t num_ins = in_tensors.size(); + + PADDLE_ENFORCE_GT( + num_ins, + 1, + errors::InvalidArgument( + "Expected at least 2 input tensors, but only received d%.", + in_tensors.size())); + + PADDLE_ENFORCE_EQ(num_ins, + out_tensors.size(), + errors::InvalidArgument( + "BroadcastTensorsOp expects equal number of inputs and " + "outputs,but received: %d inputs v.s %d outputs", + num_ins, + out_tensors.size())); + + // Eigen has no support for dynamic ranked tensor + // Thus we perform static expansion for each possible ranks + for (size_t i = 0; i < num_ins; i++) { + int out_rank = out_tensors[i]->dims().size(); + switch (out_rank) { + SWITCH_OUT_RANK_CASE(1) + SWITCH_OUT_RANK_CASE(2) + SWITCH_OUT_RANK_CASE(3) + SWITCH_OUT_RANK_CASE(4) + SWITCH_OUT_RANK_CASE(5) + default: { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Target tensor rank out of range" + "Maximum supported rank for broadcast is: 5")); + } + } + } +} + +} // namespace phi diff --git a/paddle/phi/ops/compat/broadcast_tensors_sig.cc b/paddle/phi/ops/compat/broadcast_tensors_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..2c979c4aedcc88c3b6bc6664de9ae3175272eec6 --- /dev/null +++ b/paddle/phi/ops/compat/broadcast_tensors_sig.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature BroadcastTensorsGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "broadcast_tensors_grad", {GradVarName("Out")}, {}, {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(broadcast_tensors_grad, + phi::BroadcastTensorsGradOpArgumentMapping);