未验证 提交 2a5590a1 编写于 作者: F From00 提交者: GitHub

Move BroadcastTensors OP to phi (#40047)

* Move BroadcastTensors OP to phi

* Remove mutable_data in impl

* Move BilinearTensorProductInferMeta to multiary.h/cc
上级 8492d3bb
...@@ -12,15 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,15 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/broadcast_tensors_op.h" #include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include <algorithm>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/var_type_inference.h" #include "paddle/fluid/framework/var_type_inference.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -31,64 +27,6 @@ class BroadcastTensorsOp : public framework::OperatorWithKernel { ...@@ -31,64 +27,6 @@ class BroadcastTensorsOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "broadcast_tensors");
OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
"broadcast_tensors");
int target_rank = 0;
const auto& input_dims = ctx->GetInputsDim("X");
// 1. Find Output rank = max(Inputs rank)
for (const auto& input_ddim : input_dims) {
target_rank = std::max(target_rank, input_ddim.size());
}
PADDLE_ENFORCE_GT(
target_rank, 0,
platform::errors::InvalidArgument(
"BroadcastTensorsOp requires at least one input tensor"
"to have rank greater than zero"));
std::vector<int64_t> target_dims(target_rank, 0);
// 2. Output dim(axis=x) = max(Inputs dim(axis=x))
for (int index = 0; index < target_rank; index++) {
// Loop axes in reverse order,
// For each axis, take the maximum as target size
// Fill size = 1 if shape vector exhausts
int target_dim_size = 1;
for (const auto& input_ddim : input_dims) {
// Reversed order
int axis = static_cast<int>(input_ddim.size()) - index - 1;
int dim_size = 1;
if (axis >= 0) {
dim_size = input_ddim[axis];
}
if (target_dim_size != 1 && dim_size != 1 &&
target_dim_size != dim_size) {
PADDLE_THROW(platform::errors::InvalidArgument(
"BroadcastTensorsOp inputs does not satisfy bcast semantics,"
"Please check axis = %d in reverse order",
index));
}
// We performed bcast semantics check at python level
// So input tensors should all have legal shape
target_dim_size = std::max(target_dim_size, dim_size);
}
target_dims[target_rank - index - 1] = target_dim_size;
}
// 3. Set Output Dim
std::vector<DDim> output_ddims;
for (size_t i = 0; i < input_dims.size(); i++) {
output_ddims.emplace_back(phi::make_ddim(target_dims));
}
ctx->SetOutputsDim("Out", output_ddims);
ctx->ShareAllLoD("X", /*->*/ "Out");
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
...@@ -229,34 +167,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer, ...@@ -229,34 +167,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer,
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
DELCARE_INFER_SHAPE_FUNCTOR(broadcast_tensors,
BroadcastTensorsInferShapeFunctor,
PT_INFER_META(phi::BroadcastTensorsInferMeta));
REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp, REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp,
ops::BroadcastTensorsOpMaker, ops::BroadcastTensorsOpMaker,
ops::BroadcastTensorsGradOpMaker<paddle::framework::OpDesc>, ops::BroadcastTensorsGradOpMaker<paddle::framework::OpDesc>,
ops::BroadcastTensorsGradOpMaker<paddle::imperative::OpBase>, ops::BroadcastTensorsGradOpMaker<paddle::imperative::OpBase>,
ops::BroadcastTensorsOpVarTypeInference); ops::BroadcastTensorsOpVarTypeInference,
BroadcastTensorsInferShapeFunctor);
REGISTER_OPERATOR(broadcast_tensors_grad, ops::BroadcastTensorsGradOp, REGISTER_OPERATOR(broadcast_tensors_grad, ops::BroadcastTensorsGradOp,
ops::BroadcastTensorsGradOpVarTypeInference, ops::BroadcastTensorsGradOpVarTypeInference,
ops::BroadcastTensorsGradNoNeedBufVarsInferer); ops::BroadcastTensorsGradNoNeedBufVarsInferer);
REGISTER_OP_CPU_KERNEL(
broadcast_tensors,
ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext,
plat::float16>,
ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, bool>,
ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
broadcast_tensors_grad,
ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
plat::float16>,
ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
float>,
ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
double>,
ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
int64_t>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/broadcast_tensors_op.h"
#include <algorithm>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
namespace paddle {
namespace operators {
using framework::Tensor;
using framework::DDim;
template <typename T>
class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
// Find reduce dimensions
const auto& in_tensors =
context.MultiInput<Tensor>(framework::GradVarName("Out"));
auto out_tensors = context.MultiOutput<Tensor>(framework::GradVarName("X"));
size_t num_ins = in_tensors.size();
PADDLE_ENFORCE_GT(
num_ins, 1,
platform::errors::InvalidArgument(
"Expected at least 2 input tensors, but only received d%.",
in_tensors.size()));
PADDLE_ENFORCE_EQ(
num_ins, out_tensors.size(),
platform::errors::InvalidArgument(
"BroadcastTensorsOp expects equal number of inputs and outputs,"
"but received: %d inputs v.s %d outputs",
num_ins, out_tensors.size()));
// For each In-Out tensor pair,
// Prepare and apply broadcast dims array
for (size_t i = 0; i < num_ins; i++) {
auto* input_tensor = in_tensors[i];
auto* output_tensor = out_tensors[i];
const DDim& input_dims = input_tensor->dims();
const DDim& output_dims = output_tensor->dims();
int in_rank = input_dims.size();
int out_rank = output_dims.size();
// Collect reduce_dims
// Example:
// dX = [1,1,1,1]
// dOut = [1,1,1,4]
//
// reduce_dims = [3] // reduce along the broadcasted axis
std::vector<int> reduce_dims_vec;
for (int j = 0; j < in_rank; j++) {
int out_axis = out_rank - j - 1;
int in_axis = in_rank - j - 1;
if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
reduce_dims_vec.push_back(in_axis);
}
}
bool just_copy = (reduce_dims_vec.size() == 0);
output_tensor->mutable_data<T>(context.GetPlace());
if (just_copy) {
// Turns out to be a No-Op, simply copy tensors
framework::TensorCopy(*input_tensor, context.GetPlace(),
context.device_context(), output_tensor);
} else {
// reduce_sum implementation on CUDA
auto stream = context.cuda_device_context().stream();
TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
context.cuda_device_context(), *input_tensor, output_tensor,
kps::IdentityFunctor<T>(), reduce_dims_vec, stream);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
broadcast_tensors,
ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext,
plat::float16>,
ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, bool>,
ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext,
int64_t>);
REGISTER_OP_CUDA_KERNEL(broadcast_tensors_grad,
ops::CUDABroadcastTensorsGradOpKernel<plat::float16>,
ops::CUDABroadcastTensorsGradOpKernel<float>,
ops::CUDABroadcastTensorsGradOpKernel<double>,
ops::CUDABroadcastTensorsGradOpKernel<int>,
ops::CUDABroadcastTensorsGradOpKernel<int64_t>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#define SWITCH_OUT_RANK_CASE(n) \
case n: { \
ApplyBroadcast<n>(context, in_tensors[i], out_tensors[i]); \
break; \
}
namespace paddle {
namespace operators {
using framework::Tensor;
using framework::DDim;
using framework::EigenTensor;
template <typename DeviceContext, typename T>
class BroadcastTensorsOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const auto& in_tensors = context.MultiInput<Tensor>("X");
auto out_tensors = context.MultiOutput<Tensor>("Out");
size_t num_ins = in_tensors.size();
PADDLE_ENFORCE_GT(
num_ins, 1,
platform::errors::InvalidArgument(
"Expected at least 2 input tensors, but only received d%.",
in_tensors.size()));
PADDLE_ENFORCE_EQ(
num_ins, out_tensors.size(),
platform::errors::InvalidArgument(
"BroadcastTensorsOp expects equal number of inputs and outputs,"
"but received: %d inputs v.s %d outputs",
num_ins, out_tensors.size()));
// Eigen has no support for dynamic ranked tensor
// Thus we perform static expansion for each possible ranks
for (size_t i = 0; i < num_ins; i++) {
int out_rank = out_tensors[i]->dims().size();
switch (out_rank) {
SWITCH_OUT_RANK_CASE(1)
SWITCH_OUT_RANK_CASE(2)
SWITCH_OUT_RANK_CASE(3)
SWITCH_OUT_RANK_CASE(4)
SWITCH_OUT_RANK_CASE(5)
default: {
PADDLE_THROW(platform::errors::InvalidArgument(
"Target tensor rank out of range"
"Maximum supported rank for broadcast is: 5"));
}
}
}
}
template <int OutRank>
void ApplyBroadcast(const framework::ExecutionContext& context,
const Tensor* input_tensor, Tensor* output_tensor) const {
const auto& input_dims = input_tensor->dims();
const auto& output_dims = output_tensor->dims();
int in_rank = input_dims.size();
int out_rank = output_dims.size();
// 1. Collect bcast_dims, each element of which indicates how many
// times we need to replicate along the corresponding dimension
// 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for
// both input and output tensors, so we need to initialize input X with
// expanded dims: "new_input_dims_vec"
Eigen::DSizes<Eigen::DenseIndex, OutRank> bcast_dims;
std::vector<int64_t> new_input_dims_vec(out_rank);
for (int j = 0; j < out_rank; j++) {
int out_axis = out_rank - j - 1;
int in_axis = in_rank - j - 1;
bcast_dims[out_axis] = output_dims[out_axis];
new_input_dims_vec[out_axis] = 1;
if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) {
bcast_dims[out_axis] = 1;
new_input_dims_vec[out_axis] = input_dims[in_axis];
}
}
auto new_input_dims = phi::make_ddim(new_input_dims_vec);
// Initialize input X with new_input_dims_vec, so it's rank-aligned with the
// output
auto x = EigenTensor<T, OutRank>::From(*input_tensor, new_input_dims);
output_tensor->mutable_data<T>(context.GetPlace());
auto y = EigenTensor<T, OutRank>::From(*output_tensor, output_dims);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
EigenBroadcast<std::decay_t<decltype(place)>, T, OutRank>::Eval(place, y, x,
bcast_dims);
}
};
#define SWITCH_RESHAPE_DIMS(n) \
case n: { \
Eigen::DSizes<Eigen::DenseIndex, n> reshape_dims; \
for (size_t i = 0; i < reshape_dims_vec.size(); ++i) { \
reshape_dims[i] = reshape_dims_vec[i]; \
} \
dX.device(place) = \
dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \
break; \
}
#define UPPER_SWITCH_REDUCE_DIMS(m) \
case m: { \
Eigen::DSizes<Eigen::DenseIndex, m> reduce_dims; \
for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \
reduce_dims[i] = reduce_dims_vec[i]; \
} \
switch (reshape_size) {
#define LOWER_SWITCH_REDUCE_DIMS \
default: { \
PADDLE_THROW(platform::errors::InvalidArgument( \
"Detected reshape size: %d out of range" \
"Minimum value should be larger than reduce size %d" \
"While maximum supported is: 5", \
reshape_size, reduce_size)); \
} \
} \
break; \
}
/* ----- GradOpKernel ----- */
template <typename DeviceContext, typename T>
class BroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
// Find reduce dimensions
const auto& in_tensors =
context.MultiInput<Tensor>(framework::GradVarName("Out"));
auto out_tensors = context.MultiOutput<Tensor>(framework::GradVarName("X"));
size_t num_ins = in_tensors.size();
PADDLE_ENFORCE_GT(
num_ins, 1,
platform::errors::InvalidArgument(
"Expected at least 2 input tensors, but only received d%.",
in_tensors.size()));
PADDLE_ENFORCE_EQ(
num_ins, out_tensors.size(),
platform::errors::InvalidArgument(
"BroadcastTensorsOp expects equal number of inputs and outputs,"
"but received: %d inputs v.s %d outputs",
num_ins, out_tensors.size()));
// For each In-Out tensor pair,
// Prepare and apply broadcast dims array
for (size_t i = 0; i < num_ins; i++) {
const auto* input_tensor = in_tensors[i];
auto* output_tensor = out_tensors[i];
const auto& input_dims = input_tensor->dims();
const auto& output_dims = output_tensor->dims();
int in_rank = input_dims.size();
int out_rank = output_dims.size();
// BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes
// Here we perform the following Eigen operations:
// dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
// reshape(dX_shape) -> dX
// Note the last "reshape(dX_shape)" will be performed implicitly,
// and we only need to collect reduce_dims and reshape_dims
std::vector<int> reduce_dims_vec;
std::vector<int> reshape_dims_vec;
for (int j = 0; j < in_rank; j++) {
int out_axis = out_rank - j - 1;
int in_axis = in_rank - j - 1;
reshape_dims_vec.push_back(input_dims[j]);
if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
reduce_dims_vec.push_back(in_axis);
}
}
size_t reduce_size = reduce_dims_vec.size();
size_t reshape_size = reshape_dims_vec.size();
bool just_copy = (reduce_dims_vec.size() == 0);
output_tensor->mutable_data<T>(context.GetPlace());
if (just_copy) {
// If this turns out to be a No-Op, simply perform a tensor copy
framework::TensorCopy(*input_tensor, context.GetPlace(),
context.device_context(), output_tensor);
} else {
PADDLE_ENFORCE_GE(reduce_dims_vec.size(), 1,
platform::errors::InvalidArgument(
"The number of dimensions of the input "
"'Out@GRAD' for Op(broadcast_tensors)"
" must be greater than or equal to 1, but "
"the value received is %d.",
reduce_dims_vec.size()));
PADDLE_ENFORCE_LE(
reduce_dims_vec.size(), 5,
platform::errors::InvalidArgument(
"The number of dimensions of the input 'Out@GRAD' "
"for Op(broadcast_tensors) must be less than or equal "
"to 5, but the value received is %d.",
reduce_dims_vec.size()));
// Overall:
// dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
// reshape(dX_shape) -> dX
auto dX = framework::EigenVector<T>::Flatten(*output_tensor);
auto dOut = framework::EigenVector<T>::Flatten(*input_tensor);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
// Expand ReduceSize and ReshapeSize into static values
switch (reduce_size) {
UPPER_SWITCH_REDUCE_DIMS(1)
SWITCH_RESHAPE_DIMS(1)
SWITCH_RESHAPE_DIMS(2)
SWITCH_RESHAPE_DIMS(3)
SWITCH_RESHAPE_DIMS(4)
SWITCH_RESHAPE_DIMS(5)
LOWER_SWITCH_REDUCE_DIMS
UPPER_SWITCH_REDUCE_DIMS(2)
SWITCH_RESHAPE_DIMS(2)
SWITCH_RESHAPE_DIMS(3)
SWITCH_RESHAPE_DIMS(4)
SWITCH_RESHAPE_DIMS(5)
LOWER_SWITCH_REDUCE_DIMS
UPPER_SWITCH_REDUCE_DIMS(3)
SWITCH_RESHAPE_DIMS(3)
SWITCH_RESHAPE_DIMS(4)
SWITCH_RESHAPE_DIMS(5)
LOWER_SWITCH_REDUCE_DIMS
UPPER_SWITCH_REDUCE_DIMS(4)
SWITCH_RESHAPE_DIMS(4)
SWITCH_RESHAPE_DIMS(5)
LOWER_SWITCH_REDUCE_DIMS
UPPER_SWITCH_REDUCE_DIMS(5)
SWITCH_RESHAPE_DIMS(5)
LOWER_SWITCH_REDUCE_DIMS
default: {
PADDLE_THROW(platform::errors::InvalidArgument(
"Detected reduce size: %d out of range"
"While maximum supported is: 5",
reduce_size));
}
}
}
}
}
};
} // namespace operators
} // namespace paddle
...@@ -13,11 +13,21 @@ See the License for the specific language governing permissions and ...@@ -13,11 +13,21 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/phi/infermeta/multiary.h" #include "paddle/phi/infermeta/multiary.h"
#include <vector>
#include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/meta_tensor.h"
#include "paddle/phi/kernels/funcs/concat_funcs.h" #include "paddle/phi/kernels/funcs/concat_funcs.h"
namespace phi { namespace phi {
std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors) {
std::vector<DDim> dims;
dims.reserve(tensors.size());
for (const MetaTensor* tensor : tensors) {
dims.emplace_back(tensor->dims());
}
return dims;
}
void BilinearTensorProductInferMeta(const MetaTensor& x, void BilinearTensorProductInferMeta(const MetaTensor& x,
const MetaTensor& y, const MetaTensor& y,
const MetaTensor& weight, const MetaTensor& weight,
...@@ -84,6 +94,60 @@ void BilinearTensorProductInferMeta(const MetaTensor& x, ...@@ -84,6 +94,60 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
out->set_dtype(x.dtype()); out->set_dtype(x.dtype());
} }
void BroadcastTensorsInferMeta(const std::vector<MetaTensor*>& x,
std::vector<MetaTensor*> out) {
int target_rank = 0;
const auto& input_dims = GetMetaTensorsDim(x);
// 1. Find Output rank = max(Inputs rank)
for (const auto& input_ddim : input_dims) {
target_rank = std::max(target_rank, input_ddim.size());
}
PADDLE_ENFORCE_GT(target_rank,
0,
errors::InvalidArgument("BroadcastTensorsOp requires at "
"least one input tensor to have "
"rank greater than zero"));
std::vector<int64_t> target_dims(target_rank, 0);
// 2. Output dim(axis=x) = max(Inputs dim(axis=x))
for (int index = 0; index < target_rank; index++) {
// Loop axes in reverse order,
// For each axis, take the maximum as target size
// Fill size = 1 if shape vector exhausts
int target_dim_size = 1;
for (const auto& input_ddim : input_dims) {
// Reversed order
int axis = static_cast<int>(input_ddim.size()) - index - 1;
int dim_size = 1;
if (axis >= 0) {
dim_size = input_ddim[axis];
}
if (target_dim_size != 1 && dim_size != 1 &&
target_dim_size != dim_size) {
PADDLE_THROW(errors::InvalidArgument(
"BroadcastTensorsOp inputs does not satisfy bcast semantics, "
"please check axis = %d in reverse order",
index));
}
// We performed bcast semantics check at python level
// So input tensors should all have legal shape
target_dim_size = std::max(target_dim_size, dim_size);
}
target_dims[target_rank - index - 1] = target_dim_size;
}
// 3. Set Output Dim
for (size_t i = 0; i < out.size(); i++) {
out[i]->set_dims(phi::make_ddim(target_dims));
out[i]->share_lod(*(x[i]));
out[i]->set_dtype(x[i]->dtype());
}
}
void ConcatInferMeta(const std::vector<MetaTensor*>& x, void ConcatInferMeta(const std::vector<MetaTensor*>& x,
const Scalar& axis_scalar, const Scalar& axis_scalar,
MetaTensor* out, MetaTensor* out,
......
...@@ -18,6 +18,8 @@ limitations under the License. */ ...@@ -18,6 +18,8 @@ limitations under the License. */
#include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/core/meta_tensor.h"
namespace phi { namespace phi {
std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors);
void BilinearTensorProductInferMeta(const MetaTensor& x, void BilinearTensorProductInferMeta(const MetaTensor& x,
const MetaTensor& y, const MetaTensor& y,
const MetaTensor& weight, const MetaTensor& weight,
...@@ -25,6 +27,9 @@ void BilinearTensorProductInferMeta(const MetaTensor& x, ...@@ -25,6 +27,9 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
MetaTensor* out, MetaTensor* out,
MetaConfig config = MetaConfig()); MetaConfig config = MetaConfig());
void BroadcastTensorsInferMeta(const std::vector<MetaTensor*>& x,
std::vector<MetaTensor*> out);
void ConcatInferMeta(const std::vector<MetaTensor*>& x, void ConcatInferMeta(const std::vector<MetaTensor*>& x,
const Scalar& axis_scalar, const Scalar& axis_scalar,
MetaTensor* out, MetaTensor* out,
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void BroadcastTensorsGradKernel(const Context& ctx,
const std::vector<DenseTensor>& dout,
std::vector<DenseTensor*> dx);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void BroadcastTensorsKernel(const Context& ctx,
const std::vector<DenseTensor>& x,
std::vector<DenseTensor*> out);
} // namespace phi
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
......
...@@ -50,14 +50,10 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) { ...@@ -50,14 +50,10 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
return x; return x;
} }
template <typename T, typename DeviceContext> template <typename T, typename Context>
void RealKernel(const DeviceContext& dev_ctx, void RealKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
const DenseTensor& x,
DenseTensor* out); template <typename T, typename Context>
void ImagKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
template <typename T, typename DeviceContext>
void ImagKernel(const DeviceContext& dev_ctx,
const DenseTensor& x,
DenseTensor* out);
} // namespace phi } // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h"
#include <vector>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#define SWITCH_RESHAPE_DIMS(n) \
case n: { \
Eigen::DSizes<Eigen::DenseIndex, n> reshape_dims; \
for (size_t i = 0; i < reshape_dims_vec.size(); ++i) { \
reshape_dims[i] = reshape_dims_vec[i]; \
} \
dX.device(place) = \
dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \
break; \
}
#define UPPER_SWITCH_REDUCE_DIMS(m) \
case m: { \
Eigen::DSizes<Eigen::DenseIndex, m> reduce_dims; \
for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \
reduce_dims[i] = reduce_dims_vec[i]; \
} \
switch (reshape_size) {
#define LOWER_SWITCH_REDUCE_DIMS \
default: { \
PADDLE_THROW(errors::InvalidArgument( \
"Detected reshape size: %d out of range" \
"Minimum value should be larger than reduce size %d" \
"While maximum supported is: 5", \
reshape_size, \
reduce_size)); \
} \
} \
break; \
}
namespace phi {
template <typename T, typename Context>
void BroadcastTensorsGradKernel(const Context& ctx,
const std::vector<DenseTensor>& dout,
std::vector<DenseTensor*> dx) {
// Find reduce dimensions
const auto& in_tensors = dout;
auto& out_tensors = dx;
size_t num_ins = in_tensors.size();
PADDLE_ENFORCE_GT(
num_ins,
1,
errors::InvalidArgument(
"Expected at least 2 input tensors, but only received d%.",
in_tensors.size()));
PADDLE_ENFORCE_EQ(num_ins,
out_tensors.size(),
errors::InvalidArgument(
"BroadcastTensorsOp expects equal number of inputs and "
"outputs, but received: %d inputs v.s %d outputs",
num_ins,
out_tensors.size()));
// For each In-Out tensor pair,
// Prepare and apply broadcast dims array
for (size_t i = 0; i < num_ins; i++) {
const auto* input_tensor = &in_tensors[i];
auto* output_tensor = out_tensors[i];
const auto& input_dims = input_tensor->dims();
const auto& output_dims = output_tensor->dims();
int in_rank = input_dims.size();
int out_rank = output_dims.size();
// BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes
// Here we perform the following Eigen operations:
// dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
// reshape(dX_shape) -> dX
// Note the last "reshape(dX_shape)" will be performed implicitly,
// and we only need to collect reduce_dims and reshape_dims
std::vector<int> reduce_dims_vec;
std::vector<int> reshape_dims_vec;
for (int j = 0; j < in_rank; j++) {
int out_axis = out_rank - j - 1;
int in_axis = in_rank - j - 1;
reshape_dims_vec.push_back(input_dims[j]);
if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
reduce_dims_vec.push_back(in_axis);
}
}
size_t reduce_size = reduce_dims_vec.size();
size_t reshape_size = reshape_dims_vec.size();
bool just_copy = (reduce_dims_vec.size() == 0);
ctx.template Alloc<T>(output_tensor);
if (just_copy) {
// If this turns out to be a No-Op, simply perform a tensor copy
paddle::framework::TensorCopy(
*input_tensor, ctx.GetPlace(), ctx, output_tensor);
} else {
PADDLE_ENFORCE_GE(
reduce_dims_vec.size(),
1,
errors::InvalidArgument("The number of dimensions of the input "
"'Out@GRAD' for Op(broadcast_tensors)"
" must be greater than or equal to 1, but "
"the value received is %d.",
reduce_dims_vec.size()));
PADDLE_ENFORCE_LE(
reduce_dims_vec.size(),
5,
errors::InvalidArgument(
"The number of dimensions of the input 'Out@GRAD' "
"for Op(broadcast_tensors) must be less than or equal "
"to 5, but the value received is %d.",
reduce_dims_vec.size()));
// Overall:
// dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
// reshape(dX_shape) -> dX
auto dX = EigenVector<T>::Flatten(*output_tensor);
auto dOut = EigenVector<T>::Flatten(*input_tensor);
auto& place = *ctx.eigen_device();
// Expand ReduceSize and ReshapeSize into static values
switch (reduce_size) {
UPPER_SWITCH_REDUCE_DIMS(1)
SWITCH_RESHAPE_DIMS(1)
SWITCH_RESHAPE_DIMS(2)
SWITCH_RESHAPE_DIMS(3)
SWITCH_RESHAPE_DIMS(4)
SWITCH_RESHAPE_DIMS(5)
LOWER_SWITCH_REDUCE_DIMS
UPPER_SWITCH_REDUCE_DIMS(2)
SWITCH_RESHAPE_DIMS(2)
SWITCH_RESHAPE_DIMS(3)
SWITCH_RESHAPE_DIMS(4)
SWITCH_RESHAPE_DIMS(5)
LOWER_SWITCH_REDUCE_DIMS
UPPER_SWITCH_REDUCE_DIMS(3)
SWITCH_RESHAPE_DIMS(3)
SWITCH_RESHAPE_DIMS(4)
SWITCH_RESHAPE_DIMS(5)
LOWER_SWITCH_REDUCE_DIMS
UPPER_SWITCH_REDUCE_DIMS(4)
SWITCH_RESHAPE_DIMS(4)
SWITCH_RESHAPE_DIMS(5)
LOWER_SWITCH_REDUCE_DIMS
UPPER_SWITCH_REDUCE_DIMS(5)
SWITCH_RESHAPE_DIMS(5)
LOWER_SWITCH_REDUCE_DIMS
default: {
PADDLE_THROW(
errors::InvalidArgument("Detected reduce size: %d out of range"
"While maximum supported is: 5",
reduce_size));
}
}
}
}
}
} // namespace phi
PD_REGISTER_KERNEL(broadcast_tensors_grad,
CPU,
ALL_LAYOUT,
phi::BroadcastTensorsGradKernel,
int,
int64_t,
float,
double,
phi::dtype::float16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/kernel_registry.h"
PD_REGISTER_KERNEL(broadcast_tensors,
CPU,
ALL_LAYOUT,
phi::BroadcastTensorsKernel,
bool,
int,
int64_t,
float,
double,
phi::dtype::float16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h"
#include <vector>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/gpu/reduce.h"
#include "paddle/phi/kernels/primitive/functor_primitives.h"
namespace phi {
template <typename T, typename Context>
void BroadcastTensorsGradKernel(const Context& ctx,
const std::vector<DenseTensor>& dout,
std::vector<DenseTensor*> dx) {
// Find reduce dimensions
const auto& in_tensors = dout;
auto& out_tensors = dx;
size_t num_ins = in_tensors.size();
PADDLE_ENFORCE_GT(
num_ins,
1,
errors::InvalidArgument(
"Expected at least 2 input tensors, but only received d%.",
in_tensors.size()));
PADDLE_ENFORCE_EQ(
num_ins,
out_tensors.size(),
errors::InvalidArgument(
"BroadcastTensorsOp expects equal number of inputs and outputs,"
"but received: %d inputs v.s %d outputs",
num_ins,
out_tensors.size()));
// For each In-Out tensor pair,
// Prepare and apply broadcast dims array
for (size_t i = 0; i < num_ins; i++) {
auto* input_tensor = &in_tensors[i];
auto* output_tensor = out_tensors[i];
const DDim& input_dims = input_tensor->dims();
const DDim& output_dims = output_tensor->dims();
int in_rank = input_dims.size();
int out_rank = output_dims.size();
// Collect reduce_dims
// Example:
// dX = [1,1,1,1]
// dOut = [1,1,1,4]
//
// reduce_dims = [3] // reduce along the broadcasted axis
std::vector<int> reduce_dims_vec;
for (int j = 0; j < in_rank; j++) {
int out_axis = out_rank - j - 1;
int in_axis = in_rank - j - 1;
if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
reduce_dims_vec.push_back(in_axis);
}
}
bool just_copy = (reduce_dims_vec.size() == 0);
ctx.template Alloc<T>(output_tensor);
if (just_copy) {
// Turns out to be a No-Op, simply copy tensors
paddle::framework::TensorCopy(
*input_tensor, ctx.GetPlace(), ctx, output_tensor);
} else {
// reduce_sum implementation on CUDA
kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
ctx,
*input_tensor,
output_tensor,
kps::IdentityFunctor<T>(),
reduce_dims_vec,
ctx.stream());
}
}
}
} // namespace phi
PD_REGISTER_KERNEL(broadcast_tensors_grad,
GPU,
ALL_LAYOUT,
phi::BroadcastTensorsGradKernel,
int,
int64_t,
float,
double,
phi::dtype::float16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/kernel_registry.h"
PD_REGISTER_KERNEL(broadcast_tensors,
GPU,
ALL_LAYOUT,
phi::BroadcastTensorsKernel,
bool,
int,
int64_t,
float,
double,
phi::dtype::float16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
#include <vector>
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#define SWITCH_OUT_RANK_CASE(n) \
case n: { \
ApplyBroadcast<T, Context, n>(ctx, &in_tensors[i], out_tensors[i]); \
break; \
}
namespace phi {
template <typename T, typename Context, int OutRank>
void ApplyBroadcast(const Context& ctx,
const DenseTensor* input_tensor,
DenseTensor* output_tensor) {
const auto& input_dims = input_tensor->dims();
const auto& output_dims = output_tensor->dims();
int in_rank = input_dims.size();
int out_rank = output_dims.size();
// 1. Collect bcast_dims, each element of which indicates how many
// times we need to replicate along the corresponding dimension
// 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for
// both input and output tensors, so we need to initialize input X with
// expanded dims: "new_input_dims_vec"
Eigen::DSizes<Eigen::DenseIndex, OutRank> bcast_dims;
std::vector<int64_t> new_input_dims_vec(out_rank);
for (int j = 0; j < out_rank; j++) {
int out_axis = out_rank - j - 1;
int in_axis = in_rank - j - 1;
bcast_dims[out_axis] = output_dims[out_axis];
new_input_dims_vec[out_axis] = 1;
if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) {
bcast_dims[out_axis] = 1;
new_input_dims_vec[out_axis] = input_dims[in_axis];
}
}
auto new_input_dims = phi::make_ddim(new_input_dims_vec);
// Initialize input X with new_input_dims_vec, so it's rank-aligned with the
// output
auto x = EigenTensor<T, OutRank>::From(*input_tensor, new_input_dims);
ctx.template Alloc<T>(output_tensor);
auto y = EigenTensor<T, OutRank>::From(*output_tensor, output_dims);
auto& place = *ctx.eigen_device();
funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, OutRank>::Eval(
place, y, x, bcast_dims);
}
template <typename T, typename Context>
void BroadcastTensorsKernel(const Context& ctx,
const std::vector<DenseTensor>& x,
std::vector<DenseTensor*> out) {
const auto& in_tensors = x;
auto out_tensors = out;
size_t num_ins = in_tensors.size();
PADDLE_ENFORCE_GT(
num_ins,
1,
errors::InvalidArgument(
"Expected at least 2 input tensors, but only received d%.",
in_tensors.size()));
PADDLE_ENFORCE_EQ(num_ins,
out_tensors.size(),
errors::InvalidArgument(
"BroadcastTensorsOp expects equal number of inputs and "
"outputs,but received: %d inputs v.s %d outputs",
num_ins,
out_tensors.size()));
// Eigen has no support for dynamic ranked tensor
// Thus we perform static expansion for each possible ranks
for (size_t i = 0; i < num_ins; i++) {
int out_rank = out_tensors[i]->dims().size();
switch (out_rank) {
SWITCH_OUT_RANK_CASE(1)
SWITCH_OUT_RANK_CASE(2)
SWITCH_OUT_RANK_CASE(3)
SWITCH_OUT_RANK_CASE(4)
SWITCH_OUT_RANK_CASE(5)
default: {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"Target tensor rank out of range"
"Maximum supported rank for broadcast is: 5"));
}
}
}
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature BroadcastTensorsGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"broadcast_tensors_grad", {GradVarName("Out")}, {}, {GradVarName("X")});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(broadcast_tensors_grad,
phi::BroadcastTensorsGradOpArgumentMapping);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册