未验证 提交 affddfaa 编写于 作者: Z Zhanlue Yang 提交者: GitHub

Add new operation: BroadcastTensorsOp (#33294)

上级 f9420e83
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/broadcast_tensors_op.h"
#include <algorithm>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/var_type_inference.h"
namespace paddle {
namespace operators {
using framework::Tensor;
using framework::DDim;
class BroadcastTensorsOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "broadcast_tensors");
OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
"broadcast_tensors");
int target_rank = 0;
const auto& input_dims = ctx->GetInputsDim("X");
// 1. Find Output rank = max(Inputs rank)
for (const auto& input_ddim : input_dims) {
target_rank = std::max(target_rank, input_ddim.size());
}
PADDLE_ENFORCE_GT(
target_rank, 0,
platform::errors::InvalidArgument(
"BroadcastTensorsOp requires at least one input tensor"
"to have rank greater than zero"));
std::vector<int64_t> target_dims(target_rank, 0);
// 2. Output dim(axis=x) = max(Inputs dim(axis=x))
for (int index = 0; index < target_rank; index++) {
// Loop axes in reverse order,
// For each axis, take the maximum as target size
// Fill size = 1 if shape vector exhausts
int target_dim_size = 1;
for (const auto& input_ddim : input_dims) {
// Reversed order
int axis = static_cast<int>(input_ddim.size()) - index - 1;
int dim_size = 1;
if (axis >= 0) {
dim_size = input_ddim[axis];
}
// We performed bcast semantics check at python level
// So input tensors should all have legal shape
target_dim_size = std::max(target_dim_size, dim_size);
}
target_dims[target_rank - index - 1] = target_dim_size;
}
// 3. Set Output Dim
std::vector<DDim> output_ddims;
for (size_t i = 0; i < input_dims.size(); i++) {
output_ddims.emplace_back(framework::make_ddim(target_dims));
}
ctx->SetOutputsDim("Out", output_ddims);
ctx->ShareAllLoD("X", /*->*/ "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
// Broadcast semantics enforces all input variables having the same
// DataType/VarType
// This condition is also checked during VarType Inference
// Here we simply copy input type to output
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
}
};
class BroadcastTensorsOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"A Varaible list. The shape and data type of the list elements"
"should be consistent. Variable can be multi-dimensional Tensor"
"or LoDTensor, and data types can be: bool, float16, float32, "
"float64, int32, "
"int64.")
.AsDuplicable();
AddOutput("Out",
"the sum of input :code:`x`. its shape and data types are "
"consistent with :code:`x`.")
.AsDuplicable();
AddComment(
R"DOC(This OP is used to broadcast a vector of inputs
with Tensor or LoDTensor type, following broadcast semantics.)DOC");
}
};
class BroadcastTensorsOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext* ctx) const override {
// We need at least two tensors to satisfy broadcast semantics
size_t input_size = ctx->InputSize("X");
PADDLE_ENFORCE_GT(
input_size, 0,
platform::errors::InvalidArgument(
"BroadcastTensorsOp should have at least one input variables,"
"but only received %d ",
input_size));
// BroadcastTensorsOp takes a vector of variables named "X"
// Here we loop through input variables,
// and check if their DataType/VarType are the same
auto var_type = ctx->GetInputType("X", 0);
auto data_type = ctx->GetInputDataType("X", 0);
for (size_t ind = 1; ind < input_size; ind++) {
auto cur_var_type = ctx->GetInputType("X", ind);
PADDLE_ENFORCE_EQ(
var_type, cur_var_type,
platform::errors::InvalidArgument(
"inputs to BroadcastTensorsOp should have the same variable type,"
"but detected %d v.s %d ",
framework::ToTypeName(var_type),
framework::ToTypeName(cur_var_type)));
auto cur_data_type = ctx->GetInputDataType("X", ind);
PADDLE_ENFORCE_EQ(
data_type, cur_data_type,
platform::errors::InvalidArgument(
"inputs to BroadcastTensorsOp should have the same data type,"
"but detected %d v.s %d ",
framework::ToTypeName(var_type),
framework::ToTypeName(cur_var_type)));
}
// Outputs having the same DataType/VarType as inputs
ctx->SetOutputType("Out", var_type, framework::ALL_ELEMENTS);
ctx->SetOutputDataType("Out", data_type, framework::ALL_ELEMENTS);
}
};
/* ------ BroadcastTensorsGradOp ------ */
class BroadcastTensorsGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasOutputs(framework::GradVarName("X")), "Output",
"X@grad", "broadcast_tensors");
OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "broadcast_tensors");
OP_INOUT_CHECK(ctx->HasInputs(framework::GradVarName("Out")), "Input",
"Out@grad", "broadcast_tensors");
const auto& forward_input_dims = ctx->GetInputsDim("X");
ctx->SetOutputsDim(framework::GradVarName("X"), forward_input_dims);
ctx->ShareAllLoD("X", /*->*/ framework::GradVarName("X"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out")),
ctx.device_context());
}
};
template <typename T>
class BroadcastTensorsGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
void Apply(GradOpPtr<T> grad_op) const override {
grad_op->SetType("broadcast_tensors_grad");
// We need "X" only for backward shape inference
grad_op->SetInput("X", this->Input("X"));
grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"),
this->InputGrad("X", /* drop_empty_grad */ false));
grad_op->SetAttrMap(this->Attrs());
}
};
class BroadcastTensorsGradOpVarTypeInference
: public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext* ctx) const override {
auto var_type = ctx->GetInputType("X", 0);
auto data_type = ctx->GetInputDataType("X", 0);
ctx->SetOutputType(framework::GradVarName("X"), var_type,
framework::ALL_ELEMENTS);
ctx->SetOutputDataType(framework::GradVarName("X"), data_type,
framework::ALL_ELEMENTS);
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer,
"X");
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp,
ops::BroadcastTensorsOpMaker,
ops::BroadcastTensorsGradOpMaker<paddle::framework::OpDesc>,
ops::BroadcastTensorsGradOpMaker<paddle::imperative::OpBase>,
ops::BroadcastTensorsOpVarTypeInference);
REGISTER_OPERATOR(broadcast_tensors_grad, ops::BroadcastTensorsGradOp,
ops::BroadcastTensorsGradOpVarTypeInference,
ops::BroadcastTensorsGradNoNeedBufVarsInferer);
REGISTER_OP_CPU_KERNEL(
broadcast_tensors,
ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext,
plat::float16>,
ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, bool>,
ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
broadcast_tensors_grad,
ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
plat::float16>,
ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
float>,
ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
double>,
ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
int64_t>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/broadcast_tensors_op.h"
#include <algorithm>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
namespace paddle {
namespace operators {
using framework::Tensor;
using framework::DDim;
template <typename Tout>
struct IdentityFunctor {
HOSTDEVICE explicit inline IdentityFunctor() {}
template <typename U>
HOSTDEVICE inline Tout operator()(const U& x) const {
return static_cast<Tout>(x);
}
};
template <typename T>
class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
// Find reduce dimensions
const auto& in_tensors =
context.MultiInput<Tensor>(framework::GradVarName("Out"));
auto out_tensors = context.MultiOutput<Tensor>(framework::GradVarName("X"));
size_t num_ins = in_tensors.size();
PADDLE_ENFORCE_GT(
num_ins, 1,
platform::errors::InvalidArgument(
"Expected at least 2 input tensors, but only received d%.",
in_tensors.size()));
PADDLE_ENFORCE_EQ(
num_ins, out_tensors.size(),
platform::errors::InvalidArgument(
"BroadcastTensorsOp expects equal number of inputs and outputs,"
"but received: %d inputs v.s %d outputs",
num_ins, out_tensors.size()));
// For each In-Out tensor pair,
// Prepare and apply broadcast dims array
for (size_t i = 0; i < num_ins; i++) {
auto* input_tensor = in_tensors[i];
auto* output_tensor = out_tensors[i];
const DDim& input_dims = input_tensor->dims();
const DDim& output_dims = output_tensor->dims();
int in_rank = input_dims.size();
int out_rank = output_dims.size();
// Collect reduce_dims
// Example:
// dX = [1,1,1,1]
// dOut = [1,1,1,4]
//
// reduce_dims = [3] // reduce along the broadcasted axis
std::vector<int> reduce_dims_vec;
for (int j = 0; j < in_rank; j++) {
int out_axis = out_rank - j - 1;
int in_axis = in_rank - j - 1;
if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
reduce_dims_vec.push_back(in_axis);
}
}
bool just_copy = (reduce_dims_vec.size() == 0);
output_tensor->mutable_data<T>(context.GetPlace());
if (just_copy) {
// Turns out to be a No-Op, simply copy tensors
framework::TensorCopy(*input_tensor, context.GetPlace(),
context.device_context(), output_tensor);
} else {
// reduce_sum implementation on CUDA
auto stream = context.cuda_device_context().stream();
TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
*input_tensor, output_tensor, reduce_dims_vec, static_cast<T>(0),
cub::Sum(), IdentityFunctor<T>(), stream);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
broadcast_tensors,
ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext,
plat::float16>,
ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, bool>,
ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext,
int64_t>);
REGISTER_OP_CUDA_KERNEL(broadcast_tensors_grad,
ops::CUDABroadcastTensorsGradOpKernel<plat::float16>,
ops::CUDABroadcastTensorsGradOpKernel<float>,
ops::CUDABroadcastTensorsGradOpKernel<double>,
ops::CUDABroadcastTensorsGradOpKernel<int>,
ops::CUDABroadcastTensorsGradOpKernel<int64_t>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/operators/math/math_function.h"
#define SWITCH_OUT_RANK_CASE(n) \
case n: { \
ApplyBroadcast<n>(context, in_tensors[i], out_tensors[i]); \
break; \
}
namespace paddle {
namespace operators {
using framework::Tensor;
using framework::DDim;
using framework::EigenTensor;
template <typename DeviceContext, typename T>
class BroadcastTensorsOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const auto& in_tensors = context.MultiInput<Tensor>("X");
auto out_tensors = context.MultiOutput<Tensor>("Out");
size_t num_ins = in_tensors.size();
PADDLE_ENFORCE_GT(
num_ins, 1,
platform::errors::InvalidArgument(
"Expected at least 2 input tensors, but only received d%.",
in_tensors.size()));
PADDLE_ENFORCE_EQ(
num_ins, out_tensors.size(),
platform::errors::InvalidArgument(
"BroadcastTensorsOp expects equal number of inputs and outputs,"
"but received: %d inputs v.s %d outputs",
num_ins, out_tensors.size()));
// Eigen has no support for dynamic ranked tensor
// Thus we perform static expansion for each possible ranks
for (size_t i = 0; i < num_ins; i++) {
int out_rank = out_tensors[i]->dims().size();
switch (out_rank) {
SWITCH_OUT_RANK_CASE(1)
SWITCH_OUT_RANK_CASE(2)
SWITCH_OUT_RANK_CASE(3)
SWITCH_OUT_RANK_CASE(4)
SWITCH_OUT_RANK_CASE(5)
default: {
PADDLE_THROW(platform::errors::InvalidArgument(
"Target tensor rank out of range"
"Maximum supported rank for broadcast is: 5"));
}
}
}
}
template <int OutRank>
void ApplyBroadcast(const framework::ExecutionContext& context,
const Tensor* input_tensor, Tensor* output_tensor) const {
const auto& input_dims = input_tensor->dims();
const auto& output_dims = output_tensor->dims();
int in_rank = input_dims.size();
int out_rank = output_dims.size();
// 1. Collect bcast_dims, each element of which indicates how many
// times we need to replicate along the corresponding dimension
// 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for
// both input and output tensors, so we need to initialize input X with
// expanded dims: "new_input_dims_vec"
Eigen::DSizes<Eigen::DenseIndex, OutRank> bcast_dims;
std::vector<int64_t> new_input_dims_vec(out_rank);
for (int j = 0; j < out_rank; j++) {
int out_axis = out_rank - j - 1;
int in_axis = in_rank - j - 1;
bcast_dims[out_axis] = output_dims[out_axis];
new_input_dims_vec[out_axis] = 1;
if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) {
bcast_dims[out_axis] = 1;
new_input_dims_vec[out_axis] = input_dims[in_axis];
}
}
auto new_input_dims = framework::make_ddim(new_input_dims_vec);
// Initialize input X with new_input_dims_vec, so it's rank-aligned with the
// output
auto x = EigenTensor<T, OutRank>::From(*input_tensor, new_input_dims);
output_tensor->mutable_data<T>(context.GetPlace());
auto y = EigenTensor<T, OutRank>::From(*output_tensor, output_dims);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
EigenBroadcast<std::decay_t<decltype(place)>, T, OutRank>::Eval(place, y, x,
bcast_dims);
}
};
#define SWITCH_RESHAPE_DIMS(n) \
case n: { \
Eigen::DSizes<Eigen::DenseIndex, n> reshape_dims; \
for (size_t i = 0; i < reshape_dims_vec.size(); ++i) { \
reshape_dims[i] = reshape_dims_vec[i]; \
} \
dX.device(place) = \
dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \
break; \
}
#define UPPER_SWITCH_REDUCE_DIMS(m) \
case m: { \
Eigen::DSizes<Eigen::DenseIndex, m> reduce_dims; \
for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \
reduce_dims[i] = reduce_dims_vec[i]; \
} \
switch (reshape_size) {
#define LOWER_SWITCH_REDUCE_DIMS \
default: { \
PADDLE_THROW(platform::errors::InvalidArgument( \
"Detected reshape size: %d out of range" \
"Minimum value should be larger than reduce size %d" \
"While maximum supported is: 5", \
reshape_size, reduce_size)); \
} \
} \
break; \
}
/* ----- GradOpKernel ----- */
template <typename DeviceContext, typename T>
class BroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
// Find reduce dimensions
const auto& in_tensors =
context.MultiInput<Tensor>(framework::GradVarName("Out"));
auto out_tensors = context.MultiOutput<Tensor>(framework::GradVarName("X"));
size_t num_ins = in_tensors.size();
PADDLE_ENFORCE_GT(
num_ins, 1,
platform::errors::InvalidArgument(
"Expected at least 2 input tensors, but only received d%.",
in_tensors.size()));
PADDLE_ENFORCE_EQ(
num_ins, out_tensors.size(),
platform::errors::InvalidArgument(
"BroadcastTensorsOp expects equal number of inputs and outputs,"
"but received: %d inputs v.s %d outputs",
num_ins, out_tensors.size()));
// For each In-Out tensor pair,
// Prepare and apply broadcast dims array
for (size_t i = 0; i < num_ins; i++) {
const auto* input_tensor = in_tensors[i];
auto* output_tensor = out_tensors[i];
const auto& input_dims = input_tensor->dims();
const auto& output_dims = output_tensor->dims();
int in_rank = input_dims.size();
int out_rank = output_dims.size();
// BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes
// Here we perform the following Eigen operations:
// dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
// reshape(dX_shape) -> dX
// Note the last "reshape(dX_shape)" will be performed implicitly,
// and we only need to collect reduce_dims and reshape_dims
std::vector<int> reduce_dims_vec;
std::vector<int> reshape_dims_vec;
for (int j = 0; j < in_rank; j++) {
int out_axis = out_rank - j - 1;
int in_axis = in_rank - j - 1;
reshape_dims_vec.push_back(input_dims[j]);
if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
reduce_dims_vec.push_back(in_axis);
}
}
size_t reduce_size = reduce_dims_vec.size();
size_t reshape_size = reshape_dims_vec.size();
bool just_copy = (reduce_dims_vec.size() == 0);
output_tensor->mutable_data<T>(context.GetPlace());
if (just_copy) {
// If this turns out to be a No-Op, simply perform a tensor copy
framework::TensorCopy(*input_tensor, context.GetPlace(),
context.device_context(), output_tensor);
} else {
PADDLE_ENFORCE_GE(reduce_dims_vec.size(), 1,
platform::errors::InvalidArgument(
"The number of dimensions of the input "
"'Out@GRAD' for Op(broadcast_tensors)"
" must be greater than or equal to 1, but "
"the value received is %d.",
reduce_dims_vec.size()));
PADDLE_ENFORCE_LE(
reduce_dims_vec.size(), 5,
platform::errors::InvalidArgument(
"The number of dimensions of the input 'Out@GRAD' "
"for Op(broadcast_tensors) must be less than or equal "
"to 5, but the value received is %d.",
reduce_dims_vec.size()));
// Overall:
// dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
// reshape(dX_shape) -> dX
auto dX = framework::EigenVector<T>::Flatten(*output_tensor);
auto dOut = framework::EigenVector<T>::Flatten(*input_tensor);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
// Expand ReduceSize and ReshapeSize into static values
switch (reduce_size) {
UPPER_SWITCH_REDUCE_DIMS(1)
SWITCH_RESHAPE_DIMS(1)
SWITCH_RESHAPE_DIMS(2)
SWITCH_RESHAPE_DIMS(3)
SWITCH_RESHAPE_DIMS(4)
SWITCH_RESHAPE_DIMS(5)
LOWER_SWITCH_REDUCE_DIMS
UPPER_SWITCH_REDUCE_DIMS(2)
SWITCH_RESHAPE_DIMS(2)
SWITCH_RESHAPE_DIMS(3)
SWITCH_RESHAPE_DIMS(4)
SWITCH_RESHAPE_DIMS(5)
LOWER_SWITCH_REDUCE_DIMS
UPPER_SWITCH_REDUCE_DIMS(3)
SWITCH_RESHAPE_DIMS(3)
SWITCH_RESHAPE_DIMS(4)
SWITCH_RESHAPE_DIMS(5)
LOWER_SWITCH_REDUCE_DIMS
UPPER_SWITCH_REDUCE_DIMS(4)
SWITCH_RESHAPE_DIMS(4)
SWITCH_RESHAPE_DIMS(5)
LOWER_SWITCH_REDUCE_DIMS
UPPER_SWITCH_REDUCE_DIMS(5)
SWITCH_RESHAPE_DIMS(5)
LOWER_SWITCH_REDUCE_DIMS
default: {
PADDLE_THROW(platform::errors::InvalidArgument(
"Detected reduce size: %d out of range"
"While maximum supported is: 5",
reduce_size));
}
}
}
}
}
};
} // namespace operators
} // namespace paddle
...@@ -118,6 +118,7 @@ from .tensor.logic import equal_all # noqa: F401 ...@@ -118,6 +118,7 @@ from .tensor.logic import equal_all # noqa: F401
from .tensor.logic import is_tensor # noqa: F401 from .tensor.logic import is_tensor # noqa: F401
from .tensor.manipulation import cast # noqa: F401 from .tensor.manipulation import cast # noqa: F401
from .tensor.manipulation import concat # noqa: F401 from .tensor.manipulation import concat # noqa: F401
from .tensor.manipulation import broadcast_tensors # noqa: F401
from .tensor.manipulation import expand # noqa: F401 from .tensor.manipulation import expand # noqa: F401
from .tensor.manipulation import broadcast_to # noqa: F401 from .tensor.manipulation import broadcast_to # noqa: F401
from .tensor.manipulation import expand_as # noqa: F401 from .tensor.manipulation import expand_as # noqa: F401
...@@ -505,5 +506,6 @@ __all__ = [ # noqa ...@@ -505,5 +506,6 @@ __all__ = [ # noqa
'trunc', 'trunc',
'digamma', 'digamma',
'standard_normal', 'standard_normal',
'diagonal' 'diagonal',
'broadcast_tensors',
] ]
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle
import paddle.fluid.core as core
from op_test import OpTest
from test_collective_base import TestDistBase
import random
random.seed(2021)
paddle.enable_static()
def find_output_shape(input_list):
"""Infer output tensor shape according to bcast semantics"""
output_rank = 0
for x in input_list:
rank = len(x.shape)
output_rank = max(output_rank, rank)
output_shape = [0 for i in range(output_rank)]
for i in range(output_rank):
for x in input_list:
shape = list(reversed(x.shape))
size = 1
if i < len(shape):
size = shape[i]
output_shape[i] = max(output_shape[i], size)
return list(reversed(output_shape))
def make_inputs_outputs(input_shapes, dtype):
"""Automatically generate formatted inputs and outputs from input_shapes"""
input_list = [
np.random.random(shape).astype(dtype) for shape in input_shapes
]
output_shape = find_output_shape(input_list)
output_list = [
x + np.zeros(output_shape).astype(x.dtype) for x in input_list
]
output_formatted = {
"Out": [(f"out{i}", output_list[i]) for i in range(len(output_list))]
}
input_formatted = {
"X": [(f"x{i}", input_list[i]) for i in range(len(input_list))]
}
return input_formatted, output_formatted
def gen_rank_diff_test(dtype):
input_shapes = [(2, 60, 1), (6, 2, 1, 10)]
return make_inputs_outputs(input_shapes, dtype)
def gen_no_broadcast_test(dtype):
input_shapes = [(12, 1, 10, 1), (12, 1, 10, 1)]
return make_inputs_outputs(input_shapes, dtype)
def gen_mixed_tensors_test(dtype):
input_shapes = [(2, 60, 1), (2, 2, 1, 30), (1, 2, 60, 1)]
return make_inputs_outputs(input_shapes, dtype)
class TestCPUBroadcastTensorsOp(OpTest):
def set_place(self):
self.place = core.CPUPlace()
def set_dtypes(self):
self.dtypes = ['float64']
def setUp(self):
self.op_type = "broadcast_tensors"
self.use_mkldnn = False
self.attrs = {'use_mkldnn': self.use_mkldnn}
self.test_gen_func_list = [
gen_rank_diff_test, gen_no_broadcast_test, gen_mixed_tensors_test
]
self.set_place()
self.set_dtypes()
def run_test(self, test_func, args):
for dtype in self.dtypes:
for gen_func in self.test_gen_func_list:
self.inputs, self.outputs = gen_func(dtype)
test_func(**args)
def test_check_output(self):
self.run_test(self.check_output_with_place,
{"place": self.place,
"atol": 1e-1})
def test_check_grad_normal(self):
self.run_test(self.check_grad_with_place, {
"place": self.place,
"inputs_to_check": ['x0', 'x1'],
"output_names": ['out0', 'out1'],
"max_relative_error": 0.05,
})
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestCUDABroadcastTensorsOp(TestCPUBroadcastTensorsOp):
def set_place(self):
self.place = core.CUDAPlace(0)
def set_dtypes(self):
self.dtypes = ['float64']
if core.is_float16_supported(self.place):
self.dtypes.append('float16')
class TestBroadcastTensorsAPI(unittest.TestCase):
def test_api(self):
def test_static():
inputs = [
paddle.fluid.layers.data(
shape=[4, 1, 4, 1], dtype='float32', name="x0"),
paddle.fluid.layers.data(
shape=[1, 4, 1, 4], dtype='float32', name="x1")
]
paddle.broadcast_tensors(inputs)
def test_dynamic():
paddle.disable_static()
try:
inputs = [
paddle.to_tensor(
np.random.random([4, 1, 4, 1]).astype("float32")),
paddle.to_tensor(
np.random.random([1, 4, 1, 4]).astype("float32"))
]
paddle.broadcast_tensors(inputs)
finally:
paddle.enable_static()
test_static()
test_dynamic()
class TestRaiseBroadcastTensorsError(unittest.TestCase):
def test_errors(self):
def test_type():
inputs = [
paddle.fluid.layers.data(
shape=[1, 1, 1, 1], dtype='float32', name="x4"),
paddle.fluid.layers.data(
shape=[1, 4, 1, 1], dtype='float64', name="x5")
]
paddle.broadcast_tensors(inputs)
def test_dtype():
inputs = [
paddle.fluid.layers.data(
shape=[1, 1, 1, 1], dtype='int8', name="x6"),
paddle.fluid.layers.data(
shape=[1, 4, 1, 1], dtype='int8', name="x7")
]
paddle.broadcast_tensors(inputs)
def test_bcast_semantics():
inputs = [
paddle.fluid.layers.data(
shape=[1, 3, 1, 1], dtype='float32', name="x9"),
paddle.fluid.layers.data(
shape=[1, 8, 1, 1], dtype='float32', name="x10")
]
paddle.broadcast_tensors(inputs)
self.assertRaises(TypeError, test_type)
self.assertRaises(TypeError, test_dtype)
self.assertRaises(TypeError, test_bcast_semantics)
if __name__ == '__main__':
unittest.main()
...@@ -66,6 +66,7 @@ from .manipulation import cast # noqa: F401 ...@@ -66,6 +66,7 @@ from .manipulation import cast # noqa: F401
from .manipulation import concat # noqa: F401 from .manipulation import concat # noqa: F401
from .manipulation import expand # noqa: F401 from .manipulation import expand # noqa: F401
from .manipulation import broadcast_to # noqa: F401 from .manipulation import broadcast_to # noqa: F401
from .manipulation import broadcast_tensors # noqa: F401
from .manipulation import expand_as # noqa: F401 from .manipulation import expand_as # noqa: F401
from .manipulation import tile # noqa: F401 from .manipulation import tile # noqa: F401
from .manipulation import flatten # noqa: F401 from .manipulation import flatten # noqa: F401
...@@ -363,6 +364,7 @@ tensor_method_func = [ #noqa ...@@ -363,6 +364,7 @@ tensor_method_func = [ #noqa
'bitwise_or', 'bitwise_or',
'bitwise_xor', 'bitwise_xor',
'bitwise_not', 'bitwise_not',
'broadcast_tensors',
] ]
#this list used in math_op_patch.py for magic_method bind #this list used in math_op_patch.py for magic_method bind
......
...@@ -120,6 +120,101 @@ def concat(x, axis=0, name=None): ...@@ -120,6 +120,101 @@ def concat(x, axis=0, name=None):
return paddle.fluid.layers.concat(input=x, axis=axis, name=name) return paddle.fluid.layers.concat(input=x, axis=axis, name=name)
def broadcast_tensors(input, name=None):
"""
This OP broadcast a list of tensors following broadcast semantics
.. note::
If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
Args:
input(list|tuple): ``input`` is a Tensor list or Tensor tuple which is with data type bool,
float16, float32, float64, int32, int64. All the Tensors in ``input`` must have same data type.
Currently we only support tensors with rank no greater than 5.
name (str, optional): The default value is None. Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
Returns:
list(Tensor): The list of broadcasted tensors following the same order as ``input``.
Examples:
.. code-block:: python
import paddle
x1 = paddle.rand([1, 2, 3, 4]).astype('float32')
x2 = paddle.rand([1, 2, 1, 4]).astype('float32')
x3 = paddle.rand([1, 1, 3, 1]).astype('float32')
out1, out2, out3 = paddle.broadcast_tensors(input=[x1, x2, x3])
# out1, out2, out3: tensors broadcasted from x1, x2, x3 with shape [1,2,3,4]
"""
num_inputs = len(input)
if in_dygraph_mode():
return core.ops.broadcast_tensors(input, num_inputs)
check_type(input, 'input', (list, tuple), 'broadcast_tensors')
if num_inputs < 1:
raise TypeError(
"At least 1 tensor is needed to perform broadcast_tensors")
# Check input types
for id, x in enumerate(input):
check_variable_and_dtype(
x, 'input[' + str(id) + ']',
['bool', 'float32', 'float64', 'int32', 'int64'],
'broadcast_tensors')
if x.dtype != input[0].dtype:
raise TypeError(
"All the Tensors in the input must have the same data type.")
# Check bcast semantics
output_shape_r_last_tensor_index = []
output_shape_r = []
# Use while loop due to weird behaviour of "range()"
j = 0
while j < len(input):
tensor = input[j]
shape = list(reversed(tensor.shape))
i = 0
while i < len(shape):
if len(output_shape_r) <= i:
output_shape_r.append(shape[i])
output_shape_r_last_tensor_index.append(j)
else:
invalid = (output_shape_r[i] != shape[i] and
output_shape_r[i] != 1 and shape[i] != 1)
if invalid:
last_index = output_shape_r_last_tensor_index[i]
raise TypeError(
"Input tensors to broadcast_tensors does not follow bcast semantics"
f"Tensor {last_index} conflicts with Tensor {j} in reversed dimension {i}"
)
if output_shape_r[i] <= shape[i]:
output_shape_r[i] = shape[i]
output_shape_r_last_tensor_index[i] = j
i += 1 # while i < len(shape)
j += 1 # while j < len(input)
helper = LayerHelper('broadcast_tensors', **locals())
i = 0
out = []
while i < num_inputs:
out.append(
helper.create_variable_for_type_inference(dtype=helper.input_dtype(
)))
i += 1
inputs = {'X': input}
helper.append_op(
type='broadcast_tensors', inputs=inputs, outputs={'Out': out},
attrs={})
return out
def flip(x, axis, name=None): def flip(x, axis, name=None):
""" """
Reverse the order of a n-D tensor along given axis in axis. Reverse the order of a n-D tensor along given axis in axis.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册