From affddfaa47d56666135a3b2e71b13bed75d226ae Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Wed, 23 Jun 2021 14:18:19 +0800
Subject: [PATCH] Add new operation: BroadcastTensorsOp (#33294)

---
 .../fluid/operators/broadcast_tensors_op.cc   | 253 ++++++++++++++++
 .../fluid/operators/broadcast_tensors_op.cu   | 132 ++++++++
 paddle/fluid/operators/broadcast_tensors_op.h | 282 ++++++++++++++++++
 python/paddle/__init__.py                     |   4 +-
 .../unittests/test_broadcast_tensors_op.py    | 196 ++++++++++++
 python/paddle/tensor/__init__.py              |   2 +
 python/paddle/tensor/manipulation.py          |  95 ++++++
 7 files changed, 963 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/operators/broadcast_tensors_op.cc
 create mode 100644 paddle/fluid/operators/broadcast_tensors_op.cu
 create mode 100644 paddle/fluid/operators/broadcast_tensors_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py
diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc
new file mode 100644
index 00000000000..074607e05ea
--- /dev/null
+++ b/paddle/fluid/operators/broadcast_tensors_op.cc
@@ -0,0 +1,253 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/broadcast_tensors_op.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/var_type_inference.h"
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+using framework::DDim;
+
+class BroadcastTensorsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "broadcast_tensors");
+    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
+                   "broadcast_tensors");
+
+    int target_rank = 0;
+    const auto& input_dims = ctx->GetInputsDim("X");
+    // 1. Find Output rank = max(Inputs rank)
+    for (const auto& input_ddim : input_dims) {
+      target_rank = std::max(target_rank, input_ddim.size());
+    }
+
+    PADDLE_ENFORCE_GT(
+        target_rank, 0,
+        platform::errors::InvalidArgument(
+            "BroadcastTensorsOp requires at least one input tensor"
+            "to have rank greater than zero"));
+
+    std::vector<int64_t> target_dims(target_rank, 0);
+    // 2. Output dim(axis=x) = max(Inputs dim(axis=x))
+    for (int index = 0; index < target_rank; index++) {
+      // Loop axes in reverse order,
+      // For each axis, take the maximum as target size
+      // Fill size = 1 if shape vector exhausts
+      int target_dim_size = 1;
+      for (const auto& input_ddim : input_dims) {
+        // Reversed order
+        int axis = static_cast<int>(input_ddim.size()) - index - 1;
+        int dim_size = 1;
+        if (axis >= 0) {
+          dim_size = input_ddim[axis];
+        }
+
+        // We performed bcast semantics check at python level
+        // So input tensors should all have legal shape
+        target_dim_size = std::max(target_dim_size, dim_size);
+      }
+      target_dims[target_rank - index - 1] = target_dim_size;
+    }
+
+    // 3. Set Output Dim
+    std::vector<DDim> output_ddims;
+    for (size_t i = 0; i < input_dims.size(); i++) {
+      output_ddims.emplace_back(framework::make_ddim(target_dims));
+    }
+    ctx->SetOutputsDim("Out", output_ddims);
+    ctx->ShareAllLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    // Broadcast semantics enforces all input variables having the same
+    // DataType/VarType
+    // This condition is also checked during VarType Inference
+    // Here we simply copy input type to output
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class BroadcastTensorsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "A Varaible list. The shape and data type of the list elements"
+             "should be consistent. Variable can be multi-dimensional Tensor"
+             "or LoDTensor, and data types can be: bool, float16, float32, "
+             "float64, int32, "
+             "int64.")
+        .AsDuplicable();
+    AddOutput("Out",
+              "the sum of input :code:`x`. its shape and data types are "
+              "consistent with :code:`x`.")
+        .AsDuplicable();
+    AddComment(
+        R"DOC(This OP is used to broadcast a vector of inputs 
+                     with Tensor or LoDTensor type, following broadcast semantics.)DOC");
+  }
+};
+
+class BroadcastTensorsOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    // We need at least two tensors to satisfy broadcast semantics
+    size_t input_size = ctx->InputSize("X");
+    PADDLE_ENFORCE_GT(
+        input_size, 0,
+        platform::errors::InvalidArgument(
+            "BroadcastTensorsOp should have at least one input variables,"
+            "but only received %d ",
+            input_size));
+
+    // BroadcastTensorsOp takes a vector of variables named "X"
+    // Here we loop through input variables,
+    // and check if their DataType/VarType are the same
+    auto var_type = ctx->GetInputType("X", 0);
+    auto data_type = ctx->GetInputDataType("X", 0);
+    for (size_t ind = 1; ind < input_size; ind++) {
+      auto cur_var_type = ctx->GetInputType("X", ind);
+      PADDLE_ENFORCE_EQ(
+          var_type, cur_var_type,
+          platform::errors::InvalidArgument(
+              "inputs to BroadcastTensorsOp should have the same variable type,"
+              "but detected %d v.s %d ",
+              framework::ToTypeName(var_type),
+              framework::ToTypeName(cur_var_type)));
+
+      auto cur_data_type = ctx->GetInputDataType("X", ind);
+      PADDLE_ENFORCE_EQ(
+          data_type, cur_data_type,
+          platform::errors::InvalidArgument(
+              "inputs to BroadcastTensorsOp should have the same data type,"
+              "but detected %d v.s %d ",
+              framework::ToTypeName(var_type),
+              framework::ToTypeName(cur_var_type)));
+    }
+
+    // Outputs having the same DataType/VarType as inputs
+    ctx->SetOutputType("Out", var_type, framework::ALL_ELEMENTS);
+    ctx->SetOutputDataType("Out", data_type, framework::ALL_ELEMENTS);
+  }
+};
+
+/* ------ BroadcastTensorsGradOp ------ */
+class BroadcastTensorsGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasOutputs(framework::GradVarName("X")), "Output",
+                   "X@grad", "broadcast_tensors");
+    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "broadcast_tensors");
+    OP_INOUT_CHECK(ctx->HasInputs(framework::GradVarName("Out")), "Input",
+                   "Out@grad", "broadcast_tensors");
+
+    const auto& forward_input_dims = ctx->GetInputsDim("X");
+    ctx->SetOutputsDim(framework::GradVarName("X"), forward_input_dims);
+    ctx->ShareAllLoD("X", /*->*/ framework::GradVarName("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class BroadcastTensorsGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("broadcast_tensors_grad");
+    // We need "X" only for backward shape inference
+    grad_op->SetInput("X", this->Input("X"));
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"),
+                       this->InputGrad("X", /* drop_empty_grad */ false));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
+class BroadcastTensorsGradOpVarTypeInference
+    : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto var_type = ctx->GetInputType("X", 0);
+    auto data_type = ctx->GetInputDataType("X", 0);
+
+    ctx->SetOutputType(framework::GradVarName("X"), var_type,
+                       framework::ALL_ELEMENTS);
+    ctx->SetOutputDataType(framework::GradVarName("X"), data_type,
+                           framework::ALL_ELEMENTS);
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer,
+                                    "X");
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp,
+                  ops::BroadcastTensorsOpMaker,
+                  ops::BroadcastTensorsGradOpMaker<paddle::framework::OpDesc>,
+                  ops::BroadcastTensorsGradOpMaker<paddle::imperative::OpBase>,
+                  ops::BroadcastTensorsOpVarTypeInference);
+
+REGISTER_OPERATOR(broadcast_tensors_grad, ops::BroadcastTensorsGradOp,
+                  ops::BroadcastTensorsGradOpVarTypeInference,
+                  ops::BroadcastTensorsGradNoNeedBufVarsInferer);
+
+REGISTER_OP_CPU_KERNEL(
+    broadcast_tensors,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext,
+                                  plat::float16>,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+
+REGISTER_OP_CPU_KERNEL(
+    broadcast_tensors_grad,
+    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
+                                      plat::float16>,
+    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
+                                      float>,
+    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
+                                      double>,
+    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
+                                      int64_t>);
diff --git a/paddle/fluid/operators/broadcast_tensors_op.cu b/paddle/fluid/operators/broadcast_tensors_op.cu
new file mode 100644
index 00000000000..d670e1b333d
--- /dev/null
+++ b/paddle/fluid/operators/broadcast_tensors_op.cu
@@ -0,0 +1,132 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/broadcast_tensors_op.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using framework::DDim;
+
+template <typename Tout>
+struct IdentityFunctor {
+  HOSTDEVICE explicit inline IdentityFunctor() {}
+
+  template <typename U>
+  HOSTDEVICE inline Tout operator()(const U& x) const {
+    return static_cast<Tout>(x);
+  }
+};
+
+template <typename T>
+class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    // Find reduce dimensions
+    const auto& in_tensors =
+        context.MultiInput<Tensor>(framework::GradVarName("Out"));
+    auto out_tensors = context.MultiOutput<Tensor>(framework::GradVarName("X"));
+
+    size_t num_ins = in_tensors.size();
+
+    PADDLE_ENFORCE_GT(
+        num_ins, 1,
+        platform::errors::InvalidArgument(
+            "Expected at least 2 input tensors, but only received d%.",
+            in_tensors.size()));
+
+    PADDLE_ENFORCE_EQ(
+        num_ins, out_tensors.size(),
+        platform::errors::InvalidArgument(
+            "BroadcastTensorsOp expects equal number of inputs and outputs,"
+            "but received: %d inputs v.s %d outputs",
+            num_ins, out_tensors.size()));
+
+    // For each In-Out tensor pair,
+    // Prepare and apply broadcast dims array
+    for (size_t i = 0; i < num_ins; i++) {
+      auto* input_tensor = in_tensors[i];
+      auto* output_tensor = out_tensors[i];
+
+      const DDim& input_dims = input_tensor->dims();
+      const DDim& output_dims = output_tensor->dims();
+
+      int in_rank = input_dims.size();
+      int out_rank = output_dims.size();
+
+      // Collect reduce_dims
+      // Example:
+      // dX  = [1,1,1,1]
+      // dOut = [1,1,1,4]
+      //
+      // reduce_dims  = [3] // reduce along the broadcasted axis
+      std::vector<int> reduce_dims_vec;
+      for (int j = 0; j < in_rank; j++) {
+        int out_axis = out_rank - j - 1;
+        int in_axis = in_rank - j - 1;
+
+        if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
+          reduce_dims_vec.push_back(in_axis);
+        }
+      }
+
+      bool just_copy = (reduce_dims_vec.size() == 0);
+      output_tensor->mutable_data<T>(context.GetPlace());
+      if (just_copy) {
+        // Turns out to be a No-Op, simply copy tensors
+        framework::TensorCopy(*input_tensor, context.GetPlace(),
+                              context.device_context(), output_tensor);
+      } else {
+        // reduce_sum implementation on CUDA
+        auto stream = context.cuda_device_context().stream();
+        TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
+            *input_tensor, output_tensor, reduce_dims_vec, static_cast<T>(0),
+            cub::Sum(), IdentityFunctor<T>(), stream);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    broadcast_tensors,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext,
+                                  plat::float16>,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
+
+REGISTER_OP_CUDA_KERNEL(broadcast_tensors_grad,
+                        ops::CUDABroadcastTensorsGradOpKernel<plat::float16>,
+                        ops::CUDABroadcastTensorsGradOpKernel<float>,
+                        ops::CUDABroadcastTensorsGradOpKernel<double>,
+                        ops::CUDABroadcastTensorsGradOpKernel<int>,
+                        ops::CUDABroadcastTensorsGradOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/broadcast_tensors_op.h b/paddle/fluid/operators/broadcast_tensors_op.h
new file mode 100644
index 00000000000..0eeb9234df0
--- /dev/null
+++ b/paddle/fluid/operators/broadcast_tensors_op.h
@@ -0,0 +1,282 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+#define SWITCH_OUT_RANK_CASE(n)                                \
+  case n: {                                                    \
+    ApplyBroadcast<n>(context, in_tensors[i], out_tensors[i]); \
+    break;                                                     \
+  }
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using framework::DDim;
+using framework::EigenTensor;
+
+template <typename DeviceContext, typename T>
+class BroadcastTensorsOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const auto& in_tensors = context.MultiInput<Tensor>("X");
+    auto out_tensors = context.MultiOutput<Tensor>("Out");
+
+    size_t num_ins = in_tensors.size();
+
+    PADDLE_ENFORCE_GT(
+        num_ins, 1,
+        platform::errors::InvalidArgument(
+            "Expected at least 2 input tensors, but only received d%.",
+            in_tensors.size()));
+
+    PADDLE_ENFORCE_EQ(
+        num_ins, out_tensors.size(),
+        platform::errors::InvalidArgument(
+            "BroadcastTensorsOp expects equal number of inputs and outputs,"
+            "but received: %d inputs v.s %d outputs",
+            num_ins, out_tensors.size()));
+
+    // Eigen has no support for dynamic ranked tensor
+    // Thus we perform static expansion for each possible ranks
+    for (size_t i = 0; i < num_ins; i++) {
+      int out_rank = out_tensors[i]->dims().size();
+      switch (out_rank) {
+        SWITCH_OUT_RANK_CASE(1)
+        SWITCH_OUT_RANK_CASE(2)
+        SWITCH_OUT_RANK_CASE(3)
+        SWITCH_OUT_RANK_CASE(4)
+        SWITCH_OUT_RANK_CASE(5)
+        default: {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Target tensor rank out of range"
+              "Maximum supported rank for broadcast is: 5"));
+        }
+      }
+    }
+  }
+
+  template <int OutRank>
+  void ApplyBroadcast(const framework::ExecutionContext& context,
+                      const Tensor* input_tensor, Tensor* output_tensor) const {
+    const auto& input_dims = input_tensor->dims();
+    const auto& output_dims = output_tensor->dims();
+
+    int in_rank = input_dims.size();
+    int out_rank = output_dims.size();
+
+    // 1. Collect bcast_dims, each element of which indicates how many
+    // times we need to replicate along the corresponding dimension
+    // 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for
+    // both input and output tensors, so we need to initialize input X with
+    // expanded dims: "new_input_dims_vec"
+    Eigen::DSizes<Eigen::DenseIndex, OutRank> bcast_dims;
+    std::vector<int64_t> new_input_dims_vec(out_rank);
+    for (int j = 0; j < out_rank; j++) {
+      int out_axis = out_rank - j - 1;
+      int in_axis = in_rank - j - 1;
+
+      bcast_dims[out_axis] = output_dims[out_axis];
+      new_input_dims_vec[out_axis] = 1;
+      if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) {
+        bcast_dims[out_axis] = 1;
+        new_input_dims_vec[out_axis] = input_dims[in_axis];
+      }
+    }
+    auto new_input_dims = framework::make_ddim(new_input_dims_vec);
+
+    // Initialize input X with new_input_dims_vec, so it's rank-aligned with the
+    // output
+    auto x = EigenTensor<T, OutRank>::From(*input_tensor, new_input_dims);
+
+    output_tensor->mutable_data<T>(context.GetPlace());
+    auto y = EigenTensor<T, OutRank>::From(*output_tensor, output_dims);
+
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    EigenBroadcast<std::decay_t<decltype(place)>, T, OutRank>::Eval(place, y, x,
+                                                                    bcast_dims);
+  }
+};
+
+#define SWITCH_RESHAPE_DIMS(n)                                                \
+  case n: {                                                                   \
+    Eigen::DSizes<Eigen::DenseIndex, n> reshape_dims;                         \
+    for (size_t i = 0; i < reshape_dims_vec.size(); ++i) {                    \
+      reshape_dims[i] = reshape_dims_vec[i];                                  \
+    }                                                                         \
+    dX.device(place) =                                                        \
+        dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \
+    break;                                                                    \
+  }
+
+#define UPPER_SWITCH_REDUCE_DIMS(m)                       \
+  case m: {                                               \
+    Eigen::DSizes<Eigen::DenseIndex, m> reduce_dims;      \
+    for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \
+      reduce_dims[i] = reduce_dims_vec[i];                \
+    }                                                     \
+    switch (reshape_size) {
+#define LOWER_SWITCH_REDUCE_DIMS                             \
+  default: {                                                 \
+    PADDLE_THROW(platform::errors::InvalidArgument(          \
+        "Detected reshape size: %d out of range"             \
+        "Minimum value should be larger than reduce size %d" \
+        "While maximum supported is: 5",                     \
+        reshape_size, reduce_size));                         \
+  }                                                          \
+    }                                                        \
+    break;                                                   \
+    }
+
+/* ----- GradOpKernel ----- */
+template <typename DeviceContext, typename T>
+class BroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    // Find reduce dimensions
+    const auto& in_tensors =
+        context.MultiInput<Tensor>(framework::GradVarName("Out"));
+    auto out_tensors = context.MultiOutput<Tensor>(framework::GradVarName("X"));
+
+    size_t num_ins = in_tensors.size();
+
+    PADDLE_ENFORCE_GT(
+        num_ins, 1,
+        platform::errors::InvalidArgument(
+            "Expected at least 2 input tensors, but only received d%.",
+            in_tensors.size()));
+
+    PADDLE_ENFORCE_EQ(
+        num_ins, out_tensors.size(),
+        platform::errors::InvalidArgument(
+            "BroadcastTensorsOp expects equal number of inputs and outputs,"
+            "but received: %d inputs v.s %d outputs",
+            num_ins, out_tensors.size()));
+
+    // For each In-Out tensor pair,
+    // Prepare and apply broadcast dims array
+    for (size_t i = 0; i < num_ins; i++) {
+      const auto* input_tensor = in_tensors[i];
+      auto* output_tensor = out_tensors[i];
+
+      const auto& input_dims = input_tensor->dims();
+      const auto& output_dims = output_tensor->dims();
+
+      int in_rank = input_dims.size();
+      int out_rank = output_dims.size();
+
+      // BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes
+      // Here we perform the following Eigen operations:
+      // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
+      // reshape(dX_shape) -> dX
+      // Note the last "reshape(dX_shape)" will be performed implicitly,
+      // and we only need to collect reduce_dims and reshape_dims
+      std::vector<int> reduce_dims_vec;
+      std::vector<int> reshape_dims_vec;
+      for (int j = 0; j < in_rank; j++) {
+        int out_axis = out_rank - j - 1;
+        int in_axis = in_rank - j - 1;
+
+        reshape_dims_vec.push_back(input_dims[j]);
+        if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
+          reduce_dims_vec.push_back(in_axis);
+        }
+      }
+
+      size_t reduce_size = reduce_dims_vec.size();
+      size_t reshape_size = reshape_dims_vec.size();
+      bool just_copy = (reduce_dims_vec.size() == 0);
+      output_tensor->mutable_data<T>(context.GetPlace());
+      if (just_copy) {
+        // If this turns out to be a No-Op, simply perform a tensor copy
+        framework::TensorCopy(*input_tensor, context.GetPlace(),
+                              context.device_context(), output_tensor);
+      } else {
+        PADDLE_ENFORCE_GE(reduce_dims_vec.size(), 1,
+                          platform::errors::InvalidArgument(
+                              "The number of dimensions of the input "
+                              "'Out@GRAD' for Op(broadcast_tensors)"
+                              " must be greater than or equal to 1, but "
+                              "the value received is %d.",
+                              reduce_dims_vec.size()));
+        PADDLE_ENFORCE_LE(
+            reduce_dims_vec.size(), 5,
+            platform::errors::InvalidArgument(
+                "The number of dimensions of the input 'Out@GRAD' "
+                "for Op(broadcast_tensors) must be less than or equal "
+                "to 5, but the value received is %d.",
+                reduce_dims_vec.size()));
+
+        // Overall:
+        // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
+        // reshape(dX_shape) -> dX
+        auto dX = framework::EigenVector<T>::Flatten(*output_tensor);
+        auto dOut = framework::EigenVector<T>::Flatten(*input_tensor);
+        auto& place =
+            *context.template device_context<DeviceContext>().eigen_device();
+
+        // Expand ReduceSize and ReshapeSize into static values
+        switch (reduce_size) {
+          UPPER_SWITCH_REDUCE_DIMS(1)
+          SWITCH_RESHAPE_DIMS(1)
+          SWITCH_RESHAPE_DIMS(2)
+          SWITCH_RESHAPE_DIMS(3)
+          SWITCH_RESHAPE_DIMS(4)
+          SWITCH_RESHAPE_DIMS(5)
+          LOWER_SWITCH_REDUCE_DIMS
+
+          UPPER_SWITCH_REDUCE_DIMS(2)
+          SWITCH_RESHAPE_DIMS(2)
+          SWITCH_RESHAPE_DIMS(3)
+          SWITCH_RESHAPE_DIMS(4)
+          SWITCH_RESHAPE_DIMS(5)
+          LOWER_SWITCH_REDUCE_DIMS
+
+          UPPER_SWITCH_REDUCE_DIMS(3)
+          SWITCH_RESHAPE_DIMS(3)
+          SWITCH_RESHAPE_DIMS(4)
+          SWITCH_RESHAPE_DIMS(5)
+          LOWER_SWITCH_REDUCE_DIMS
+
+          UPPER_SWITCH_REDUCE_DIMS(4)
+          SWITCH_RESHAPE_DIMS(4)
+          SWITCH_RESHAPE_DIMS(5)
+          LOWER_SWITCH_REDUCE_DIMS
+
+          UPPER_SWITCH_REDUCE_DIMS(5)
+          SWITCH_RESHAPE_DIMS(5)
+          LOWER_SWITCH_REDUCE_DIMS
+
+          default: {
+            PADDLE_THROW(platform::errors::InvalidArgument(
+                "Detected reduce size: %d out of range"
+                "While maximum supported is: 5",
+                reduce_size));
+          }
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index c7fc74deec0..773ae61a691 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -118,6 +118,7 @@ from .tensor.logic import equal_all  # noqa: F401
 from .tensor.logic import is_tensor  # noqa: F401
 from .tensor.manipulation import cast  # noqa: F401
 from .tensor.manipulation import concat  # noqa: F401
+from .tensor.manipulation import broadcast_tensors  # noqa: F401
 from .tensor.manipulation import expand  # noqa: F401
 from .tensor.manipulation import broadcast_to  # noqa: F401
 from .tensor.manipulation import expand_as  # noqa: F401
@@ -505,5 +506,6 @@ __all__ = [  # noqa
            'trunc',
            'digamma',
            'standard_normal',
-           'diagonal'
+           'diagonal',
+           'broadcast_tensors',
 ]
diff --git a/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py b/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py
new file mode 100644
index 00000000000..602c5bae8f8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py
@@ -0,0 +1,196 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from op_test import OpTest
+from test_collective_base import TestDistBase
+
+import random
+random.seed(2021)
+
+paddle.enable_static()
+
+
+def find_output_shape(input_list):
+    """Infer output tensor shape according to bcast semantics"""
+    output_rank = 0
+    for x in input_list:
+        rank = len(x.shape)
+        output_rank = max(output_rank, rank)
+
+    output_shape = [0 for i in range(output_rank)]
+    for i in range(output_rank):
+        for x in input_list:
+            shape = list(reversed(x.shape))
+            size = 1
+            if i < len(shape):
+                size = shape[i]
+            output_shape[i] = max(output_shape[i], size)
+
+    return list(reversed(output_shape))
+
+
+def make_inputs_outputs(input_shapes, dtype):
+    """Automatically generate formatted inputs and outputs from input_shapes"""
+    input_list = [
+        np.random.random(shape).astype(dtype) for shape in input_shapes
+    ]
+    output_shape = find_output_shape(input_list)
+    output_list = [
+        x + np.zeros(output_shape).astype(x.dtype) for x in input_list
+    ]
+
+    output_formatted = {
+        "Out": [(f"out{i}", output_list[i]) for i in range(len(output_list))]
+    }
+    input_formatted = {
+        "X": [(f"x{i}", input_list[i]) for i in range(len(input_list))]
+    }
+
+    return input_formatted, output_formatted
+
+
+def gen_rank_diff_test(dtype):
+    input_shapes = [(2, 60, 1), (6, 2, 1, 10)]
+    return make_inputs_outputs(input_shapes, dtype)
+
+
+def gen_no_broadcast_test(dtype):
+    input_shapes = [(12, 1, 10, 1), (12, 1, 10, 1)]
+    return make_inputs_outputs(input_shapes, dtype)
+
+
+def gen_mixed_tensors_test(dtype):
+    input_shapes = [(2, 60, 1), (2, 2, 1, 30), (1, 2, 60, 1)]
+    return make_inputs_outputs(input_shapes, dtype)
+
+
+class TestCPUBroadcastTensorsOp(OpTest):
+    def set_place(self):
+        self.place = core.CPUPlace()
+
+    def set_dtypes(self):
+        self.dtypes = ['float64']
+
+    def setUp(self):
+        self.op_type = "broadcast_tensors"
+        self.use_mkldnn = False
+        self.attrs = {'use_mkldnn': self.use_mkldnn}
+        self.test_gen_func_list = [
+            gen_rank_diff_test, gen_no_broadcast_test, gen_mixed_tensors_test
+        ]
+        self.set_place()
+        self.set_dtypes()
+
+    def run_test(self, test_func, args):
+        for dtype in self.dtypes:
+            for gen_func in self.test_gen_func_list:
+                self.inputs, self.outputs = gen_func(dtype)
+                test_func(**args)
+
+    def test_check_output(self):
+        self.run_test(self.check_output_with_place,
+                      {"place": self.place,
+                       "atol": 1e-1})
+
+    def test_check_grad_normal(self):
+        self.run_test(self.check_grad_with_place, {
+            "place": self.place,
+            "inputs_to_check": ['x0', 'x1'],
+            "output_names": ['out0', 'out1'],
+            "max_relative_error": 0.05,
+        })
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestCUDABroadcastTensorsOp(TestCPUBroadcastTensorsOp):
+    def set_place(self):
+        self.place = core.CUDAPlace(0)
+
+    def set_dtypes(self):
+        self.dtypes = ['float64']
+        if core.is_float16_supported(self.place):
+            self.dtypes.append('float16')
+
+
+class TestBroadcastTensorsAPI(unittest.TestCase):
+    def test_api(self):
+        def test_static():
+            inputs = [
+                paddle.fluid.layers.data(
+                    shape=[4, 1, 4, 1], dtype='float32', name="x0"),
+                paddle.fluid.layers.data(
+                    shape=[1, 4, 1, 4], dtype='float32', name="x1")
+            ]
+            paddle.broadcast_tensors(inputs)
+
+        def test_dynamic():
+            paddle.disable_static()
+            try:
+                inputs = [
+                    paddle.to_tensor(
+                        np.random.random([4, 1, 4, 1]).astype("float32")),
+                    paddle.to_tensor(
+                        np.random.random([1, 4, 1, 4]).astype("float32"))
+                ]
+                paddle.broadcast_tensors(inputs)
+            finally:
+                paddle.enable_static()
+
+        test_static()
+        test_dynamic()
+
+
+class TestRaiseBroadcastTensorsError(unittest.TestCase):
+    def test_errors(self):
+        def test_type():
+            inputs = [
+                paddle.fluid.layers.data(
+                    shape=[1, 1, 1, 1], dtype='float32', name="x4"),
+                paddle.fluid.layers.data(
+                    shape=[1, 4, 1, 1], dtype='float64', name="x5")
+            ]
+            paddle.broadcast_tensors(inputs)
+
+        def test_dtype():
+            inputs = [
+                paddle.fluid.layers.data(
+                    shape=[1, 1, 1, 1], dtype='int8', name="x6"),
+                paddle.fluid.layers.data(
+                    shape=[1, 4, 1, 1], dtype='int8', name="x7")
+            ]
+            paddle.broadcast_tensors(inputs)
+
+        def test_bcast_semantics():
+            inputs = [
+                paddle.fluid.layers.data(
+                    shape=[1, 3, 1, 1], dtype='float32', name="x9"),
+                paddle.fluid.layers.data(
+                    shape=[1, 8, 1, 1], dtype='float32', name="x10")
+            ]
+            paddle.broadcast_tensors(inputs)
+
+        self.assertRaises(TypeError, test_type)
+        self.assertRaises(TypeError, test_dtype)
+        self.assertRaises(TypeError, test_bcast_semantics)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 98d033ecec3..2d4c97212be 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -66,6 +66,7 @@ from .manipulation import cast  # noqa: F401
 from .manipulation import concat  # noqa: F401
 from .manipulation import expand  # noqa: F401
 from .manipulation import broadcast_to  # noqa: F401
+from .manipulation import broadcast_tensors  # noqa: F401
 from .manipulation import expand_as  # noqa: F401
 from .manipulation import tile  # noqa: F401
 from .manipulation import flatten  # noqa: F401
@@ -363,6 +364,7 @@ tensor_method_func  = [ #noqa
            'bitwise_or',
            'bitwise_xor',
            'bitwise_not',
+           'broadcast_tensors',
 ]
 
 #this list used in math_op_patch.py for magic_method bind
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 1c33d19db4b..981baecb644 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -120,6 +120,101 @@ def concat(x, axis=0, name=None):
     return paddle.fluid.layers.concat(input=x, axis=axis, name=name)
 
 
+def broadcast_tensors(input, name=None):
+    """
+    This OP broadcast a list of tensors following broadcast semantics
+
+    .. note::
+        If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
+
+    Args:
+        input(list|tuple): ``input`` is a Tensor list or Tensor tuple which is with data type bool,
+            float16, float32, float64, int32, int64. All the Tensors in ``input`` must have same data type.
+            Currently we only support tensors with rank no greater than 5.
+
+        name (str, optional): The default value is None. Normally there is no need for user to set this property. 
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        list(Tensor): The list of broadcasted tensors following the same order as ``input``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            x1 = paddle.rand([1, 2, 3, 4]).astype('float32')
+            x2 = paddle.rand([1, 2, 1, 4]).astype('float32')
+            x3 = paddle.rand([1, 1, 3, 1]).astype('float32')
+            out1, out2, out3 = paddle.broadcast_tensors(input=[x1, x2, x3])
+            # out1, out2, out3: tensors broadcasted from x1, x2, x3 with shape [1,2,3,4]
+    """
+
+    num_inputs = len(input)
+    if in_dygraph_mode():
+        return core.ops.broadcast_tensors(input, num_inputs)
+
+    check_type(input, 'input', (list, tuple), 'broadcast_tensors')
+    if num_inputs < 1:
+        raise TypeError(
+            "At least 1 tensor is needed to perform broadcast_tensors")
+
+    # Check input types
+    for id, x in enumerate(input):
+        check_variable_and_dtype(
+            x, 'input[' + str(id) + ']',
+            ['bool', 'float32', 'float64', 'int32', 'int64'],
+            'broadcast_tensors')
+        if x.dtype != input[0].dtype:
+            raise TypeError(
+                "All the Tensors in the input must have the same data type.")
+
+    # Check bcast semantics
+    output_shape_r_last_tensor_index = []
+    output_shape_r = []
+
+    # Use while loop due to weird behaviour of "range()"
+    j = 0
+    while j < len(input):
+        tensor = input[j]
+        shape = list(reversed(tensor.shape))
+
+        i = 0
+        while i < len(shape):
+            if len(output_shape_r) <= i:
+                output_shape_r.append(shape[i])
+                output_shape_r_last_tensor_index.append(j)
+            else:
+                invalid = (output_shape_r[i] != shape[i] and
+                           output_shape_r[i] != 1 and shape[i] != 1)
+                if invalid:
+                    last_index = output_shape_r_last_tensor_index[i]
+                    raise TypeError(
+                        "Input tensors to broadcast_tensors does not follow bcast semantics"
+                        f"Tensor {last_index} conflicts with Tensor {j} in reversed dimension {i}"
+                    )
+                if output_shape_r[i] <= shape[i]:
+                    output_shape_r[i] = shape[i]
+                    output_shape_r_last_tensor_index[i] = j
+            i += 1  # while i < len(shape)
+        j += 1  # while j < len(input)
+
+    helper = LayerHelper('broadcast_tensors', **locals())
+    i = 0
+    out = []
+    while i < num_inputs:
+        out.append(
+            helper.create_variable_for_type_inference(dtype=helper.input_dtype(
+            )))
+        i += 1
+
+    inputs = {'X': input}
+    helper.append_op(
+        type='broadcast_tensors', inputs=inputs, outputs={'Out': out},
+        attrs={})
+
+    return out
+
+
 def flip(x, axis, name=None):
     """
     Reverse the order of a n-D tensor along given axis in axis.
-- 
GitLab