diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc
index 27b1107675d4e722f9a2e25801ecc4dfb206cce5..c3917fad555cb4633d4d958abcde0244fae13cae 100644
--- a/paddle/fluid/operators/broadcast_tensors_op.cc
+++ b/paddle/fluid/operators/broadcast_tensors_op.cc
@@ -12,15 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/broadcast_tensors_op.h"
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/var_type_inference.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -31,64 +27,6 @@ class BroadcastTensorsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "broadcast_tensors");
-    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
-                   "broadcast_tensors");
-
-    int target_rank = 0;
-    const auto& input_dims = ctx->GetInputsDim("X");
-
-    // 1. Find Output rank = max(Inputs rank)
-    for (const auto& input_ddim : input_dims) {
-      target_rank = std::max(target_rank, input_ddim.size());
-    }
-
-    PADDLE_ENFORCE_GT(
-        target_rank, 0,
-        platform::errors::InvalidArgument(
-            "BroadcastTensorsOp requires at least one input tensor"
-            "to have rank greater than zero"));
-
-    std::vector<int64_t> target_dims(target_rank, 0);
-    // 2. Output dim(axis=x) = max(Inputs dim(axis=x))
-    for (int index = 0; index < target_rank; index++) {
-      // Loop axes in reverse order,
-      // For each axis, take the maximum as target size
-      // Fill size = 1 if shape vector exhausts
-      int target_dim_size = 1;
-      for (const auto& input_ddim : input_dims) {
-        // Reversed order
-        int axis = static_cast<int>(input_ddim.size()) - index - 1;
-        int dim_size = 1;
-        if (axis >= 0) {
-          dim_size = input_ddim[axis];
-        }
-
-        if (target_dim_size != 1 && dim_size != 1 &&
-            target_dim_size != dim_size) {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "BroadcastTensorsOp inputs does not satisfy bcast semantics,"
-              "Please check axis = %d in reverse order",
-              index));
-        }
-
-        // We performed bcast semantics check at python level
-        // So input tensors should all have legal shape
-        target_dim_size = std::max(target_dim_size, dim_size);
-      }
-      target_dims[target_rank - index - 1] = target_dim_size;
-    }
-
-    // 3. Set Output Dim
-    std::vector<DDim> output_ddims;
-    for (size_t i = 0; i < input_dims.size(); i++) {
-      output_ddims.emplace_back(phi::make_ddim(target_dims));
-    }
-    ctx->SetOutputsDim("Out", output_ddims);
-    ctx->ShareAllLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -229,34 +167,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer,
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+DELCARE_INFER_SHAPE_FUNCTOR(broadcast_tensors,
+                            BroadcastTensorsInferShapeFunctor,
+                            PT_INFER_META(phi::BroadcastTensorsInferMeta));
+
 REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp,
                   ops::BroadcastTensorsOpMaker,
                   ops::BroadcastTensorsGradOpMaker<paddle::framework::OpDesc>,
                   ops::BroadcastTensorsGradOpMaker<paddle::imperative::OpBase>,
-                  ops::BroadcastTensorsOpVarTypeInference);
+                  ops::BroadcastTensorsOpVarTypeInference,
+                  BroadcastTensorsInferShapeFunctor);
 
 REGISTER_OPERATOR(broadcast_tensors_grad, ops::BroadcastTensorsGradOp,
                   ops::BroadcastTensorsGradOpVarTypeInference,
                   ops::BroadcastTensorsGradNoNeedBufVarsInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    broadcast_tensors,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext,
-                                  plat::float16>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
-
-REGISTER_OP_CPU_KERNEL(
-    broadcast_tensors_grad,
-    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
-                                      plat::float16>,
-    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
-                                      float>,
-    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
-                                      double>,
-    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
-                                      int64_t>);
diff --git a/paddle/fluid/operators/broadcast_tensors_op.cu b/paddle/fluid/operators/broadcast_tensors_op.cu
deleted file mode 100644
index 5882258317d7daa6c62905f8a76d5c68060787a8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/broadcast_tensors_op.cu
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/broadcast_tensors_op.h"
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using framework::DDim;
-
-template <typename T>
-class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // Find reduce dimensions
-    const auto& in_tensors =
-        context.MultiInput<Tensor>(framework::GradVarName("Out"));
-    auto out_tensors = context.MultiOutput<Tensor>(framework::GradVarName("X"));
-
-    size_t num_ins = in_tensors.size();
-
-    PADDLE_ENFORCE_GT(
-        num_ins, 1,
-        platform::errors::InvalidArgument(
-            "Expected at least 2 input tensors, but only received d%.",
-            in_tensors.size()));
-
-    PADDLE_ENFORCE_EQ(
-        num_ins, out_tensors.size(),
-        platform::errors::InvalidArgument(
-            "BroadcastTensorsOp expects equal number of inputs and outputs,"
-            "but received: %d inputs v.s %d outputs",
-            num_ins, out_tensors.size()));
-
-    // For each In-Out tensor pair,
-    // Prepare and apply broadcast dims array
-    for (size_t i = 0; i < num_ins; i++) {
-      auto* input_tensor = in_tensors[i];
-      auto* output_tensor = out_tensors[i];
-
-      const DDim& input_dims = input_tensor->dims();
-      const DDim& output_dims = output_tensor->dims();
-
-      int in_rank = input_dims.size();
-      int out_rank = output_dims.size();
-
-      // Collect reduce_dims
-      // Example:
-      // dX  = [1,1,1,1]
-      // dOut = [1,1,1,4]
-      //
-      // reduce_dims  = [3] // reduce along the broadcasted axis
-      std::vector<int> reduce_dims_vec;
-      for (int j = 0; j < in_rank; j++) {
-        int out_axis = out_rank - j - 1;
-        int in_axis = in_rank - j - 1;
-
-        if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
-          reduce_dims_vec.push_back(in_axis);
-        }
-      }
-
-      bool just_copy = (reduce_dims_vec.size() == 0);
-      output_tensor->mutable_data<T>(context.GetPlace());
-      if (just_copy) {
-        // Turns out to be a No-Op, simply copy tensors
-        framework::TensorCopy(*input_tensor, context.GetPlace(),
-                              context.device_context(), output_tensor);
-      } else {
-        // reduce_sum implementation on CUDA
-        auto stream = context.cuda_device_context().stream();
-        TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-            context.cuda_device_context(), *input_tensor, output_tensor,
-            kps::IdentityFunctor<T>(), reduce_dims_vec, stream);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    broadcast_tensors,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext,
-                                  plat::float16>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(broadcast_tensors_grad,
-                        ops::CUDABroadcastTensorsGradOpKernel<plat::float16>,
-                        ops::CUDABroadcastTensorsGradOpKernel<float>,
-                        ops::CUDABroadcastTensorsGradOpKernel<double>,
-                        ops::CUDABroadcastTensorsGradOpKernel<int>,
-                        ops::CUDABroadcastTensorsGradOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/broadcast_tensors_op.h b/paddle/fluid/operators/broadcast_tensors_op.h
deleted file mode 100644
index 682f2e24769221d04317d0e53d02406c4c5a26eb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/broadcast_tensors_op.h
+++ /dev/null
@@ -1,282 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#define SWITCH_OUT_RANK_CASE(n)                                \
-  case n: {                                                    \
-    ApplyBroadcast<n>(context, in_tensors[i], out_tensors[i]); \
-    break;                                                     \
-  }
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using framework::DDim;
-using framework::EigenTensor;
-
-template <typename DeviceContext, typename T>
-class BroadcastTensorsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto& in_tensors = context.MultiInput<Tensor>("X");
-    auto out_tensors = context.MultiOutput<Tensor>("Out");
-
-    size_t num_ins = in_tensors.size();
-
-    PADDLE_ENFORCE_GT(
-        num_ins, 1,
-        platform::errors::InvalidArgument(
-            "Expected at least 2 input tensors, but only received d%.",
-            in_tensors.size()));
-
-    PADDLE_ENFORCE_EQ(
-        num_ins, out_tensors.size(),
-        platform::errors::InvalidArgument(
-            "BroadcastTensorsOp expects equal number of inputs and outputs,"
-            "but received: %d inputs v.s %d outputs",
-            num_ins, out_tensors.size()));
-
-    // Eigen has no support for dynamic ranked tensor
-    // Thus we perform static expansion for each possible ranks
-    for (size_t i = 0; i < num_ins; i++) {
-      int out_rank = out_tensors[i]->dims().size();
-      switch (out_rank) {
-        SWITCH_OUT_RANK_CASE(1)
-        SWITCH_OUT_RANK_CASE(2)
-        SWITCH_OUT_RANK_CASE(3)
-        SWITCH_OUT_RANK_CASE(4)
-        SWITCH_OUT_RANK_CASE(5)
-        default: {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Target tensor rank out of range"
-              "Maximum supported rank for broadcast is: 5"));
-        }
-      }
-    }
-  }
-
-  template <int OutRank>
-  void ApplyBroadcast(const framework::ExecutionContext& context,
-                      const Tensor* input_tensor, Tensor* output_tensor) const {
-    const auto& input_dims = input_tensor->dims();
-    const auto& output_dims = output_tensor->dims();
-
-    int in_rank = input_dims.size();
-    int out_rank = output_dims.size();
-
-    // 1. Collect bcast_dims, each element of which indicates how many
-    // times we need to replicate along the corresponding dimension
-    // 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for
-    // both input and output tensors, so we need to initialize input X with
-    // expanded dims: "new_input_dims_vec"
-    Eigen::DSizes<Eigen::DenseIndex, OutRank> bcast_dims;
-    std::vector<int64_t> new_input_dims_vec(out_rank);
-    for (int j = 0; j < out_rank; j++) {
-      int out_axis = out_rank - j - 1;
-      int in_axis = in_rank - j - 1;
-
-      bcast_dims[out_axis] = output_dims[out_axis];
-      new_input_dims_vec[out_axis] = 1;
-      if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) {
-        bcast_dims[out_axis] = 1;
-        new_input_dims_vec[out_axis] = input_dims[in_axis];
-      }
-    }
-    auto new_input_dims = phi::make_ddim(new_input_dims_vec);
-
-    // Initialize input X with new_input_dims_vec, so it's rank-aligned with the
-    // output
-    auto x = EigenTensor<T, OutRank>::From(*input_tensor, new_input_dims);
-
-    output_tensor->mutable_data<T>(context.GetPlace());
-    auto y = EigenTensor<T, OutRank>::From(*output_tensor, output_dims);
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcast<std::decay_t<decltype(place)>, T, OutRank>::Eval(place, y, x,
-                                                                    bcast_dims);
-  }
-};
-
-#define SWITCH_RESHAPE_DIMS(n)                                                \
-  case n: {                                                                   \
-    Eigen::DSizes<Eigen::DenseIndex, n> reshape_dims;                         \
-    for (size_t i = 0; i < reshape_dims_vec.size(); ++i) {                    \
-      reshape_dims[i] = reshape_dims_vec[i];                                  \
-    }                                                                         \
-    dX.device(place) =                                                        \
-        dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \
-    break;                                                                    \
-  }
-
-#define UPPER_SWITCH_REDUCE_DIMS(m)                       \
-  case m: {                                               \
-    Eigen::DSizes<Eigen::DenseIndex, m> reduce_dims;      \
-    for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \
-      reduce_dims[i] = reduce_dims_vec[i];                \
-    }                                                     \
-    switch (reshape_size) {
-#define LOWER_SWITCH_REDUCE_DIMS                             \
-  default: {                                                 \
-    PADDLE_THROW(platform::errors::InvalidArgument(          \
-        "Detected reshape size: %d out of range"             \
-        "Minimum value should be larger than reduce size %d" \
-        "While maximum supported is: 5",                     \
-        reshape_size, reduce_size));                         \
-  }                                                          \
-    }                                                        \
-    break;                                                   \
-    }
-
-/* ----- GradOpKernel ----- */
-template <typename DeviceContext, typename T>
-class BroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // Find reduce dimensions
-    const auto& in_tensors =
-        context.MultiInput<Tensor>(framework::GradVarName("Out"));
-    auto out_tensors = context.MultiOutput<Tensor>(framework::GradVarName("X"));
-
-    size_t num_ins = in_tensors.size();
-
-    PADDLE_ENFORCE_GT(
-        num_ins, 1,
-        platform::errors::InvalidArgument(
-            "Expected at least 2 input tensors, but only received d%.",
-            in_tensors.size()));
-
-    PADDLE_ENFORCE_EQ(
-        num_ins, out_tensors.size(),
-        platform::errors::InvalidArgument(
-            "BroadcastTensorsOp expects equal number of inputs and outputs,"
-            "but received: %d inputs v.s %d outputs",
-            num_ins, out_tensors.size()));
-
-    // For each In-Out tensor pair,
-    // Prepare and apply broadcast dims array
-    for (size_t i = 0; i < num_ins; i++) {
-      const auto* input_tensor = in_tensors[i];
-      auto* output_tensor = out_tensors[i];
-
-      const auto& input_dims = input_tensor->dims();
-      const auto& output_dims = output_tensor->dims();
-
-      int in_rank = input_dims.size();
-      int out_rank = output_dims.size();
-
-      // BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes
-      // Here we perform the following Eigen operations:
-      // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
-      // reshape(dX_shape) -> dX
-      // Note the last "reshape(dX_shape)" will be performed implicitly,
-      // and we only need to collect reduce_dims and reshape_dims
-      std::vector<int> reduce_dims_vec;
-      std::vector<int> reshape_dims_vec;
-      for (int j = 0; j < in_rank; j++) {
-        int out_axis = out_rank - j - 1;
-        int in_axis = in_rank - j - 1;
-
-        reshape_dims_vec.push_back(input_dims[j]);
-        if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
-          reduce_dims_vec.push_back(in_axis);
-        }
-      }
-
-      size_t reduce_size = reduce_dims_vec.size();
-      size_t reshape_size = reshape_dims_vec.size();
-      bool just_copy = (reduce_dims_vec.size() == 0);
-      output_tensor->mutable_data<T>(context.GetPlace());
-      if (just_copy) {
-        // If this turns out to be a No-Op, simply perform a tensor copy
-        framework::TensorCopy(*input_tensor, context.GetPlace(),
-                              context.device_context(), output_tensor);
-      } else {
-        PADDLE_ENFORCE_GE(reduce_dims_vec.size(), 1,
-                          platform::errors::InvalidArgument(
-                              "The number of dimensions of the input "
-                              "'Out@GRAD' for Op(broadcast_tensors)"
-                              " must be greater than or equal to 1, but "
-                              "the value received is %d.",
-                              reduce_dims_vec.size()));
-        PADDLE_ENFORCE_LE(
-            reduce_dims_vec.size(), 5,
-            platform::errors::InvalidArgument(
-                "The number of dimensions of the input 'Out@GRAD' "
-                "for Op(broadcast_tensors) must be less than or equal "
-                "to 5, but the value received is %d.",
-                reduce_dims_vec.size()));
-
-        // Overall:
-        // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
-        // reshape(dX_shape) -> dX
-        auto dX = framework::EigenVector<T>::Flatten(*output_tensor);
-        auto dOut = framework::EigenVector<T>::Flatten(*input_tensor);
-        auto& place =
-            *context.template device_context<DeviceContext>().eigen_device();
-
-        // Expand ReduceSize and ReshapeSize into static values
-        switch (reduce_size) {
-          UPPER_SWITCH_REDUCE_DIMS(1)
-          SWITCH_RESHAPE_DIMS(1)
-          SWITCH_RESHAPE_DIMS(2)
-          SWITCH_RESHAPE_DIMS(3)
-          SWITCH_RESHAPE_DIMS(4)
-          SWITCH_RESHAPE_DIMS(5)
-          LOWER_SWITCH_REDUCE_DIMS
-
-          UPPER_SWITCH_REDUCE_DIMS(2)
-          SWITCH_RESHAPE_DIMS(2)
-          SWITCH_RESHAPE_DIMS(3)
-          SWITCH_RESHAPE_DIMS(4)
-          SWITCH_RESHAPE_DIMS(5)
-          LOWER_SWITCH_REDUCE_DIMS
-
-          UPPER_SWITCH_REDUCE_DIMS(3)
-          SWITCH_RESHAPE_DIMS(3)
-          SWITCH_RESHAPE_DIMS(4)
-          SWITCH_RESHAPE_DIMS(5)
-          LOWER_SWITCH_REDUCE_DIMS
-
-          UPPER_SWITCH_REDUCE_DIMS(4)
-          SWITCH_RESHAPE_DIMS(4)
-          SWITCH_RESHAPE_DIMS(5)
-          LOWER_SWITCH_REDUCE_DIMS
-
-          UPPER_SWITCH_REDUCE_DIMS(5)
-          SWITCH_RESHAPE_DIMS(5)
-          LOWER_SWITCH_REDUCE_DIMS
-
-          default: {
-            PADDLE_THROW(platform::errors::InvalidArgument(
-                "Detected reduce size: %d out of range"
-                "While maximum supported is: 5",
-                reduce_size));
-          }
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 7634e5e01aca4cdaf7fb46399f9594897f2d0e36..dc5478e8afb981defa9bc493cb440cead4f5965f 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -13,11 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/infermeta/multiary.h"
-
+#include <vector>
 #include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 namespace phi {
 
+std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors) {
+  std::vector<DDim> dims;
+  dims.reserve(tensors.size());
+  for (const MetaTensor* tensor : tensors) {
+    dims.emplace_back(tensor->dims());
+  }
+  return dims;
+}
+
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
@@ -84,6 +94,60 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void BroadcastTensorsInferMeta(const std::vector<MetaTensor*>& x,
+                               std::vector<MetaTensor*> out) {
+  int target_rank = 0;
+  const auto& input_dims = GetMetaTensorsDim(x);
+
+  // 1. Find Output rank = max(Inputs rank)
+  for (const auto& input_ddim : input_dims) {
+    target_rank = std::max(target_rank, input_ddim.size());
+  }
+
+  PADDLE_ENFORCE_GT(target_rank,
+                    0,
+                    errors::InvalidArgument("BroadcastTensorsOp requires at "
+                                            "least one input tensor to have "
+                                            "rank greater than zero"));
+
+  std::vector<int64_t> target_dims(target_rank, 0);
+  // 2. Output dim(axis=x) = max(Inputs dim(axis=x))
+  for (int index = 0; index < target_rank; index++) {
+    // Loop axes in reverse order,
+    // For each axis, take the maximum as target size
+    // Fill size = 1 if shape vector exhausts
+    int target_dim_size = 1;
+    for (const auto& input_ddim : input_dims) {
+      // Reversed order
+      int axis = static_cast<int>(input_ddim.size()) - index - 1;
+      int dim_size = 1;
+      if (axis >= 0) {
+        dim_size = input_ddim[axis];
+      }
+
+      if (target_dim_size != 1 && dim_size != 1 &&
+          target_dim_size != dim_size) {
+        PADDLE_THROW(errors::InvalidArgument(
+            "BroadcastTensorsOp inputs does not satisfy bcast semantics, "
+            "please check axis = %d in reverse order",
+            index));
+      }
+
+      // We performed bcast semantics check at python level
+      // So input tensors should all have legal shape
+      target_dim_size = std::max(target_dim_size, dim_size);
+    }
+    target_dims[target_rank - index - 1] = target_dim_size;
+  }
+
+  // 3. Set Output Dim
+  for (size_t i = 0; i < out.size(); i++) {
+    out[i]->set_dims(phi::make_ddim(target_dims));
+    out[i]->share_lod(*(x[i]));
+    out[i]->set_dtype(x[i]->dtype());
+  }
+}
+
 void ConcatInferMeta(const std::vector<MetaTensor*>& x,
                      const Scalar& axis_scalar,
                      MetaTensor* out,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 2afb79daa355cc897e3bf4076003e9a41de8b96c..51738c5e08e9842c7cffcdd1a2ce7ee3764d6412 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -18,6 +18,8 @@ limitations under the License. */
 #include "paddle/phi/core/meta_tensor.h"
 namespace phi {
 
+std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors);
+
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
@@ -25,6 +27,9 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     MetaTensor* out,
                                     MetaConfig config = MetaConfig());
 
+void BroadcastTensorsInferMeta(const std::vector<MetaTensor*>& x,
+                               std::vector<MetaTensor*> out);
+
 void ConcatInferMeta(const std::vector<MetaTensor*>& x,
                      const Scalar& axis_scalar,
                      MetaTensor* out,
diff --git a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ec2e35cc9b0cfe09fd281605984e72a603b8f5e
--- /dev/null
+++ b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BroadcastTensorsGradKernel(const Context& ctx,
+                                const std::vector<DenseTensor>& dout,
+                                std::vector<DenseTensor*> dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/broadcast_tensors_kernel.h b/paddle/phi/kernels/broadcast_tensors_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb2a6f1136c26cb1bee1ca26ae7d214566862709
--- /dev/null
+++ b/paddle/phi/kernels/broadcast_tensors_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BroadcastTensorsKernel(const Context& ctx,
+                            const std::vector<DenseTensor>& x,
+                            std::vector<DenseTensor*> out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/complex_grad_kernel.h b/paddle/phi/kernels/complex_grad_kernel.h
index 505d4d374424141ad71da863d1fd7a6424fb35ef..be13e2826ea81455fd811143dde02f2d11cfdae2 100644
--- a/paddle/phi/kernels/complex_grad_kernel.h
+++ b/paddle/phi/kernels/complex_grad_kernel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h
index 44bfae9820aa84cb33784f108ace6aa0ab8b5281..3b3003392d37f384416643a3b8a52b4a6809216d 100644
--- a/paddle/phi/kernels/complex_kernel.h
+++ b/paddle/phi/kernels/complex_kernel.h
@@ -50,14 +50,10 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
   return x;
 }
 
-template <typename T, typename DeviceContext>
-void RealKernel(const DeviceContext& dev_ctx,
-                const DenseTensor& x,
-                DenseTensor* out);
-
-template <typename T, typename DeviceContext>
-void ImagKernel(const DeviceContext& dev_ctx,
-                const DenseTensor& x,
-                DenseTensor* out);
+template <typename T, typename Context>
+void RealKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
+template <typename T, typename Context>
+void ImagKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7a97f8c2189736452a722882f8d86a6cfaeae0f5
--- /dev/null
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
@@ -0,0 +1,201 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h"
+
+#include <vector>
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#define SWITCH_RESHAPE_DIMS(n)                                                \
+  case n: {                                                                   \
+    Eigen::DSizes<Eigen::DenseIndex, n> reshape_dims;                         \
+    for (size_t i = 0; i < reshape_dims_vec.size(); ++i) {                    \
+      reshape_dims[i] = reshape_dims_vec[i];                                  \
+    }                                                                         \
+    dX.device(place) =                                                        \
+        dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \
+    break;                                                                    \
+  }
+
+#define UPPER_SWITCH_REDUCE_DIMS(m)                       \
+  case m: {                                               \
+    Eigen::DSizes<Eigen::DenseIndex, m> reduce_dims;      \
+    for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \
+      reduce_dims[i] = reduce_dims_vec[i];                \
+    }                                                     \
+    switch (reshape_size) {
+#define LOWER_SWITCH_REDUCE_DIMS                             \
+  default: {                                                 \
+    PADDLE_THROW(errors::InvalidArgument(                    \
+        "Detected reshape size: %d out of range"             \
+        "Minimum value should be larger than reduce size %d" \
+        "While maximum supported is: 5",                     \
+        reshape_size,                                        \
+        reduce_size));                                       \
+  }                                                          \
+    }                                                        \
+    break;                                                   \
+    }
+
+namespace phi {
+
+template <typename T, typename Context>
+void BroadcastTensorsGradKernel(const Context& ctx,
+                                const std::vector<DenseTensor>& dout,
+                                std::vector<DenseTensor*> dx) {
+  // Find reduce dimensions
+  const auto& in_tensors = dout;
+  auto& out_tensors = dx;
+
+  size_t num_ins = in_tensors.size();
+
+  PADDLE_ENFORCE_GT(
+      num_ins,
+      1,
+      errors::InvalidArgument(
+          "Expected at least 2 input tensors, but only received d%.",
+          in_tensors.size()));
+
+  PADDLE_ENFORCE_EQ(num_ins,
+                    out_tensors.size(),
+                    errors::InvalidArgument(
+                        "BroadcastTensorsOp expects equal number of inputs and "
+                        "outputs, but received: %d inputs v.s %d outputs",
+                        num_ins,
+                        out_tensors.size()));
+
+  // For each In-Out tensor pair,
+  // Prepare and apply broadcast dims array
+  for (size_t i = 0; i < num_ins; i++) {
+    const auto* input_tensor = &in_tensors[i];
+    auto* output_tensor = out_tensors[i];
+
+    const auto& input_dims = input_tensor->dims();
+    const auto& output_dims = output_tensor->dims();
+
+    int in_rank = input_dims.size();
+    int out_rank = output_dims.size();
+
+    // BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes
+    // Here we perform the following Eigen operations:
+    // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
+    // reshape(dX_shape) -> dX
+    // Note the last "reshape(dX_shape)" will be performed implicitly,
+    // and we only need to collect reduce_dims and reshape_dims
+    std::vector<int> reduce_dims_vec;
+    std::vector<int> reshape_dims_vec;
+    for (int j = 0; j < in_rank; j++) {
+      int out_axis = out_rank - j - 1;
+      int in_axis = in_rank - j - 1;
+
+      reshape_dims_vec.push_back(input_dims[j]);
+      if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
+        reduce_dims_vec.push_back(in_axis);
+      }
+    }
+
+    size_t reduce_size = reduce_dims_vec.size();
+    size_t reshape_size = reshape_dims_vec.size();
+    bool just_copy = (reduce_dims_vec.size() == 0);
+    ctx.template Alloc<T>(output_tensor);
+    if (just_copy) {
+      // If this turns out to be a No-Op, simply perform a tensor copy
+      paddle::framework::TensorCopy(
+          *input_tensor, ctx.GetPlace(), ctx, output_tensor);
+    } else {
+      PADDLE_ENFORCE_GE(
+          reduce_dims_vec.size(),
+          1,
+          errors::InvalidArgument("The number of dimensions of the input "
+                                  "'Out@GRAD' for Op(broadcast_tensors)"
+                                  " must be greater than or equal to 1, but "
+                                  "the value received is %d.",
+                                  reduce_dims_vec.size()));
+      PADDLE_ENFORCE_LE(
+          reduce_dims_vec.size(),
+          5,
+          errors::InvalidArgument(
+              "The number of dimensions of the input 'Out@GRAD' "
+              "for Op(broadcast_tensors) must be less than or equal "
+              "to 5, but the value received is %d.",
+              reduce_dims_vec.size()));
+
+      // Overall:
+      // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
+      // reshape(dX_shape) -> dX
+      auto dX = EigenVector<T>::Flatten(*output_tensor);
+      auto dOut = EigenVector<T>::Flatten(*input_tensor);
+      auto& place = *ctx.eigen_device();
+
+      // Expand ReduceSize and ReshapeSize into static values
+      switch (reduce_size) {
+        UPPER_SWITCH_REDUCE_DIMS(1)
+        SWITCH_RESHAPE_DIMS(1)
+        SWITCH_RESHAPE_DIMS(2)
+        SWITCH_RESHAPE_DIMS(3)
+        SWITCH_RESHAPE_DIMS(4)
+        SWITCH_RESHAPE_DIMS(5)
+        LOWER_SWITCH_REDUCE_DIMS
+
+        UPPER_SWITCH_REDUCE_DIMS(2)
+        SWITCH_RESHAPE_DIMS(2)
+        SWITCH_RESHAPE_DIMS(3)
+        SWITCH_RESHAPE_DIMS(4)
+        SWITCH_RESHAPE_DIMS(5)
+        LOWER_SWITCH_REDUCE_DIMS
+
+        UPPER_SWITCH_REDUCE_DIMS(3)
+        SWITCH_RESHAPE_DIMS(3)
+        SWITCH_RESHAPE_DIMS(4)
+        SWITCH_RESHAPE_DIMS(5)
+        LOWER_SWITCH_REDUCE_DIMS
+
+        UPPER_SWITCH_REDUCE_DIMS(4)
+        SWITCH_RESHAPE_DIMS(4)
+        SWITCH_RESHAPE_DIMS(5)
+        LOWER_SWITCH_REDUCE_DIMS
+
+        UPPER_SWITCH_REDUCE_DIMS(5)
+        SWITCH_RESHAPE_DIMS(5)
+        LOWER_SWITCH_REDUCE_DIMS
+
+        default: {
+          PADDLE_THROW(
+              errors::InvalidArgument("Detected reduce size: %d out of range"
+                                      "While maximum supported is: 5",
+                                      reduce_size));
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(broadcast_tensors_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BroadcastTensorsGradKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4cb6db876927142baac0ba0cde3438a4e3b00159
--- /dev/null
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
+#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h"
+
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(broadcast_tensors,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BroadcastTensorsKernel,
+                   bool,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6fb24d72145c67be2ad1d25620e7886326e8cd6f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -0,0 +1,111 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h"
+
+#include <vector>
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BroadcastTensorsGradKernel(const Context& ctx,
+                                const std::vector<DenseTensor>& dout,
+                                std::vector<DenseTensor*> dx) {
+  // Find reduce dimensions
+  const auto& in_tensors = dout;
+  auto& out_tensors = dx;
+
+  size_t num_ins = in_tensors.size();
+
+  PADDLE_ENFORCE_GT(
+      num_ins,
+      1,
+      errors::InvalidArgument(
+          "Expected at least 2 input tensors, but only received d%.",
+          in_tensors.size()));
+
+  PADDLE_ENFORCE_EQ(
+      num_ins,
+      out_tensors.size(),
+      errors::InvalidArgument(
+          "BroadcastTensorsOp expects equal number of inputs and outputs,"
+          "but received: %d inputs v.s %d outputs",
+          num_ins,
+          out_tensors.size()));
+
+  // For each In-Out tensor pair,
+  // Prepare and apply broadcast dims array
+  for (size_t i = 0; i < num_ins; i++) {
+    auto* input_tensor = &in_tensors[i];
+    auto* output_tensor = out_tensors[i];
+
+    const DDim& input_dims = input_tensor->dims();
+    const DDim& output_dims = output_tensor->dims();
+
+    int in_rank = input_dims.size();
+    int out_rank = output_dims.size();
+
+    // Collect reduce_dims
+    // Example:
+    // dX  = [1,1,1,1]
+    // dOut = [1,1,1,4]
+    //
+    // reduce_dims  = [3] // reduce along the broadcasted axis
+    std::vector<int> reduce_dims_vec;
+    for (int j = 0; j < in_rank; j++) {
+      int out_axis = out_rank - j - 1;
+      int in_axis = in_rank - j - 1;
+
+      if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
+        reduce_dims_vec.push_back(in_axis);
+      }
+    }
+
+    bool just_copy = (reduce_dims_vec.size() == 0);
+    ctx.template Alloc<T>(output_tensor);
+    if (just_copy) {
+      // Turns out to be a No-Op, simply copy tensors
+      paddle::framework::TensorCopy(
+          *input_tensor, ctx.GetPlace(), ctx, output_tensor);
+    } else {
+      // reduce_sum implementation on CUDA
+      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx,
+          *input_tensor,
+          output_tensor,
+          kps::IdentityFunctor<T>(),
+          reduce_dims_vec,
+          ctx.stream());
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(broadcast_tensors_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BroadcastTensorsGradKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..aa45bd3c4389177a07b5228319940e9b840fe1b2
--- /dev/null
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
+#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h"
+
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(broadcast_tensors,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BroadcastTensorsKernel,
+                   bool,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb01b83377cb62c7dc6147cd57edcd3c9c047f78
--- /dev/null
+++ b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
@@ -0,0 +1,118 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#define SWITCH_OUT_RANK_CASE(n)                                         \
+  case n: {                                                             \
+    ApplyBroadcast<T, Context, n>(ctx, &in_tensors[i], out_tensors[i]); \
+    break;                                                              \
+  }
+
+namespace phi {
+
+template <typename T, typename Context, int OutRank>
+void ApplyBroadcast(const Context& ctx,
+                    const DenseTensor* input_tensor,
+                    DenseTensor* output_tensor) {
+  const auto& input_dims = input_tensor->dims();
+  const auto& output_dims = output_tensor->dims();
+
+  int in_rank = input_dims.size();
+  int out_rank = output_dims.size();
+
+  // 1. Collect bcast_dims, each element of which indicates how many
+  // times we need to replicate along the corresponding dimension
+  // 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for
+  // both input and output tensors, so we need to initialize input X with
+  // expanded dims: "new_input_dims_vec"
+  Eigen::DSizes<Eigen::DenseIndex, OutRank> bcast_dims;
+  std::vector<int64_t> new_input_dims_vec(out_rank);
+  for (int j = 0; j < out_rank; j++) {
+    int out_axis = out_rank - j - 1;
+    int in_axis = in_rank - j - 1;
+
+    bcast_dims[out_axis] = output_dims[out_axis];
+    new_input_dims_vec[out_axis] = 1;
+    if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) {
+      bcast_dims[out_axis] = 1;
+      new_input_dims_vec[out_axis] = input_dims[in_axis];
+    }
+  }
+  auto new_input_dims = phi::make_ddim(new_input_dims_vec);
+
+  // Initialize input X with new_input_dims_vec, so it's rank-aligned with the
+  // output
+  auto x = EigenTensor<T, OutRank>::From(*input_tensor, new_input_dims);
+
+  ctx.template Alloc<T>(output_tensor);
+  auto y = EigenTensor<T, OutRank>::From(*output_tensor, output_dims);
+
+  auto& place = *ctx.eigen_device();
+  funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, OutRank>::Eval(
+      place, y, x, bcast_dims);
+}
+
+template <typename T, typename Context>
+void BroadcastTensorsKernel(const Context& ctx,
+                            const std::vector<DenseTensor>& x,
+                            std::vector<DenseTensor*> out) {
+  const auto& in_tensors = x;
+  auto out_tensors = out;
+  size_t num_ins = in_tensors.size();
+
+  PADDLE_ENFORCE_GT(
+      num_ins,
+      1,
+      errors::InvalidArgument(
+          "Expected at least 2 input tensors, but only received d%.",
+          in_tensors.size()));
+
+  PADDLE_ENFORCE_EQ(num_ins,
+                    out_tensors.size(),
+                    errors::InvalidArgument(
+                        "BroadcastTensorsOp expects equal number of inputs and "
+                        "outputs,but received: %d inputs v.s %d outputs",
+                        num_ins,
+                        out_tensors.size()));
+
+  // Eigen has no support for dynamic ranked tensor
+  // Thus we perform static expansion for each possible ranks
+  for (size_t i = 0; i < num_ins; i++) {
+    int out_rank = out_tensors[i]->dims().size();
+    switch (out_rank) {
+      SWITCH_OUT_RANK_CASE(1)
+      SWITCH_OUT_RANK_CASE(2)
+      SWITCH_OUT_RANK_CASE(3)
+      SWITCH_OUT_RANK_CASE(4)
+      SWITCH_OUT_RANK_CASE(5)
+      default: {
+        PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+            "Target tensor rank out of range"
+            "Maximum supported rank for broadcast is: 5"));
+      }
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/broadcast_tensors_sig.cc b/paddle/phi/ops/compat/broadcast_tensors_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c979c4aedcc88c3b6bc6664de9ae3175272eec6
--- /dev/null
+++ b/paddle/phi/ops/compat/broadcast_tensors_sig.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature BroadcastTensorsGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "broadcast_tensors_grad", {GradVarName("Out")}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(broadcast_tensors_grad,
+                           phi::BroadcastTensorsGradOpArgumentMapping);