diff --git a/paddle/phi/kernels/xpu/masked_select_kernel.cc b/paddle/phi/kernels/xpu/masked_select_kernel.cc
index 43b8d2cba25f28d79b838c78445951be38458a1d..0f142e852a9a7fd9f77ff10fc439938b89c7b3c4 100644
--- a/paddle/phi/kernels/xpu/masked_select_kernel.cc
+++ b/paddle/phi/kernels/xpu/masked_select_kernel.cc
@@ -62,14 +62,16 @@ void MaskedSelectKernel(const Context& dev_ctx,
   auto input_shape = vectorize<int>(input_dim);
   auto mask_shape = vectorize<int>(mask_dim);
 
-  PADDLE_ENFORCE_XDNN_SUCCESS(xpu::masked_select(dev_ctx.x_context(),
-                                                 input_data,
-                                                 mask_data,
-                                                 out_data,
-                                                 input_shape,
-                                                 mask_shape,
-                                                 out_size_cpu),
-                              "masked_select");
+  if (out_size_cpu > 0) {
+    PADDLE_ENFORCE_XDNN_SUCCESS(xpu::masked_select(dev_ctx.x_context(),
+                                                   input_data,
+                                                   mask_data,
+                                                   out_data,
+                                                   input_shape,
+                                                   mask_shape,
+                                                   out_size_cpu),
+                                "masked_select");
+  }
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/xpu/matmul_grad_kernel.cc b/paddle/phi/kernels/xpu/matmul_grad_kernel.cc
index dfdac5a552aaf97dcbf946b23b7823987af69b7b..07f93dc2d6a78b16ecd51ed0c908997cec822449 100644
--- a/paddle/phi/kernels/xpu/matmul_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/matmul_grad_kernel.cc
@@ -56,6 +56,15 @@ void MatmulGradKernel(const Context& dev_ctx,
                               : reinterpret_cast<XPUType*>(dx->data<T>());
   XPUType* c_2 = (dy == NULL) ? reinterpret_cast<XPUType*>(NULL)
                               : reinterpret_cast<XPUType*>(dy->data<T>());
+
+  if (info_forward.is_x_need_broadcast) {
+    XPUType* new_c_1 = nullptr;
+    new_c_1 = RAII_GUARD.alloc_l3_or_gm<XPUType>(
+        info_forward.bs * info_forward.m * info_forward.k);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(new_c_1);
+    c_1 = new_c_1;
+  }
+
   XpuFcInfo info_dx;
   XpuFcInfo info_dy;
   std::tuple<XpuFcInfo,
@@ -75,6 +84,15 @@ void MatmulGradKernel(const Context& dev_ctx,
   std::tie(info_dx, info_dy, a_1, b_1, a_2, b_2) = fc_info;
   if (dx) {
     MatMulXPUFunction<XPUType>(xpu_ctx, a_1, b_1, c_1, info_dx, 1.0f);
+    if (info_forward.is_x_need_broadcast) {
+      int r = xpu::reduce_sum<XPUType>(
+          xpu_ctx,
+          c_1,
+          reinterpret_cast<XPUType*>(dx->data<T>()),
+          {info_forward.bs, info_forward.m, info_forward.k},
+          {0});
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
+    }
   }
   if (dy) {
     MatMulXPUFunction<XPUType>(xpu_ctx, a_2, b_2, c_2, info_dy, 1.0f);
diff --git a/paddle/phi/kernels/xpu/xpu_api_wrapper.h b/paddle/phi/kernels/xpu/xpu_api_wrapper.h
index 0ebed7f449e3f26d814a96203c6bd5a0bebc84c0..8fefbd84c65d35b31fc7339a28ae30abf23825b2 100644
--- a/paddle/phi/kernels/xpu/xpu_api_wrapper.h
+++ b/paddle/phi/kernels/xpu/xpu_api_wrapper.h
@@ -58,6 +58,7 @@ struct XpuFcInfo {
   float* max_x;
   float* max_y;
   float* max_out;
+  bool is_x_need_broadcast;
   XpuFcInfo()
       : bs(0),
         m(0),
@@ -70,7 +71,8 @@ struct XpuFcInfo {
         stride_out(0),
         max_x(nullptr),
         max_y(nullptr),
-        max_out(nullptr) {}
+        max_out(nullptr),
+        is_x_need_broadcast(false) {}
   void InitFcInfo(int bs,
                   int m,
                   int n,
@@ -145,8 +147,12 @@ static void GetFCInfo(const phi::DDim& x_dims,
             y_dims.to_str(),
             mat_dim_a.trans_,
             mat_dim_b.trans_));
-    mat_dim_b.height_ *= mat_dim_b.batch_size_;
-    mat_dim_b.batch_size_ = 0;
+    if (mat_dim_a.width_ == mat_dim_b.batch_size_ * mat_dim_b.height_) {
+      mat_dim_b.height_ *= mat_dim_b.batch_size_;
+      mat_dim_b.batch_size_ = 0;
+    } else {
+      info->is_x_need_broadcast = true;
+    }
   }
 
   if (mat_dim_a.width_ == mat_dim_b.height_) {
@@ -171,7 +177,7 @@ static void GetFCInfo(const phi::DDim& x_dims,
   info->m = mat_dim_a.height_;
   info->n = mat_dim_b.width_;
   info->k = mat_dim_a.width_;
-  info->bs = mat_dim_a.batch_size_;
+  info->bs = std::max(mat_dim_a.batch_size_, mat_dim_b.batch_size_);
   info->trans_x = trans_x;
   info->trans_y = trans_y;
 
@@ -406,6 +412,7 @@ static void MatMulXPUFunction(xpu::Context* xpu_ctx,
   float* max_x = fcinfo.max_x;
   float* max_y = fcinfo.max_y;
   float* max_out = fcinfo.max_out;
+  bool is_x_need_broadcast = fcinfo.is_x_need_broadcast;
 
   if (batch_size <= 1) {
     fc_api(xpu_ctx,
@@ -428,6 +435,19 @@ static void MatMulXPUFunction(xpu::Context* xpu_ctx,
            nullptr,
            xpu::Activation_t::LINEAR);
   } else {
+    const XPUType* x_data = reinterpret_cast<const XPUType*>(x);
+    if (is_x_need_broadcast) {
+      XPUType* x_broadcast_data = nullptr;
+      xpu::ctx_guard RAII_GUARD(xpu_ctx);
+      x_broadcast_data = RAII_GUARD.alloc_l3_or_gm<XPUType>(batch_size * m * k);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(x_broadcast_data);
+      std::vector<int> x_shape = {1, m, k};
+      std::vector<int> new_x_shape = {batch_size, m, k};
+      int r = xpu::broadcast<XPUType>(
+          xpu_ctx, x_data, x_broadcast_data, x_shape, new_x_shape);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast");
+      x_data = x_broadcast_data;
+    }
     // batch matmul
     fc_batch_api(xpu_ctx,                              // Context* ctx,
                  batch_size,                           // int batch_size,
@@ -437,7 +457,7 @@ static void MatMulXPUFunction(xpu::Context* xpu_ctx,
                  n,                                    // int n,
                  k,                                    // int k,
                  alpha,                                // float alpha,
-                 reinterpret_cast<const XPUType*>(x),  // const TX* x,
+                 x_data,                               // const TX* x,
                  ldx,                                  // int stride_a,
                  reinterpret_cast<const XPUType*>(y),  // const TW* w,
                  ldy,                                  // int stride_b,
@@ -554,6 +574,7 @@ MatmulGradFcInfo(xpu::Context* xpu_ctx,
                         nullptr,
                         max_dout,
                         nullptr);
+    dy_shape.is_x_need_broadcast = dout_shape.is_x_need_broadcast;
     dy_a = x, dy_b = dout_new;
   } else if (trans_y) {
     // dx = dout * y
@@ -600,6 +621,7 @@ MatmulGradFcInfo(xpu::Context* xpu_ctx,
                         nullptr,
                         max_dout,
                         nullptr);
+    dy_shape.is_x_need_broadcast = dout_shape.is_x_need_broadcast;
     dy_a = x, dy_b = dout_new;
   }
   std::tuple<XpuFcInfo, XpuFcInfo, const T*, const T*, const T*, const T*>
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
index 3e873a965f6d96416c2f5ca3427148c6a67e1f91..c2a1ab4ee0b6216d08a4ca82afe5825f13578b95 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
@@ -294,6 +294,30 @@ class XPUTestMatmulV2Op(XPUOpTestWrapper):
             self.trans_x = False
             self.trans_y = False
 
+    class TestMatMulOp19(TestMatMulV2Op):
+        """
+        case 19 : (x.ndim <= 2) && (y.ndim >= 3),
+                  x need to broadcast and trans_y is false
+        """
+
+        def config(self):
+            self.x_shape = (10, 20)
+            self.y_shape = (2, 20, 4)
+            self.trans_x = False
+            self.trans_y = False
+
+    class TestMatMulOp20(TestMatMulV2Op):
+        """
+        case 20 : (x.ndim <= 2) && (y.ndim >= 3),
+                  x need to broadcast and trans_y is false
+        """
+
+        def config(self):
+            self.x_shape = (20, 10)
+            self.y_shape = (2, 20, 4)
+            self.trans_x = True
+            self.trans_y = False
+
 
 support_types = get_xpu_op_support_types('matmul_v2')
 for stype in support_types: