diff --git a/paddle/phi/api/lib/scalar.cc b/paddle/phi/api/lib/scalar.cc
index 09b15f629d8040784c78918d109d85e5fa69b277..78207c7ae66c8fea071fda57a46f24e14082272e 100644
--- a/paddle/phi/api/lib/scalar.cc
+++ b/paddle/phi/api/lib/scalar.cc
@@ -31,7 +31,8 @@ ScalarBase<Tensor>::ScalarBase(const Tensor& tensor_in)
                         "now Tensor has `%d` elements",
                         tensor_in.numel()));
   auto tensor_in_place = tensor_in.place().GetType();
-  if (tensor_in_place == phi::AllocationType::GPU) {
+  if (tensor_in_place == phi::AllocationType::XPU ||
+      tensor_in_place == phi::AllocationType::GPU) {
     Tensor dst_tensor;
     copy(tensor_in, phi::CPUPlace(), true, &dst_tensor);
     GetDataFromTensor(dst_tensor);
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
index 38b03225616ed4c9280f909b073da470c3df8275..6a0a0b66cbeb25c6127e1895064c24bac6ddf3b0 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
@@ -46,7 +46,7 @@ from .group_sharded_storage import ParamStorage, GradStorage
 from .group_sharded_utils import Type, device_guard, GroupShardedClipGrad
 
 # CUDA alignment 256 bytes, cpu alignment 4096 bytes
-alignment = {"gpu": 256, "cpu": 4096}
+alignment = {"gpu": 256, "cpu": 4096, "xpu": 256}
 align = {
     Type.fp16.value: 2,
     Type.bf16.value: 2,
@@ -85,7 +85,9 @@ class GroupShardedOptimizerStage2(Optimizer):
     ):
 
         super().__init__(learning_rate=optim._learning_rate, parameters=params)
-        assert core.is_compiled_with_cuda(), "Only GPU is supported now"
+        assert (
+            core.is_compiled_with_cuda() or core.is_compiled_with_xpu()
+        ), "Only GPU and XPU is supported now"
 
         # Segmentation information
         self._dtype_rank_params = (