diff --git a/paddle/phi/api/lib/scalar.cc b/paddle/phi/api/lib/scalar.cc index 09b15f629d8040784c78918d109d85e5fa69b277..78207c7ae66c8fea071fda57a46f24e14082272e 100644 --- a/paddle/phi/api/lib/scalar.cc +++ b/paddle/phi/api/lib/scalar.cc @@ -31,7 +31,8 @@ ScalarBase::ScalarBase(const Tensor& tensor_in) "now Tensor has `%d` elements", tensor_in.numel())); auto tensor_in_place = tensor_in.place().GetType(); - if (tensor_in_place == phi::AllocationType::GPU) { + if (tensor_in_place == phi::AllocationType::XPU || + tensor_in_place == phi::AllocationType::GPU) { Tensor dst_tensor; copy(tensor_in, phi::CPUPlace(), true, &dst_tensor); GetDataFromTensor(dst_tensor); diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py index 38b03225616ed4c9280f909b073da470c3df8275..6a0a0b66cbeb25c6127e1895064c24bac6ddf3b0 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py @@ -46,7 +46,7 @@ from .group_sharded_storage import ParamStorage, GradStorage from .group_sharded_utils import Type, device_guard, GroupShardedClipGrad # CUDA alignment 256 bytes, cpu alignment 4096 bytes -alignment = {"gpu": 256, "cpu": 4096} +alignment = {"gpu": 256, "cpu": 4096, "xpu": 256} align = { Type.fp16.value: 2, Type.bf16.value: 2, @@ -85,7 +85,9 @@ class GroupShardedOptimizerStage2(Optimizer): ): super().__init__(learning_rate=optim._learning_rate, parameters=params) - assert core.is_compiled_with_cuda(), "Only GPU is supported now" + assert ( + core.is_compiled_with_cuda() or core.is_compiled_with_xpu() + ), "Only GPU and XPU is supported now" # Segmentation information self._dtype_rank_params = (