diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py index a5e508d0a0defc1b0ed19827ac820200a116fa55..77f9ab33c4c343f371c0a752ee2e48f32691abe7 100644 --- a/python/paddle/fluid/incubate/fleet/base/role_maker.py +++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py @@ -383,7 +383,7 @@ class MPISymetricRoleMaker(MPIRoleMaker): return the current number of worker """ if self._check_role_generation(): - return self._get_size() / self._proc_per_node + return int(self._get_size() / self._proc_per_node) return 0 def _server_num(self): @@ -391,30 +391,30 @@ class MPISymetricRoleMaker(MPIRoleMaker): return the current number of server """ if self._check_role_generation(): - return self._get_size() / self._proc_per_node + return int(self._get_size() / self._proc_per_node) else: self.generate_role() - return self._get_size() / self._proc_per_node + return int(self._get_size() / self._proc_per_node) def worker_index(self): """ return the index of worker """ if self._check_role_generation(): - return self._rank / self._proc_per_node + return int(self._rank / self._proc_per_node) else: self.generate_role() - return self._get_size() / 2 + return int(self._get_size() / 2) def server_index(self): """ return the index of server """ if self._check_role_generation(): - return self._rank / self._proc_per_node + return int(self._rank / self._proc_per_node) else: self.generate_role() - return self._get_size() / self._proc_per_node + return int(self._get_size() / self._proc_per_node) def _all_reduce(self, input, output, mode="sum"): """ @@ -612,6 +612,7 @@ class GeneralRoleMaker(RoleMakerBase): # set running status of http server self._http_server_d["running"] = False self._iface = self.__get_default_iface() + self._iface = "" if self._iface == "lo" else self._iface # this environment variable can be empty self._prefix = os.getenv("SYS_JOB_ID", "") diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py index e8d9cc3b77b6a8902bebc6a18fcb783cbd368da2..d245ce222ca6cffbac6a37caaa2668443f649652 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py @@ -270,6 +270,7 @@ class PSLib(Fleet): self._role_maker._barrier_worker() if self._role_maker.is_first_worker(): self._fleet_ptr.stop_server() + if self._heter_ptr: self._heter_ptr.stop_xpu_service() self._role_maker._barrier_worker() self._role_maker._barrier_all() diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py index e2fb29c5439e111179e96a247443dbedaa2816c1..56d476210894e15e20c67c99525e55edb9d5e5f3 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py @@ -846,7 +846,7 @@ class DistributedAdam(DistributedOptimizerImplBase): "user_define_dump_filename", "") opt_info["dump_fields_path"] = strategy.get("dump_fields_path", "") opt_info["dump_param"] = strategy.get("dump_param", []) - gpus_env = os.getenv("FLAGS_selected_gpus") + gpus_env = os.getenv("FLAGS_selected_gpus", "0") opt_info["worker_places"] = [int(s) for s in gpus_env.split(",")] opt_info["use_ps_gpu"] = strategy.get("use_ps_gpu", False) if server._server.downpour_server_param.downpour_table_param[ diff --git a/python/paddle/fluid/incubate/fleet/utils/hdfs.py b/python/paddle/fluid/incubate/fleet/utils/hdfs.py index fe09692531ad3a80e06022cd02d84fe23f7bc6ae..e5b2129e857f4b27f52da6f06b34d47dc0c299ac 100644 --- a/python/paddle/fluid/incubate/fleet/utils/hdfs.py +++ b/python/paddle/fluid/incubate/fleet/utils/hdfs.py @@ -25,8 +25,8 @@ import errno import time import logging import six -from . import fs -from .fs import FS, LocalFS, FSFileExistsError, FSFileNotExistsError, ExecuteError, FSTimeOut, FSShellCmdAborted +#from . import fs +from paddle.distributed.fleet.utils.fs import FS, LocalFS, FSFileExistsError, FSFileNotExistsError, ExecuteError, FSTimeOut, FSShellCmdAborted from paddle.fluid import core import functools