diff --git a/fleet_rec/check.py b/fleet_rec/check.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/__init__.py b/fleet_rec/core/__init__.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/engine/__init__.py b/fleet_rec/core/engine/__init__.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/engine/engine.py b/fleet_rec/core/engine/engine.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/engine/local_cluster_engine.py b/fleet_rec/core/engine/local_cluster_engine.py old mode 100644 new mode 100755 index 6b178619f80a4a550adc0e2caa40e0bae48366d7..86a8d2497035c8a791f06b22137b067af9923ae1 --- a/fleet_rec/core/engine/local_cluster_engine.py +++ b/fleet_rec/core/engine/local_cluster_engine.py @@ -20,13 +20,13 @@ import os import copy from fleetrec.core.engine.engine import Engine - +from fleetrec.core.utils import envs class LocalClusterEngine(Engine): def start_procs(self): worker_num = self.envs["worker_num"] server_num = self.envs["server_num"] - start_port = self.envs["start_port"] + ports = [self.envs["start_port"]] logs_dir = self.envs["log_dir"] default_env = os.environ.copy() @@ -36,7 +36,13 @@ class LocalClusterEngine(Engine): current_env.pop("https_proxy", None) procs = [] log_fns = [] - ports = range(start_port, start_port + server_num, 1) + + for i in range(server_num - 1): + while True: + new_port = envs.find_free_port() + if new_port not in ports: + ports.append(new_port) + break user_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports]) user_endpoints_ips = [x.split(":")[0] for x in user_endpoints.split(",")] user_endpoints_port = [x.split(":")[1] for x in user_endpoints.split(",")] diff --git a/fleet_rec/core/engine/local_mpi_engine.py b/fleet_rec/core/engine/local_mpi_engine.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/factory.py b/fleet_rec/core/factory.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/layer.py b/fleet_rec/core/layer.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/metric.py b/fleet_rec/core/metric.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/metrics/auc_metrics.py b/fleet_rec/core/metrics/auc_metrics.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/model.py b/fleet_rec/core/model.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/modules/__init__.py b/fleet_rec/core/modules/__init__.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/modules/coding/__init__.py b/fleet_rec/core/modules/coding/__init__.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/modules/coding/layers.py b/fleet_rec/core/modules/coding/layers.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/modules/modul/__init__.py b/fleet_rec/core/modules/modul/__init__.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/modules/modul/build.py b/fleet_rec/core/modules/modul/build.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/modules/modul/layers.py b/fleet_rec/core/modules/modul/layers.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/reader.py b/fleet_rec/core/reader.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/trainers/cluster_trainer.py b/fleet_rec/core/trainers/cluster_trainer.py old mode 100644 new mode 100755 index bf59ab671f12b2ef424553e5108b1f7cd3c9abd9..4ad88c2dfba97c22c5768f4374844c43a2c4d8fa --- a/fleet_rec/core/trainers/cluster_trainer.py +++ b/fleet_rec/core/trainers/cluster_trainer.py @@ -40,7 +40,7 @@ class ClusterTrainer(TranspileTrainer): else: self.regist_context_processor('uninit', self.instance) self.regist_context_processor('init_pass', self.init) - + self.regist_context_processor('startup_pass', self.startup) if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader": self.regist_context_processor('train_pass', self.dataset_train) else: diff --git a/fleet_rec/core/trainers/single_trainer.py b/fleet_rec/core/trainers/single_trainer.py old mode 100644 new mode 100755 index b1cfba0f22ed763fb555ba2f4a0082e52c73e18d..56f081c25849af1e6a47c843ce091057599a4767 --- a/fleet_rec/core/trainers/single_trainer.py +++ b/fleet_rec/core/trainers/single_trainer.py @@ -33,7 +33,7 @@ class SingleTrainer(TranspileTrainer): def processor_register(self): self.regist_context_processor('uninit', self.instance) self.regist_context_processor('init_pass', self.init) - + self.regist_context_processor('startup_pass', self.startup) if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader": self.regist_context_processor('train_pass', self.dataset_train) else: diff --git a/fleet_rec/core/trainers/tdm_cluster_trainer.py b/fleet_rec/core/trainers/tdm_cluster_trainer.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/trainers/tdm_single_trainer.py b/fleet_rec/core/trainers/tdm_single_trainer.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/trainers/transpiler_trainer.py b/fleet_rec/core/trainers/transpiler_trainer.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/utils/__init__.py b/fleet_rec/core/utils/__init__.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/utils/dataloader_instance.py b/fleet_rec/core/utils/dataloader_instance.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/utils/dataset_instance.py b/fleet_rec/core/utils/dataset_instance.py old mode 100644 new mode 100755 diff --git a/fleet_rec/core/utils/envs.py b/fleet_rec/core/utils/envs.py old mode 100644 new mode 100755 index 22550a328d84bfb99cf90a64242b8fea4e030539..2d4cfadea305e2d42bd454f3ddc4048dd593945a --- a/fleet_rec/core/utils/envs.py +++ b/fleet_rec/core/utils/envs.py @@ -15,7 +15,8 @@ import os import copy import sys - +import socket +from contextlib import closing global_envs = {} @@ -170,3 +171,12 @@ def get_platform(): return "DARWIN" if 'Windows' in plats: return "WINDOWS" + +def find_free_port(): + def __free_port(): + with closing(socket.socket(socket.AF_INET, + socket.SOCK_STREAM)) as s: + s.bind(('', 0)) + return s.getsockname()[1] + new_port = __free_port() + return new_port diff --git a/fleet_rec/run.py b/fleet_rec/run.py old mode 100644 new mode 100755 index e4727d594b8b7fa9e6a15a964a9c0eb2bc9e27c6..ea822f81a6648e860b34600812aedf0253f5fc83 --- a/fleet_rec/run.py +++ b/fleet_rec/run.py @@ -139,7 +139,7 @@ def local_cluster_engine(args): cluster_envs = {} cluster_envs["server_num"] = 1 cluster_envs["worker_num"] = 1 - cluster_envs["start_port"] = 36001 + cluster_envs["start_port"] = envs.find_free_port() cluster_envs["log_dir"] = "logs" cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.strategy"] = "async" diff --git a/fleet_rec/tests/__init__.py b/fleet_rec/tests/__init__.py old mode 100644 new mode 100755