diff --git a/docs/requirements.txt b/docs/requirements.txt index 90c6171099d9bb6e5ce32ac1164ba848c14426f4..c860963602bfd591ff57c8c8a722dc9670bc582d 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -3,3 +3,4 @@ mistune sphinx_rtd_theme paddlepaddle>=1.6 zmq + diff --git a/docs/source/md/quick_start.md b/docs/source/md/quick_start.md index bfdd9450d2d7801c60d4111a016f59983a76fc72..93731d6d656ce63cdd0f10f9acce411f57fba0e0 100644 --- a/docs/source/md/quick_start.md +++ b/docs/source/md/quick_start.md @@ -67,7 +67,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler worker_num = 2 server_num = 1 -scheduler = FLScheduler(worker_num,server_num) +# Define the number of worker/server and the port for scheduler +scheduler = FLScheduler(worker_num,server_num,port=9091) scheduler.set_sample_worker_num(worker_num) scheduler.init_env() print("init env done.") @@ -94,6 +95,7 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest job_path = "fl_job_config" job = FLRunTimeJob() job.load_trainer_job(job_path, trainer_id) +job._scheduler_ep = "127.0.0.1:9091" # Inform the scheduler IP to trainer trainer = FLTrainerFactory().create_fl_trainer(job) trainer.start() @@ -122,6 +124,8 @@ server_id = 0 job_path = "fl_job_config" job = FLRunTimeJob() job.load_server_job(job_path, server_id) +job._scheduler_ep = "127.0.0.1:9091" # IP address for scheduler server.set_server_job(job) +server._current_ep = "127.0.0.1:8181" # IP address for server server.start() ``` diff --git a/paddle_fl/core/scheduler/agent_master.py b/paddle_fl/core/scheduler/agent_master.py index d436b822865fe0189bddb1c871b69a20dcdeae17..f26a85a7f44aca7e662dd3140b269944d89201f5 100644 --- a/paddle_fl/core/scheduler/agent_master.py +++ b/paddle_fl/core/scheduler/agent_master.py @@ -104,10 +104,7 @@ class FLScheduler(object): def start_fl_training(self): # loop until training is done - loop = 0 while True: - if loop <= 1: - print(loop) random.shuffle(self.fl_workers) worker_dict = {} for worker in self.fl_workers[:self.sample_worker_num]: @@ -143,4 +140,3 @@ class FLScheduler(object): if len(finish_training_dict) == len(worker_dict): all_finish_training = True time.sleep(5) - loop += 1 diff --git a/paddle_fl/core/trainer/diffiehellman/diffiehellman.py b/paddle_fl/core/trainer/diffiehellman/diffiehellman.py index 364d01b82d560627555e031873c1a07993d41165..cae4b26187fc548df7055922b6a08fa2d831427d 100644 --- a/paddle_fl/core/trainer/diffiehellman/diffiehellman.py +++ b/paddle_fl/core/trainer/diffiehellman/diffiehellman.py @@ -42,10 +42,8 @@ try: from ssl import RAND_bytes rng = RAND_bytes except(AttributeError, ImportError): - #python2 rng = os.urandom - #raise RNGError - + class DiffieHellman: """ Implements the Diffie-Hellman key exchange protocol. @@ -115,13 +113,13 @@ class DiffieHellman: self.shared_secret = pow(other_public_key, self.private_key, self.prime) - #python2 - #length = self.shared_secret.bit_length() // 8 + 1 - #shared_secret_as_bytes = ('%%0%dx' % (length << 1) % self.shared_secret).decode('hex')[-length:] - - #python3 - shared_secret_as_bytes = self.shared_secret.to_bytes(self.shared_secret.bit_length() // 8 + 1, byteorder='big') - + try: + #python3 + shared_secret_as_bytes = self.shared_secret.to_bytes(self.shared_secret.bit_length() // 8 + 1, byteorder='big') + except: + #python2 + length = self.shared_secret.bit_length() // 8 + 1 + shared_secret_as_bytes = ('%%0%dx' % (length << 1) % self.shared_secret).decode('hex')[-length:] _h = sha256() _h.update(bytes(shared_secret_as_bytes)) diff --git a/paddle_fl/core/trainer/fl_trainer.py b/paddle_fl/core/trainer/fl_trainer.py index 5e610e9e66bdaffa7d822813db169f252a039e6a..00cb67e72617c7edf73e4714b307090d78493e88 100755 --- a/paddle_fl/core/trainer/fl_trainer.py +++ b/paddle_fl/core/trainer/fl_trainer.py @@ -16,6 +16,7 @@ import logging from paddle_fl.core.scheduler.agent_master import FLWorkerAgent import numpy import hmac +import hashlib from .diffiehellman.diffiehellman import DiffieHellman class FLTrainerFactory(object): @@ -89,12 +90,12 @@ class FLTrainer(object): # TODO(guru4elephant): add connection with master if self.cur_step != 0: while not self.agent.finish_training(): - print('wait others finish') + self._logger.debug("Wait others finish") continue while not self.agent.can_join_training(): - print("wait permit") + self._logger.debug("Wait permit") continue - print("ready to train") + self._logger.debug("Ready to train") return False @@ -123,7 +124,6 @@ class FedAvgTrainer(FLTrainer): self.exe.run(self._recv_program) epoch = 0 for i in range(num_epoch): - print(epoch) for data in reader(): self.exe.run(self._main_program, feed=feeder.feed(data), @@ -190,6 +190,8 @@ class SecAggTrainer(FLTrainer): self._step_id = s def start(self): + self.agent = FLWorkerAgent(self._scheduler_ep, self._current_ep) + self.agent.connect_scheduler() self.exe = fluid.Executor(fluid.CPUPlace()) self.exe.run(self._startup_program) self.cur_step = 0 @@ -219,7 +221,7 @@ class SecAggTrainer(FLTrainer): self._logger.debug("begin to run send program") noise = 0.0 scale = pow(10.0, 5) - digestmod="SHA256" + digestmod=hashlib.sha256 # 1. load priv key and other's pub key dh = DiffieHellman(group=15, key_length=256) dh.load_private_key(self._key_dir + str(self._trainer_id) + "_priv_key.txt") @@ -245,5 +247,3 @@ class SecAggTrainer(FLTrainer): self.cur_step += 1 return loss - def stop(self): - return False diff --git a/paddle_fl/examples/ctr_demo/fl_scheduler.py b/paddle_fl/examples/ctr_demo/fl_scheduler.py index 7a0c8f5b3411aac07f7e50e3daf3a1c0a53144e4..9dc5d84497b376d1aa7fd5731771b0799343c2ec 100644 --- a/paddle_fl/examples/ctr_demo/fl_scheduler.py +++ b/paddle_fl/examples/ctr_demo/fl_scheduler.py @@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler worker_num = 2 server_num = 1 -scheduler = FLScheduler(worker_num,server_num) +# Define the number of worker/server and the port for scheduler +scheduler = FLScheduler(worker_num,server_num,port=9091) scheduler.set_sample_worker_num(worker_num) scheduler.init_env() print("init env done.") diff --git a/paddle_fl/examples/ctr_demo/fl_server.py b/paddle_fl/examples/ctr_demo/fl_server.py index 3671838a60b678c8c29ad530888ec75252c5430a..529df8da4079fbbd217c58a857f7ab8a3c307586 100644 --- a/paddle_fl/examples/ctr_demo/fl_server.py +++ b/paddle_fl/examples/ctr_demo/fl_server.py @@ -21,8 +21,8 @@ server_id = 0 job_path = "fl_job_config" job = FLRunTimeJob() job.load_server_job(job_path, server_id) -job._scheduler_ep = "127.0.0.1:9091" +job._scheduler_ep = "127.0.0.1:9091" # IP address for scheduler server.set_server_job(job) -server._current_ep = "127.0.0.1:8181" +server._current_ep = "127.0.0.1:8181" # IP address for server server.start() print("connect") diff --git a/paddle_fl/examples/ctr_demo/fl_trainer.py b/paddle_fl/examples/ctr_demo/fl_trainer.py index b6c864b5ac9979b76f5246d5b2b2bc3b6a9896cc..0bcfb19544741eb1ef7158dcbf6136c2e34d1b9c 100644 --- a/paddle_fl/examples/ctr_demo/fl_trainer.py +++ b/paddle_fl/examples/ctr_demo/fl_trainer.py @@ -19,22 +19,22 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest job_path = "fl_job_config" job = FLRunTimeJob() job.load_trainer_job(job_path, trainer_id) -job._scheduler_ep = "127.0.0.1:9091" +job._scheduler_ep = "127.0.0.1:9091" # Inform the scheduler IP to trainer trainer = FLTrainerFactory().create_fl_trainer(job) trainer._current_ep = "127.0.0.1:{}".format(9000+trainer_id) trainer.start() print(trainer._scheduler_ep, trainer._current_ep) output_folder = "fl_model" -step_i = 0 +epoch_id = 0 while not trainer.stop(): - print("batch %d start train" % (step_i)) + print("batch %d start train" % (epoch_id)) train_step = 0 for data in reader(): trainer.run(feed=data, fetch=[]) train_step += 1 if train_step == trainer._step: break - step_i += 1 - if step_i % 100 == 0: + epoch_id += 1 + if epoch_id % 5 == 0: trainer.save_inference_program(output_folder) diff --git a/paddle_fl/examples/dpsgd_demo/fl_scheduler.py b/paddle_fl/examples/dpsgd_demo/fl_scheduler.py index 0016fe70cc5d128404f6dc931028dedad1ee79e8..f8ea641e2fa356102ffc08eec179e84ca1993f7d 100644 --- a/paddle_fl/examples/dpsgd_demo/fl_scheduler.py +++ b/paddle_fl/examples/dpsgd_demo/fl_scheduler.py @@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler worker_num = 4 server_num = 1 -scheduler = FLScheduler(worker_num,server_num) +#Define number of worker/server and the port for scheduler +scheduler = FLScheduler(worker_num,server_num,port=9091) scheduler.set_sample_worker_num(4) scheduler.init_env() print("init env done.") diff --git a/paddle_fl/examples/dpsgd_demo/fl_server.py b/paddle_fl/examples/dpsgd_demo/fl_server.py index dbbeaa8d779ecb96d3c442c9565e088b23590764..39056e82d99fb924f52c201e0fb230b6bc1626a1 100644 --- a/paddle_fl/examples/dpsgd_demo/fl_server.py +++ b/paddle_fl/examples/dpsgd_demo/fl_server.py @@ -21,7 +21,7 @@ server_id = 0 job_path = "fl_job_config" job = FLRunTimeJob() job.load_server_job(job_path, server_id) -job._scheduler_ep = "127.0.0.1:9091" +job._scheduler_ep = "127.0.0.1:9091" # IP address for scheduler server.set_server_job(job) -server._current_ep = "127.0.0.1:8181" +server._current_ep = "127.0.0.1:8181" # IP address for server server.start() diff --git a/paddle_fl/examples/dpsgd_demo/fl_trainer.py b/paddle_fl/examples/dpsgd_demo/fl_trainer.py index 76f8841a1afed0decc03c5257a49f1609be21a6e..074368194e20defdea2f1151c9a0f940ed613189 100644 --- a/paddle_fl/examples/dpsgd_demo/fl_trainer.py +++ b/paddle_fl/examples/dpsgd_demo/fl_trainer.py @@ -13,7 +13,7 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest job_path = "fl_job_config" job = FLRunTimeJob() job.load_trainer_job(job_path, trainer_id) -job._scheduler_ep = "127.0.0.1:9091" +job._scheduler_ep = "127.0.0.1:9091" # Inform scheduler IP address to trainer trainer = FLTrainerFactory().create_fl_trainer(job) trainer._current_ep = "127.0.0.1:{}".format(9000+trainer_id) trainer.start() diff --git a/paddle_fl/examples/dpsgd_demo/run.sh b/paddle_fl/examples/dpsgd_demo/run.sh index b17d100cb380749443380d275a91d2c8703a892b..9fb7ad97fef90531d9f52004ad36877aae669137 100644 --- a/paddle_fl/examples/dpsgd_demo/run.sh +++ b/paddle_fl/examples/dpsgd_demo/run.sh @@ -1,3 +1,5 @@ +unset http_proxy +unset https_proxy python fl_master.py sleep 2 python -u fl_scheduler.py >scheduler.log & diff --git a/paddle_fl/examples/femnist_demo/fl_scheduler.py b/paddle_fl/examples/femnist_demo/fl_scheduler.py index 0016fe70cc5d128404f6dc931028dedad1ee79e8..346529fd6caadef06d1e46078fa873da264a7507 100644 --- a/paddle_fl/examples/femnist_demo/fl_scheduler.py +++ b/paddle_fl/examples/femnist_demo/fl_scheduler.py @@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler worker_num = 4 server_num = 1 -scheduler = FLScheduler(worker_num,server_num) +# Define the number of worker/server and the port for scheduler +scheduler = FLScheduler(worker_num,server_num,port=9091) scheduler.set_sample_worker_num(4) scheduler.init_env() print("init env done.") diff --git a/paddle_fl/examples/femnist_demo/fl_server.py b/paddle_fl/examples/femnist_demo/fl_server.py index 33c29d29855fe5f2f5ea02e358b9843970e0481e..cd558deb4ba577e5d0dbc74ecc90d0e22d278d8e 100644 --- a/paddle_fl/examples/femnist_demo/fl_server.py +++ b/paddle_fl/examples/femnist_demo/fl_server.py @@ -7,7 +7,7 @@ server_id = 0 job_path = "fl_job_config" job = FLRunTimeJob() job.load_server_job(job_path, server_id) -job._scheduler_ep = "127.0.0.1:9091" +job._scheduler_ep = "127.0.0.1:9091" # IP address for scheduler server.set_server_job(job) -server._current_ep = "127.0.0.1:8181" +server._current_ep = "127.0.0.1:8181" # IP address for server server.start() diff --git a/paddle_fl/examples/femnist_demo/fl_trainer.py b/paddle_fl/examples/femnist_demo/fl_trainer.py index b46661f2099cc7ef091c87404e21fdf5074e6b25..dce2c8af02ce3a3be58f0a6fe03eca18a882472a 100644 --- a/paddle_fl/examples/femnist_demo/fl_trainer.py +++ b/paddle_fl/examples/femnist_demo/fl_trainer.py @@ -14,7 +14,7 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest job_path = "fl_job_config" job = FLRunTimeJob() job.load_trainer_job(job_path, trainer_id) -job._scheduler_ep = "127.0.0.1:9091" +job._scheduler_ep = "127.0.0.1:9091" # Inform the scheduler IP to trainer print(job._target_names) trainer = FLTrainerFactory().create_fl_trainer(job) trainer._current_ep = "127.0.0.1:{}".format(9000+trainer_id) @@ -40,7 +40,7 @@ def train_test(train_test_program, train_test_feed, train_test_reader): epoch_id = 0 step = 0 epoch = 3000 -count_by_step = True +count_by_step = False if count_by_step: output_folder = "model_node%d" % trainer_id else: diff --git a/paddle_fl/examples/femnist_demo/run.sh b/paddle_fl/examples/femnist_demo/run.sh index 4a32be225fe2145630b3ca581525da00faa58930..186ce530a0b4f8e6cf4c96fab2e40bc90cda9d63 100644 --- a/paddle_fl/examples/femnist_demo/run.sh +++ b/paddle_fl/examples/femnist_demo/run.sh @@ -1,3 +1,5 @@ +unset http_proxy +unset https_proxy #killall python python fl_master.py sleep 2 diff --git a/paddle_fl/examples/gru4rec_demo/fl_scheduler.py b/paddle_fl/examples/gru4rec_demo/fl_scheduler.py index 0016fe70cc5d128404f6dc931028dedad1ee79e8..346529fd6caadef06d1e46078fa873da264a7507 100644 --- a/paddle_fl/examples/gru4rec_demo/fl_scheduler.py +++ b/paddle_fl/examples/gru4rec_demo/fl_scheduler.py @@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler worker_num = 4 server_num = 1 -scheduler = FLScheduler(worker_num,server_num) +# Define the number of worker/server and the port for scheduler +scheduler = FLScheduler(worker_num,server_num,port=9091) scheduler.set_sample_worker_num(4) scheduler.init_env() print("init env done.") diff --git a/paddle_fl/examples/gru4rec_demo/fl_server.py b/paddle_fl/examples/gru4rec_demo/fl_server.py index dbbeaa8d779ecb96d3c442c9565e088b23590764..39056e82d99fb924f52c201e0fb230b6bc1626a1 100644 --- a/paddle_fl/examples/gru4rec_demo/fl_server.py +++ b/paddle_fl/examples/gru4rec_demo/fl_server.py @@ -21,7 +21,7 @@ server_id = 0 job_path = "fl_job_config" job = FLRunTimeJob() job.load_server_job(job_path, server_id) -job._scheduler_ep = "127.0.0.1:9091" +job._scheduler_ep = "127.0.0.1:9091" # IP address for scheduler server.set_server_job(job) -server._current_ep = "127.0.0.1:8181" +server._current_ep = "127.0.0.1:8181" # IP address for server server.start() diff --git a/paddle_fl/examples/gru4rec_demo/fl_trainer.py b/paddle_fl/examples/gru4rec_demo/fl_trainer.py index 84410e9077c5c39963de6fee324ff77ed3b70b9b..b8416f739d1b0dc7cb493768cf1c9283214778c4 100644 --- a/paddle_fl/examples/gru4rec_demo/fl_trainer.py +++ b/paddle_fl/examples/gru4rec_demo/fl_trainer.py @@ -14,7 +14,7 @@ train_file_dir = "mid_data/node4/%d/" % trainer_id job_path = "fl_job_config" job = FLRunTimeJob() job.load_trainer_job(job_path, trainer_id) -job._scheduler_ep = "127.0.0.1:9091" +job._scheduler_ep = "127.0.0.1:9091" # Inform the scheduler IP to trainer trainer = FLTrainerFactory().create_fl_trainer(job) trainer._current_ep = "127.0.0.1:{}".format(9000+trainer_id) trainer.start() diff --git a/paddle_fl/examples/secagg_demo/fl_scheduler.py b/paddle_fl/examples/secagg_demo/fl_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..06f26897e88e830b66fe6fda11d83c99463507ae --- /dev/null +++ b/paddle_fl/examples/secagg_demo/fl_scheduler.py @@ -0,0 +1,10 @@ +from paddle_fl.core.scheduler.agent_master import FLScheduler + +worker_num = 2 +server_num = 1 +# Define the number of worker/server and the port for scheduler +scheduler = FLScheduler(worker_num,server_num,port=9091) +scheduler.set_sample_worker_num(worker_num) +scheduler.init_env() +print("init env done.") +scheduler.start_fl_training() diff --git a/paddle_fl/examples/secagg_demo/fl_server.py b/paddle_fl/examples/secagg_demo/fl_server.py index 7ef353b8fe99c4b335dda4648977f292ffb07dae..529df8da4079fbbd217c58a857f7ab8a3c307586 100644 --- a/paddle_fl/examples/secagg_demo/fl_server.py +++ b/paddle_fl/examples/secagg_demo/fl_server.py @@ -21,5 +21,8 @@ server_id = 0 job_path = "fl_job_config" job = FLRunTimeJob() job.load_server_job(job_path, server_id) +job._scheduler_ep = "127.0.0.1:9091" # IP address for scheduler server.set_server_job(job) +server._current_ep = "127.0.0.1:8181" # IP address for server server.start() +print("connect") diff --git a/paddle_fl/examples/secagg_demo/fl_trainer.py b/paddle_fl/examples/secagg_demo/fl_trainer.py index d6f9fdffb6e270674831a6c4f826f6673d073d44..7b08e8724a67219654c7f94809fc281353202ad2 100644 --- a/paddle_fl/examples/secagg_demo/fl_trainer.py +++ b/paddle_fl/examples/secagg_demo/fl_trainer.py @@ -28,8 +28,10 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest job_path = "fl_job_config" job = FLRunTimeJob() job.load_trainer_job(job_path, trainer_id) +job._scheduler_ep = "127.0.0.1:9091" # Inform the scheduler IP to trainer trainer = FLTrainerFactory().create_fl_trainer(job) trainer.trainer_id = trainer_id +trainer._current_ep = "127.0.0.1:{}".format(9000+trainer_id) trainer.trainer_num = trainer_num trainer.key_dir = "./keys/" trainer.start() @@ -73,6 +75,7 @@ while not trainer.stop(): if step_i % 100 == 0: print("Epoch: {0}, step: {1}, accuracy: {2}".format(epoch_id, step_i, accuracy[0])) + print(step_i) avg_loss_val, acc_val = train_test(train_test_program=test_program, train_test_reader=test_reader, train_test_feed=feeder) @@ -80,5 +83,5 @@ while not trainer.stop(): if epoch_id > 40: break - if step_i % 100 == 0: + if epoch_id % 5 == 0: trainer.save_inference_program(output_folder) diff --git a/paddle_fl/examples/secagg_demo/run.sh b/paddle_fl/examples/secagg_demo/run.sh index 9d51fa27c464f3f18a10185e0adbb8166e0ea27b..1e40c1a3ea8301299ecc8bc52b78bcfca4d037bb 100644 --- a/paddle_fl/examples/secagg_demo/run.sh +++ b/paddle_fl/examples/secagg_demo/run.sh @@ -5,10 +5,12 @@ if [ ! -d log ];then mkdir log fi -python3 fl_master.py +python fl_master.py sleep 2 -python3 -u fl_server.py >log/server0.log & +python -u fl_server.py >log/server0.log & sleep 2 -python3 -u fl_trainer.py 0 >log/trainer0.log & +python -u fl_scheduler.py > log/scheduler.log & sleep 2 -python3 -u fl_trainer.py 1 >log/trainer1.log & +python -u fl_trainer.py 0 >log/trainer0.log & +sleep 2 +python -u fl_trainer.py 1 >log/trainer1.log & diff --git a/paddle_fl/examples/submitter_demo/conf.txt b/paddle_fl/examples/submitter_demo/conf.txt index 4f952b7f9f9549cab96dd61f174111c41966dda7..0880739e809063bc21531a92bb205ee2df4f169a 100644 --- a/paddle_fl/examples/submitter_demo/conf.txt +++ b/paddle_fl/examples/submitter_demo/conf.txt @@ -2,7 +2,8 @@ task_name=test_fl_job_submit_jingqinghe hdfs_output=/user/feed/mlarch/sequence_generator/dongdaxiang/job_44 train_cmd=python dist_trainer.py -monitor_cmd=python system_monitor_app.py 10 100 +#monitor_cmd=python system_monitor_app.py 10 100 +monitor_cmd= #train_cmd=python test_hadoop.py hdfs_path=afs://xingtian.afs.baidu.com:9902 diff --git a/paddle_fl/examples/submitter_demo/kill.sh b/paddle_fl/examples/submitter_demo/kill.sh index 44c26767d2d80cfc2a8950da0e822a06a691137e..3f2a3a9586028ee4e892a4b4aad5bb390e32d50a 100644 --- a/paddle_fl/examples/submitter_demo/kill.sh +++ b/paddle_fl/examples/submitter_demo/kill.sh @@ -1 +1,3 @@ -/home/jingqinghe/mpi_feed4/smart_client/bin/qdel $1".yq01-hpc-lvliang01-smart-master.dmop.baidu.com" +unset http_proxy +unset https_proxy +/home/jingqinghe/tools/mpi_feed4/smart_client/bin/qdel $1".yq01-hpc-lvliang01-smart-master.dmop.baidu.com" diff --git a/paddle_fl/examples/submitter_demo/scheduler_client.py b/paddle_fl/examples/submitter_demo/scheduler_client.py index 59068a8d87b200ac6e4d375a40757c70afd5c0c7..b19c1793e401e473536c72f22fbbc1fe30d7d3a5 100644 --- a/paddle_fl/examples/submitter_demo/scheduler_client.py +++ b/paddle_fl/examples/submitter_demo/scheduler_client.py @@ -18,7 +18,8 @@ print(random_port) current_ip = socket.gethostbyname(socket.gethostname()) endpoints = "{}:{}".format(current_ip, random_port) #start a web server for remote endpoints to download their config -os.system("python -m SimpleHTTPServer 8080 &") +#os.system("python -m SimpleHTTPServer 8080 &") +os.system("python -m http.server 8080 &") if os.path.exists("job_config"): os.system("rm -rf job_config") if os.path.exists("package"): @@ -120,10 +121,10 @@ print(ip_list) #allocate the role of each endpoint and their ids ip_role = {} for i in range(len(ip_list)): - if i < int(default_dict["server_nodes"]): - ip_role[ip_list[i]] = 'server%d' % i + if i < int(default_dict["server_nodes"]): + ip_role[ip_list[i]] = 'server%d' % i else: - ip_role[ip_list[i]] = 'trainer%d' % (i-int(default_dict["server_nodes"])) + ip_role[ip_list[i]] = 'trainer%d' % (i-int(default_dict["server_nodes"])) print(ip_role) def job_generate(): @@ -179,7 +180,7 @@ while not all_job_sent: message = zmq_socket.recv() group = message.split("\t") if group[0] == "GET_FL_JOB": - download_job.append(group[1]) + download_job.append(group[1]) zmq_socket.send(ip_role[group[1]]) else: zmq_socket.send("WAIT\t0") diff --git a/paddle_fl/examples/submitter_demo/train_program.py b/paddle_fl/examples/submitter_demo/train_program.py index 7dae086f2a02c39a2166fd415fff66adebe1535a..22cef9f92012dc816906431a5671994f5fbb3363 100644 --- a/paddle_fl/examples/submitter_demo/train_program.py +++ b/paddle_fl/examples/submitter_demo/train_program.py @@ -89,15 +89,15 @@ else: trainer.start() print(trainer._scheduler_ep, trainer._current_ep) output_folder = "fl_model" - step_i = 0 + epoch_id = 0 while not trainer.stop(): print("batch %d start train" % (step_i)) - train_step = 0 + step_i = 0 for data in reader(): trainer.run(feed=data, fetch=[]) - train_step += 1 + step_i += 1 if train_step == trainer._step: break - step_i += 1 - if step_i % 100 == 0: + epoch_id += 1 + if epoch_id % 5 == 0: trainer.save_inference_program(output_folder) diff --git a/paddle_fl/version.py b/paddle_fl/version.py index 27fc069e27b5e7890d7f584ea5c57b7fcfe5a7ba..fb756cdd7e35182d10d99bbfe17005950c9745f2 100644 --- a/paddle_fl/version.py +++ b/paddle_fl/version.py @@ -12,5 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. """ PaddleFL version string """ -fl_version = "0.1.6" -module_proto_version = "0.1.6" +fl_version = "0.1.7" +module_proto_version = "0.1.7"