diff --git a/paddle_fl/core/trainer/fl_trainer.py b/paddle_fl/core/trainer/fl_trainer.py index fdd5af6c435486141515d9c9dd9d496a4b747ca5..ea62466068d2c3290e795f8c1a4c05f523fd9401 100755 --- a/paddle_fl/core/trainer/fl_trainer.py +++ b/paddle_fl/core/trainer/fl_trainer.py @@ -100,13 +100,14 @@ class FedAvgTrainer(FLTrainer): self._logger.debug("begin to run recv program") self.exe.run(self._recv_program) self._logger.debug("begin to run current step") - self.exe.run(self._main_program, + loss = self.exe.run(self._main_program, feed=feed, fetch_list=fetch) if self.cur_step % self._step == 0: self._logger.debug("begin to run send program") self.exe.run(self._send_program) self.cur_step += 1 + return loss def stop(self): return False diff --git a/paddle_fl/examples/gru4rec_demo/fl_master.py b/paddle_fl/examples/gru4rec_demo/fl_master.py index 55f9c625c309a4f0b1e2e22114a5030f461f4bb4..c20aafe1ae711cfad7be56bdd41cd0cdc688cba7 100644 --- a/paddle_fl/examples/gru4rec_demo/fl_master.py +++ b/paddle_fl/examples/gru4rec_demo/fl_master.py @@ -9,7 +9,7 @@ class Model(object): def gru4rec_network(self, vocab_size=37483, - hid_size=10, + hid_size=100, init_low_bound=-0.04, init_high_bound=0.04): """ network definition """ @@ -29,7 +29,6 @@ class Model(object): initializer=fluid.initializer.Uniform( low=init_low_bound, high=init_high_bound), learning_rate=emb_lr_x), - #is_distributed=True, is_sparse=False) fc0 = fluid.layers.fc(input=emb, size=hid_size * 3, @@ -54,7 +53,7 @@ class Model(object): learning_rate=fc_lr_x)) cost = fluid.layers.cross_entropy( input=self.fc, label=self.dst_wordseq) - acc = fluid.layers.accuracy( + self.acc = fluid.layers.accuracy( input=self.fc, label=self.dst_wordseq, k=20) self.loss = fluid.layers.mean(x=cost) self.startup_program = fluid.default_startup_program() @@ -70,11 +69,11 @@ job_generator.set_optimizer(optimizer) job_generator.set_losses([model.loss]) job_generator.set_startup_program(model.startup_program) job_generator.set_infer_feed_and_target_names( - [model.src_wordseq.name, model.dst_wordseq.name], [model.fc.name]) + [model.src_wordseq.name, model.dst_wordseq.name], [model.loss.name, model.acc.name]) build_strategy = FLStrategyFactory() build_strategy.fed_avg = True -build_strategy.inner_step = 10 +build_strategy.inner_step = 1 strategy = build_strategy.create_fl_strategy() # endpoints will be collected through the cluster @@ -82,5 +81,5 @@ strategy = build_strategy.create_fl_strategy() endpoints = ["127.0.0.1:8181"] output = "fl_job_config" job_generator.generate_fl_job( - strategy, server_endpoints=endpoints, worker_num=2, output=output) + strategy, server_endpoints=endpoints, worker_num=4, output=output) # fl_job_config will be dispatched to workers diff --git a/paddle_fl/examples/gru4rec_demo/fl_trainer.py b/paddle_fl/examples/gru4rec_demo/fl_trainer.py index 6a4850b2707cc8b8d57a049001fd4b875b199672..f43374a2d02a2f11a63a45eda4b5d03c95f4c72f 100644 --- a/paddle_fl/examples/gru4rec_demo/fl_trainer.py +++ b/paddle_fl/examples/gru4rec_demo/fl_trainer.py @@ -10,7 +10,7 @@ logging.basicConfig(filename="test.log", filemode="w", format="%(asctime)s %(nam trainer_id = int(sys.argv[1]) # trainer id for each guest place = fluid.CPUPlace() -train_file_dir = "mid_data/node1/0/" +train_file_dir = "mid_data/node4/%d/" % trainer_id job_path = "fl_job_config" job = FLRunTimeJob() job.load_trainer_job(job_path, trainer_id) @@ -18,13 +18,23 @@ trainer = FLTrainerFactory().create_fl_trainer(job) trainer.start() r = Gru4rec_Reader() -train_reader = r.reader(train_file_dir, place) +train_reader = r.reader(train_file_dir, place, batch_size = 125) +output_folder = "model_node4" step_i = 0 while not trainer.stop(): step_i += 1 print("batch %d start train" % (step_i)) for data in train_reader(): - print(data) - trainer.run(feed=data, - fetch=[]) + #print(np.array(data['src_wordseq'])) + ret_avg_cost = trainer.run(feed=data, + fetch=["mean_0.tmp_0"]) + avg_ppl = np.exp(ret_avg_cost[0]) + newest_ppl = np.mean(avg_ppl) + print("ppl:%.3f" % (newest_ppl)) + save_dir = (output_folder + "/epoch_%d") % step_i + if trainer_id == 0: + print("start save") + trainer.save_inference_program(save_dir) + if step_i >= 40: + break diff --git a/paddle_fl/examples/gru4rec_demo/run.sh b/paddle_fl/examples/gru4rec_demo/run.sh index a8de7316f2957c15747aa7b60e1849c6cac644b2..17d233d93dbe35123213ec89f604ca0094e4fd02 100644 --- a/paddle_fl/examples/gru4rec_demo/run.sh +++ b/paddle_fl/examples/gru4rec_demo/run.sh @@ -7,3 +7,7 @@ sleep 2 python -u fl_trainer.py 0 >trainer0.log & sleep 2 python -u fl_trainer.py 1 >trainer1.log & +sleep 2 +python -u fl_trainer.py 2 >trainer2.log & +sleep 2 +python -u fl_trainer.py 3 >trainer3.log &