diff --git a/core/trainers/cluster_trainer.py b/core/trainers/cluster_trainer.py index a23ba61c43545f7c42f9bcaac554f8df83e697a4..fa4238bf3e4e188df124c298e15211454f2c2ade 100755 --- a/core/trainers/cluster_trainer.py +++ b/core/trainers/cluster_trainer.py @@ -19,6 +19,8 @@ Training use fluid with one node only. from __future__ import print_function import os +import time + import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory @@ -155,14 +157,21 @@ class ClusterTrainer(TranspileTrainer): fleet.init_worker() dataset = self._get_dataset() + ins = self._get_dataset_ins() + epochs = envs.get_global_env("train.epochs") for i in range(epochs): + begin_time = time.time() self._exe.train_from_dataset(program=fluid.default_main_program(), dataset=dataset, fetch_list=self.fetch_vars, fetch_info=self.fetch_alias, print_period=self.fetch_period) + end_time = time.time() + times = end_time-begin_time + print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins/times)) + self.save(i, "train", is_fleet=True) fleet.stop_worker() context['status'] = 'infer_pass' diff --git a/example/cloud/job.sh b/example/cloud/job.sh index 3e47eb285eac1b956bf8f840b5c541fad961f950..5f001a6f3005983e189d7a066df997a2e6b77b01 100644 --- a/example/cloud/job.sh +++ b/example/cloud/job.sh @@ -9,9 +9,9 @@ # ---------------------------------------------------------------------------- # # variable define # # ---------------------------------------------------------------------------- # -CPU_NUM=16 -GLOG_v=0 -FLAGS_rpc_deadline=300000 +export CPU_NUM=16 +export GLOG_v=0 +export FLAGS_rpc_deadline=300000 # ---------------------------------------------------------------------------- # python -m paddlerec.run -m paddle_rec_config.yaml -e cluster -r worker