From e2d2db2246deeda04c1d4381190c263f68811cb3 Mon Sep 17 00:00:00 2001 From: tangwei Date: Thu, 14 May 2020 15:08:56 +0800 Subject: [PATCH] add paddle cloud run --- core/trainers/cluster_trainer.py | 9 +++++++++ example/cloud/job.sh | 6 +++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/core/trainers/cluster_trainer.py b/core/trainers/cluster_trainer.py index a23ba61c..fa4238bf 100755 --- a/core/trainers/cluster_trainer.py +++ b/core/trainers/cluster_trainer.py @@ -19,6 +19,8 @@ Training use fluid with one node only. from __future__ import print_function import os +import time + import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory @@ -155,14 +157,21 @@ class ClusterTrainer(TranspileTrainer): fleet.init_worker() dataset = self._get_dataset() + ins = self._get_dataset_ins() + epochs = envs.get_global_env("train.epochs") for i in range(epochs): + begin_time = time.time() self._exe.train_from_dataset(program=fluid.default_main_program(), dataset=dataset, fetch_list=self.fetch_vars, fetch_info=self.fetch_alias, print_period=self.fetch_period) + end_time = time.time() + times = end_time-begin_time + print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins/times)) + self.save(i, "train", is_fleet=True) fleet.stop_worker() context['status'] = 'infer_pass' diff --git a/example/cloud/job.sh b/example/cloud/job.sh index 3e47eb28..5f001a6f 100644 --- a/example/cloud/job.sh +++ b/example/cloud/job.sh @@ -9,9 +9,9 @@ # ---------------------------------------------------------------------------- # # variable define # # ---------------------------------------------------------------------------- # -CPU_NUM=16 -GLOG_v=0 -FLAGS_rpc_deadline=300000 +export CPU_NUM=16 +export GLOG_v=0 +export FLAGS_rpc_deadline=300000 # ---------------------------------------------------------------------------- # python -m paddlerec.run -m paddle_rec_config.yaml -e cluster -r worker -- GitLab