diff --git a/core/trainers/framework/runner.py b/core/trainers/framework/runner.py index 4e64004350ac419a5c50516b4ee0a45459460f7b..839e3ed4d6e04b13f69e6c2cfc463e83aef130f7 100644 --- a/core/trainers/framework/runner.py +++ b/core/trainers/framework/runner.py @@ -209,12 +209,10 @@ class RunnerBase(object): if save_step_interval >= 1 and batch_id % save_step_interval == 0 and context[ "is_infer"] == False: - if context["fleet_mode"]: - if context["fleet_mode"].upper() == "PS": - train_prog = context["model"][model_dict[ - "name"]]["main_program"] - elif not context["is_fleet"] or context[ - "fleet_mode"].upper() == "COLLECTIVE": + if context["fleet_mode"].upper() == "PS": + train_prog = context["model"][model_dict["name"]][ + "main_program"] + else: train_prog = context["model"][model_dict["name"]][ "default_main_program"] startup_prog = context["model"][model_dict["name"]][ diff --git a/models/rank/dnn/config.yaml b/models/rank/dnn/config.yaml index f0c82462485cfda69882894d16cdfadffb872c89..75826684dbc0734e4acf40983bbc837c7b97ac84 100755 --- a/models/rank/dnn/config.yaml +++ b/models/rank/dnn/config.yaml @@ -114,15 +114,13 @@ runner: print_interval: 1 phases: [phase1] -- name: local_ps_train - class: local_cluster_train +- name: single_multi_gpu_train + class: train # num of epochs epochs: 1 # device to run training or infer - device: cpu - selected_gpus: "0" # 选择多卡执行训练 - work_num: 1 - server_num: 1 + device: gpu + selected_gpus: "0,1" # 选择多卡执行训练 save_checkpoint_interval: 1 # save model interval of epochs save_inference_interval: 4 # save inference save_step_interval: 1