From 1c7d15737b24d2dc427e4a97080a7e33bfe5bce1 Mon Sep 17 00:00:00 2001 From: Xi Chen Date: Wed, 18 Apr 2018 15:01:53 -0700 Subject: [PATCH] fix training parameter issue --- tools/aws_benchmarking/README.md | 3 ++- .../client/cluster_launcher.py | 12 ++++++++-- .../aws_benchmarking/server/cluster_master.py | 22 +++++++++++++++++-- .../server/pserver.sh.template | 2 +- .../server/trainer.sh.template | 2 +- 5 files changed, 34 insertions(+), 7 deletions(-) diff --git a/tools/aws_benchmarking/README.md b/tools/aws_benchmarking/README.md index 837fcbb851..7d7ce7278e 100644 --- a/tools/aws_benchmarking/README.md +++ b/tools/aws_benchmarking/README.md @@ -84,7 +84,8 @@ putcn/paddle_aws_client \ --security_group_id \ --docker_image myreponame/paddle_benchmark \ --pserver_count 2 \ ---trainer_count 2 +--trainer_count 2 \ +--trainer_command batch_size:20,is_local:no ``` Now just wait until you see this: diff --git a/tools/aws_benchmarking/client/cluster_launcher.py b/tools/aws_benchmarking/client/cluster_launcher.py index 594378ff8f..12333202b9 100644 --- a/tools/aws_benchmarking/client/cluster_launcher.py +++ b/tools/aws_benchmarking/client/cluster_launcher.py @@ -80,7 +80,11 @@ parser.add_argument( use ami-1ae93962 for us-east-2") parser.add_argument( - '--pserver_command', type=str, default="", help="pserver start command") + '--pserver_command', + type=str, + default="", + help="pserver start command, format example: python,vgg.py,batch_size:128,is_local:yes" +) parser.add_argument( '--trainer_image_id', @@ -90,7 +94,11 @@ parser.add_argument( use ami-1ae93962 for us-west-2") parser.add_argument( - '--trainer_command', type=str, default="", help="trainer start command") + '--trainer_command', + type=str, + default="", + help="trainer start command, format example: python,vgg.py,batch_size:128,is_local:yes" +) parser.add_argument( '--availability_zone', diff --git a/tools/aws_benchmarking/server/cluster_master.py b/tools/aws_benchmarking/server/cluster_master.py index 21f85a5fc4..f3454a1b21 100644 --- a/tools/aws_benchmarking/server/cluster_master.py +++ b/tools/aws_benchmarking/server/cluster_master.py @@ -19,6 +19,7 @@ import math import time import threading import logging +import copy import netaddr import boto3 @@ -334,6 +335,23 @@ def log_to_file(source, filename): log_file.write(line) +def parse_command(command_raw, defaults={}): + if not command_raw: + return "" + commands_processed = [] + parameter_map = copy.copy(defaults) + for seg in command_raw.split(","): + if ":" in seg: + parameters = seg.split(":") + parameter_map[parameters[0]] = parameters[1] + #seg = "--" + seg.replace(":", " ") + else: + commands_processed.append(seg) + for key, val in parameter_map.iteritems(): + commands_processed.append("--" + key + " " + val) + return " ".join(commands_processed) + + def create_trainers(kickoff_cmd, pserver_endpoints_str): def create_and_start_trainer(trainer_index): logging.info("trainer " + str(trainer_index) + " is starting") @@ -361,7 +379,7 @@ def create_trainers(kickoff_cmd, pserver_endpoints_str): TRAINER_INDEX=str(trainer_index), TASK_NAME=args.task_name, TRAINER_COUNT=args.trainer_count, - COMMAND=args.trainer_command, + COMMAND=parse_command(args.trainer_command, {"device": "GPU"}), MASTER_ENDPOINT=args.master_server_ip + ":" + str(args.master_server_port)) logging.info(cmd) @@ -476,7 +494,7 @@ def kickoff_pserver(host, pserver_endpoints_str): DOCKER_IMAGE=args.docker_image, PSERVER_PORT=args.pserver_port, TASK_NAME=args.task_name, - COMMAND=args.pserver_command, + COMMAND=parse_command(args.pserver_command, {"device": "CPU"}), TRAINER_COUNT=args.trainer_count, TRAINER_INDEX=0, # there is no way to use 0.0.0.0:port to start pserver diff --git a/tools/aws_benchmarking/server/pserver.sh.template b/tools/aws_benchmarking/server/pserver.sh.template index 2612856d1e..8d7f9e84c7 100644 --- a/tools/aws_benchmarking/server/pserver.sh.template +++ b/tools/aws_benchmarking/server/pserver.sh.template @@ -1,2 +1,2 @@ #!/bin/bash -docker run --network="host" -i -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device CPU \ No newline at end of file +docker run --network="host" -i -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} \ No newline at end of file diff --git a/tools/aws_benchmarking/server/trainer.sh.template b/tools/aws_benchmarking/server/trainer.sh.template index a4b2876b08..9b0aae9f7a 100644 --- a/tools/aws_benchmarking/server/trainer.sh.template +++ b/tools/aws_benchmarking/server/trainer.sh.template @@ -1,2 +1,2 @@ #!/bin/bash -nvidia-docker run --network="host" -i -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "PADDLE_INIT_TRAINER_ID={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device GPU \ No newline at end of file +nvidia-docker run --network="host" -i -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "PADDLE_INIT_TRAINER_ID={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} \ No newline at end of file -- GitLab