diff --git a/tools/aws_benchmarking/README.md b/tools/aws_benchmarking/README.md index 837fcbb8512bce027ecd09a7f39b806151e9154b..22a468466afbcbf7cc312e714e41a3b5adf1160c 100644 --- a/tools/aws_benchmarking/README.md +++ b/tools/aws_benchmarking/README.md @@ -84,7 +84,8 @@ putcn/paddle_aws_client \ --security_group_id \ --docker_image myreponame/paddle_benchmark \ --pserver_count 2 \ ---trainer_count 2 +--trainer_count 2 \ +--trainer_command batch_size:20,local:no,device:CPU ``` Now just wait until you see this: diff --git a/tools/aws_benchmarking/client/cluster_launcher.py b/tools/aws_benchmarking/client/cluster_launcher.py index 594378ff8fc0744a4b11b1c11e2e3b270be7aed0..12333202b9f003ae5109c7e9b825035ba8eb7d99 100644 --- a/tools/aws_benchmarking/client/cluster_launcher.py +++ b/tools/aws_benchmarking/client/cluster_launcher.py @@ -80,7 +80,11 @@ parser.add_argument( use ami-1ae93962 for us-east-2") parser.add_argument( - '--pserver_command', type=str, default="", help="pserver start command") + '--pserver_command', + type=str, + default="", + help="pserver start command, format example: python,vgg.py,batch_size:128,is_local:yes" +) parser.add_argument( '--trainer_image_id', @@ -90,7 +94,11 @@ parser.add_argument( use ami-1ae93962 for us-west-2") parser.add_argument( - '--trainer_command', type=str, default="", help="trainer start command") + '--trainer_command', + type=str, + default="", + help="trainer start command, format example: python,vgg.py,batch_size:128,is_local:yes" +) parser.add_argument( '--availability_zone', diff --git a/tools/aws_benchmarking/server/cluster_master.py b/tools/aws_benchmarking/server/cluster_master.py index 21f85a5fc43e951897eb6b785367630abda722c0..7952e61159ec31a4be5394b50f30cbc20f9b414e 100644 --- a/tools/aws_benchmarking/server/cluster_master.py +++ b/tools/aws_benchmarking/server/cluster_master.py @@ -19,6 +19,7 @@ import math import time import threading import logging +import copy import netaddr import boto3 @@ -257,6 +258,8 @@ def script_to_str(file_path): def run_instances(image_id, instance_type, count, role, cmd=""): + if count == 0: + return [] response = ec2client.run_instances( ImageId=image_id, InstanceType=instance_type, @@ -334,6 +337,22 @@ def log_to_file(source, filename): log_file.write(line) +def parse_command(command_raw, defaults={}): + if not command_raw: + command_raw = "" + commands_processed = [] + parameter_map = copy.copy(defaults) + for seg in command_raw.split(","): + if ":" in seg: + parameters = seg.split(":") + parameter_map[parameters[0]] = parameters[1] + else: + commands_processed.append(seg) + for key, val in parameter_map.iteritems(): + commands_processed.append("--" + key + " " + str(val)) + return " ".join(commands_processed) + + def create_trainers(kickoff_cmd, pserver_endpoints_str): def create_and_start_trainer(trainer_index): logging.info("trainer " + str(trainer_index) + " is starting") @@ -361,7 +380,7 @@ def create_trainers(kickoff_cmd, pserver_endpoints_str): TRAINER_INDEX=str(trainer_index), TASK_NAME=args.task_name, TRAINER_COUNT=args.trainer_count, - COMMAND=args.trainer_command, + COMMAND=parse_command(args.trainer_command, {"device": "GPU"}), MASTER_ENDPOINT=args.master_server_ip + ":" + str(args.master_server_port)) logging.info(cmd) @@ -476,7 +495,7 @@ def kickoff_pserver(host, pserver_endpoints_str): DOCKER_IMAGE=args.docker_image, PSERVER_PORT=args.pserver_port, TASK_NAME=args.task_name, - COMMAND=args.pserver_command, + COMMAND=parse_command(args.pserver_command, {"device": "CPU"}), TRAINER_COUNT=args.trainer_count, TRAINER_INDEX=0, # there is no way to use 0.0.0.0:port to start pserver diff --git a/tools/aws_benchmarking/server/pserver.sh.template b/tools/aws_benchmarking/server/pserver.sh.template index 2612856d1e6273fe2642f82e8c616eb9ff24f8a4..8d7f9e84c768b096537c92a448a117d91903f25b 100644 --- a/tools/aws_benchmarking/server/pserver.sh.template +++ b/tools/aws_benchmarking/server/pserver.sh.template @@ -1,2 +1,2 @@ #!/bin/bash -docker run --network="host" -i -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device CPU \ No newline at end of file +docker run --network="host" -i -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} \ No newline at end of file diff --git a/tools/aws_benchmarking/server/trainer.sh.template b/tools/aws_benchmarking/server/trainer.sh.template index a4b2876b08cdf05e90e50589f897d74ca5f90443..9b0aae9f7a7a879f164b380f719065302e0eb7e2 100644 --- a/tools/aws_benchmarking/server/trainer.sh.template +++ b/tools/aws_benchmarking/server/trainer.sh.template @@ -1,2 +1,2 @@ #!/bin/bash -nvidia-docker run --network="host" -i -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "PADDLE_INIT_TRAINER_ID={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device GPU \ No newline at end of file +nvidia-docker run --network="host" -i -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "PADDLE_INIT_TRAINER_ID={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} \ No newline at end of file