未验证 提交 6231035e 编写于 作者: D Darcy 提交者: GitHub

Merge pull request #10037 from putcn/fix-command-escape

fix training parameter issue
......@@ -84,7 +84,8 @@ putcn/paddle_aws_client \
--security_group_id <your security group id> \
--docker_image myreponame/paddle_benchmark \
--pserver_count 2 \
--trainer_count 2
--trainer_count 2 \
--trainer_command batch_size:20,local:no,device:CPU
```
Now just wait until you see this:
......
......@@ -80,7 +80,11 @@ parser.add_argument(
use ami-1ae93962 for us-east-2")
parser.add_argument(
'--pserver_command', type=str, default="", help="pserver start command")
'--pserver_command',
type=str,
default="",
help="pserver start command, format example: python,vgg.py,batch_size:128,is_local:yes"
)
parser.add_argument(
'--trainer_image_id',
......@@ -90,7 +94,11 @@ parser.add_argument(
use ami-1ae93962 for us-west-2")
parser.add_argument(
'--trainer_command', type=str, default="", help="trainer start command")
'--trainer_command',
type=str,
default="",
help="trainer start command, format example: python,vgg.py,batch_size:128,is_local:yes"
)
parser.add_argument(
'--availability_zone',
......
......@@ -19,6 +19,7 @@ import math
import time
import threading
import logging
import copy
import netaddr
import boto3
......@@ -257,6 +258,8 @@ def script_to_str(file_path):
def run_instances(image_id, instance_type, count, role, cmd=""):
if count == 0:
return []
response = ec2client.run_instances(
ImageId=image_id,
InstanceType=instance_type,
......@@ -334,6 +337,22 @@ def log_to_file(source, filename):
log_file.write(line)
def parse_command(command_raw, defaults={}):
if not command_raw:
command_raw = ""
commands_processed = []
parameter_map = copy.copy(defaults)
for seg in command_raw.split(","):
if ":" in seg:
parameters = seg.split(":")
parameter_map[parameters[0]] = parameters[1]
else:
commands_processed.append(seg)
for key, val in parameter_map.iteritems():
commands_processed.append("--" + key + " " + str(val))
return " ".join(commands_processed)
def create_trainers(kickoff_cmd, pserver_endpoints_str):
def create_and_start_trainer(trainer_index):
logging.info("trainer " + str(trainer_index) + " is starting")
......@@ -361,7 +380,7 @@ def create_trainers(kickoff_cmd, pserver_endpoints_str):
TRAINER_INDEX=str(trainer_index),
TASK_NAME=args.task_name,
TRAINER_COUNT=args.trainer_count,
COMMAND=args.trainer_command,
COMMAND=parse_command(args.trainer_command, {"device": "GPU"}),
MASTER_ENDPOINT=args.master_server_ip + ":" +
str(args.master_server_port))
logging.info(cmd)
......@@ -476,7 +495,7 @@ def kickoff_pserver(host, pserver_endpoints_str):
DOCKER_IMAGE=args.docker_image,
PSERVER_PORT=args.pserver_port,
TASK_NAME=args.task_name,
COMMAND=args.pserver_command,
COMMAND=parse_command(args.pserver_command, {"device": "CPU"}),
TRAINER_COUNT=args.trainer_count,
TRAINER_INDEX=0,
# there is no way to use 0.0.0.0:port to start pserver
......
#!/bin/bash
docker run --network="host" -i -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device CPU
\ No newline at end of file
docker run --network="host" -i -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND}
\ No newline at end of file
#!/bin/bash
nvidia-docker run --network="host" -i -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "PADDLE_INIT_TRAINER_ID={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device GPU
\ No newline at end of file
nvidia-docker run --network="host" -i -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "PADDLE_INIT_TRAINER_ID={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND}
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册