提交 1c7d1573 编写于 作者: X Xi Chen

fix training parameter issue

上级 598035f9
...@@ -84,7 +84,8 @@ putcn/paddle_aws_client \ ...@@ -84,7 +84,8 @@ putcn/paddle_aws_client \
--security_group_id <your security group id> \ --security_group_id <your security group id> \
--docker_image myreponame/paddle_benchmark \ --docker_image myreponame/paddle_benchmark \
--pserver_count 2 \ --pserver_count 2 \
--trainer_count 2 --trainer_count 2 \
--trainer_command batch_size:20,is_local:no
``` ```
Now just wait until you see this: Now just wait until you see this:
......
...@@ -80,7 +80,11 @@ parser.add_argument( ...@@ -80,7 +80,11 @@ parser.add_argument(
use ami-1ae93962 for us-east-2") use ami-1ae93962 for us-east-2")
parser.add_argument( parser.add_argument(
'--pserver_command', type=str, default="", help="pserver start command") '--pserver_command',
type=str,
default="",
help="pserver start command, format example: python,vgg.py,batch_size:128,is_local:yes"
)
parser.add_argument( parser.add_argument(
'--trainer_image_id', '--trainer_image_id',
...@@ -90,7 +94,11 @@ parser.add_argument( ...@@ -90,7 +94,11 @@ parser.add_argument(
use ami-1ae93962 for us-west-2") use ami-1ae93962 for us-west-2")
parser.add_argument( parser.add_argument(
'--trainer_command', type=str, default="", help="trainer start command") '--trainer_command',
type=str,
default="",
help="trainer start command, format example: python,vgg.py,batch_size:128,is_local:yes"
)
parser.add_argument( parser.add_argument(
'--availability_zone', '--availability_zone',
......
...@@ -19,6 +19,7 @@ import math ...@@ -19,6 +19,7 @@ import math
import time import time
import threading import threading
import logging import logging
import copy
import netaddr import netaddr
import boto3 import boto3
...@@ -334,6 +335,23 @@ def log_to_file(source, filename): ...@@ -334,6 +335,23 @@ def log_to_file(source, filename):
log_file.write(line) log_file.write(line)
def parse_command(command_raw, defaults={}):
if not command_raw:
return ""
commands_processed = []
parameter_map = copy.copy(defaults)
for seg in command_raw.split(","):
if ":" in seg:
parameters = seg.split(":")
parameter_map[parameters[0]] = parameters[1]
#seg = "--" + seg.replace(":", " ")
else:
commands_processed.append(seg)
for key, val in parameter_map.iteritems():
commands_processed.append("--" + key + " " + val)
return " ".join(commands_processed)
def create_trainers(kickoff_cmd, pserver_endpoints_str): def create_trainers(kickoff_cmd, pserver_endpoints_str):
def create_and_start_trainer(trainer_index): def create_and_start_trainer(trainer_index):
logging.info("trainer " + str(trainer_index) + " is starting") logging.info("trainer " + str(trainer_index) + " is starting")
...@@ -361,7 +379,7 @@ def create_trainers(kickoff_cmd, pserver_endpoints_str): ...@@ -361,7 +379,7 @@ def create_trainers(kickoff_cmd, pserver_endpoints_str):
TRAINER_INDEX=str(trainer_index), TRAINER_INDEX=str(trainer_index),
TASK_NAME=args.task_name, TASK_NAME=args.task_name,
TRAINER_COUNT=args.trainer_count, TRAINER_COUNT=args.trainer_count,
COMMAND=args.trainer_command, COMMAND=parse_command(args.trainer_command, {"device": "GPU"}),
MASTER_ENDPOINT=args.master_server_ip + ":" + MASTER_ENDPOINT=args.master_server_ip + ":" +
str(args.master_server_port)) str(args.master_server_port))
logging.info(cmd) logging.info(cmd)
...@@ -476,7 +494,7 @@ def kickoff_pserver(host, pserver_endpoints_str): ...@@ -476,7 +494,7 @@ def kickoff_pserver(host, pserver_endpoints_str):
DOCKER_IMAGE=args.docker_image, DOCKER_IMAGE=args.docker_image,
PSERVER_PORT=args.pserver_port, PSERVER_PORT=args.pserver_port,
TASK_NAME=args.task_name, TASK_NAME=args.task_name,
COMMAND=args.pserver_command, COMMAND=parse_command(args.pserver_command, {"device": "CPU"}),
TRAINER_COUNT=args.trainer_count, TRAINER_COUNT=args.trainer_count,
TRAINER_INDEX=0, TRAINER_INDEX=0,
# there is no way to use 0.0.0.0:port to start pserver # there is no way to use 0.0.0.0:port to start pserver
......
#!/bin/bash #!/bin/bash
docker run --network="host" -i -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device CPU docker run --network="host" -i -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND}
\ No newline at end of file \ No newline at end of file
#!/bin/bash #!/bin/bash
nvidia-docker run --network="host" -i -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "PADDLE_INIT_TRAINER_ID={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device GPU nvidia-docker run --network="host" -i -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "PADDLE_INIT_TRAINER_ID={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND}
\ No newline at end of file \ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册