From b8577266c584e2a785d45a2756c3f4f6efc05a99 Mon Sep 17 00:00:00 2001 From: Xi Chen Date: Fri, 13 Apr 2018 15:31:41 -0700 Subject: [PATCH] change pserver to use regular docker and some other tweaks --- tools/aws_benchmarking/server/cluster_master.py | 5 ++++- tools/aws_benchmarking/server/pserver.sh.template | 2 +- tools/aws_benchmarking/server/trainer.sh.template | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/aws_benchmarking/server/cluster_master.py b/tools/aws_benchmarking/server/cluster_master.py index 798228b35a..21f85a5fc4 100644 --- a/tools/aws_benchmarking/server/cluster_master.py +++ b/tools/aws_benchmarking/server/cluster_master.py @@ -478,6 +478,9 @@ def kickoff_pserver(host, pserver_endpoints_str): TASK_NAME=args.task_name, COMMAND=args.pserver_command, TRAINER_COUNT=args.trainer_count, + TRAINER_INDEX=0, + # there is no way to use 0.0.0.0:port to start pserver + # has to docker --network="host" with host ip to make this work SERVER_ENDPOINT=host + ":" + str(args.pserver_port), MASTER_ENDPOINT=args.master_server_ip + ":" + str(args.master_server_port)) @@ -588,7 +591,7 @@ def start_server(args): logging.info("Received request to return status") with open(args.log_path + "master.log", "r") as logfile: self.wfile.write(logfile.read().strip()) - elif request_path == "/list_logs": + elif request_path == "/list_logs" or request_path == "/logs": self._set_headers() self.wfile.write("\n".join(log_files)) elif "/log/" in request_path: diff --git a/tools/aws_benchmarking/server/pserver.sh.template b/tools/aws_benchmarking/server/pserver.sh.template index 5e46a4246f..e648ecaac1 100644 --- a/tools/aws_benchmarking/server/pserver.sh.template +++ b/tools/aws_benchmarking/server/pserver.sh.template @@ -1,2 +1,2 @@ #!/bin/bash -nvidia-docker run -i -p {PSERVER_PORT}:{PSERVER_PORT} -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINING_ROLE=PSERVER" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} \ No newline at end of file +docker run --network="host" -i -p {PSERVER_PORT}:{PSERVER_PORT} -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device CPU \ No newline at end of file diff --git a/tools/aws_benchmarking/server/trainer.sh.template b/tools/aws_benchmarking/server/trainer.sh.template index 56405a8e31..4ece636a08 100644 --- a/tools/aws_benchmarking/server/trainer.sh.template +++ b/tools/aws_benchmarking/server/trainer.sh.template @@ -1,2 +1,2 @@ #!/bin/bash -nvidia-docker run -i -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER" -e "PSERVER_HOSTS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} \ No newline at end of file +nvidia-docker run -i -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER" -e "PSERVER_HOSTS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device GPU \ No newline at end of file -- GitLab