diff --git a/tools/aws_benchmarking/server/cluster_master.py b/tools/aws_benchmarking/server/cluster_master.py index 798228b35ae7aae0f159c540da77e926f138fa0f..21f85a5fc43e951897eb6b785367630abda722c0 100644 --- a/tools/aws_benchmarking/server/cluster_master.py +++ b/tools/aws_benchmarking/server/cluster_master.py @@ -478,6 +478,9 @@ def kickoff_pserver(host, pserver_endpoints_str): TASK_NAME=args.task_name, COMMAND=args.pserver_command, TRAINER_COUNT=args.trainer_count, + TRAINER_INDEX=0, + # there is no way to use 0.0.0.0:port to start pserver + # has to docker --network="host" with host ip to make this work SERVER_ENDPOINT=host + ":" + str(args.pserver_port), MASTER_ENDPOINT=args.master_server_ip + ":" + str(args.master_server_port)) @@ -588,7 +591,7 @@ def start_server(args): logging.info("Received request to return status") with open(args.log_path + "master.log", "r") as logfile: self.wfile.write(logfile.read().strip()) - elif request_path == "/list_logs": + elif request_path == "/list_logs" or request_path == "/logs": self._set_headers() self.wfile.write("\n".join(log_files)) elif "/log/" in request_path: diff --git a/tools/aws_benchmarking/server/pserver.sh.template b/tools/aws_benchmarking/server/pserver.sh.template index 5e46a4246f1a31261654dc52d4ab30ccd89f1957..e648ecaac18ebb2969122622ed2a6adb64795dd1 100644 --- a/tools/aws_benchmarking/server/pserver.sh.template +++ b/tools/aws_benchmarking/server/pserver.sh.template @@ -1,2 +1,2 @@ #!/bin/bash -nvidia-docker run -i -p {PSERVER_PORT}:{PSERVER_PORT} -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINING_ROLE=PSERVER" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} \ No newline at end of file +docker run --network="host" -i -p {PSERVER_PORT}:{PSERVER_PORT} -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device CPU \ No newline at end of file diff --git a/tools/aws_benchmarking/server/trainer.sh.template b/tools/aws_benchmarking/server/trainer.sh.template index 56405a8e31d0c89c90168ce5126afc5b3206da0f..4ece636a087adc7628c216859bcf9446e861ae7e 100644 --- a/tools/aws_benchmarking/server/trainer.sh.template +++ b/tools/aws_benchmarking/server/trainer.sh.template @@ -1,2 +1,2 @@ #!/bin/bash -nvidia-docker run -i -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER" -e "PSERVER_HOSTS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} \ No newline at end of file +nvidia-docker run -i -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER" -e "PSERVER_HOSTS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device GPU \ No newline at end of file