From 45d87ade441b786c8d6cac20a6234816fe5a2019 Mon Sep 17 00:00:00 2001 From: Xi Chen Date: Thu, 12 Apr 2018 20:55:01 -0700 Subject: [PATCH] minor tweaks --- .../client/cluster_launcher.py | 21 ++++++++-- .../aws_benchmarking/server/cluster_master.py | 41 ++++++++++++------- 2 files changed, 44 insertions(+), 18 deletions(-) diff --git a/tools/aws_benchmarking/client/cluster_launcher.py b/tools/aws_benchmarking/client/cluster_launcher.py index bbabd982465..3a6cc57b3ab 100644 --- a/tools/aws_benchmarking/client/cluster_launcher.py +++ b/tools/aws_benchmarking/client/cluster_launcher.py @@ -49,8 +49,8 @@ parser.add_argument( parser.add_argument( '--pserver_instance_type', type=str, - default="p2.8xlarge", - help="your pserver instance type, p2.8xlarge by default") + default="c5.2xlarge", + help="your pserver instance type, c5.2xlarge by default") parser.add_argument( '--trainer_instance_type', type=str, @@ -68,6 +68,10 @@ parser.add_argument( default="ami-da2c1cbf", help="ami id for system image, default one has nvidia-docker ready, \ use ami-1ae93962 for us-east-2") + +parser.add_argument( + '--pserver_command', type=str, default="", help="pserver start command") + parser.add_argument( '--trainer_image_id', type=str, @@ -75,6 +79,9 @@ parser.add_argument( help="ami id for system image, default one has nvidia-docker ready, \ use ami-1ae93962 for us-west-2") +parser.add_argument( + '--trainer_command', type=str, default="", help="trainer start command") + parser.add_argument( '--availability_zone', type=str, @@ -104,6 +111,12 @@ parser.add_argument( parser.add_argument( '--master_server_public_ip', type=str, help="master server public ip") +parser.add_argument( + '--master_docker_image', + type=str, + default="putcn/paddle_aws_master:latest", + help="master docker image id") + args = parser.parse_args() logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') @@ -322,14 +335,16 @@ def create(): # set arguments and start docker kick_off_cmd = "docker run -d -v /home/ubuntu/.aws:/root/.aws/" kick_off_cmd += " -v /home/ubuntu/" + args.key_name + ".pem:/root/" + args.key_name + ".pem" + kick_off_cmd += " -v /home/ubuntu/logs/:/root/logs/" kick_off_cmd += " -p " + str(args.master_server_port) + ":" + str( args.master_server_port) - kick_off_cmd += " putcn/paddle_aws_master" + kick_off_cmd += " " + args.master_docker_image args_to_pass = copy.copy(args) args_to_pass.action = "serve" del args_to_pass.pem_path del args_to_pass.security_group_ids + del args_to_pass.master_docker_image del args_to_pass.master_server_public_ip for arg, value in sorted(vars(args_to_pass).iteritems()): kick_off_cmd += ' --%s %s' % (arg, value) diff --git a/tools/aws_benchmarking/server/cluster_master.py b/tools/aws_benchmarking/server/cluster_master.py index 38d09dc8694..5e63b5a8b4c 100644 --- a/tools/aws_benchmarking/server/cluster_master.py +++ b/tools/aws_benchmarking/server/cluster_master.py @@ -53,8 +53,8 @@ parser.add_argument( parser.add_argument( '--pserver_instance_type', type=str, - default="p2.8xlarge", - help="your pserver instance type, p2.8xlarge by default") + default="c5.2xlarge", + help="your pserver instance type, c5.2xlarge by default") parser.add_argument( '--trainer_instance_type', type=str, @@ -97,12 +97,18 @@ parser.add_argument( default=os.path.join(os.path.dirname(__file__), "pserver.sh.template"), help="pserver bash file path") +parser.add_argument( + '--pserver_command', type=str, default="", help="pserver start command") + parser.add_argument( '--trainer_bash_file', type=str, default=os.path.join(os.path.dirname(__file__), "trainer.sh.template"), help="trainer bash file path") +parser.add_argument( + '--trainer_command', type=str, default="", help="trainer start command") + parser.add_argument( '--action', type=str, default="serve", help="create|cleanup|serve") @@ -124,8 +130,12 @@ args = parser.parse_args() ec2client = boto3.client('ec2') +args.log_path = os.path.join(os.path.dirname(__file__), "logs/") + logging.basicConfig( - filename='master.log', level=logging.INFO, format='%(asctime)s %(message)s') + filename=args.log_path + 'master.log', + level=logging.INFO, + format='%(asctime)s %(message)s') log_files = ["master.log"] @@ -304,7 +314,7 @@ def create_pservers(): def log_to_file(source, filename): if not filename in log_files: log_files.append(filename) - with open(filename, "a") as log_file: + with open(args.log_path + filename, "a") as log_file: for line in iter(source.readline, ""): log_file.write(line) @@ -335,6 +345,8 @@ def create_trainers(kickoff_cmd, pserver_endpoints_str): DOCKER_IMAGE=args.docker_image, TRAINER_INDEX=str(trainer_index), TASK_NAME=args.task_name, + TRAINER_COUNT=args.trainer_count, + COMMAND=args.trainer_command, MASTER_ENDPOINT=args.master_server_ip + ":" + str(args.master_server_port)) logging.info(cmd) @@ -446,6 +458,9 @@ def kickoff_pserver(host, pserver_endpoints_str): DOCKER_IMAGE=args.docker_image, PSERVER_PORT=args.pserver_port, TASK_NAME=args.task_name, + COMMAND=args.pserver_command, + TRAINER_COUNT=args.trainer_count, + SERVER_ENDPOINT=host + ":" + str(args.pserver_port), MASTER_ENDPOINT=args.master_server_ip + ":" + str(args.master_server_port)) logging.info(cmd) @@ -553,14 +568,17 @@ def start_server(args): if request_path == "/status" or request_path == "/master_logs": self._set_headers() logging.info("Received request to return status") - with open("master.log", "r") as logfile: + with open(args.log_path + "master.log", "r") as logfile: self.wfile.write(logfile.read().strip()) elif request_path == "/list_logs": self._set_headers() self.wfile.write("\n".join(log_files)) elif "/log/" in request_path: - log_file_path = request_path.replace("/log/") - with open(log_file_path, "r") as logfile: + self._set_headers() + log_file_path = request_path.replace("/log/", "") + logging.info("requesting log file path is" + args.log_path + + log_file_path) + with open(args.log_path + log_file_path, "r") as logfile: self.wfile.write(logfile.read().strip()) else: self.do_404() @@ -631,11 +649,4 @@ if __name__ == "__main__": create_cluster() server_thread.join() elif args.action == "test": - init_args() - if not args.subnet_id: - logging.info("creating subnet for this task") - args.subnet_id = create_subnet() - logging.info("subnet %s created" % (args.subnet_id)) - create_trainers( - kickoff_cmd=script_to_str(args.trainer_bash_file), - pserver_endpoints_str="11.22.33.44:5476") + start_server(args) -- GitLab