diff --git a/go/glide.lock b/go/glide.lock deleted file mode 100644 index d15fc934dbe511389cc92ce95cededa41ba32b4d..0000000000000000000000000000000000000000 --- a/go/glide.lock +++ /dev/null @@ -1,233 +0,0 @@ -hash: 107c058cf5c9163a75d40eef2273a793c36112683c25d72aa8288827fdde3a19 -updated: 2017-10-30T03:46:19.137696069Z -imports: -- name: github.com/alecthomas/gometalinter - version: bae2f1293d092fd8167939d5108d1b025eaef9de -- name: github.com/beorn7/perks - version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9 - subpackages: - - quantile -- name: github.com/boltdb/bolt - version: 583e8937c61f1af6513608ccc75c97b6abdf4ff9 -- name: github.com/cockroachdb/cmux - version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92 -- name: github.com/coreos/etcd - version: f1d7dd87da3e8feab4aaf675b8e29c6a5ed5f58b - subpackages: - - alarm - - auth - - auth/authpb - - client - - clientv3 - - clientv3/concurrency - - compactor - - discovery - - embed - - error - - etcdserver - - etcdserver/api - - etcdserver/api/etcdhttp - - etcdserver/api/v2http - - etcdserver/api/v2http/httptypes - - etcdserver/api/v3client - - etcdserver/api/v3election - - etcdserver/api/v3election/v3electionpb - - etcdserver/api/v3election/v3electionpb/gw - - etcdserver/api/v3lock - - etcdserver/api/v3lock/v3lockpb - - etcdserver/api/v3lock/v3lockpb/gw - - etcdserver/api/v3rpc - - etcdserver/api/v3rpc/rpctypes - - etcdserver/auth - - etcdserver/etcdserverpb - - etcdserver/etcdserverpb/gw - - etcdserver/membership - - etcdserver/stats - - lease - - lease/leasehttp - - lease/leasepb - - mvcc - - mvcc/backend - - mvcc/mvccpb - - pkg/adt - - pkg/contention - - pkg/cors - - pkg/cpuutil - - pkg/crc - - pkg/debugutil - - pkg/fileutil - - pkg/httputil - - pkg/idutil - - pkg/ioutil - - pkg/logutil - - pkg/monotime - - pkg/netutil - - pkg/pathutil - - pkg/pbutil - - pkg/runtime - - pkg/schedule - - pkg/srv - - pkg/tlsutil - - pkg/transport - - pkg/types - - pkg/wait - - proxy/grpcproxy/adapter - - raft - - raft/raftpb - - rafthttp - - snap - - snap/snappb - - store - - version - - wal - - wal/walpb -- name: github.com/coreos/go-semver - version: 8ab6407b697782a06568d4b7f1db25550ec2e4c6 - subpackages: - - semver -- name: github.com/coreos/go-systemd - version: 48702e0da86bd25e76cfef347e2adeb434a0d0a6 - subpackages: - - daemon - - journal - - util -- name: github.com/coreos/pkg - version: 3ac0863d7acf3bc44daf49afef8919af12f704ef - subpackages: - - capnslog -- name: github.com/dgrijalva/jwt-go - version: d2709f9f1f31ebcda9651b03077758c1f3a0018c -- name: github.com/ghodss/yaml - version: 0ca9ea5df5451ffdf184b4428c902747c2c11cd7 -- name: github.com/go-stack/stack - version: 817915b46b97fd7bb80e8ab6b69f01a53ac3eebf -- name: github.com/gogo/protobuf - version: 909568be09de550ed094403c2bf8a261b5bb730a - subpackages: - - proto -- name: github.com/golang/protobuf - version: 4bd1920723d7b7c925de087aa32e2187708897f7 - subpackages: - - jsonpb - - proto -- name: github.com/golang/snappy - version: 553a641470496b2327abcac10b36396bd98e45c9 -- name: github.com/google/btree - version: 925471ac9e2131377a91e1595defec898166fe49 -- name: github.com/grpc-ecosystem/go-grpc-prometheus - version: 6b7015e65d366bf3f19b2b2a000a831940f0f7e0 -- name: github.com/grpc-ecosystem/grpc-gateway - version: 18d159699f2e83fc5bb9ef2f79465ca3f3122676 - subpackages: - - runtime - - runtime/internal - - utilities -- name: github.com/inconshreveable/log15 - version: 0decfc6c20d9ca0ad143b0e89dcaa20f810b4fb3 -- name: github.com/jonboulle/clockwork - version: 2eee05ed794112d45db504eb05aa693efd2b8b09 -- name: github.com/mattn/go-colorable - version: 5411d3eea5978e6cdc258b30de592b60df6aba96 -- name: github.com/mattn/go-isatty - version: 57fdcb988a5c543893cc61bce354a6e24ab70022 -- name: github.com/matttproud/golang_protobuf_extensions - version: c12348ce28de40eed0136aa2b644d0ee0650e56c - subpackages: - - pbutil -- name: github.com/namsral/flag - version: 71ceffbeb0ba60fccc853971bb3ed4d7d90bfd04 -- name: github.com/PaddlePaddle/recordio - version: 0432dee9fd4b24fb6840fb20a8c055b0c933fb81 -- name: github.com/prometheus/client_golang - version: c5b7fccd204277076155f10851dad72b76a49317 - subpackages: - - prometheus -- name: github.com/prometheus/client_model - version: 6f3806018612930941127f2a7c6c453ba2c527d2 - subpackages: - - go -- name: github.com/prometheus/common - version: 49fee292b27bfff7f354ee0f64e1bc4850462edf - subpackages: - - expfmt - - internal/bitbucket.org/ww/goautoneg - - model -- name: github.com/prometheus/procfs - version: a1dba9ce8baed984a2495b658c82687f8157b98f - subpackages: - - xfs -- name: github.com/satori/go.uuid - version: 879c5887cd475cd7864858769793b2ceb0d44feb -- name: github.com/sirupsen/logrus - version: f006c2ac4710855cf0f916dd6b77acf6b048dc6e -- name: github.com/topicai/candy - version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc -- name: github.com/ugorji/go - version: ded73eae5db7e7a0ef6f55aace87a2873c5d2b74 - subpackages: - - codec -- name: github.com/xiang90/probing - version: 07dd2e8dfe18522e9c447ba95f2fe95262f63bb2 -- name: golang.org/x/crypto - version: 9419663f5a44be8b34ca85f08abc5fe1be11f8a3 - repo: https://github.com/golang/crypto.git - vcs: git - subpackages: - - bcrypt - - blowfish - - ssh/terminal -- name: golang.org/x/net - version: c8c74377599bd978aee1cf3b9b63a8634051cec2 - subpackages: - - context - - http2 - - http2/hpack - - idna - - internal/timeseries - - lex/httplex - - trace -- name: golang.org/x/sys - version: e48874b42435b4347fc52bdee0424a52abc974d7 - repo: https://github.com/golang/sys.git - vcs: git - subpackages: - - unix - - windows -- name: golang.org/x/text - version: 836efe42bb4aa16aaa17b9c155d8813d336ed720 - repo: https://github.com/golang/text.git - vcs: git - subpackages: - - secure/bidirule - - transform - - unicode/bidi - - unicode/norm -- name: google.golang.org/grpc - version: 8050b9cbc271307e5a716a9d782803d09b0d6f2d - subpackages: - - codes - - credentials - - grpclog - - internal - - keepalive - - metadata - - naming - - peer - - stats - - tap - - transport -- name: gopkg.in/yaml.v2 - version: cd8b52f8269e0feb286dfeef29f8fe4d5b397e0b -testImports: -- name: github.com/davecgh/go-spew - version: 04cdfd42973bb9c8589fd6a731800cf222fde1a9 - subpackages: - - spew -- name: github.com/pmezard/go-difflib - version: d8ed2627bdf02c080bf22230dbb337003b7aba2d - subpackages: - - difflib -- name: github.com/stretchr/testify - version: 05e8a0eda380579888eb53c394909df027f06991 - subpackages: - - assert diff --git a/go/glide.yaml b/go/glide.yaml deleted file mode 100644 index c5d66694acd0f45de5002391a7953b7491eaf2bc..0000000000000000000000000000000000000000 --- a/go/glide.yaml +++ /dev/null @@ -1,33 +0,0 @@ -package: github.com/PaddlePaddle/Paddle/go -import: -- package: github.com/PaddlePaddle/recordio -- package: github.com/coreos/etcd - version: ^3.2.1 - subpackages: - - clientv3 - - clientv3/concurrency - - embed - - etcdserver -- package: github.com/namsral/flag - version: ^1.7.4-pre -- package: github.com/sirupsen/logrus - version: ^1.0.0 -- package: github.com/topicai/candy -- package: golang.org/x/crypto - repo: https://github.com/golang/crypto.git - vcs: git -- package: golang.org/x/sys - repo: https://github.com/golang/sys.git - vcs: git -- package: golang.org/x/text - repo: https://github.com/golang/text.git - vcs: git -- package: github.com/satori/go.uuid - version: v1.1.0 -- package: github.com/alecthomas/gometalinter - version: v1.2.1 -- package: github.com/inconshreveable/log15 - version: v2.13 -- package: github.com/go-stack/stack - version: v1.6.0 -- package: github.com/golang/protobuf diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index e65cf3124398a4afb6b7bbd77fdaa45e94422f6b..7f6793a2494a797705bda480b70bd0e2baea209d 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -265,9 +265,6 @@ function check_style() { # set up go environment for running gometalinter mkdir -p $GOPATH/src/github.com/PaddlePaddle/ ln -sf ${PADDLE_ROOT} $GOPATH/src/github.com/PaddlePaddle/Paddle - mkdir -p ./build/go - cp go/glide.* build/go - cd build/go; glide install; cd - export PATH=/usr/bin:$PATH pre-commit install diff --git a/tools/aws_benchmarking/README.md b/tools/aws_benchmarking/README.md deleted file mode 100644 index 4fdd4b0de44e779378091566d9d6056a6f9ee4b6..0000000000000000000000000000000000000000 --- a/tools/aws_benchmarking/README.md +++ /dev/null @@ -1,184 +0,0 @@ -# AWS benchmark testing tool -This is an automation tool for deploying paddlepaddle benchmark testing to AWS. - -## Features - - - subnet creation to fit just the amount of ec2 instances required. - - pserver and trainer ec2 instances allocation, and instance state verification - - nvidia-docker ready for GPU training - - Instances and network element garbage collection when a task is accomplished or an error occurred - - Test log is collected in realtime - - Web service for checking log or tearing down the testing setup - - No testing code change needed - - Lots of optional configuration options - - ## Usages - - ### Prerequisites - - - You have a working AWS account - - You have [AWS Command Line Interface](https://aws.amazon.com/cli/) installed - - Your AWS cli is bind with a account which has `AmazonEC2FullAccess` permission, and it's set as default credential. - - You have key pair created and pem file downloaded. - - You have a default VPC in the region you want to run the test. - - You have a Security Group created for the VPC mentioned above, which allows port 22 and the port you want to expose your control web service (5436 by default) - - If your test is supposed to run in a GPU machine, especially a multi card GPU machine (p2, p3 series), you might need to contact amazon to raise the limit which allows no more than 1 GPU instance at a time. - - ### Start a benchmark test - -#### Create training image - -*What to expect in this step:* - -*You will have your training logic packed with paddle runtime in a docker image, and be able to be picked up by AWS instance for training.* - -Training python script and PaddlePaddle runtime are supposed to be packed into one docker image. Use PaddlePaddle production images as base image and create the training images with the docker file as follows: - -```Dockerfile -FROM paddlepaddle/paddle:latest-gpu - -ENV HOME /root -COPY ./ /root/ -WORKDIR /root -RUN pip install -r /root/requirements.txt -ENTRYPOINT ["python", "my_training.py"] -``` - -***Please Note*** -Training nodes will run your `ENTRYPOINT` script with the following environment variables: - - - `TASK_NAME`: unique name to identify this training process. - - `TRAINING_ROLE`: current node's role in this training process, either "PSERVER" or "TRAINER" - - `PSERVER_HOSTS`: comma separated value of pserver end points, I.E. "192.168.1.2:5436,192.168.1.3:5436" - - `PSERVERS`: same as above - - `TRAINERS`: trainer count - - `SERVER_ENDPOINT`: current server end point if the node role is a pserver - - `TRAINER_INDEX`: an integer to identify the index of current trainer if the node role is a trainer. - - `PADDLE_INIT_TRAINER_ID`: same as above - - Now we have a working distributed training script which takes advantage of node environment variables and docker file to generate the training image. Run the following command: - - ```bash - docker build -t myreponname/paddle_benchmark . - ``` - - Now you have the image built and tagged with `myreponame/paddle_benchmark`, let's push it to dockerhub so that it can be picked up by out AWS instance. - - ```bash - docker push myreponame/paddle_benchmark - ``` - -#### Create instances and start training - -*What to expect in this step* - -*you will be asked to provide some basic settings to config your training, and this tool will have your training started and monitored* - -Now let's start the training process: - -```bash -docker run -i -v $HOME/.aws:/root/.aws -v :/root/.pem \ -putcn/paddle_aws_client \ ---action create \ ---key_name \ ---security_group_id \ ---docker_image myreponame/paddle_benchmark \ ---pserver_count 2 \ ---trainer_count 2 \ ---trainer_command batch_size:20,local:no,device:CPU -``` - -Now just wait until you see this: -``` -master server finished init process, visit http://XXX:XXX/status to check master log -``` -That means you can turn off your laptop and your cluster is creating instances, starting training process, collecting logs and eventually shut all pservers and trainers down when training is finished. - -#### Post creation operations - -To access the master log: - -```bash -docker run -i -v $HOME/.aws:/root/.aws \ -putcn/paddle_aws_client \ ---action status \ ---master_server_public_ip \ ---master_server_port -``` - -To tear down the training setup: - -```bash -docker run -i -v $HOME/.aws:/root/.aws \ -putcn/paddle_aws_client \ ---action cleanup \ ---master_server_public_ip \ ---master_server_port -``` - -To retrieve training logs -TBD - -### Tech details - -*What to expect in this step* - -*You will understand what is happening behind the scene, and how to check the training log, how to tear down the training on the fly, etc.* - -Let's understand what is happening under the hood when you run above command in your laptop - -![alt](diagram.png) - -There are 4 roles in the figure above: - - client: your laptop - - master: who tasks to aws api server to create/tear down instances, and monitor training process - - AWS api server: the one who actually creates and manages instances - - pservers and trainers: training instances - -When you run the `docker run` command above, what it actually does is to ask aws api service to create a subnet (step 1) and a master instance (step 2), and pass all the parameters the client collected or generated (step 3). The master is kept as minimum hardware config to keep the running cost low. - -Then when the master is up and running, it will ask the aws api server to create the heavy lifting training instances who are expensive to run (step 4). And the master will start training process as soon as they are done initializing (step 5). - -Meanwhile, the master will expose a web service for client to check training log or even tear the training setup down by a web service call. - -if you are creating the training with client docker container, and also monitoring your aws dashboard, you will initially see a instance tagged with `ROLE=MASTER` and `TASK_NAME=_master` starts, then you will see several instances tagged with `ROLE=PSERVER` and `ROLE=TRAINER` starts. -When the training is finished, pservers and trainers will be terminated. All their logs are kept in master node's docker env. - -Master exposes 4 major services: - - - GET `/status`: return master log - - GET `/logs`: return list of log file names - - GET `/log/`: return a particular log by log file name - - POST `/cleanup`: teardown the whole setup - - -### Parameters - - - key_name: required, aws key pair name - - security_group_id: required, the security group id associated with your VPC - - vpc_id: The VPC in which you wish to run test, if not provided, this tool will use your default VPC. - - subnet_id: The Subnet_id in which you wish to run test, if not provided, this tool will create a new sub net to run test. - - pserver_instance_type: your pserver instance type, c5.2xlarge by default, which is a memory optimized machine. - - trainer_instance_type: your trainer instance type, p2.8xlarge by default, which is a GPU machine with 8 cards. - - task_name: the name you want to identify your job, if not provided, this tool will generate one for you. - - pserver_image_id: ami id for system image. Please note, although the default one has nvidia-docker installed, pserver is always launched with `docker` instead of `nvidia-docker`, please DO NOT init your training program with GPU place. - - pserver_command: pserver start command, format example: python,vgg.py,batch_size:128,is_local:no, which will be translated as `python vgg.py --batch_size 128 --is_local no` when trying to start the training in pserver. "--device CPU" is passed as default. - - trainer_image_id: ami id for system image, default one has nvidia-docker ready. - - trainer_command: trainer start command. Format is the same as pserver's, "--device GPU" is passed as default. - - availability_zone: aws zone id to place ec2 instances, us-east-2a by default. - - trainer_count: Trainer count, 1 by default. - - pserver_count: Pserver count, 1 by default. - - action: create|cleanup|status, "create" by default. - - pserver_port: the port for pserver to open service, 5436 by default. - - docker_image: the training docker image id. - - master_service_port: the port for master to open service, 5436 by default. - - master_server_public_ip: the master service ip, this is required when action is not "create" - - master_docker_image: master's docker image id, "putcn/paddle_aws_master:latest" by default - - no_clean_up: no instance termination when training is finished or failed when this value is set "yes". This is for debug purpose, so that you can inspect into the instances when the process is finished. - - -### Trouble shooting - - 1. How to check logs - - Master log is served at `http://:/status`, and you can list all the log files from `http://:/logs`, and access either one of them by `http://:/log/` diff --git a/tools/aws_benchmarking/client/Dockerfile b/tools/aws_benchmarking/client/Dockerfile deleted file mode 100644 index 812c5d4bce0adff404577ce6b5fd3f0f4a91118c..0000000000000000000000000000000000000000 --- a/tools/aws_benchmarking/client/Dockerfile +++ /dev/null @@ -1,7 +0,0 @@ -FROM python:2.7.14-stretch - -ENV HOME /root -COPY ./ /root/ -WORKDIR /root -RUN pip install -r /root/requirements.txt -ENTRYPOINT ["python", "cluster_launcher.py"] \ No newline at end of file diff --git a/tools/aws_benchmarking/client/cluster_launcher.py b/tools/aws_benchmarking/client/cluster_launcher.py deleted file mode 100644 index 12333202b9f003ae5109c7e9b825035ba8eb7d99..0000000000000000000000000000000000000000 --- a/tools/aws_benchmarking/client/cluster_launcher.py +++ /dev/null @@ -1,415 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -import time -import math -import logging -import copy - -import netaddr -import boto3 -import namesgenerator -import paramiko -from scp import SCPClient -import requests - - -def str2bool(v): - if v.lower() in ('yes', 'true', 't', 'y', '1'): - return True - elif v.lower() in ('no', 'false', 'f', 'n', '0'): - return False - else: - raise argparse.ArgumentTypeError('Boolean value expected.') - - -parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - '--key_name', type=str, default="", help="required, key pair name") -parser.add_argument( - '--security_group_id', - type=str, - default="", - help="required, the security group id associated with your VPC") - -parser.add_argument( - '--vpc_id', - type=str, - default="", - help="The VPC in which you wish to run test") -parser.add_argument( - '--subnet_id', - type=str, - default="", - help="The Subnet_id in which you wish to run test") - -parser.add_argument( - '--pserver_instance_type', - type=str, - default="c5.2xlarge", - help="your pserver instance type, c5.2xlarge by default") -parser.add_argument( - '--trainer_instance_type', - type=str, - default="p2.8xlarge", - help="your trainer instance type, p2.8xlarge by default") - -parser.add_argument( - '--task_name', - type=str, - default="", - help="the name you want to identify your job") -parser.add_argument( - '--pserver_image_id', - type=str, - default="ami-da2c1cbf", - help="ami id for system image, default one has nvidia-docker ready, \ - use ami-1ae93962 for us-east-2") - -parser.add_argument( - '--pserver_command', - type=str, - default="", - help="pserver start command, format example: python,vgg.py,batch_size:128,is_local:yes" -) - -parser.add_argument( - '--trainer_image_id', - type=str, - default="ami-da2c1cbf", - help="ami id for system image, default one has nvidia-docker ready, \ - use ami-1ae93962 for us-west-2") - -parser.add_argument( - '--trainer_command', - type=str, - default="", - help="trainer start command, format example: python,vgg.py,batch_size:128,is_local:yes" -) - -parser.add_argument( - '--availability_zone', - type=str, - default="us-east-2a", - help="aws zone id to place ec2 instances") - -parser.add_argument( - '--trainer_count', type=int, default=1, help="Trainer count") - -parser.add_argument( - '--pserver_count', type=int, default=1, help="Pserver count") - -parser.add_argument( - '--action', type=str, default="create", help="create|cleanup|status") - -parser.add_argument('--pem_path', type=str, help="private key file") - -parser.add_argument( - '--pserver_port', type=str, default="5436", help="pserver port") - -parser.add_argument( - '--docker_image', type=str, default="busybox", help="training docker image") - -parser.add_argument( - '--master_server_port', type=int, default=5436, help="master server port") - -parser.add_argument( - '--master_server_public_ip', type=str, help="master server public ip") - -parser.add_argument( - '--master_docker_image', - type=str, - default="putcn/paddle_aws_master:latest", - help="master docker image id") - -parser.add_argument( - '--no_clean_up', - type=str2bool, - default=False, - help="whether to clean up after training") - -args = parser.parse_args() - -logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') - -ec2client = boto3.client('ec2') - - -def print_arguments(): - print('----------- Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -def create_subnet(): - # if no vpc id provided, list vpcs - logging.info("start creating subnet") - if not args.vpc_id: - logging.info("no vpc provided, trying to find the default one") - vpcs_desc = ec2client.describe_vpcs( - Filters=[{ - "Name": "isDefault", - "Values": ["true", ] - }], ) - if len(vpcs_desc["Vpcs"]) == 0: - raise ValueError('No default VPC') - args.vpc_id = vpcs_desc["Vpcs"][0]["VpcId"] - vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"] - - logging.info("default vpc fount with id %s and CidrBlock %s" % - (args.vpc_id, vpc_cidrBlock)) - - if not vpc_cidrBlock: - logging.info("trying to find cidrblock for vpc") - vpcs_desc = ec2client.describe_vpcs( - Filters=[{ - "Name": "vpc-id", - "Values": [args.vpc_id, ], - }], ) - if len(vpcs_desc["Vpcs"]) == 0: - raise ValueError('No VPC found') - vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"] - logging.info("cidrblock for vpc is %s" % vpc_cidrBlock) - - # list subnets in vpc in order to create a new one - - logging.info("trying to find ip blocks for new subnet") - subnets_desc = ec2client.describe_subnets( - Filters=[{ - "Name": "vpc-id", - "Values": [args.vpc_id, ], - }], ) - - ips_taken = [] - for subnet_dec in subnets_desc["Subnets"]: - ips_taken.append(subnet_dec["CidrBlock"]) - - ip_blocks_avaliable = netaddr.IPSet( - [vpc_cidrBlock]) ^ netaddr.IPSet(ips_taken) - # adding 10 addresses as buffer - cidr_prefix = 32 - math.ceil( - math.log(args.pserver_count + args.trainer_count + 10, 2)) - if cidr_prefix <= 16: - raise ValueError('Too many nodes to fit in current VPC') - - for ipnetwork in ip_blocks_avaliable.iter_cidrs(): - try: - subnet_cidr = ipnetwork.subnet(int(cidr_prefix)).next() - logging.info("subnet ip block found %s" % (subnet_cidr)) - break - except Exception: - pass - - if not subnet_cidr: - raise ValueError( - 'No avaliable subnet to fit required nodes in current VPC') - - logging.info("trying to create subnet") - subnet_desc = ec2client.create_subnet( - CidrBlock=str(subnet_cidr), - VpcId=args.vpc_id, - AvailabilityZone=args.availability_zone) - - subnet_id = subnet_desc["Subnet"]["SubnetId"] - - subnet_waiter = ec2client.get_waiter('subnet_available') - # sleep for 1s before checking its state - time.sleep(1) - subnet_waiter.wait(SubnetIds=[subnet_id, ]) - - logging.info("subnet created") - - logging.info("adding tags to newly created subnet") - ec2client.create_tags( - Resources=[subnet_id, ], - Tags=[{ - "Key": "Task_name", - 'Value': args.task_name - }]) - return subnet_id - - -def run_instances(image_id, instance_type, count=1, role="MASTER", cmd=""): - response = ec2client.run_instances( - ImageId=image_id, - InstanceType=instance_type, - MaxCount=count, - MinCount=count, - UserData=cmd, - DryRun=False, - InstanceInitiatedShutdownBehavior="stop", - KeyName=args.key_name, - Placement={'AvailabilityZone': args.availability_zone}, - NetworkInterfaces=[{ - 'DeviceIndex': 0, - 'SubnetId': args.subnet_id, - "AssociatePublicIpAddress": True, - 'Groups': args.security_group_ids - }], - TagSpecifications=[{ - 'ResourceType': "instance", - 'Tags': [{ - "Key": 'Task_name', - "Value": args.task_name + "_master" - }, { - "Key": 'Role', - "Value": role - }] - }]) - - instance_ids = [] - for instance in response["Instances"]: - instance_ids.append(instance["InstanceId"]) - - if len(instance_ids) > 0: - logging.info(str(len(instance_ids)) + " instance(s) created") - else: - logging.info("no instance created") - #create waiter to make sure it's running - - logging.info("waiting for instance to become accessible") - waiter = ec2client.get_waiter('instance_status_ok') - waiter.wait( - Filters=[{ - "Name": "instance-status.status", - "Values": ["ok"] - }, { - "Name": "instance-status.reachability", - "Values": ["passed"] - }, { - "Name": "instance-state-name", - "Values": ["running"] - }], - InstanceIds=instance_ids) - - instances_response = ec2client.describe_instances(InstanceIds=instance_ids) - - return instances_response["Reservations"][0]["Instances"] - - -def generate_task_name(): - return namesgenerator.get_random_name() - - -def init_args(): - - if not args.task_name: - args.task_name = generate_task_name() - logging.info("task name generated %s" % (args.task_name)) - - if not args.pem_path: - args.pem_path = os.path.expanduser("~") + "/" + args.key_name + ".pem" - if args.security_group_id: - args.security_group_ids = (args.security_group_id, ) - - -def create(): - - init_args() - - # create subnet - if not args.subnet_id: - args.subnet_id = create_subnet() - - # create master node - - master_instance_response = run_instances( - image_id="ami-7a05351f", instance_type="t2.nano") - - logging.info("master server started") - - args.master_server_public_ip = master_instance_response[0][ - "PublicIpAddress"] - args.master_server_ip = master_instance_response[0]["PrivateIpAddress"] - - logging.info("master server started, master_ip=%s, task_name=%s" % - (args.master_server_public_ip, args.task_name)) - - # cp config file and pems to master node - - ssh_key = paramiko.RSAKey.from_private_key_file(args.pem_path) - ssh_client = paramiko.SSHClient() - ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - ssh_client.connect( - hostname=args.master_server_public_ip, username="ubuntu", pkey=ssh_key) - - with SCPClient(ssh_client.get_transport()) as scp: - scp.put(os.path.expanduser("~") + "/" + ".aws", - recursive=True, - remote_path='/home/ubuntu/') - scp.put(args.pem_path, - remote_path='/home/ubuntu/' + args.key_name + ".pem") - - logging.info("credentials and pem copied to master") - - # set arguments and start docker - kick_off_cmd = "docker run -d -v /home/ubuntu/.aws:/root/.aws/" - kick_off_cmd += " -v /home/ubuntu/" + args.key_name + ".pem:/root/" + args.key_name + ".pem" - kick_off_cmd += " -v /home/ubuntu/logs/:/root/logs/" - kick_off_cmd += " -p " + str(args.master_server_port) + ":" + str( - args.master_server_port) - kick_off_cmd += " " + args.master_docker_image - - args_to_pass = copy.copy(args) - args_to_pass.action = "serve" - del args_to_pass.pem_path - del args_to_pass.security_group_ids - del args_to_pass.master_docker_image - del args_to_pass.master_server_public_ip - for arg, value in sorted(vars(args_to_pass).iteritems()): - if value: - kick_off_cmd += ' --%s %s' % (arg, value) - - logging.info(kick_off_cmd) - stdin, stdout, stderr = ssh_client.exec_command(command=kick_off_cmd) - return_code = stdout.channel.recv_exit_status() - logging.info(return_code) - if return_code != 0: - raise Exception("Error while kicking off master") - - logging.info( - "master server finished init process, visit %s to check master log" % - (get_master_web_url("/status"))) - - -def cleanup(): - print requests.post(get_master_web_url("/cleanup")).text - - -def status(): - print requests.post(get_master_web_url("/status")).text - - -def get_master_web_url(path): - return "http://" + args.master_server_public_ip + ":" + str( - args.master_server_port) + path - - -if __name__ == "__main__": - print_arguments() - if args.action == "create": - if not args.key_name or not args.security_group_id: - raise ValueError("key_name and security_group_id are required") - create() - elif args.action == "cleanup": - if not args.master_server_public_ip: - raise ValueError("master_server_public_ip is required") - cleanup() - elif args.action == "status": - if not args.master_server_public_ip: - raise ValueError("master_server_public_ip is required") - status() diff --git a/tools/aws_benchmarking/client/requirements.txt b/tools/aws_benchmarking/client/requirements.txt deleted file mode 100644 index 9454801f2025671cfd1a2c3b71cf4c2ac07cb8fb..0000000000000000000000000000000000000000 --- a/tools/aws_benchmarking/client/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -netaddr==0.7.19 -boto3==1.6.21 -namesgenerator==0.3 -paramiko==2.4.1 -scp -requests diff --git a/tools/aws_benchmarking/diagram.png b/tools/aws_benchmarking/diagram.png deleted file mode 100644 index b97909c5fe78b59d0e636ff73c2ed3e63a0be722..0000000000000000000000000000000000000000 Binary files a/tools/aws_benchmarking/diagram.png and /dev/null differ diff --git a/tools/aws_benchmarking/server/Dockerfile b/tools/aws_benchmarking/server/Dockerfile deleted file mode 100644 index 333523abcdb6fbe7dc01bbaf7d32ce1d8e866028..0000000000000000000000000000000000000000 --- a/tools/aws_benchmarking/server/Dockerfile +++ /dev/null @@ -1,7 +0,0 @@ -FROM python:2.7.14-stretch - -ENV HOME /root -COPY ./ /root/ -WORKDIR /root -RUN pip install -r /root/requirements.txt -ENTRYPOINT ["python", "cluster_master.py"] \ No newline at end of file diff --git a/tools/aws_benchmarking/server/cluster_master.py b/tools/aws_benchmarking/server/cluster_master.py deleted file mode 100644 index a9b24846544d8aca5e4c7bd5709e70564c088431..0000000000000000000000000000000000000000 --- a/tools/aws_benchmarking/server/cluster_master.py +++ /dev/null @@ -1,735 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -import json -import math -import time -import threading -import logging -import copy -import csv - -import netaddr -import boto3 -import namesgenerator -import paramiko - -from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer - - -# You must have aws_access_key_id, aws_secret_access_key, region set in -# ~/.aws/credentials and ~/.aws/config -def str2bool(v): - if v.lower() in ('yes', 'true', 't', 'y', '1'): - return True - elif v.lower() in ('no', 'false', 'f', 'n', '0'): - return False - else: - raise argparse.ArgumentTypeError('Boolean value expected.') - - -parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - '--key_name', type=str, default="", help="required, key pair name") -parser.add_argument( - '--security_group_id', - type=str, - default="", - help="required, the security group id associated with your VPC") - -parser.add_argument( - '--vpc_id', - type=str, - default="", - help="The VPC in which you wish to run test") -parser.add_argument( - '--subnet_id', - type=str, - default="", - help="The Subnet_id in which you wish to run test") - -parser.add_argument( - '--pserver_instance_type', - type=str, - default="c5.2xlarge", - help="your pserver instance type, c5.2xlarge by default") -parser.add_argument( - '--trainer_instance_type', - type=str, - default="p2.8xlarge", - help="your trainer instance type, p2.8xlarge by default") - -parser.add_argument( - '--task_name', - type=str, - default="", - help="the name you want to identify your job") -parser.add_argument( - '--pserver_image_id', - type=str, - default="ami-da2c1cbf", - help="ami id for system image, default one has nvidia-docker ready, use ami-1ae93962 for us-east-2" -) -parser.add_argument( - '--trainer_image_id', - type=str, - default="ami-da2c1cbf", - help="ami id for system image, default one has nvidia-docker ready, use ami-1ae93962 for us-west-2" -) - -parser.add_argument( - '--availability_zone', - type=str, - default="us-east-2a", - help="aws zone id to place ec2 instances") - -parser.add_argument( - '--trainer_count', type=int, default=1, help="Trainer count") - -parser.add_argument( - '--pserver_count', type=int, default=1, help="Pserver count") - -parser.add_argument( - '--pserver_bash_file', - type=str, - default=os.path.join(os.path.dirname(__file__), "pserver.sh.template"), - help="pserver bash file path") - -parser.add_argument( - '--pserver_command', type=str, default="", help="pserver start command") - -parser.add_argument( - '--trainer_bash_file', - type=str, - default=os.path.join(os.path.dirname(__file__), "trainer.sh.template"), - help="trainer bash file path") - -parser.add_argument( - '--trainer_command', type=str, default="", help="trainer start command") - -parser.add_argument( - '--action', type=str, default="serve", help="create|cleanup|serve") - -parser.add_argument('--pem_path', type=str, help="private key file") - -parser.add_argument( - '--pserver_port', type=str, default="5436", help="pserver port") - -parser.add_argument( - '--docker_image', type=str, default="busybox", help="training docker image") - -parser.add_argument( - '--master_server_port', type=int, default=5436, help="master server port") - -parser.add_argument( - '--master_server_ip', type=str, default="", help="master server private ip") - -parser.add_argument( - '--metric_data_identifier', - type=str, - default="**metrics_data: ", - help="key string to identify metrics data") - -parser.add_argument( - '--no_clean_up', - type=str2bool, - default=False, - help="whether to clean up after training") - -args = parser.parse_args() - -ec2client = boto3.client('ec2') - -args.log_path = os.path.join(os.path.dirname(__file__), "logs/") - -logging.basicConfig( - filename=args.log_path + 'master.log', - level=logging.INFO, - format='%(asctime)s %(message)s') - -log_files = ["master.log"] - -metrics = {} - -metrics_csv_file_name = "metrics.csv" -is_metrics_file_created = False - - -def create_subnet(): - # if no vpc id provided, list vpcs - logging.info("start creating subnet") - if not args.vpc_id: - logging.info("no vpc provided, trying to find the default one") - vpcs_desc = ec2client.describe_vpcs( - Filters=[{ - "Name": "isDefault", - "Values": ["true", ] - }], ) - if len(vpcs_desc["Vpcs"]) == 0: - raise ValueError('No default VPC') - args.vpc_id = vpcs_desc["Vpcs"][0]["VpcId"] - vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"] - - logging.info("default vpc fount with id %s and CidrBlock %s" % - (args.vpc_id, vpc_cidrBlock)) - - if not vpc_cidrBlock: - logging.info("trying to find cidrblock for vpc") - vpcs_desc = ec2client.describe_vpcs( - Filters=[{ - "Name": "vpc-id", - "Values": [args.vpc_id, ], - }], ) - if len(vpcs_desc["Vpcs"]) == 0: - raise ValueError('No VPC found') - vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"] - logging.info("cidrblock for vpc is %s" % vpc_cidrBlock) - - # list subnets in vpc in order to create a new one - - logging.info("trying to find ip blocks for new subnet") - subnets_desc = ec2client.describe_subnets( - Filters=[{ - "Name": "vpc-id", - "Values": [args.vpc_id, ], - }], ) - - ips_taken = [] - for subnet_dec in subnets_desc["Subnets"]: - ips_taken.append(subnet_dec["CidrBlock"]) - - ip_blocks_avaliable = netaddr.IPSet( - [vpc_cidrBlock]) ^ netaddr.IPSet(ips_taken) - # adding 10 addresses as buffer - cidr_prefix = 32 - math.ceil( - math.log(args.pserver_count + args.trainer_count + 10, 2)) - if cidr_prefix <= 16: - raise ValueError('Too many nodes to fit in current VPC') - - for ipnetwork in ip_blocks_avaliable.iter_cidrs(): - try: - subnet_cidr = ipnetwork.subnet(int(cidr_prefix)).next() - logging.info("subnet ip block found %s" % (subnet_cidr)) - break - except Exception: - pass - - if not subnet_cidr: - raise ValueError( - 'No avaliable subnet to fit required nodes in current VPC') - - logging.info("trying to create subnet") - subnet_desc = ec2client.create_subnet( - CidrBlock=str(subnet_cidr), - VpcId=args.vpc_id, - AvailabilityZone=args.availability_zone) - - subnet_id = subnet_desc["Subnet"]["SubnetId"] - - subnet_waiter = ec2client.get_waiter('subnet_available') - # sleep for 1s before checking its state - time.sleep(1) - subnet_waiter.wait(SubnetIds=[subnet_id, ]) - - logging.info("subnet created") - - logging.info("adding tags to newly created subnet") - ec2client.create_tags( - Resources=[subnet_id, ], - Tags=[{ - "Key": "Task_name", - 'Value': args.task_name - }]) - return subnet_id - - -def generate_task_name(): - return namesgenerator.get_random_name() - - -def script_to_str(file_path): - if not file_path: - return "echo $PSERVER_HOSTS" - file = open(file_path, 'r') - text = file.read().strip() - file.close() - return text - - -def run_instances(image_id, instance_type, count, role, cmd=""): - if count == 0: - return [] - response = ec2client.run_instances( - ImageId=image_id, - InstanceType=instance_type, - MaxCount=count, - MinCount=count, - UserData=cmd, - DryRun=False, - InstanceInitiatedShutdownBehavior="stop", - KeyName=args.key_name, - Placement={'AvailabilityZone': args.availability_zone}, - NetworkInterfaces=[{ - 'DeviceIndex': 0, - 'SubnetId': args.subnet_id, - "AssociatePublicIpAddress": True, - 'Groups': args.security_group_ids - }], - TagSpecifications=[{ - 'ResourceType': "instance", - 'Tags': [{ - "Key": 'Task_name', - "Value": args.task_name - }, { - "Key": 'Role', - "Value": role - }] - }]) - - instance_ids = [] - for instance in response["Instances"]: - instance_ids.append(instance["InstanceId"]) - - if len(instance_ids) > 0: - logging.info(str(len(instance_ids)) + " instance(s) created") - else: - logging.info("no instance created") - #create waiter to make sure it's running - - logging.info("waiting for instance to become accessible") - waiter = ec2client.get_waiter('instance_status_ok') - waiter.wait( - Filters=[{ - "Name": "instance-status.status", - "Values": ["ok"] - }, { - "Name": "instance-status.reachability", - "Values": ["passed"] - }, { - "Name": "instance-state-name", - "Values": ["running"] - }], - InstanceIds=instance_ids) - - instances_response = ec2client.describe_instances(InstanceIds=instance_ids) - - return instances_response["Reservations"][0]["Instances"] - - -def create_pservers(): - try: - return run_instances( - image_id=args.pserver_image_id, - instance_type=args.pserver_instance_type, - count=args.pserver_count, - role="PSERVER", ) - except Exception: - logging.exception("error while trying to create pservers") - cleanup(args.task_name) - - -def save_metrics_data(str_msg): - #parse msg - logging.info("found metrics data, saving it to csv file") - global is_metrics_file_created - metrics_raw = str_msg.split(",") - with open(args.log_path + metrics_csv_file_name, 'a') as csvfile: - csv_fieldnames = [] - csv_write_data = {} - for metric in metrics_raw: - metric_data = metric.split("=") - metric_key = metric_data[0].strip() - metric_val = float(metric_data[1].strip()) - if not metric_key in metrics: - metrics[metric_key] = [] - metric_repo = metrics[metric_key] - metric_repo.append(metric_val) - csv_fieldnames.append(metric_key) - csv_write_data[metric_key] = metric_val - writer = csv.DictWriter(csvfile, fieldnames=csv_fieldnames) - if not is_metrics_file_created: - writer.writeheader() - is_metrics_file_created = True - writer.writerow(csv_write_data) - logging.info("csv file appended") - - -def log_to_file(source, filename): - if not filename in log_files: - log_files.append(filename) - with open(args.log_path + filename, "a") as log_file: - for line in iter(source.readline, ""): - log_file.write(line) - if (line.startswith(args.metric_data_identifier)): - #found key data, trying to add to csv - line = line.replace(args.metric_data_identifier, "") - save_metrics_data(line) - - -def parse_command(command_raw, defaults={}): - if not command_raw: - command_raw = "" - commands_processed = [] - parameter_map = copy.copy(defaults) - for seg in command_raw.split(","): - if ":" in seg: - parameters = seg.split(":") - parameter_map[parameters[0]] = parameters[1] - else: - commands_processed.append(seg) - for key, val in parameter_map.iteritems(): - commands_processed.append("--" + key + " " + str(val)) - return " ".join(commands_processed) - - -def create_trainers(kickoff_cmd, pserver_endpoints_str): - def create_and_start_trainer(trainer_index): - logging.info("trainer " + str(trainer_index) + " is starting") - - instance_response = run_instances( - image_id=args.trainer_image_id, - instance_type=args.trainer_instance_type, - count=1, - role="TRAINER", )[0] - trainer_ip = instance_response["PrivateIpAddress"] - - logging.info("trainer " + str(trainer_index) + " started") - - ssh_key = paramiko.RSAKey.from_private_key_file(args.pem_path) - ssh_client = paramiko.SSHClient() - ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - ssh_client.connect(hostname=trainer_ip, username="ubuntu", pkey=ssh_key) - - logging.info("trainer " + str(trainer_index) + - " terminal connected via ssh") - - cmd = kickoff_cmd.format( - PSERVER_HOSTS=pserver_endpoints_str, - DOCKER_IMAGE=args.docker_image, - TRAINER_INDEX=str(trainer_index), - TASK_NAME=args.task_name, - TRAINER_COUNT=args.trainer_count, - COMMAND=parse_command(args.trainer_command, {"device": "GPU"}), - MASTER_ENDPOINT=args.master_server_ip + ":" + - str(args.master_server_port)) - logging.info(cmd) - - stdin, stdout, stderr = ssh_client.exec_command(command=cmd) - - # read and save output log - - logging.info("trainer " + str(trainer_index) + - " command executed, keep fetching log") - - stdout_thread = threading.Thread( - target=log_to_file, - args=( - stdout, - "trainer_" + str(trainer_index) + ".log", )) - stderr_thread = threading.Thread( - target=log_to_file, - args=( - stderr, - "trainer_" + str(trainer_index) + "_err.log", )) - stdout_thread.start() - stderr_thread.start() - - stdout_thread.join() - stderr_thread.join() - - return_code = stdout.channel.recv_exit_status() - if return_code != 0: - trainer_create_results[trainer_index] = {'has_error': True} - raise ValueError("trainer didn't finish with exit code 0") - - ssh_client.close() - - # multi thread starting trainer instance and run kickoff command - - trainer_threads = [] - trainer_create_results = {} - try: - for i in xrange(args.trainer_count): - logging.info("starting tread for trainer " + str(i)) - trainer_thread = threading.Thread( - target=create_and_start_trainer, args=(i, )) - trainer_thread.start() - trainer_threads.append(trainer_thread) - - for trainer_thread in trainer_threads: - trainer_thread.join() - - for result in trainer_create_results: - if result["has_error"]: - logging.error( - "error during trainer starting or training, destorying the while cluster " - ) - cleanup(args.task_name) - break - - logging.info("all trainers stopped") - except Exception, e: - logging.info( - "Training exception, clean up resources, please check log for more info" - ) - finally: - cleanup(args.task_name) - - -def cleanup(task_name): - if args.no_clean_up: - logging.info("no clean up option set, going to leave the setup running") - return - #shutdown all ec2 instances - print("going to clean up " + task_name + " instances") - instances_response = ec2client.describe_instances(Filters=[{ - "Name": "tag:Task_name", - "Values": [task_name] - }]) - - instance_ids = [] - if len(instances_response["Reservations"]) > 0: - for reservation in instances_response["Reservations"]: - for instance in reservation["Instances"]: - instance_ids.append(instance["InstanceId"]) - - ec2client.terminate_instances(InstanceIds=instance_ids) - - instance_termination_waiter = ec2client.get_waiter( - 'instance_terminated') - instance_termination_waiter.wait(InstanceIds=instance_ids) - - #delete the subnet created - - subnet = ec2client.describe_subnets(Filters=[{ - "Name": "tag:Task_name", - "Values": [task_name] - }]) - - if len(subnet["Subnets"]) > 0: - ec2client.delete_subnet(SubnetId=subnet["Subnets"][0]["SubnetId"]) - # no subnet delete waiter, just leave it. - logging.info("Clearnup done") - return - - -def kickoff_pserver(host, pserver_endpoints_str): - try: - ssh_key = paramiko.RSAKey.from_private_key_file(args.pem_path) - ssh_client = paramiko.SSHClient() - ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - ssh_client.connect(hostname=host, username="ubuntu", pkey=ssh_key) - cmd = (script_to_str(args.pserver_bash_file)).format( - PSERVER_HOSTS=pserver_endpoints_str, - DOCKER_IMAGE=args.docker_image, - PSERVER_PORT=args.pserver_port, - TASK_NAME=args.task_name, - COMMAND=parse_command(args.pserver_command, {"device": "CPU"}), - TRAINER_COUNT=args.trainer_count, - TRAINER_INDEX=0, - # there is no way to use 0.0.0.0:port to start pserver - # has to docker --network="host" with host ip to make this work - SERVER_ENDPOINT=host + ":" + str(args.pserver_port), - MASTER_ENDPOINT=args.master_server_ip + ":" + - str(args.master_server_port)) - logging.info(cmd) - stdin, stdout, stderr = ssh_client.exec_command(command=cmd) - - stdout_thread = threading.Thread( - target=log_to_file, args=( - stdout, - "pserver_" + host + ".log", )) - stderr_thread = threading.Thread( - target=log_to_file, args=( - stderr, - "pserver_" + host + "_err.log", )) - stdout_thread.start() - stderr_thread.start() - - stdout_thread.join() - stderr_thread.join() - - return_code = stdout.channel.recv_exit_status() - logging.info(return_code) - if return_code != 0: - raise Exception("Error while kicking off pserver training process") - except Exception: - logging.exception("Error while kicking off pserver training process") - cleanup(args.task_name) - finally: - ssh_client.close() - - -def init_args(): - - if not args.task_name: - args.task_name = generate_task_name() - logging.info("task name generated %s" % (args.task_name)) - - if not args.pem_path: - args.pem_path = os.path.expanduser("~") + "/" + args.key_name + ".pem" - if args.security_group_id: - args.security_group_ids = (args.security_group_id, ) - - args.trainers_job_done_count = 0 - - -def create_cluster(): - - if not args.subnet_id: - logging.info("creating subnet for this task") - args.subnet_id = create_subnet() - logging.info("subnet %s created" % (args.subnet_id)) - - logging.info("creating pservers") - pserver_create_response = create_pservers() - logging.info("pserver created, collecting pserver ips") - - pserver_endpoints = [] - for pserver in pserver_create_response: - pserver_endpoints.append(pserver["NetworkInterfaces"][0][ - "PrivateIpAddress"] + ":" + args.pserver_port) - - pserver_endpoints_str = ",".join(pserver_endpoints) - - logging.info("kicking off pserver training process") - pserver_threads = [] - for pserver in pserver_create_response: - pserver_thread = threading.Thread( - target=kickoff_pserver, - args=(pserver["PrivateIpAddress"], pserver_endpoints_str)) - pserver_thread.start() - pserver_threads.append(pserver_thread) - - logging.info("all pserver training process started") - - logging.info("creating trainers and kicking off trainer training process") - create_trainers( - kickoff_cmd=script_to_str(args.trainer_bash_file), - pserver_endpoints_str=pserver_endpoints_str) - - for pserver_thread in pserver_threads: - pserver_thread.join() - - logging.info("all process ended") - - -def start_server(args): - class S(BaseHTTPRequestHandler): - def _set_headers(self): - self.send_response(200) - self.send_header('Content-type', 'text/text') - self.end_headers() - - def do_HEAD(self): - self._set_headers() - - def do_404(self): - self.send_response(404) - self.send_header('Content-type', 'text/text') - self.end_headers() - logging.info("Received invalid GET request" + self.path) - self.wfile.write("NO ACTION FOUND") - - def do_GET(self): - - request_path = self.path - if request_path == "/status" or request_path == "/master_logs": - self._set_headers() - logging.info("Received request to return status") - with open(args.log_path + "master.log", "r") as logfile: - self.wfile.write(logfile.read().strip()) - elif request_path == "/list_logs" or request_path == "/logs": - self._set_headers() - self.wfile.write("\n".join(log_files)) - elif "/log/" in request_path: - self._set_headers() - log_file_path = request_path.replace("/log/", "") - logging.info("requesting log file path is" + args.log_path + - log_file_path) - with open(args.log_path + log_file_path, "r") as logfile: - self.wfile.write(logfile.read().strip()) - else: - self.do_404() - - def do_POST(self): - - request_path = self.path - - if request_path == "/save_data": - self._set_headers() - logging.info("Received request to save data") - self.wfile.write("DATA SAVED!") - content_length = int(self.headers['Content-Length']) - post_data = self.rfile.read(content_length) - if args.task_name: - with open(args.task_name + ".txt", "a") as text_file: - text_file.write(post_data + "\n") - - elif request_path == "/cleanup": - self._set_headers() - logging.info("Received request to cleanup cluster") - args.no_clean_up = False - cleanup(args.task_name) - self.wfile.write("cleanup in progress") - - else: - self.do_404() - - server_address = ('', args.master_server_port) - httpd = HTTPServer(server_address, S) - logging.info("HTTP server is starting") - httpd.serve_forever() - - -def print_arguments(): - logging.info('----------- Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - logging.info('%s: %s' % (arg, value)) - logging.info('------------------------------------------------') - - -if __name__ == "__main__": - print_arguments() - if args.action == "create": - logging.info("going to create cluster") - if not args.key_name or not args.security_group_id: - raise ValueError("key_name and security_group_id are required") - init_args() - create_cluster() - elif args.action == "cleanup": - logging.info("going to cleanup cluster") - if not args.task_name: - raise ValueError("task_name is required") - cleanup(args.task_name) - elif args.action == "serve": - # serve mode - if not args.master_server_ip: - raise ValueError( - "No master server ip set, please run with --action create") - - logging.info("going to start serve and create cluster") - - init_args() - - logging.info("starting server in another thread") - server_thread = threading.Thread(target=start_server, args=(args, )) - server_thread.start() - - create_cluster() - server_thread.join() - elif args.action == "test": - start_server(args) diff --git a/tools/aws_benchmarking/server/logs/master.log b/tools/aws_benchmarking/server/logs/master.log deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/tools/aws_benchmarking/server/pserver.sh.template b/tools/aws_benchmarking/server/pserver.sh.template deleted file mode 100644 index 8d7f9e84c768b096537c92a448a117d91903f25b..0000000000000000000000000000000000000000 --- a/tools/aws_benchmarking/server/pserver.sh.template +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -docker run --network="host" -i -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} \ No newline at end of file diff --git a/tools/aws_benchmarking/server/requirements.txt b/tools/aws_benchmarking/server/requirements.txt deleted file mode 100644 index 5c523854f28b0a6f024fba2b2f344b53ba967a2f..0000000000000000000000000000000000000000 --- a/tools/aws_benchmarking/server/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -netaddr==0.7.19 -boto3==1.6.21 -namesgenerator==0.3 -paramiko==2.4.1 diff --git a/tools/aws_benchmarking/server/trainer.sh.template b/tools/aws_benchmarking/server/trainer.sh.template deleted file mode 100644 index 9b0aae9f7a7a879f164b380f719065302e0eb7e2..0000000000000000000000000000000000000000 --- a/tools/aws_benchmarking/server/trainer.sh.template +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -nvidia-docker run --network="host" -i -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "PADDLE_INIT_TRAINER_ID={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} \ No newline at end of file