未验证 提交 4a959883 编写于 作者: T Tao Luo 提交者: GitHub

remove unused aws_benchmarking and go directory (#19103)

test=develop
上级 9150cf50
hash: 107c058cf5c9163a75d40eef2273a793c36112683c25d72aa8288827fdde3a19
updated: 2017-10-30T03:46:19.137696069Z
imports:
- name: github.com/alecthomas/gometalinter
version: bae2f1293d092fd8167939d5108d1b025eaef9de
- name: github.com/beorn7/perks
version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9
subpackages:
- quantile
- name: github.com/boltdb/bolt
version: 583e8937c61f1af6513608ccc75c97b6abdf4ff9
- name: github.com/cockroachdb/cmux
version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92
- name: github.com/coreos/etcd
version: f1d7dd87da3e8feab4aaf675b8e29c6a5ed5f58b
subpackages:
- alarm
- auth
- auth/authpb
- client
- clientv3
- clientv3/concurrency
- compactor
- discovery
- embed
- error
- etcdserver
- etcdserver/api
- etcdserver/api/etcdhttp
- etcdserver/api/v2http
- etcdserver/api/v2http/httptypes
- etcdserver/api/v3client
- etcdserver/api/v3election
- etcdserver/api/v3election/v3electionpb
- etcdserver/api/v3election/v3electionpb/gw
- etcdserver/api/v3lock
- etcdserver/api/v3lock/v3lockpb
- etcdserver/api/v3lock/v3lockpb/gw
- etcdserver/api/v3rpc
- etcdserver/api/v3rpc/rpctypes
- etcdserver/auth
- etcdserver/etcdserverpb
- etcdserver/etcdserverpb/gw
- etcdserver/membership
- etcdserver/stats
- lease
- lease/leasehttp
- lease/leasepb
- mvcc
- mvcc/backend
- mvcc/mvccpb
- pkg/adt
- pkg/contention
- pkg/cors
- pkg/cpuutil
- pkg/crc
- pkg/debugutil
- pkg/fileutil
- pkg/httputil
- pkg/idutil
- pkg/ioutil
- pkg/logutil
- pkg/monotime
- pkg/netutil
- pkg/pathutil
- pkg/pbutil
- pkg/runtime
- pkg/schedule
- pkg/srv
- pkg/tlsutil
- pkg/transport
- pkg/types
- pkg/wait
- proxy/grpcproxy/adapter
- raft
- raft/raftpb
- rafthttp
- snap
- snap/snappb
- store
- version
- wal
- wal/walpb
- name: github.com/coreos/go-semver
version: 8ab6407b697782a06568d4b7f1db25550ec2e4c6
subpackages:
- semver
- name: github.com/coreos/go-systemd
version: 48702e0da86bd25e76cfef347e2adeb434a0d0a6
subpackages:
- daemon
- journal
- util
- name: github.com/coreos/pkg
version: 3ac0863d7acf3bc44daf49afef8919af12f704ef
subpackages:
- capnslog
- name: github.com/dgrijalva/jwt-go
version: d2709f9f1f31ebcda9651b03077758c1f3a0018c
- name: github.com/ghodss/yaml
version: 0ca9ea5df5451ffdf184b4428c902747c2c11cd7
- name: github.com/go-stack/stack
version: 817915b46b97fd7bb80e8ab6b69f01a53ac3eebf
- name: github.com/gogo/protobuf
version: 909568be09de550ed094403c2bf8a261b5bb730a
subpackages:
- proto
- name: github.com/golang/protobuf
version: 4bd1920723d7b7c925de087aa32e2187708897f7
subpackages:
- jsonpb
- proto
- name: github.com/golang/snappy
version: 553a641470496b2327abcac10b36396bd98e45c9
- name: github.com/google/btree
version: 925471ac9e2131377a91e1595defec898166fe49
- name: github.com/grpc-ecosystem/go-grpc-prometheus
version: 6b7015e65d366bf3f19b2b2a000a831940f0f7e0
- name: github.com/grpc-ecosystem/grpc-gateway
version: 18d159699f2e83fc5bb9ef2f79465ca3f3122676
subpackages:
- runtime
- runtime/internal
- utilities
- name: github.com/inconshreveable/log15
version: 0decfc6c20d9ca0ad143b0e89dcaa20f810b4fb3
- name: github.com/jonboulle/clockwork
version: 2eee05ed794112d45db504eb05aa693efd2b8b09
- name: github.com/mattn/go-colorable
version: 5411d3eea5978e6cdc258b30de592b60df6aba96
- name: github.com/mattn/go-isatty
version: 57fdcb988a5c543893cc61bce354a6e24ab70022
- name: github.com/matttproud/golang_protobuf_extensions
version: c12348ce28de40eed0136aa2b644d0ee0650e56c
subpackages:
- pbutil
- name: github.com/namsral/flag
version: 71ceffbeb0ba60fccc853971bb3ed4d7d90bfd04
- name: github.com/PaddlePaddle/recordio
version: 0432dee9fd4b24fb6840fb20a8c055b0c933fb81
- name: github.com/prometheus/client_golang
version: c5b7fccd204277076155f10851dad72b76a49317
subpackages:
- prometheus
- name: github.com/prometheus/client_model
version: 6f3806018612930941127f2a7c6c453ba2c527d2
subpackages:
- go
- name: github.com/prometheus/common
version: 49fee292b27bfff7f354ee0f64e1bc4850462edf
subpackages:
- expfmt
- internal/bitbucket.org/ww/goautoneg
- model
- name: github.com/prometheus/procfs
version: a1dba9ce8baed984a2495b658c82687f8157b98f
subpackages:
- xfs
- name: github.com/satori/go.uuid
version: 879c5887cd475cd7864858769793b2ceb0d44feb
- name: github.com/sirupsen/logrus
version: f006c2ac4710855cf0f916dd6b77acf6b048dc6e
- name: github.com/topicai/candy
version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc
- name: github.com/ugorji/go
version: ded73eae5db7e7a0ef6f55aace87a2873c5d2b74
subpackages:
- codec
- name: github.com/xiang90/probing
version: 07dd2e8dfe18522e9c447ba95f2fe95262f63bb2
- name: golang.org/x/crypto
version: 9419663f5a44be8b34ca85f08abc5fe1be11f8a3
repo: https://github.com/golang/crypto.git
vcs: git
subpackages:
- bcrypt
- blowfish
- ssh/terminal
- name: golang.org/x/net
version: c8c74377599bd978aee1cf3b9b63a8634051cec2
subpackages:
- context
- http2
- http2/hpack
- idna
- internal/timeseries
- lex/httplex
- trace
- name: golang.org/x/sys
version: e48874b42435b4347fc52bdee0424a52abc974d7
repo: https://github.com/golang/sys.git
vcs: git
subpackages:
- unix
- windows
- name: golang.org/x/text
version: 836efe42bb4aa16aaa17b9c155d8813d336ed720
repo: https://github.com/golang/text.git
vcs: git
subpackages:
- secure/bidirule
- transform
- unicode/bidi
- unicode/norm
- name: google.golang.org/grpc
version: 8050b9cbc271307e5a716a9d782803d09b0d6f2d
subpackages:
- codes
- credentials
- grpclog
- internal
- keepalive
- metadata
- naming
- peer
- stats
- tap
- transport
- name: gopkg.in/yaml.v2
version: cd8b52f8269e0feb286dfeef29f8fe4d5b397e0b
testImports:
- name: github.com/davecgh/go-spew
version: 04cdfd42973bb9c8589fd6a731800cf222fde1a9
subpackages:
- spew
- name: github.com/pmezard/go-difflib
version: d8ed2627bdf02c080bf22230dbb337003b7aba2d
subpackages:
- difflib
- name: github.com/stretchr/testify
version: 05e8a0eda380579888eb53c394909df027f06991
subpackages:
- assert
package: github.com/PaddlePaddle/Paddle/go
import:
- package: github.com/PaddlePaddle/recordio
- package: github.com/coreos/etcd
version: ^3.2.1
subpackages:
- clientv3
- clientv3/concurrency
- embed
- etcdserver
- package: github.com/namsral/flag
version: ^1.7.4-pre
- package: github.com/sirupsen/logrus
version: ^1.0.0
- package: github.com/topicai/candy
- package: golang.org/x/crypto
repo: https://github.com/golang/crypto.git
vcs: git
- package: golang.org/x/sys
repo: https://github.com/golang/sys.git
vcs: git
- package: golang.org/x/text
repo: https://github.com/golang/text.git
vcs: git
- package: github.com/satori/go.uuid
version: v1.1.0
- package: github.com/alecthomas/gometalinter
version: v1.2.1
- package: github.com/inconshreveable/log15
version: v2.13
- package: github.com/go-stack/stack
version: v1.6.0
- package: github.com/golang/protobuf
......@@ -265,9 +265,6 @@ function check_style() {
# set up go environment for running gometalinter
mkdir -p $GOPATH/src/github.com/PaddlePaddle/
ln -sf ${PADDLE_ROOT} $GOPATH/src/github.com/PaddlePaddle/Paddle
mkdir -p ./build/go
cp go/glide.* build/go
cd build/go; glide install; cd -
export PATH=/usr/bin:$PATH
pre-commit install
......
# AWS benchmark testing tool
This is an automation tool for deploying paddlepaddle benchmark testing to AWS.
## Features
- subnet creation to fit just the amount of ec2 instances required.
- pserver and trainer ec2 instances allocation, and instance state verification
- nvidia-docker ready for GPU training
- Instances and network element garbage collection when a task is accomplished or an error occurred
- Test log is collected in realtime
- Web service for checking log or tearing down the testing setup
- No testing code change needed
- Lots of optional configuration options
## Usages
### Prerequisites
- You have a working AWS account
- You have [AWS Command Line Interface](https://aws.amazon.com/cli/) installed
- Your AWS cli is bind with a account which has `AmazonEC2FullAccess` permission, and it's set as default credential.
- You have key pair created and pem file downloaded.
- You have a default VPC in the region you want to run the test.
- You have a Security Group created for the VPC mentioned above, which allows port 22 and the port you want to expose your control web service (5436 by default)
- If your test is supposed to run in a GPU machine, especially a multi card GPU machine (p2, p3 series), you might need to contact amazon to raise the limit which allows no more than 1 GPU instance at a time.
### Start a benchmark test
#### Create training image
*What to expect in this step:*
*You will have your training logic packed with paddle runtime in a docker image, and be able to be picked up by AWS instance for training.*
Training python script and PaddlePaddle runtime are supposed to be packed into one docker image. Use PaddlePaddle production images as base image and create the training images with the docker file as follows:
```Dockerfile
FROM paddlepaddle/paddle:latest-gpu
ENV HOME /root
COPY ./ /root/
WORKDIR /root
RUN pip install -r /root/requirements.txt
ENTRYPOINT ["python", "my_training.py"]
```
***Please Note***
Training nodes will run your `ENTRYPOINT` script with the following environment variables:
- `TASK_NAME`: unique name to identify this training process.
- `TRAINING_ROLE`: current node's role in this training process, either "PSERVER" or "TRAINER"
- `PSERVER_HOSTS`: comma separated value of pserver end points, I.E. "192.168.1.2:5436,192.168.1.3:5436"
- `PSERVERS`: same as above
- `TRAINERS`: trainer count
- `SERVER_ENDPOINT`: current server end point if the node role is a pserver
- `TRAINER_INDEX`: an integer to identify the index of current trainer if the node role is a trainer.
- `PADDLE_INIT_TRAINER_ID`: same as above
Now we have a working distributed training script which takes advantage of node environment variables and docker file to generate the training image. Run the following command:
```bash
docker build -t myreponname/paddle_benchmark .
```
Now you have the image built and tagged with `myreponame/paddle_benchmark`, let's push it to dockerhub so that it can be picked up by out AWS instance.
```bash
docker push myreponame/paddle_benchmark
```
#### Create instances and start training
*What to expect in this step*
*you will be asked to provide some basic settings to config your training, and this tool will have your training started and monitored*
Now let's start the training process:
```bash
docker run -i -v $HOME/.aws:/root/.aws -v <full path to your pem file>:/root/<key pair name>.pem \
putcn/paddle_aws_client \
--action create \
--key_name <your key pair name> \
--security_group_id <your security group id> \
--docker_image myreponame/paddle_benchmark \
--pserver_count 2 \
--trainer_count 2 \
--trainer_command batch_size:20,local:no,device:CPU
```
Now just wait until you see this:
```
master server finished init process, visit http://XXX:XXX/status to check master log
```
That means you can turn off your laptop and your cluster is creating instances, starting training process, collecting logs and eventually shut all pservers and trainers down when training is finished.
#### Post creation operations
To access the master log:
```bash
docker run -i -v $HOME/.aws:/root/.aws \
putcn/paddle_aws_client \
--action status \
--master_server_public_ip <master ip> \
--master_server_port <master port>
```
To tear down the training setup:
```bash
docker run -i -v $HOME/.aws:/root/.aws \
putcn/paddle_aws_client \
--action cleanup \
--master_server_public_ip <master ip> \
--master_server_port <master port>
```
To retrieve training logs
TBD
### Tech details
*What to expect in this step*
*You will understand what is happening behind the scene, and how to check the training log, how to tear down the training on the fly, etc.*
Let's understand what is happening under the hood when you run above command in your laptop
![alt](diagram.png)
There are 4 roles in the figure above:
- client: your laptop
- master: who tasks to aws api server to create/tear down instances, and monitor training process
- AWS api server: the one who actually creates and manages instances
- pservers and trainers: training instances
When you run the `docker run` command above, what it actually does is to ask aws api service to create a subnet (step 1) and a master instance (step 2), and pass all the parameters the client collected or generated (step 3). The master is kept as minimum hardware config to keep the running cost low.
Then when the master is up and running, it will ask the aws api server to create the heavy lifting training instances who are expensive to run (step 4). And the master will start training process as soon as they are done initializing (step 5).
Meanwhile, the master will expose a web service for client to check training log or even tear the training setup down by a web service call.
if you are creating the training with client docker container, and also monitoring your aws dashboard, you will initially see a instance tagged with `ROLE=MASTER` and `TASK_NAME=<yourtask name>_master` starts, then you will see several instances tagged with `ROLE=PSERVER` and `ROLE=TRAINER` starts.
When the training is finished, pservers and trainers will be terminated. All their logs are kept in master node's docker env.
Master exposes 4 major services:
- GET `/status`: return master log
- GET `/logs`: return list of log file names
- GET `/log/<logfile name>`: return a particular log by log file name
- POST `/cleanup`: teardown the whole setup
### Parameters
- key_name: required, aws key pair name
- security_group_id: required, the security group id associated with your VPC
- vpc_id: The VPC in which you wish to run test, if not provided, this tool will use your default VPC.
- subnet_id: The Subnet_id in which you wish to run test, if not provided, this tool will create a new sub net to run test.
- pserver_instance_type: your pserver instance type, c5.2xlarge by default, which is a memory optimized machine.
- trainer_instance_type: your trainer instance type, p2.8xlarge by default, which is a GPU machine with 8 cards.
- task_name: the name you want to identify your job, if not provided, this tool will generate one for you.
- pserver_image_id: ami id for system image. Please note, although the default one has nvidia-docker installed, pserver is always launched with `docker` instead of `nvidia-docker`, please DO NOT init your training program with GPU place.
- pserver_command: pserver start command, format example: python,vgg.py,batch_size:128,is_local:no, which will be translated as `python vgg.py --batch_size 128 --is_local no` when trying to start the training in pserver. "--device CPU" is passed as default.
- trainer_image_id: ami id for system image, default one has nvidia-docker ready.
- trainer_command: trainer start command. Format is the same as pserver's, "--device GPU" is passed as default.
- availability_zone: aws zone id to place ec2 instances, us-east-2a by default.
- trainer_count: Trainer count, 1 by default.
- pserver_count: Pserver count, 1 by default.
- action: create|cleanup|status, "create" by default.
- pserver_port: the port for pserver to open service, 5436 by default.
- docker_image: the training docker image id.
- master_service_port: the port for master to open service, 5436 by default.
- master_server_public_ip: the master service ip, this is required when action is not "create"
- master_docker_image: master's docker image id, "putcn/paddle_aws_master:latest" by default
- no_clean_up: no instance termination when training is finished or failed when this value is set "yes". This is for debug purpose, so that you can inspect into the instances when the process is finished.
### Trouble shooting
1. How to check logs
Master log is served at `http://<masterip>:<masterport>/status`, and you can list all the log files from `http://<masterip>:<masterport>/logs`, and access either one of them by `http://<masterip>:<masterport>/log/<logfilename>`
FROM python:2.7.14-stretch
ENV HOME /root
COPY ./ /root/
WORKDIR /root
RUN pip install -r /root/requirements.txt
ENTRYPOINT ["python", "cluster_launcher.py"]
\ No newline at end of file
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import time
import math
import logging
import copy
import netaddr
import boto3
import namesgenerator
import paramiko
from scp import SCPClient
import requests
def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--key_name', type=str, default="", help="required, key pair name")
parser.add_argument(
'--security_group_id',
type=str,
default="",
help="required, the security group id associated with your VPC")
parser.add_argument(
'--vpc_id',
type=str,
default="",
help="The VPC in which you wish to run test")
parser.add_argument(
'--subnet_id',
type=str,
default="",
help="The Subnet_id in which you wish to run test")
parser.add_argument(
'--pserver_instance_type',
type=str,
default="c5.2xlarge",
help="your pserver instance type, c5.2xlarge by default")
parser.add_argument(
'--trainer_instance_type',
type=str,
default="p2.8xlarge",
help="your trainer instance type, p2.8xlarge by default")
parser.add_argument(
'--task_name',
type=str,
default="",
help="the name you want to identify your job")
parser.add_argument(
'--pserver_image_id',
type=str,
default="ami-da2c1cbf",
help="ami id for system image, default one has nvidia-docker ready, \
use ami-1ae93962 for us-east-2")
parser.add_argument(
'--pserver_command',
type=str,
default="",
help="pserver start command, format example: python,vgg.py,batch_size:128,is_local:yes"
)
parser.add_argument(
'--trainer_image_id',
type=str,
default="ami-da2c1cbf",
help="ami id for system image, default one has nvidia-docker ready, \
use ami-1ae93962 for us-west-2")
parser.add_argument(
'--trainer_command',
type=str,
default="",
help="trainer start command, format example: python,vgg.py,batch_size:128,is_local:yes"
)
parser.add_argument(
'--availability_zone',
type=str,
default="us-east-2a",
help="aws zone id to place ec2 instances")
parser.add_argument(
'--trainer_count', type=int, default=1, help="Trainer count")
parser.add_argument(
'--pserver_count', type=int, default=1, help="Pserver count")
parser.add_argument(
'--action', type=str, default="create", help="create|cleanup|status")
parser.add_argument('--pem_path', type=str, help="private key file")
parser.add_argument(
'--pserver_port', type=str, default="5436", help="pserver port")
parser.add_argument(
'--docker_image', type=str, default="busybox", help="training docker image")
parser.add_argument(
'--master_server_port', type=int, default=5436, help="master server port")
parser.add_argument(
'--master_server_public_ip', type=str, help="master server public ip")
parser.add_argument(
'--master_docker_image',
type=str,
default="putcn/paddle_aws_master:latest",
help="master docker image id")
parser.add_argument(
'--no_clean_up',
type=str2bool,
default=False,
help="whether to clean up after training")
args = parser.parse_args()
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
ec2client = boto3.client('ec2')
def print_arguments():
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def create_subnet():
# if no vpc id provided, list vpcs
logging.info("start creating subnet")
if not args.vpc_id:
logging.info("no vpc provided, trying to find the default one")
vpcs_desc = ec2client.describe_vpcs(
Filters=[{
"Name": "isDefault",
"Values": ["true", ]
}], )
if len(vpcs_desc["Vpcs"]) == 0:
raise ValueError('No default VPC')
args.vpc_id = vpcs_desc["Vpcs"][0]["VpcId"]
vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
logging.info("default vpc fount with id %s and CidrBlock %s" %
(args.vpc_id, vpc_cidrBlock))
if not vpc_cidrBlock:
logging.info("trying to find cidrblock for vpc")
vpcs_desc = ec2client.describe_vpcs(
Filters=[{
"Name": "vpc-id",
"Values": [args.vpc_id, ],
}], )
if len(vpcs_desc["Vpcs"]) == 0:
raise ValueError('No VPC found')
vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
logging.info("cidrblock for vpc is %s" % vpc_cidrBlock)
# list subnets in vpc in order to create a new one
logging.info("trying to find ip blocks for new subnet")
subnets_desc = ec2client.describe_subnets(
Filters=[{
"Name": "vpc-id",
"Values": [args.vpc_id, ],
}], )
ips_taken = []
for subnet_dec in subnets_desc["Subnets"]:
ips_taken.append(subnet_dec["CidrBlock"])
ip_blocks_avaliable = netaddr.IPSet(
[vpc_cidrBlock]) ^ netaddr.IPSet(ips_taken)
# adding 10 addresses as buffer
cidr_prefix = 32 - math.ceil(
math.log(args.pserver_count + args.trainer_count + 10, 2))
if cidr_prefix <= 16:
raise ValueError('Too many nodes to fit in current VPC')
for ipnetwork in ip_blocks_avaliable.iter_cidrs():
try:
subnet_cidr = ipnetwork.subnet(int(cidr_prefix)).next()
logging.info("subnet ip block found %s" % (subnet_cidr))
break
except Exception:
pass
if not subnet_cidr:
raise ValueError(
'No avaliable subnet to fit required nodes in current VPC')
logging.info("trying to create subnet")
subnet_desc = ec2client.create_subnet(
CidrBlock=str(subnet_cidr),
VpcId=args.vpc_id,
AvailabilityZone=args.availability_zone)
subnet_id = subnet_desc["Subnet"]["SubnetId"]
subnet_waiter = ec2client.get_waiter('subnet_available')
# sleep for 1s before checking its state
time.sleep(1)
subnet_waiter.wait(SubnetIds=[subnet_id, ])
logging.info("subnet created")
logging.info("adding tags to newly created subnet")
ec2client.create_tags(
Resources=[subnet_id, ],
Tags=[{
"Key": "Task_name",
'Value': args.task_name
}])
return subnet_id
def run_instances(image_id, instance_type, count=1, role="MASTER", cmd=""):
response = ec2client.run_instances(
ImageId=image_id,
InstanceType=instance_type,
MaxCount=count,
MinCount=count,
UserData=cmd,
DryRun=False,
InstanceInitiatedShutdownBehavior="stop",
KeyName=args.key_name,
Placement={'AvailabilityZone': args.availability_zone},
NetworkInterfaces=[{
'DeviceIndex': 0,
'SubnetId': args.subnet_id,
"AssociatePublicIpAddress": True,
'Groups': args.security_group_ids
}],
TagSpecifications=[{
'ResourceType': "instance",
'Tags': [{
"Key": 'Task_name',
"Value": args.task_name + "_master"
}, {
"Key": 'Role',
"Value": role
}]
}])
instance_ids = []
for instance in response["Instances"]:
instance_ids.append(instance["InstanceId"])
if len(instance_ids) > 0:
logging.info(str(len(instance_ids)) + " instance(s) created")
else:
logging.info("no instance created")
#create waiter to make sure it's running
logging.info("waiting for instance to become accessible")
waiter = ec2client.get_waiter('instance_status_ok')
waiter.wait(
Filters=[{
"Name": "instance-status.status",
"Values": ["ok"]
}, {
"Name": "instance-status.reachability",
"Values": ["passed"]
}, {
"Name": "instance-state-name",
"Values": ["running"]
}],
InstanceIds=instance_ids)
instances_response = ec2client.describe_instances(InstanceIds=instance_ids)
return instances_response["Reservations"][0]["Instances"]
def generate_task_name():
return namesgenerator.get_random_name()
def init_args():
if not args.task_name:
args.task_name = generate_task_name()
logging.info("task name generated %s" % (args.task_name))
if not args.pem_path:
args.pem_path = os.path.expanduser("~") + "/" + args.key_name + ".pem"
if args.security_group_id:
args.security_group_ids = (args.security_group_id, )
def create():
init_args()
# create subnet
if not args.subnet_id:
args.subnet_id = create_subnet()
# create master node
master_instance_response = run_instances(
image_id="ami-7a05351f", instance_type="t2.nano")
logging.info("master server started")
args.master_server_public_ip = master_instance_response[0][
"PublicIpAddress"]
args.master_server_ip = master_instance_response[0]["PrivateIpAddress"]
logging.info("master server started, master_ip=%s, task_name=%s" %
(args.master_server_public_ip, args.task_name))
# cp config file and pems to master node
ssh_key = paramiko.RSAKey.from_private_key_file(args.pem_path)
ssh_client = paramiko.SSHClient()
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh_client.connect(
hostname=args.master_server_public_ip, username="ubuntu", pkey=ssh_key)
with SCPClient(ssh_client.get_transport()) as scp:
scp.put(os.path.expanduser("~") + "/" + ".aws",
recursive=True,
remote_path='/home/ubuntu/')
scp.put(args.pem_path,
remote_path='/home/ubuntu/' + args.key_name + ".pem")
logging.info("credentials and pem copied to master")
# set arguments and start docker
kick_off_cmd = "docker run -d -v /home/ubuntu/.aws:/root/.aws/"
kick_off_cmd += " -v /home/ubuntu/" + args.key_name + ".pem:/root/" + args.key_name + ".pem"
kick_off_cmd += " -v /home/ubuntu/logs/:/root/logs/"
kick_off_cmd += " -p " + str(args.master_server_port) + ":" + str(
args.master_server_port)
kick_off_cmd += " " + args.master_docker_image
args_to_pass = copy.copy(args)
args_to_pass.action = "serve"
del args_to_pass.pem_path
del args_to_pass.security_group_ids
del args_to_pass.master_docker_image
del args_to_pass.master_server_public_ip
for arg, value in sorted(vars(args_to_pass).iteritems()):
if value:
kick_off_cmd += ' --%s %s' % (arg, value)
logging.info(kick_off_cmd)
stdin, stdout, stderr = ssh_client.exec_command(command=kick_off_cmd)
return_code = stdout.channel.recv_exit_status()
logging.info(return_code)
if return_code != 0:
raise Exception("Error while kicking off master")
logging.info(
"master server finished init process, visit %s to check master log" %
(get_master_web_url("/status")))
def cleanup():
print requests.post(get_master_web_url("/cleanup")).text
def status():
print requests.post(get_master_web_url("/status")).text
def get_master_web_url(path):
return "http://" + args.master_server_public_ip + ":" + str(
args.master_server_port) + path
if __name__ == "__main__":
print_arguments()
if args.action == "create":
if not args.key_name or not args.security_group_id:
raise ValueError("key_name and security_group_id are required")
create()
elif args.action == "cleanup":
if not args.master_server_public_ip:
raise ValueError("master_server_public_ip is required")
cleanup()
elif args.action == "status":
if not args.master_server_public_ip:
raise ValueError("master_server_public_ip is required")
status()
netaddr==0.7.19
boto3==1.6.21
namesgenerator==0.3
paramiko==2.4.1
scp
requests
FROM python:2.7.14-stretch
ENV HOME /root
COPY ./ /root/
WORKDIR /root
RUN pip install -r /root/requirements.txt
ENTRYPOINT ["python", "cluster_master.py"]
\ No newline at end of file
#!/bin/bash
docker run --network="host" -i -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND}
\ No newline at end of file
netaddr==0.7.19
boto3==1.6.21
namesgenerator==0.3
paramiko==2.4.1
#!/bin/bash
nvidia-docker run --network="host" -i -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "PADDLE_INIT_TRAINER_ID={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND}
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册