Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
a2a5ffae
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a2a5ffae
编写于
4月 03, 2018
作者:
X
Xi Chen
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
init checkin for aws benchmarking tool
上级
d40096ca
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
399 addition
and
0 deletion
+399
-0
tools/aws_benchmarking/paddle_banchmarking_aws.py
tools/aws_benchmarking/paddle_banchmarking_aws.py
+393
-0
tools/aws_benchmarking/pserver.sh.template
tools/aws_benchmarking/pserver.sh.template
+1
-0
tools/aws_benchmarking/requirements.txt
tools/aws_benchmarking/requirements.txt
+4
-0
tools/aws_benchmarking/trainer.sh.template
tools/aws_benchmarking/trainer.sh.template
+1
-0
未找到文件。
tools/aws_benchmarking/paddle_banchmarking_aws.py
0 → 100644
浏览文件 @
a2a5ffae
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
os
import
json
import
math
import
time
import
base64
import
netaddr
import
boto3
import
namesgenerator
import
paramiko
# You must have aws_access_key_id, aws_secret_access_key, region set in
# ~/.aws/credentials and ~/.aws/config
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
'--vpc_id'
,
type
=
str
,
default
=
""
,
help
=
"The VPC in which you wish to run test"
)
parser
.
add_argument
(
'--subnet_id'
,
type
=
str
,
default
=
""
,
help
=
"The Subnet_id in which you wish to run test"
)
parser
.
add_argument
(
'--security_group_id'
,
type
=
str
,
default
=
""
,
required
=
True
,
help
=
"required, the security group id associated with your VPC"
)
parser
.
add_argument
(
'--pserver_instance_type'
,
type
=
str
,
default
=
"p2.xlarge"
,
help
=
"your pserver instance type"
)
parser
.
add_argument
(
'--trainer_instance_type'
,
type
=
str
,
default
=
"p2.xlarge"
,
help
=
"your trainer instance type"
)
parser
.
add_argument
(
'--key_name'
,
type
=
str
,
default
=
""
,
required
=
True
,
help
=
"required, key pair name"
)
parser
.
add_argument
(
'--task_name'
,
type
=
str
,
default
=
""
,
help
=
"the name you want to identify your job"
)
parser
.
add_argument
(
'--pserver_image_id'
,
type
=
str
,
default
=
"ami-1ae93962"
,
help
=
"ami id for system image, default one has nvidia-docker ready"
)
parser
.
add_argument
(
'--trainer_image_id'
,
type
=
str
,
default
=
"ami-1ae93962"
,
help
=
"ami id for system image, default one has nvidia-docker ready"
)
parser
.
add_argument
(
'--trainer_count'
,
type
=
int
,
default
=
1
,
help
=
"Trainer count"
)
parser
.
add_argument
(
'--pserver_count'
,
type
=
int
,
default
=
1
,
help
=
"Pserver count"
)
parser
.
add_argument
(
'--pserver_bash_file'
,
type
=
str
,
required
=
False
,
default
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"pserver.sh.template"
),
help
=
"pserver bash file path"
)
parser
.
add_argument
(
'--trainer_bash_file'
,
type
=
str
,
required
=
False
,
default
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"trainer.sh.template"
),
help
=
"trainer bash file path"
)
parser
.
add_argument
(
'--pem_path'
,
type
=
str
,
help
=
"private key file"
)
parser
.
add_argument
(
'--pserver_port'
,
type
=
str
,
default
=
"5436"
,
help
=
"pserver port"
)
parser
.
add_argument
(
'--docker_image'
,
type
=
str
,
default
=
"busybox"
,
help
=
"training docker image"
)
args
=
parser
.
parse_args
()
ec2client
=
boto3
.
client
(
'ec2'
)
def
create_subnet
():
# if no vpc id provided, list vpcs
if
not
args
.
vpc_id
:
print
(
"no vpc provided, trying to find the default one"
)
vpcs_desc
=
ec2client
.
describe_vpcs
(
Filters
=
[{
"Name"
:
"isDefault"
,
"Values"
:
[
"true"
,
]
}],
)
if
len
(
vpcs_desc
[
"Vpcs"
])
==
0
:
raise
ValueError
(
'No default VPC'
)
args
.
vpc_id
=
vpcs_desc
[
"Vpcs"
][
0
][
"VpcId"
]
vpc_cidrBlock
=
vpcs_desc
[
"Vpcs"
][
0
][
"CidrBlock"
]
print
(
"default vpc fount with id %s and CidrBlock %s"
%
(
args
.
vpc_id
,
vpc_cidrBlock
))
if
not
vpc_cidrBlock
:
print
(
"trying to find cidrblock for vpc"
)
vpcs_desc
=
ec2client
.
describe_vpcs
(
Filters
=
[{
"Name"
:
"vpc-id"
,
"Values"
:
[
args
.
vpc_id
,
],
}],
)
if
len
(
vpcs_desc
[
"Vpcs"
])
==
0
:
raise
ValueError
(
'No VPC found'
)
vpc_cidrBlock
=
vpcs_desc
[
"Vpcs"
][
0
][
"CidrBlock"
]
print
(
"cidrblock for vpc is %s"
%
vpc_cidrBlock
)
# list subnets in vpc in order to create a new one
print
(
"trying to find ip blocks for new subnet"
)
subnets_desc
=
ec2client
.
describe_subnets
(
Filters
=
[{
"Name"
:
"vpc-id"
,
"Values"
:
[
args
.
vpc_id
,
],
}],
)
ips_taken
=
[]
for
subnet_dec
in
subnets_desc
[
"Subnets"
]:
ips_taken
.
append
(
subnet_dec
[
"CidrBlock"
])
ip_blocks_avaliable
=
netaddr
.
IPSet
(
[
vpc_cidrBlock
])
^
netaddr
.
IPSet
(
ips_taken
)
# adding 10 addresses as buffer
cidr_prefix
=
32
-
math
.
ceil
(
math
.
log
(
args
.
pserver_count
+
args
.
trainer_count
+
10
,
2
))
if
cidr_prefix
<=
16
:
raise
ValueError
(
'Too many nodes to fit in current VPC'
)
for
ipnetwork
in
ip_blocks_avaliable
.
iter_cidrs
():
try
:
subnet_cidr
=
ipnetwork
.
subnet
(
int
(
cidr_prefix
)).
next
()
print
(
"subnet ip block found %s"
%
(
subnet_cidr
))
break
except
Exception
:
pass
if
not
subnet_cidr
:
raise
ValueError
(
'No avaliable subnet to fit required nodes in current VPC'
)
print
(
"trying to create subnet"
)
subnet_desc
=
ec2client
.
create_subnet
(
CidrBlock
=
str
(
subnet_cidr
),
VpcId
=
args
.
vpc_id
)
subnet_id
=
subnet_desc
[
"Subnet"
][
"SubnetId"
]
subnet_waiter
=
ec2client
.
get_waiter
(
'subnet_available'
)
# sleep for 1s before checking its state
time
.
sleep
(
1
)
subnet_waiter
.
wait
(
SubnetIds
=
[
subnet_id
,
])
print
(
"subnet created"
)
print
(
"adding tags to newly created subnet"
)
ec2client
.
create_tags
(
Resources
=
[
subnet_id
,
],
Tags
=
[{
"Key"
:
"Task_name"
,
'Value'
:
args
.
task_name
}])
return
subnet_id
def
generate_task_name
():
return
namesgenerator
.
get_random_name
()
def
script_to_str
(
file_path
):
if
not
file_path
:
return
"echo $PSERVER_HOSTS"
file
=
open
(
file_path
,
'r'
)
text
=
file
.
read
().
strip
()
file
.
close
()
return
text
def
run_instances
(
image_id
,
instance_type
,
count
,
role
,
cmd
=
""
):
if
cmd
:
cmd
=
base64
.
b64encode
(
cmd
)
response
=
ec2client
.
run_instances
(
ImageId
=
image_id
,
InstanceType
=
instance_type
,
MaxCount
=
count
,
MinCount
=
count
,
UserData
=
cmd
,
DryRun
=
False
,
InstanceInitiatedShutdownBehavior
=
"stop"
,
KeyName
=
args
.
key_name
,
NetworkInterfaces
=
[{
'DeviceIndex'
:
0
,
'SubnetId'
:
args
.
subnet_id
,
"AssociatePublicIpAddress"
:
True
,
'Groups'
:
args
.
security_group_ids
}],
TagSpecifications
=
[{
'ResourceType'
:
"instance"
,
'Tags'
:
[{
"Key"
:
'Task_name'
,
"Value"
:
args
.
task_name
},
{
"Key"
:
'Role'
,
"Value"
:
role
}]
}])
instance_ids
=
[]
for
instance
in
response
[
"Instances"
]:
instance_ids
.
append
(
instance
[
"InstanceId"
])
if
len
(
instance_ids
)
>
0
:
print
(
str
(
len
(
instance_ids
))
+
" instance(s) created"
)
else
:
print
(
"no instance created"
)
#create waiter to make sure it's running
print
(
"waiting for instance to become accessible"
)
waiter
=
ec2client
.
get_waiter
(
'instance_status_ok'
)
waiter
.
wait
(
Filters
=
[{
"Name"
:
"instance-status.status"
,
"Values"
:
[
"ok"
]
},
{
"Name"
:
"instance-status.reachability"
,
"Values"
:
[
"passed"
]
},
{
"Name"
:
"instance-state-name"
,
"Values"
:
[
"running"
]
}],
InstanceIds
=
instance_ids
)
instances_response
=
ec2client
.
describe_instances
(
InstanceIds
=
instance_ids
)
return
instances_response
[
"Reservations"
][
0
][
"Instances"
]
def
create_pservers
():
return
run_instances
(
image_id
=
args
.
pserver_image_id
,
instance_type
=
args
.
pserver_instance_type
,
count
=
args
.
pserver_count
,
role
=
"PSERVER"
,
)
def
create_trainers
(
kickoff_cmd
,
pserver_endpoints_str
):
responses
=
[]
for
i
in
xrange
(
args
.
trainer_count
):
cmd
=
kickoff_cmd
.
format
(
PSERVER_HOSTS
=
pserver_endpoints_str
,
DOCKER_IMAGE
=
args
.
docker_image
,
TRAINER_INDEX
=
str
(
i
))
print
(
cmd
)
responses
.
append
(
run_instances
(
image_id
=
args
.
trainer_image_id
,
instance_type
=
args
.
trainer_instance_type
,
count
=
1
,
role
=
"TRAINER"
,
cmd
=
cmd
,
)[
0
])
return
responses
def
cleanup
(
task_name
):
#shutdown all ec2 instances
instances
=
ec2client
.
describe_instances
(
Filters
=
[{
"Name"
:
"tag"
,
"Value"
:
"Task_name="
+
task_name
}])
instance_ids
=
[]
for
instance
in
instances
[
"Reservations"
][
0
][
"Instances"
]:
instance_ids
.
append
(
instance
[
"InstanceId"
])
ec2client
.
stop_instances
(
InstanceIds
=
instance_ids
)
instance_stop_waiter
=
ec2client
.
get_waiter
(
'instance_stopped'
)
instance_stop_waiter
.
wait
(
InstanceIds
=
instance_ids
)
#delete the subnet created
subnet
=
ec2client
.
describe_subnets
(
Filters
=
[{
"Name"
:
"tag"
,
"Value"
:
"Task_name="
+
task_name
}])
ec2client
.
delete_subnet
(
SubnetId
=
subnet
[
"Subnets"
][
0
][
"SubnetId"
])
# no subnet delete waiter, just leave it.
return
def
main
():
if
not
args
.
task_name
:
args
.
task_name
=
generate_task_name
()
print
(
"task name generated"
,
args
.
task_name
)
if
not
args
.
subnet_id
:
print
(
"creating subnet for this task"
)
args
.
subnet_id
=
create_subnet
()
print
(
"subnet %s created"
%
(
args
.
subnet_id
))
if
not
args
.
pem_path
:
args
.
pem_path
=
os
.
path
.
expanduser
(
"~"
)
+
"/"
+
args
.
key_name
+
".pem"
if
args
.
security_group_id
:
args
.
security_group_ids
=
(
args
.
security_group_id
,
)
print
(
"creating pservers"
)
pserver_create_response
=
create_pservers
()
print
(
"pserver created, collecting pserver ips"
)
pserver_endpoints
=
[]
for
pserver
in
pserver_create_response
:
pserver_endpoints
.
append
(
pserver
[
"NetworkInterfaces"
][
0
][
"PrivateIpAddress"
]
+
":"
+
args
.
pserver_port
)
pserver_endpoints_str
=
","
.
join
(
pserver_endpoints
)
# ssh to pservers to start training
ssh_key
=
paramiko
.
RSAKey
.
from_private_key_file
(
args
.
pem_path
)
ssh_client
=
paramiko
.
SSHClient
()
ssh_client
.
set_missing_host_key_policy
(
paramiko
.
AutoAddPolicy
())
print
(
"kicking off pserver training process"
)
for
pserver
in
pserver_create_response
:
try
:
ssh_client
.
connect
(
hostname
=
pserver
[
"PublicIpAddress"
],
username
=
"ubuntu"
,
pkey
=
ssh_key
)
cmd
=
(
script_to_str
(
args
.
pserver_bash_file
)).
format
(
PSERVER_HOSTS
=
pserver_endpoints_str
,
DOCKER_IMAGE
=
args
.
docker_image
)
print
(
cmd
)
stdin
,
stdout
,
stderr
=
ssh_client
.
exec_command
(
command
=
cmd
)
if
stderr
.
read
():
raise
Exception
(
"Error while kicking off pserver training process"
)
#print(stdout.read())
except
Exception
,
e
:
print
e
cleanup
(
args
.
task_name
)
finally
:
ssh_client
.
close
()
print
(
"creating trainers and kicking off trainer training process"
)
create_trainers
(
kickoff_cmd
=
script_to_str
(
args
.
trainer_bash_file
),
pserver_endpoints_str
=
pserver_endpoints_str
)
def
print_arguments
():
print
(
'----------- Configuration Arguments -----------'
)
for
arg
,
value
in
sorted
(
vars
(
args
).
iteritems
()):
print
(
'%s: %s'
%
(
arg
,
value
))
print
(
'------------------------------------------------'
)
if
__name__
==
"__main__"
:
print_arguments
()
main
()
tools/aws_benchmarking/pserver.sh.template
0 → 100644
浏览文件 @
a2a5ffae
nvidia-docker run -i -e "TRAINING_ROLE=PSERVER" -e "PSERVER_HOSTS={PSERVER_HOSTS}" {DOCKER_IMAGE}
\ No newline at end of file
tools/aws_benchmarking/requirements.txt
0 → 100644
浏览文件 @
a2a5ffae
netaddr==0.7.19
boto3==1.6.21
namesgenerator==0.3
paramiko==2.4.1
tools/aws_benchmarking/trainer.sh.template
0 → 100644
浏览文件 @
a2a5ffae
nvidia-docker run -i -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER" -e "PSERVER_HOSTS={PSERVER_HOSTS}" {DOCKER_IMAGE}
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录