Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
45d87ade
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
45d87ade
编写于
4月 12, 2018
作者:
X
Xi Chen
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
minor tweaks
上级
94ad30e5
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
44 addition
and
18 deletion
+44
-18
tools/aws_benchmarking/client/cluster_launcher.py
tools/aws_benchmarking/client/cluster_launcher.py
+18
-3
tools/aws_benchmarking/server/cluster_master.py
tools/aws_benchmarking/server/cluster_master.py
+26
-15
未找到文件。
tools/aws_benchmarking/client/cluster_launcher.py
浏览文件 @
45d87ade
...
...
@@ -49,8 +49,8 @@ parser.add_argument(
parser
.
add_argument
(
'--pserver_instance_type'
,
type
=
str
,
default
=
"
p2.8
xlarge"
,
help
=
"your pserver instance type,
p2.8
xlarge by default"
)
default
=
"
c5.2
xlarge"
,
help
=
"your pserver instance type,
c5.2
xlarge by default"
)
parser
.
add_argument
(
'--trainer_instance_type'
,
type
=
str
,
...
...
@@ -68,6 +68,10 @@ parser.add_argument(
default
=
"ami-da2c1cbf"
,
help
=
"ami id for system image, default one has nvidia-docker ready,
\
use ami-1ae93962 for us-east-2"
)
parser
.
add_argument
(
'--pserver_command'
,
type
=
str
,
default
=
""
,
help
=
"pserver start command"
)
parser
.
add_argument
(
'--trainer_image_id'
,
type
=
str
,
...
...
@@ -75,6 +79,9 @@ parser.add_argument(
help
=
"ami id for system image, default one has nvidia-docker ready,
\
use ami-1ae93962 for us-west-2"
)
parser
.
add_argument
(
'--trainer_command'
,
type
=
str
,
default
=
""
,
help
=
"trainer start command"
)
parser
.
add_argument
(
'--availability_zone'
,
type
=
str
,
...
...
@@ -104,6 +111,12 @@ parser.add_argument(
parser
.
add_argument
(
'--master_server_public_ip'
,
type
=
str
,
help
=
"master server public ip"
)
parser
.
add_argument
(
'--master_docker_image'
,
type
=
str
,
default
=
"putcn/paddle_aws_master:latest"
,
help
=
"master docker image id"
)
args
=
parser
.
parse_args
()
logging
.
basicConfig
(
level
=
logging
.
INFO
,
format
=
'%(asctime)s %(message)s'
)
...
...
@@ -322,14 +335,16 @@ def create():
# set arguments and start docker
kick_off_cmd
=
"docker run -d -v /home/ubuntu/.aws:/root/.aws/"
kick_off_cmd
+=
" -v /home/ubuntu/"
+
args
.
key_name
+
".pem:/root/"
+
args
.
key_name
+
".pem"
kick_off_cmd
+=
" -v /home/ubuntu/logs/:/root/logs/"
kick_off_cmd
+=
" -p "
+
str
(
args
.
master_server_port
)
+
":"
+
str
(
args
.
master_server_port
)
kick_off_cmd
+=
"
putcn/paddle_aws_master"
kick_off_cmd
+=
"
"
+
args
.
master_docker_image
args_to_pass
=
copy
.
copy
(
args
)
args_to_pass
.
action
=
"serve"
del
args_to_pass
.
pem_path
del
args_to_pass
.
security_group_ids
del
args_to_pass
.
master_docker_image
del
args_to_pass
.
master_server_public_ip
for
arg
,
value
in
sorted
(
vars
(
args_to_pass
).
iteritems
()):
kick_off_cmd
+=
' --%s %s'
%
(
arg
,
value
)
...
...
tools/aws_benchmarking/server/cluster_master.py
浏览文件 @
45d87ade
...
...
@@ -53,8 +53,8 @@ parser.add_argument(
parser
.
add_argument
(
'--pserver_instance_type'
,
type
=
str
,
default
=
"
p2.8
xlarge"
,
help
=
"your pserver instance type,
p2.8
xlarge by default"
)
default
=
"
c5.2
xlarge"
,
help
=
"your pserver instance type,
c5.2
xlarge by default"
)
parser
.
add_argument
(
'--trainer_instance_type'
,
type
=
str
,
...
...
@@ -97,12 +97,18 @@ parser.add_argument(
default
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"pserver.sh.template"
),
help
=
"pserver bash file path"
)
parser
.
add_argument
(
'--pserver_command'
,
type
=
str
,
default
=
""
,
help
=
"pserver start command"
)
parser
.
add_argument
(
'--trainer_bash_file'
,
type
=
str
,
default
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"trainer.sh.template"
),
help
=
"trainer bash file path"
)
parser
.
add_argument
(
'--trainer_command'
,
type
=
str
,
default
=
""
,
help
=
"trainer start command"
)
parser
.
add_argument
(
'--action'
,
type
=
str
,
default
=
"serve"
,
help
=
"create|cleanup|serve"
)
...
...
@@ -124,8 +130,12 @@ args = parser.parse_args()
ec2client
=
boto3
.
client
(
'ec2'
)
args
.
log_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"logs/"
)
logging
.
basicConfig
(
filename
=
'master.log'
,
level
=
logging
.
INFO
,
format
=
'%(asctime)s %(message)s'
)
filename
=
args
.
log_path
+
'master.log'
,
level
=
logging
.
INFO
,
format
=
'%(asctime)s %(message)s'
)
log_files
=
[
"master.log"
]
...
...
@@ -304,7 +314,7 @@ def create_pservers():
def
log_to_file
(
source
,
filename
):
if
not
filename
in
log_files
:
log_files
.
append
(
filename
)
with
open
(
filename
,
"a"
)
as
log_file
:
with
open
(
args
.
log_path
+
filename
,
"a"
)
as
log_file
:
for
line
in
iter
(
source
.
readline
,
""
):
log_file
.
write
(
line
)
...
...
@@ -335,6 +345,8 @@ def create_trainers(kickoff_cmd, pserver_endpoints_str):
DOCKER_IMAGE
=
args
.
docker_image
,
TRAINER_INDEX
=
str
(
trainer_index
),
TASK_NAME
=
args
.
task_name
,
TRAINER_COUNT
=
args
.
trainer_count
,
COMMAND
=
args
.
trainer_command
,
MASTER_ENDPOINT
=
args
.
master_server_ip
+
":"
+
str
(
args
.
master_server_port
))
logging
.
info
(
cmd
)
...
...
@@ -446,6 +458,9 @@ def kickoff_pserver(host, pserver_endpoints_str):
DOCKER_IMAGE
=
args
.
docker_image
,
PSERVER_PORT
=
args
.
pserver_port
,
TASK_NAME
=
args
.
task_name
,
COMMAND
=
args
.
pserver_command
,
TRAINER_COUNT
=
args
.
trainer_count
,
SERVER_ENDPOINT
=
host
+
":"
+
str
(
args
.
pserver_port
),
MASTER_ENDPOINT
=
args
.
master_server_ip
+
":"
+
str
(
args
.
master_server_port
))
logging
.
info
(
cmd
)
...
...
@@ -553,14 +568,17 @@ def start_server(args):
if
request_path
==
"/status"
or
request_path
==
"/master_logs"
:
self
.
_set_headers
()
logging
.
info
(
"Received request to return status"
)
with
open
(
"master.log"
,
"r"
)
as
logfile
:
with
open
(
args
.
log_path
+
"master.log"
,
"r"
)
as
logfile
:
self
.
wfile
.
write
(
logfile
.
read
().
strip
())
elif
request_path
==
"/list_logs"
:
self
.
_set_headers
()
self
.
wfile
.
write
(
"
\n
"
.
join
(
log_files
))
elif
"/log/"
in
request_path
:
log_file_path
=
request_path
.
replace
(
"/log/"
)
with
open
(
log_file_path
,
"r"
)
as
logfile
:
self
.
_set_headers
()
log_file_path
=
request_path
.
replace
(
"/log/"
,
""
)
logging
.
info
(
"requesting log file path is"
+
args
.
log_path
+
log_file_path
)
with
open
(
args
.
log_path
+
log_file_path
,
"r"
)
as
logfile
:
self
.
wfile
.
write
(
logfile
.
read
().
strip
())
else
:
self
.
do_404
()
...
...
@@ -631,11 +649,4 @@ if __name__ == "__main__":
create_cluster
()
server_thread
.
join
()
elif
args
.
action
==
"test"
:
init_args
()
if
not
args
.
subnet_id
:
logging
.
info
(
"creating subnet for this task"
)
args
.
subnet_id
=
create_subnet
()
logging
.
info
(
"subnet %s created"
%
(
args
.
subnet_id
))
create_trainers
(
kickoff_cmd
=
script_to_str
(
args
.
trainer_bash_file
),
pserver_endpoints_str
=
"11.22.33.44:5476"
)
start_server
(
args
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录