Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
f9db5629
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f9db5629
编写于
1月 31, 2018
作者:
T
typhoonzero
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update results
上级
bd64719a
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
11 addition
and
322 deletion
+11
-322
benchmark/cluster/vgg16/Dockerfile
benchmark/cluster/vgg16/Dockerfile
+7
-9
benchmark/cluster/vgg16/README.md
benchmark/cluster/vgg16/README.md
+1
-1
benchmark/cluster/vgg16/fluid_trainer.yaml
benchmark/cluster/vgg16/fluid_trainer.yaml
+1
-1
benchmark/cluster/vgg16/k8s_tools.py
benchmark/cluster/vgg16/k8s_tools.py
+0
-94
benchmark/cluster/vgg16/paddle_k8s
benchmark/cluster/vgg16/paddle_k8s
+0
-199
benchmark/cluster/vgg16/reader.py
benchmark/cluster/vgg16/reader.py
+0
-16
benchmark/cluster/vgg16/v2_trainer.yaml
benchmark/cluster/vgg16/v2_trainer.yaml
+1
-1
benchmark/cluster/vgg16/vgg16_v2.py
benchmark/cluster/vgg16/vgg16_v2.py
+1
-1
未找到文件。
benchmark/cluster/vgg16/Dockerfile
浏览文件 @
f9db5629
#FROM paddlepaddle/paddlecloud-job
#RUN mkdir -p /workspace
#ADD reader.py /workspace/
#RUN python /workspace/reader.py
FROM
python:2.7.14
ADD
paddle_k8s /usr/bin
ADD
k8s_tools.py /root
RUN
pip
install
-U
kubernetes opencv-python
&&
apt-get update
-y
&&
apt-get
install
-y
iputils-ping libgtk2.0-dev
ADD
https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
ADD
https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
RUN
pip
install
-U
kubernetes opencv-python
&&
apt-get update
-y
&&
apt-get
install
-y
iputils-ping libgtk2.0-dev
&&
\
chmod
+x /usr/bin/paddle_k8s
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
# so we must build one with distribute support to install in this image.
ADD
*.whl /
RUN
pip
install
/
*
.whl
&&
rm
-f
/
*
.whl
ENV
LD_LIBRARY_PATH=/usr/local/lib
ADD
reader.py /workspace/
RUN
python /workspace/reader.py
RUN
sh
-c
'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
ADD
vgg16_fluid.py vgg16_v2.py /workspace/
benchmark/cluster/vgg16/README.md
浏览文件 @
f9db5629
...
...
@@ -43,7 +43,7 @@
| Trainer Counter | 20 | 40 | 80 | 100 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 291.06 | 518.80 | 836.26 | 1019.29 |
| PaddlePaddle v2
| 356.28 | - | -
| 1041.99 |
| PaddlePaddle v2
(need more tests) | 356.28 | 785.39 | 853.30
| 1041.99 |
| TensorFlow | - | - | - | - |
### different pserver number
...
...
benchmark/cluster/vgg16/fluid_trainer.yaml
浏览文件 @
f9db5629
...
...
@@ -30,7 +30,7 @@ spec:
-
name
:
TOPOLOGY
value
:
"
"
-
name
:
ENTRY
value
:
"
MKL_NUM_THREADS=1
python
/workspace/vgg16_fluid.py
--local
0
--batch_size
256
"
value
:
"
MKL_NUM_THREADS=1
python
/workspace/vgg16_fluid.py
--local
0
--batch_size
128
"
-
name
:
TRAINER_PACKAGE
value
:
"
/workspace"
-
name
:
PADDLE_INIT_PORT
...
...
benchmark/cluster/vgg16/k8s_tools.py
已删除
100644 → 0
浏览文件 @
bd64719a
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/bin/env python
import
os
import
sys
import
time
import
socket
from
kubernetes
import
client
,
config
PADDLE_JOB_NAME
=
os
.
getenv
(
"PADDLE_JOB_NAME"
)
NAMESPACE
=
os
.
getenv
(
"NAMESPACE"
)
PORT
=
os
.
getenv
(
"PSERVER_PORT"
)
if
os
.
getenv
(
"KUBERNETES_SERVICE_HOST"
,
None
):
config
.
load_incluster_config
()
else
:
config
.
load_kube_config
()
v1
=
client
.
CoreV1Api
()
def
fetch_pods_info
(
label_selector
):
api_response
=
v1
.
list_namespaced_pod
(
namespace
=
NAMESPACE
,
pretty
=
True
,
label_selector
=
label_selector
)
pod_list
=
[]
for
item
in
api_response
.
items
:
pod_list
.
append
((
item
.
status
.
phase
,
item
.
status
.
pod_ip
))
return
pod_list
def
wait_pods_running
(
label_selector
,
desired
):
print
"label selector: %s, desired: %s"
%
(
label_selector
,
desired
)
while
True
:
count
=
count_pods_by_phase
(
label_selector
,
'Running'
)
# NOTE: pods may be scaled.
if
count
>=
int
(
desired
):
break
print
'current cnt: %d sleep for 5 seconds...'
%
count
time
.
sleep
(
5
)
def
count_pods_by_phase
(
label_selector
,
phase
):
pod_list
=
fetch_pods_info
(
label_selector
)
filtered_pod_list
=
filter
(
lambda
x
:
x
[
0
]
==
phase
,
pod_list
)
return
len
(
filtered_pod_list
)
def
fetch_pserver_ips
():
label_selector
=
"paddle-job-pserver=%s"
%
PADDLE_JOB_NAME
pod_list
=
fetch_pods_info
(
label_selector
)
pserver_ips
=
[
item
[
1
]
for
item
in
pod_list
]
return
","
.
join
(
pserver_ips
)
def
fetch_master_ip
():
label_selector
=
"paddle-job-master=%s"
%
PADDLE_JOB_NAME
pod_list
=
fetch_pods_info
(
label_selector
)
master_ips
=
[
item
[
1
]
for
item
in
pod_list
]
return
master_ips
[
0
]
def
fetch_trainer_id
():
label_selector
=
"paddle-job=%s"
%
PADDLE_JOB_NAME
pod_list
=
fetch_pods_info
(
label_selector
)
trainer_ips
=
[
item
[
1
]
for
item
in
pod_list
]
trainer_ips
.
sort
()
local_ip
=
socket
.
gethostbyname
(
socket
.
gethostname
())
for
i
in
xrange
(
len
(
trainer_ips
)):
if
trainer_ips
[
i
]
==
local_ip
:
return
i
return
None
if
__name__
==
"__main__"
:
command
=
sys
.
argv
[
1
]
if
command
==
"fetch_pserver_ips"
:
print
fetch_pserver_ips
()
elif
command
==
"fetch_trainer_id"
:
print
fetch_trainer_id
()
elif
command
==
"fetch_master_ip"
:
print
fetch_master_ip
()
elif
command
==
"count_pods_by_phase"
:
print
count_pods_by_phase
(
sys
.
argv
[
2
],
sys
.
argv
[
3
])
elif
command
==
"wait_pods_running"
:
wait_pods_running
(
sys
.
argv
[
2
],
sys
.
argv
[
3
])
benchmark/cluster/vgg16/paddle_k8s
已删除
100755 → 0
浏览文件 @
bd64719a
#!/bin/bash
start_pserver
()
{
stdbuf
-oL
paddle pserver
\
--use_gpu
=
0
\
--port
=
$PADDLE_INIT_PORT
\
--ports_num
=
$PADDLE_INIT_PORTS_NUM
\
--ports_num_for_sparse
=
$PADDLE_INIT_PORTS_NUM_FOR_SPARSE
\
--nics
=
$PADDLE_INIT_NICS
\
--comment
=
paddle_process_k8s
\
--num_gradient_servers
=
$PADDLE_INIT_NUM_GRADIENT_SERVERS
}
start_new_pserver
()
{
stdbuf
-oL
python /root/k8s_tools.py wait_pods_running paddle-job-master
=
${
PADDLE_JOB_NAME
}
1
export
MASTER_IP
=
$(
python /root/k8s_tools.py fetch_master_ip
)
stdbuf
-oL
/usr/bin/pserver
\
-port
=
$PADDLE_INIT_PORT
\
-num-pservers
=
$PSERVERS
\
-log-level
=
debug
\
-etcd-endpoint
=
http://
$MASTER_IP
:2379
}
start_master
()
{
stdbuf
-oL
/usr/bin/master
\
-port
=
8080
\
-chunk-per-task
=
1
\
-task-timout-dur
=
16s
\
-endpoints
=
http://127.0.0.1:2379
}
check_failed_cnt
()
{
max_failed
=
$1
failed_count
=
$(
python /root/k8s_tools.py count_pods_by_phase paddle-job
=
${
PADDLE_JOB_NAME
}
Failed
)
if
[
$failed_count
-gt
$max_failed
]
;
then
stdbuf
-oL
echo
"Failed trainer count beyond the threadhold: "
$max_failed
echo
"Failed trainer count beyond the threshold: "
$max_failed
>
/dev/termination-log
exit
0
fi
}
check_trainer_ret
()
{
ret
=
$1
stdbuf
-oL
echo
"job returned
$ret
...setting pod return message..."
stdbuf
-oL
echo
"==============================="
if
[
$ret
-eq
136
]
;
then
echo
"Error Arithmetic Operation(Floating Point Exception)"
>
/dev/termination-log
elif
[
$ret
-eq
139
]
;
then
echo
"Segmentation Fault"
>
/dev/termination-log
elif
[
$ret
-eq
1
]
;
then
echo
"General Error"
>
/dev/termination-log
elif
[
$ret
-eq
134
]
;
then
echo
"Program Abort"
>
/dev/termination-log
fi
stdbuf
-oL
echo
"termination log wroted..."
exit
$ret
}
start_fluid_process
()
{
stdbuf
-oL
python /root/k8s_tools.py wait_pods_running paddle-job-pserver
=
${
PADDLE_JOB_NAME
}
${
PSERVERS
}
if
[
"
${
TRAINING_ROLE
}
"
==
"TRAINER"
]
;
then
check_failed_cnt
${
TRAINERS
}
sleep
5
export
PADDLE_INIT_TRAINER_ID
=
$(
python /root/k8s_tools.py fetch_trainer_id
)
fi
export
PADDLE_INIT_PSERVERS
=
$(
python /root/k8s_tools.py fetch_pserver_ips
)
stdbuf
-oL
sh
-c
"
${
ENTRY
}
"
check_trainer_ret
$?
}
start_new_trainer
()
{
# FIXME(Yancey1989): use command-line interface to configure the max failed count
check_failed_cnt
${
TRAINERS
}
stdbuf
-oL
python /root/k8s_tools.py wait_pods_running paddle-job-pserver
=
${
PADDLE_JOB_NAME
}
${
PSERVERS
}
sleep
5
stdbuf
-oL
python /root/k8s_tools.py wait_pods_running paddle-job-master
=
${
PADDLE_JOB_NAME
}
1
export
MASTER_IP
=
$(
python /root/k8s_tools.py fetch_master_ip
)
export
ETCD_IP
=
"
$MASTER_IP
"
# NOTE: $TRAINER_PACKAGE may be large, do not copy
export
PYTHONPATH
=
$TRAINER_PACKAGE
:
$PYTHONPATH
cd
$TRAINER_PACKAGE
stdbuf
-oL
echo
"Starting training job: "
$TRAINER_PACKAGE
,
"num_gradient_servers:"
\
$PADDLE_INIT_NUM_GRADIENT_SERVERS
,
"version: "
$1
stdbuf
-oL
sh
-c
"
${
ENTRY
}
"
check_trainer_ret
$?
}
start_trainer
()
{
# paddle v1 and V2 distributed training does not allow any trainer failed.
check_failed_cnt 0
stdbuf
-oL
python /root/k8s_tools.py wait_pods_running paddle-job-pserver
=
${
PADDLE_JOB_NAME
}
${
PSERVERS
}
stdbuf
-oL
python /root/k8s_tools.py wait_pods_running paddle-job
=
${
PADDLE_JOB_NAME
}
${
TRAINERS
}
export
PADDLE_INIT_PSERVERS
=
$(
python /root/k8s_tools.py fetch_pserver_ips
)
export
PADDLE_INIT_TRAINER_ID
=
$(
python /root/k8s_tools.py fetch_trainer_id
)
stdbuf
-oL
echo
$PADDLE_INIT_TRAINER_ID
>
/trainer_id
# FIXME: /trainer_count = PADDLE_INIT_NUM_GRADIENT_SERVERS
stdbuf
-oL
echo
$PADDLE_INIT_NUM_GRADIENT_SERVERS
>
/trainer_count
# NOTE: $TRAINER_PACKAGE may be large, do not copy
export
PYTHONPATH
=
$TRAINER_PACKAGE
:
$PYTHONPATH
cd
$TRAINER_PACKAGE
stdbuf
-oL
echo
"Starting training job: "
$TRAINER_PACKAGE
,
"num_gradient_servers:"
\
$PADDLE_INIT_NUM_GRADIENT_SERVERS
,
"trainer_id: "
$PADDLE_INIT_TRAINER_ID
,
\
"version: "
$1
# FIXME: If we use the new PServer by Golang, add Kubernetes healthz
# to wait PServer process get ready.Now only sleep 20 seconds.
sleep
20
case
"
$1
"
in
"v1"
)
FILE_COUNT
=
$(
wc
-l
$TRAIN_LIST
|
awk
'{print $1}'
)
if
[
$FILE_COUNT
-le
$PADDLE_INIT_NUM_GRADIENT_SERVERS
]
;
then
echo
"file count less than trainers"
check_trainer_ret 0
fi
let
lines_per_node
=
"
$FILE_COUNT
/ (
$PADDLE_INIT_NUM_GRADIENT_SERVERS
+ 1)"
echo
"spliting file to"
$lines_per_node
cp
$TRAIN_LIST
/
cd
/
split
-l
$lines_per_node
-d
-a
3
$TRAIN_LIST
train.list
CURRENT_LIST
=
$(
printf
"train.list%03d"
$PADDLE_INIT_TRAINER_ID
)
# always use /train.list for paddle v1 for each node.
echo
"File for current node
${
CURRENT_LIST
}
"
sleep
10
cp
$CURRENT_LIST
train.list
cd
$TRAINER_PACKAGE
stdbuf
-oL
paddle train
\
--port
=
$PADDLE_INIT_PORT
\
--nics
=
$PADDLE_INIT_NICS
\
--ports_num
=
$PADDLE_INIT_PORTS_NUM
\
--ports_num_for_sparse
=
$PADDLE_INIT_PORTS_NUM_FOR_SPARSE
\
--num_passes
=
$PADDLE_INIT_NUM_PASSES
\
--trainer_count
=
$PADDLE_INIT_TRAINER_COUNT
\
--saving_period
=
1
\
--log_period
=
20
\
--local
=
0
\
--rdma_tcp
=
tcp
\
--config
=
$TOPOLOGY
\
--use_gpu
=
$PADDLE_INIT_USE_GPU
\
--trainer_id
=
$PADDLE_INIT_TRAINER_ID
\
--save_dir
=
$OUTPUT
\
--pservers
=
$PADDLE_INIT_PSERVERS
\
--num_gradient_servers
=
$PADDLE_INIT_NUM_GRADIENT_SERVERS
# paddle v1 API does not allow any trainer failed.
check_trainer_ret
$?
;;
"v2"
)
stdbuf
-oL
sh
-c
"
${
ENTRY
}
"
# paddle v2 API does not allow any trainer failed.
check_trainer_ret
$?
;;
*
)
;;
esac
}
usage
()
{
echo
"usage: paddle_k8s [<args>]:"
echo
" start_trainer [v1|v2] Start a trainer process with v1 or v2 API"
echo
" start_pserver Start a pserver process"
echo
" start_new_pserver Start a new pserver process"
echo
" start_new_trainer Start a new triner process"
}
case
"
$1
"
in
start_pserver
)
start_pserver
;;
start_trainer
)
start_trainer
$2
;;
start_new_trainer
)
start_new_trainer
;;
start_new_pserver
)
start_new_pserver
;;
start_master
)
start_master
;;
start_fluid
)
start_fluid_process
;;
--help
)
usage
;;
*
)
usage
;;
esac
benchmark/cluster/vgg16/reader.py
已删除
100644 → 0
浏览文件 @
bd64719a
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle.v2
as
paddle
paddle
.
dataset
.
cifar
.
train10
()
benchmark/cluster/vgg16/v2_trainer.yaml
浏览文件 @
f9db5629
...
...
@@ -38,7 +38,7 @@ spec:
-
name
:
PADDLE_INIT_NICS
value
:
"
xgbe0"
-
name
:
PADDLE_INIT_TRAINER_COUNT
value
:
"
2
"
value
:
"
1
"
-
name
:
PADDLE_INIT_PORTS_NUM
value
:
"
1"
-
name
:
PADDLE_INIT_PORTS_NUM_FOR_SPARSE
...
...
benchmark/cluster/vgg16/vgg16_v2.py
浏览文件 @
f9db5629
...
...
@@ -51,7 +51,7 @@ def vgg(input, nums, class_dim):
conv4
=
conv_block
(
conv3
,
512
,
nums
[
3
])
conv5
=
conv_block
(
conv4
,
512
,
nums
[
4
])
fc_dim
=
4096
fc_dim
=
512
fc1
=
paddle
.
layer
.
fc
(
input
=
conv5
,
size
=
fc_dim
,
act
=
paddle
.
activation
.
Relu
(),
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录