Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
f9db5629
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
f9db5629
编写于
1月 31, 2018
作者:
T
typhoonzero
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update results
上级
bd64719a
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
11 addition
and
322 deletion
+11
-322
benchmark/cluster/vgg16/Dockerfile
benchmark/cluster/vgg16/Dockerfile
+7
-9
benchmark/cluster/vgg16/README.md
benchmark/cluster/vgg16/README.md
+1
-1
benchmark/cluster/vgg16/fluid_trainer.yaml
benchmark/cluster/vgg16/fluid_trainer.yaml
+1
-1
benchmark/cluster/vgg16/k8s_tools.py
benchmark/cluster/vgg16/k8s_tools.py
+0
-94
benchmark/cluster/vgg16/paddle_k8s
benchmark/cluster/vgg16/paddle_k8s
+0
-199
benchmark/cluster/vgg16/reader.py
benchmark/cluster/vgg16/reader.py
+0
-16
benchmark/cluster/vgg16/v2_trainer.yaml
benchmark/cluster/vgg16/v2_trainer.yaml
+1
-1
benchmark/cluster/vgg16/vgg16_v2.py
benchmark/cluster/vgg16/vgg16_v2.py
+1
-1
未找到文件。
benchmark/cluster/vgg16/Dockerfile
浏览文件 @
f9db5629
#FROM paddlepaddle/paddlecloud-job
#RUN mkdir -p /workspace
#ADD reader.py /workspace/
#RUN python /workspace/reader.py
FROM
python:2.7.14
FROM
python:2.7.14
ADD
paddle_k8s /usr/bin
ADD
https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
ADD
k8s_tools.py /root
ADD
https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
RUN
pip
install
-U
kubernetes opencv-python
&&
apt-get update
-y
&&
apt-get
install
-y
iputils-ping libgtk2.0-dev
RUN
pip
install
-U
kubernetes opencv-python
&&
apt-get update
-y
&&
apt-get
install
-y
iputils-ping libgtk2.0-dev
&&
\
chmod
+x /usr/bin/paddle_k8s
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
# so we must build one with distribute support to install in this image.
ADD
*.whl /
ADD
*.whl /
RUN
pip
install
/
*
.whl
&&
rm
-f
/
*
.whl
RUN
pip
install
/
*
.whl
&&
rm
-f
/
*
.whl
ENV
LD_LIBRARY_PATH=/usr/local/lib
ENV
LD_LIBRARY_PATH=/usr/local/lib
ADD
reader.py /workspace/
RUN
sh
-c
'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
RUN
python /workspace/reader.py
ADD
vgg16_fluid.py vgg16_v2.py /workspace/
ADD
vgg16_fluid.py vgg16_v2.py /workspace/
benchmark/cluster/vgg16/README.md
浏览文件 @
f9db5629
...
@@ -43,7 +43,7 @@
...
@@ -43,7 +43,7 @@
| Trainer Counter | 20 | 40 | 80 | 100 |
| Trainer Counter | 20 | 40 | 80 | 100 |
| -- | -- | -- | -- | -- |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 291.06 | 518.80 | 836.26 | 1019.29 |
| PaddlePaddle Fluid | 291.06 | 518.80 | 836.26 | 1019.29 |
| PaddlePaddle v2
| 356.28 | - | -
| 1041.99 |
| PaddlePaddle v2
(need more tests) | 356.28 | 785.39 | 853.30
| 1041.99 |
| TensorFlow | - | - | - | - |
| TensorFlow | - | - | - | - |
### different pserver number
### different pserver number
...
...
benchmark/cluster/vgg16/fluid_trainer.yaml
浏览文件 @
f9db5629
...
@@ -30,7 +30,7 @@ spec:
...
@@ -30,7 +30,7 @@ spec:
-
name
:
TOPOLOGY
-
name
:
TOPOLOGY
value
:
"
"
value
:
"
"
-
name
:
ENTRY
-
name
:
ENTRY
value
:
"
MKL_NUM_THREADS=1
python
/workspace/vgg16_fluid.py
--local
0
--batch_size
256
"
value
:
"
MKL_NUM_THREADS=1
python
/workspace/vgg16_fluid.py
--local
0
--batch_size
128
"
-
name
:
TRAINER_PACKAGE
-
name
:
TRAINER_PACKAGE
value
:
"
/workspace"
value
:
"
/workspace"
-
name
:
PADDLE_INIT_PORT
-
name
:
PADDLE_INIT_PORT
...
...
benchmark/cluster/vgg16/k8s_tools.py
已删除
100644 → 0
浏览文件 @
bd64719a
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/bin/env python
import
os
import
sys
import
time
import
socket
from
kubernetes
import
client
,
config
PADDLE_JOB_NAME
=
os
.
getenv
(
"PADDLE_JOB_NAME"
)
NAMESPACE
=
os
.
getenv
(
"NAMESPACE"
)
PORT
=
os
.
getenv
(
"PSERVER_PORT"
)
if
os
.
getenv
(
"KUBERNETES_SERVICE_HOST"
,
None
):
config
.
load_incluster_config
()
else
:
config
.
load_kube_config
()
v1
=
client
.
CoreV1Api
()
def
fetch_pods_info
(
label_selector
):
api_response
=
v1
.
list_namespaced_pod
(
namespace
=
NAMESPACE
,
pretty
=
True
,
label_selector
=
label_selector
)
pod_list
=
[]
for
item
in
api_response
.
items
:
pod_list
.
append
((
item
.
status
.
phase
,
item
.
status
.
pod_ip
))
return
pod_list
def
wait_pods_running
(
label_selector
,
desired
):
print
"label selector: %s, desired: %s"
%
(
label_selector
,
desired
)
while
True
:
count
=
count_pods_by_phase
(
label_selector
,
'Running'
)
# NOTE: pods may be scaled.
if
count
>=
int
(
desired
):
break
print
'current cnt: %d sleep for 5 seconds...'
%
count
time
.
sleep
(
5
)
def
count_pods_by_phase
(
label_selector
,
phase
):
pod_list
=
fetch_pods_info
(
label_selector
)
filtered_pod_list
=
filter
(
lambda
x
:
x
[
0
]
==
phase
,
pod_list
)
return
len
(
filtered_pod_list
)
def
fetch_pserver_ips
():
label_selector
=
"paddle-job-pserver=%s"
%
PADDLE_JOB_NAME
pod_list
=
fetch_pods_info
(
label_selector
)
pserver_ips
=
[
item
[
1
]
for
item
in
pod_list
]
return
","
.
join
(
pserver_ips
)
def
fetch_master_ip
():
label_selector
=
"paddle-job-master=%s"
%
PADDLE_JOB_NAME
pod_list
=
fetch_pods_info
(
label_selector
)
master_ips
=
[
item
[
1
]
for
item
in
pod_list
]
return
master_ips
[
0
]
def
fetch_trainer_id
():
label_selector
=
"paddle-job=%s"
%
PADDLE_JOB_NAME
pod_list
=
fetch_pods_info
(
label_selector
)
trainer_ips
=
[
item
[
1
]
for
item
in
pod_list
]
trainer_ips
.
sort
()
local_ip
=
socket
.
gethostbyname
(
socket
.
gethostname
())
for
i
in
xrange
(
len
(
trainer_ips
)):
if
trainer_ips
[
i
]
==
local_ip
:
return
i
return
None
if
__name__
==
"__main__"
:
command
=
sys
.
argv
[
1
]
if
command
==
"fetch_pserver_ips"
:
print
fetch_pserver_ips
()
elif
command
==
"fetch_trainer_id"
:
print
fetch_trainer_id
()
elif
command
==
"fetch_master_ip"
:
print
fetch_master_ip
()
elif
command
==
"count_pods_by_phase"
:
print
count_pods_by_phase
(
sys
.
argv
[
2
],
sys
.
argv
[
3
])
elif
command
==
"wait_pods_running"
:
wait_pods_running
(
sys
.
argv
[
2
],
sys
.
argv
[
3
])
benchmark/cluster/vgg16/paddle_k8s
已删除
100755 → 0
浏览文件 @
bd64719a
#!/bin/bash
start_pserver
()
{
stdbuf
-oL
paddle pserver
\
--use_gpu
=
0
\
--port
=
$PADDLE_INIT_PORT
\
--ports_num
=
$PADDLE_INIT_PORTS_NUM
\
--ports_num_for_sparse
=
$PADDLE_INIT_PORTS_NUM_FOR_SPARSE
\
--nics
=
$PADDLE_INIT_NICS
\
--comment
=
paddle_process_k8s
\
--num_gradient_servers
=
$PADDLE_INIT_NUM_GRADIENT_SERVERS
}
start_new_pserver
()
{
stdbuf
-oL
python /root/k8s_tools.py wait_pods_running paddle-job-master
=
${
PADDLE_JOB_NAME
}
1
export
MASTER_IP
=
$(
python /root/k8s_tools.py fetch_master_ip
)
stdbuf
-oL
/usr/bin/pserver
\
-port
=
$PADDLE_INIT_PORT
\
-num-pservers
=
$PSERVERS
\
-log-level
=
debug
\
-etcd-endpoint
=
http://
$MASTER_IP
:2379
}
start_master
()
{
stdbuf
-oL
/usr/bin/master
\
-port
=
8080
\
-chunk-per-task
=
1
\
-task-timout-dur
=
16s
\
-endpoints
=
http://127.0.0.1:2379
}
check_failed_cnt
()
{
max_failed
=
$1
failed_count
=
$(
python /root/k8s_tools.py count_pods_by_phase paddle-job
=
${
PADDLE_JOB_NAME
}
Failed
)
if
[
$failed_count
-gt
$max_failed
]
;
then
stdbuf
-oL
echo
"Failed trainer count beyond the threadhold: "
$max_failed
echo
"Failed trainer count beyond the threshold: "
$max_failed
>
/dev/termination-log
exit
0
fi
}
check_trainer_ret
()
{
ret
=
$1
stdbuf
-oL
echo
"job returned
$ret
...setting pod return message..."
stdbuf
-oL
echo
"==============================="
if
[
$ret
-eq
136
]
;
then
echo
"Error Arithmetic Operation(Floating Point Exception)"
>
/dev/termination-log
elif
[
$ret
-eq
139
]
;
then
echo
"Segmentation Fault"
>
/dev/termination-log
elif
[
$ret
-eq
1
]
;
then
echo
"General Error"
>
/dev/termination-log
elif
[
$ret
-eq
134
]
;
then
echo
"Program Abort"
>
/dev/termination-log
fi
stdbuf
-oL
echo
"termination log wroted..."
exit
$ret
}
start_fluid_process
()
{
stdbuf
-oL
python /root/k8s_tools.py wait_pods_running paddle-job-pserver
=
${
PADDLE_JOB_NAME
}
${
PSERVERS
}
if
[
"
${
TRAINING_ROLE
}
"
==
"TRAINER"
]
;
then
check_failed_cnt
${
TRAINERS
}
sleep
5
export
PADDLE_INIT_TRAINER_ID
=
$(
python /root/k8s_tools.py fetch_trainer_id
)
fi
export
PADDLE_INIT_PSERVERS
=
$(
python /root/k8s_tools.py fetch_pserver_ips
)
stdbuf
-oL
sh
-c
"
${
ENTRY
}
"
check_trainer_ret
$?
}
start_new_trainer
()
{
# FIXME(Yancey1989): use command-line interface to configure the max failed count
check_failed_cnt
${
TRAINERS
}
stdbuf
-oL
python /root/k8s_tools.py wait_pods_running paddle-job-pserver
=
${
PADDLE_JOB_NAME
}
${
PSERVERS
}
sleep
5
stdbuf
-oL
python /root/k8s_tools.py wait_pods_running paddle-job-master
=
${
PADDLE_JOB_NAME
}
1
export
MASTER_IP
=
$(
python /root/k8s_tools.py fetch_master_ip
)
export
ETCD_IP
=
"
$MASTER_IP
"
# NOTE: $TRAINER_PACKAGE may be large, do not copy
export
PYTHONPATH
=
$TRAINER_PACKAGE
:
$PYTHONPATH
cd
$TRAINER_PACKAGE
stdbuf
-oL
echo
"Starting training job: "
$TRAINER_PACKAGE
,
"num_gradient_servers:"
\
$PADDLE_INIT_NUM_GRADIENT_SERVERS
,
"version: "
$1
stdbuf
-oL
sh
-c
"
${
ENTRY
}
"
check_trainer_ret
$?
}
start_trainer
()
{
# paddle v1 and V2 distributed training does not allow any trainer failed.
check_failed_cnt 0
stdbuf
-oL
python /root/k8s_tools.py wait_pods_running paddle-job-pserver
=
${
PADDLE_JOB_NAME
}
${
PSERVERS
}
stdbuf
-oL
python /root/k8s_tools.py wait_pods_running paddle-job
=
${
PADDLE_JOB_NAME
}
${
TRAINERS
}
export
PADDLE_INIT_PSERVERS
=
$(
python /root/k8s_tools.py fetch_pserver_ips
)
export
PADDLE_INIT_TRAINER_ID
=
$(
python /root/k8s_tools.py fetch_trainer_id
)
stdbuf
-oL
echo
$PADDLE_INIT_TRAINER_ID
>
/trainer_id
# FIXME: /trainer_count = PADDLE_INIT_NUM_GRADIENT_SERVERS
stdbuf
-oL
echo
$PADDLE_INIT_NUM_GRADIENT_SERVERS
>
/trainer_count
# NOTE: $TRAINER_PACKAGE may be large, do not copy
export
PYTHONPATH
=
$TRAINER_PACKAGE
:
$PYTHONPATH
cd
$TRAINER_PACKAGE
stdbuf
-oL
echo
"Starting training job: "
$TRAINER_PACKAGE
,
"num_gradient_servers:"
\
$PADDLE_INIT_NUM_GRADIENT_SERVERS
,
"trainer_id: "
$PADDLE_INIT_TRAINER_ID
,
\
"version: "
$1
# FIXME: If we use the new PServer by Golang, add Kubernetes healthz
# to wait PServer process get ready.Now only sleep 20 seconds.
sleep
20
case
"
$1
"
in
"v1"
)
FILE_COUNT
=
$(
wc
-l
$TRAIN_LIST
|
awk
'{print $1}'
)
if
[
$FILE_COUNT
-le
$PADDLE_INIT_NUM_GRADIENT_SERVERS
]
;
then
echo
"file count less than trainers"
check_trainer_ret 0
fi
let
lines_per_node
=
"
$FILE_COUNT
/ (
$PADDLE_INIT_NUM_GRADIENT_SERVERS
+ 1)"
echo
"spliting file to"
$lines_per_node
cp
$TRAIN_LIST
/
cd
/
split
-l
$lines_per_node
-d
-a
3
$TRAIN_LIST
train.list
CURRENT_LIST
=
$(
printf
"train.list%03d"
$PADDLE_INIT_TRAINER_ID
)
# always use /train.list for paddle v1 for each node.
echo
"File for current node
${
CURRENT_LIST
}
"
sleep
10
cp
$CURRENT_LIST
train.list
cd
$TRAINER_PACKAGE
stdbuf
-oL
paddle train
\
--port
=
$PADDLE_INIT_PORT
\
--nics
=
$PADDLE_INIT_NICS
\
--ports_num
=
$PADDLE_INIT_PORTS_NUM
\
--ports_num_for_sparse
=
$PADDLE_INIT_PORTS_NUM_FOR_SPARSE
\
--num_passes
=
$PADDLE_INIT_NUM_PASSES
\
--trainer_count
=
$PADDLE_INIT_TRAINER_COUNT
\
--saving_period
=
1
\
--log_period
=
20
\
--local
=
0
\
--rdma_tcp
=
tcp
\
--config
=
$TOPOLOGY
\
--use_gpu
=
$PADDLE_INIT_USE_GPU
\
--trainer_id
=
$PADDLE_INIT_TRAINER_ID
\
--save_dir
=
$OUTPUT
\
--pservers
=
$PADDLE_INIT_PSERVERS
\
--num_gradient_servers
=
$PADDLE_INIT_NUM_GRADIENT_SERVERS
# paddle v1 API does not allow any trainer failed.
check_trainer_ret
$?
;;
"v2"
)
stdbuf
-oL
sh
-c
"
${
ENTRY
}
"
# paddle v2 API does not allow any trainer failed.
check_trainer_ret
$?
;;
*
)
;;
esac
}
usage
()
{
echo
"usage: paddle_k8s [<args>]:"
echo
" start_trainer [v1|v2] Start a trainer process with v1 or v2 API"
echo
" start_pserver Start a pserver process"
echo
" start_new_pserver Start a new pserver process"
echo
" start_new_trainer Start a new triner process"
}
case
"
$1
"
in
start_pserver
)
start_pserver
;;
start_trainer
)
start_trainer
$2
;;
start_new_trainer
)
start_new_trainer
;;
start_new_pserver
)
start_new_pserver
;;
start_master
)
start_master
;;
start_fluid
)
start_fluid_process
;;
--help
)
usage
;;
*
)
usage
;;
esac
benchmark/cluster/vgg16/reader.py
已删除
100644 → 0
浏览文件 @
bd64719a
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle.v2
as
paddle
paddle
.
dataset
.
cifar
.
train10
()
benchmark/cluster/vgg16/v2_trainer.yaml
浏览文件 @
f9db5629
...
@@ -38,7 +38,7 @@ spec:
...
@@ -38,7 +38,7 @@ spec:
-
name
:
PADDLE_INIT_NICS
-
name
:
PADDLE_INIT_NICS
value
:
"
xgbe0"
value
:
"
xgbe0"
-
name
:
PADDLE_INIT_TRAINER_COUNT
-
name
:
PADDLE_INIT_TRAINER_COUNT
value
:
"
2
"
value
:
"
1
"
-
name
:
PADDLE_INIT_PORTS_NUM
-
name
:
PADDLE_INIT_PORTS_NUM
value
:
"
1"
value
:
"
1"
-
name
:
PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-
name
:
PADDLE_INIT_PORTS_NUM_FOR_SPARSE
...
...
benchmark/cluster/vgg16/vgg16_v2.py
浏览文件 @
f9db5629
...
@@ -51,7 +51,7 @@ def vgg(input, nums, class_dim):
...
@@ -51,7 +51,7 @@ def vgg(input, nums, class_dim):
conv4
=
conv_block
(
conv3
,
512
,
nums
[
3
])
conv4
=
conv_block
(
conv3
,
512
,
nums
[
3
])
conv5
=
conv_block
(
conv4
,
512
,
nums
[
4
])
conv5
=
conv_block
(
conv4
,
512
,
nums
[
4
])
fc_dim
=
4096
fc_dim
=
512
fc1
=
paddle
.
layer
.
fc
(
input
=
conv5
,
fc1
=
paddle
.
layer
.
fc
(
input
=
conv5
,
size
=
fc_dim
,
size
=
fc_dim
,
act
=
paddle
.
activation
.
Relu
(),
act
=
paddle
.
activation
.
Relu
(),
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录