Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
5316c647
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5316c647
编写于
1月 18, 2019
作者:
T
Tao Luo
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
remove legacy cluster_train code
上级
eec133ca
变更
14
隐藏空白更改
内联
并排
Showing
14 changed file
with
0 addition
and
388 deletion
+0
-388
paddle/scripts/cluster_train/conf.py
paddle/scripts/cluster_train/conf.py
+0
-37
paddle/scripts/cluster_train/paddle.py
paddle/scripts/cluster_train/paddle.py
+0
-82
paddle/scripts/cluster_train/run.sh
paddle/scripts/cluster_train/run.sh
+0
-27
paddle/scripts/cluster_train_v2/fabric/conf.py
paddle/scripts/cluster_train_v2/fabric/conf.py
+0
-39
paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
...scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
+0
-11
paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
...s/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
+0
-23
paddle/scripts/cluster_train_v2/fabric/run.sh
paddle/scripts/cluster_train_v2/fabric/run.sh
+0
-14
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
...cripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
+0
-43
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
...scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
+0
-25
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
...ts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
+0
-26
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
...cripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
+0
-1
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
...ts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
+0
-27
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
...luster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
+0
-1
paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
+0
-32
未找到文件。
paddle/scripts/cluster_train/conf.py
已删除
100644 → 0
浏览文件 @
eec133ca
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
HOSTS
=
[
"root@192.168.100.17"
,
"root@192.168.100.18"
,
]
'''
workspace configuration
'''
#root dir for workspace, can be set as any director with real user account
ROOT_DIR
=
"/home/paddle"
'''
network configuration
'''
#pserver nics
PADDLE_NIC
=
"eth0"
#pserver port
PADDLE_PORT
=
7164
#pserver ports num
PADDLE_PORTS_NUM
=
2
#pserver sparse ports num
PADDLE_PORTS_NUM_FOR_SPARSE
=
2
#environments setting for all processes in cluster job
LD_LIBRARY_PATH
=
"/usr/local/cuda/lib64:/usr/lib64"
paddle/scripts/cluster_train/paddle.py
已删除
100644 → 0
浏览文件 @
eec133ca
#!/usr/bin/python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" module for launching cluster job """
import
os
import
argparse
import
socket
import
copy
import
time
import
signal
from
fabric.api
import
run
,
put
,
settings
,
env
,
prefix
from
fabric.tasks
import
execute
#configuration for cluster
import
conf
def
refine_unknown_args
(
cmd_args
):
'''
refine unknown parameters to handle some special parameters
'''
new_args
=
[]
for
arg
in
cmd_args
:
if
arg
.
startswith
(
"--"
)
and
arg
.
find
(
"="
)
!=
-
1
:
equal_pos
=
arg
.
find
(
"="
)
#find first = pos
arglist
=
list
(
arg
)
arglist
[
equal_pos
]
=
" "
arg
=
""
.
join
(
arglist
)
arg
=
arg
.
lstrip
(
"-"
)
new_args
+=
arg
.
split
(
" "
)
elif
arg
.
startswith
(
"--"
)
and
arg
.
find
(
"="
)
==
-
1
:
arg
=
arg
.
lstrip
(
"-"
)
new_args
.
append
(
arg
)
else
:
new_args
.
append
(
arg
)
return
new_args
def
kill_process
():
'''
kill comments threads
'''
run
(
"ps aux
\
| grep paddle_process_by_paddle
\
| grep -v grep
\
| awk '{print $2}'
\
| xargs kill > /dev/null 2>&1"
)
def
job_prepare
(
jobdir
,
data
=
None
):
'''
prepare job related workspace data
Assuming you already installed PaddlePaddle in all nodes which means
PaddlePaddle related bins and dependencies libraries.
Assuming the train/test data have already been installed.
This function just prepare all related model and other resources
needed at runtime.
'''
def
job_create_workspace
(
jobdir
,
data
=
None
):
'''
prepare job workspace, common file, etc.
'''
log
=
os
.
path
.
join
(
jobdir
,
"log"
)
if
data
is
not
None
:
#create job dir
run
(
'rm '
+
jobdir
+
' -fr && '
+
'mkdir -p '
+
jobdir
)
#push data and paddle bin
paddle/scripts/cluster_train/run.sh
已删除
100644 → 0
浏览文件 @
eec133ca
#!/bin/sh
#python paddle.py \
# --job_workspace="${PATH_TO_REMOTE_EXISTED_WORKSPACE}" \
# --dot_period=10 \
# --ports_num_for_sparse=2 \
# --log_period=50 \
# --num_passes=10 \
# --trainer_count=4 \
# --saving_period=1 \
# --local=0 \
# --config=./trainer_config.py \
# --save_dir=./output \
# --use_gpu=0
python paddle.py
\
--job_dispatch_package
=
"
${
PATH_TO_LOCAL_WORKSPACE
}
"
\
--dot_period
=
10
\
--ports_num_for_sparse
=
2
\
--log_period
=
50
\
--num_passes
=
10
\
--trainer_count
=
4
\
--saving_period
=
1
\
--local
=
0
\
--config
=
./trainer_config.py
\
--save_dir
=
./output
\
--use_gpu
=
0
paddle/scripts/cluster_train_v2/fabric/conf.py
已删除
100644 → 0
浏览文件 @
eec133ca
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
HOSTS
=
[
"root@10.1.9.7"
,
"root@10.1.18.7"
,
"root@10.1.32.9"
,
]
'''
workspace configuration
'''
#root dir for workspace, can be set as any director with real user account
ROOT_DIR
=
"/root"
'''
network configuration
'''
#pserver nics
PADDLE_NIC
=
"eth0"
#pserver port
PADDLE_PORT
=
7164
#pserver ports num
PADDLE_PORTS_NUM
=
1
#pserver sparse ports num
PADDLE_PORTS_NUM_FOR_SPARSE
=
1
#trainer whether use gpu
PADDLE_USE_GPU
=
"False"
#environments setting for all processes in cluster job
LD_LIBRARY_PATH
=
"/usr/local/cuda/lib64:/usr/lib64"
paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
已删除
100644 → 0
浏览文件 @
eec133ca
FROM
docker.paddlepaddlehub.com/paddle:0.10.0rc2
RUN
apt-get update
&&
apt-get
install
-y
openssh-server
RUN
mkdir
/var/run/sshd
RUN
echo
'root:root'
|chpasswd
RUN
sed
-ri
's/^PermitRootLogin\s+.*/PermitRootLogin yes/'
/etc/ssh/sshd_config
RUN
sed
-ri
's/UsePAM yes/#UsePAM yes/g'
/etc/ssh/sshd_config
EXPOSE
22
CMD
["/usr/sbin/sshd", "-D"]
paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
已删除
100644 → 0
浏览文件 @
eec133ca
apiVersion
:
extensions/v1beta1
kind
:
Deployment
metadata
:
name
:
ssh-servers
spec
:
replicas
:
3
template
:
metadata
:
labels
:
app
:
ssh-servers
spec
:
containers
:
-
name
:
ssh-servers
image
:
docker.paddlepaddlehub.com/paddlessh
resources
:
limits
:
cpu
:
500m
memory
:
1Gi
requests
:
cpu
:
500m
memory
:
1Gi
ports
:
-
containerPort
:
22
paddle/scripts/cluster_train_v2/fabric/run.sh
已删除
100644 → 0
浏览文件 @
eec133ca
#!/bin/bash
python paddle.py
\
--job_dispatch_package
=
"/root/wuyi/fabric_submit/workspace"
\
--dot_period
=
10
\
--ports_num_for_sparse
=
1
\
--log_period
=
50
\
--num_passes
=
5
\
--trainer_count
=
2
\
--saving_period
=
1
\
--local
=
0
\
--config
=
./trainer_config.py
\
--save_dir
=
./output
\
--use_gpu
=
0
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
已删除
100644 → 0
浏览文件 @
eec133ca
# Build this image: docker build -t mpi .
#
FROM
paddlepaddle/paddle:0.10.0rc3
ENV
DEBIAN_FRONTEND noninteractive
RUN
apt-get update
-y
&&
\
apt-get upgrade
-y
&&
\
apt-get
install
-y
openssh-server zip unzip vim
sudo
\
gcc gfortran openmpi-checkpoint binutils wget curl git openmpi-bin openmpi-common libopenmpi-dev
&&
\
pip
install
mpi4py numpy virtualenv scipy matplotlib lxml sqlalchemy suds ipython obspy
&&
\
mkdir
/var/run/sshd
&&
\
echo
'root:tutorial'
| chpasswd
&&
\
sed
-i
's/PermitRootLogin without-password/PermitRootLogin yes/'
/etc/ssh/sshd_config
&&
\
# SSH login fix. Otherwise user is kicked off after login
sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \
echo "export VISIBLE=now" >> /etc/profile && \
adduser --disabled-password --gecos "" tutorial && \
echo "tutorial ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers && \
mkdir /home/tutorial/.ssh/
ENV
HOME /home/tutorial
ENV
NOTVISIBLE "in users profile"
# ------------------------------------------------------------
# Set-Up SSH with our Github deploy key
# ------------------------------------------------------------
ADD
ssh/config /home/tutorial/.ssh/config
ADD
ssh/id_rsa.mpi /home/tutorial/.ssh/id_rsa
ADD
ssh/id_rsa.mpi.pub /home/tutorial/.ssh/id_rsa.pub
ADD
ssh/id_rsa.mpi.pub /home/tutorial/.ssh/authorized_keys
#---------------------------------------------------------------
#LD_LIBRARY_PATH
#---------------------------------------------------------------
RUN
export
LD_LIBRARY_PATH
=
/usr/lib/openmpi/lib/
WORKDIR
/home/tutorial
EXPOSE
22
CMD
["/usr/sbin/sshd", "-D"]
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
已删除
100644 → 0
浏览文件 @
eec133ca
apiVersion
:
extensions/v1beta1
kind
:
Deployment
metadata
:
name
:
mpi-header
labels
:
app
:
mpi-header
spec
:
replicas
:
1
template
:
metadata
:
labels
:
app
:
mpi-header
spec
:
containers
:
-
image
:
typhoon1986/paddle-openmpi
name
:
mpi-header
resources
:
limits
:
cpu
:
500m
memory
:
2Gi
requests
:
cpu
:
500m
memory
:
2Gi
ports
:
-
containerPort
:
22
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
已删除
100644 → 0
浏览文件 @
eec133ca
apiVersion
:
extensions/v1beta1
kind
:
Deployment
metadata
:
name
:
mpi-nodes
labels
:
app
:
mpi-nodes
spec
:
replicas
:
3
template
:
metadata
:
labels
:
app
:
mpi-nodes
spec
:
containers
:
-
image
:
typhoon1986/paddle-openmpi
name
:
mpi-nodes
resources
:
limits
:
cpu
:
500m
memory
:
2Gi
requests
:
cpu
:
500m
memory
:
2Gi
ports
:
-
containerPort
:
22
imagePullPolicy
:
Always
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
已删除
100644 → 0
浏览文件 @
eec133ca
StrictHostKeyChecking no
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
已删除
100644 → 0
浏览文件 @
eec133ca
-----BEGIN RSA PRIVATE KEY-----
MIIEogIBAAKCAQEA7PWLZmgdJ508dD15T6+xqGDvL9Ehzo9SgsnN6xJ+qpUvvOi4
1axW0AqR4MnPTg/uuvk+x4tUpuufOW4w22UTGjsdvmIVWa9ujLtcRiN3YPY+SU+Y
O5FfqKg7r/hBn+/GMcSoffwSs7vVgmhBBnp/mJh2O1cOAFZEe98/47mbg3/kHBAk
36NOQktaU3l48B38EhBTnjWfcEGm1HcTRPFxXV5Wiko6ZhKFEuHcTVKng4ROtUqE
mgHyI0aB7TAxg4na0ejItsYWEPWGeDOw6ms/4MwylxNosWzHFPW9p4zgLCLNr+b6
bDDfYKjXZflAuTQtQhLmJUwD9uuYLAijpSE2fQIDAQABAoIBADgcgRET8Gt0CV/B
OtvKz/f+VEVvcWD3gWNlJDTZIVOFllNWjIZUlA4ZoqenQkbK8Q4nfV1FOht4yjCQ
TlN1oMtiWk297i5Zo4UBzPzy4w774I39oh/g8dT/WXr2/5s+7SDV38xNh6Q2A34o
79T35wUcfUrZ93/O7dKjb/6d8hx2FMha0wVKqY4lmG1lQE3bbx3kakec0PdvU5kO
YHKlpqj3pMR7CpMa+4yL/iXFwWYmnK+uu+zw7JR7PwvH1CzrnvW438wjQ1QmYbSx
mHHOE89X67Lsl5hn81qYWBhpwAlBwi1qscsE0cV9GcFyKqWFqZsj5coM9u3CRfvy
lrWe1OUCgYEA+LBUFEd3Hxs4sFiYElJ8R9SAs1udaqPvAl01hTEijJLfYlMMVs/y
rgNN7j22zjDak2f8QdyMJZX7EZdRmdYcHO0csYOwbYvalzcnwk+U3mxmdD3r4xSo
DSvkJ70fogAqUlcVIg2re6fCmZVJQTvMQYTVEM8zQomJRt/Lb2esSfsCgYEA8+zv
44aToe8uqiDs4w8guRW7LCDkTw4z4IVo9JUibIaPjaAs5bZEBXSB43EEywXCR75H
fML0rU1PVvKh1rqcvZdVzm+XMWVr3asPk0sapaiHaTcmyZvJRDxxqbLFp0zRP1T6
cCtXNFdHWU4KiuKrUi6cDyOKchpfkSZa4seiT+cCgYB+n4FgBfdQPlMB70oW4irn
g/q32CjxuGCk6oKqu5bkzo+xB6obtavSEFqouIGQwO056tNVUY+GP7Rjg5GH663K
yKw4cl3tmS0Gm43B8TVSfw03mKO3rrfWZQe5eCFYIg9qd26KNT2gK435FzsCXQkm
PxUhhu6JrW/ZR2/U3Iur6wKBgADrWLAb1ryagSuE+j+U1AO+kDkHWrTtkcZ72jxp
v3p3O11GSEUJXdJDcSXhTCpTuDq6/dv7hB6PFwh126RKicKxKlKf2wsFndV1Cpb8
hnovW2tLGOtTmfuW2rrQAKyzvmolsNfxYd/BoHQ2thV16z1hDZeFA8WQUeHjKh6G
sBbrAoGATdtQlaUxx4izua6k02ihkxx/cRYwDl2N8UDvDBHokS7vJFMX8b8NpsGg
zMElnqSpu/pe/0UG7N2MtPF6uyMcX8AZzzcsRkiMkDvWJzYt8Jpf+Eyd/uryF+Yv
yrXaOEY83tm6x/fny5ZaZmk8lNth7bfWywuTMkZLX3fYpWtIeE4=
-----END RSA PRIVATE KEY-----
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
已删除
100644 → 0
浏览文件 @
eec133ca
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDs9YtmaB0nnTx0PXlPr7GoYO8v0SHOj1KCyc3rEn6qlS+86LjVrFbQCpHgyc9OD+66+T7Hi1Sm6585bjDbZRMaOx2+YhVZr26Mu1xGI3dg9j5JT5g7kV+oqDuv+EGf78YxxKh9/BKzu9WCaEEGen+YmHY7Vw4AVkR73z/juZuDf+QcECTfo05CS1pTeXjwHfwSEFOeNZ9wQabUdxNE8XFdXlaKSjpmEoUS4dxNUqeDhE61SoSaAfIjRoHtMDGDidrR6Mi2xhYQ9YZ4M7Dqaz/gzDKXE2ixbMcU9b2njOAsIs2v5vpsMN9gqNdl+UC5NC1CEuYlTAP265gsCKOlITZ9 oweidner@peahi
paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
已删除
100644 → 0
浏览文件 @
eec133ca
#!/bin/bash
# General trainning configurations
NICS
=
eth0
PADDLE_INIT_PORT
=
7164
PADDLE_INIT_PORTS_NUM
=
1
PADDLE_INIT_PORTS_NUM_FOR_SPARSE
=
1
PADDLE_INIT_PSERVERS
=
$(
cat
machines |
sed
-e
':a'
-e
'N'
-e
'$!ba'
-e
's/\n/,/g'
)
PADDLE_INIT_USE_GPU
=
False
PADDLE_INIT_NUM_GRADIENT_SERVERS
=
${
OMPI_COMM_WORLD_SIZE
}
PADDLE_INIT_TRAINER_ID
=
${
OMPI_COMM_WORLD_RANK
}
PADDLE_CLUSTER_TRAIN
=
True
env
# start pserver
stdbuf
-oL
nohup
paddle pserver
\
--port
=
$PADDLE_INIT_PORT
\
--ports_num
=
$PADDLE_INIT_PORTS_NUM
\
--ports_num_for_sparse
=
$PADDLE_INIT_PORTS_NUM_FOR_SPARSE
\
--nics
=
$NICS
\
--comment
=
paddle_cluster_pserver
\
--num_gradient_servers
=
$PADDLE_INIT_NUM_GRADIENT_SERVERS
\
&> logs/pserver.log &
# start trainer
# NOTE: train.py will use the above environment variables as configuration
python train.py &> logs/train.log
# kill background pservers when train finishes
ps
-ef
|
grep
pserver |
awk
'{print $2}'
| xargs
kill
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录