Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
694bc64a
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2309
Star
20932
Fork
5423
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
694bc64a
编写于
10月 19, 2017
作者:
D
dangqingqing
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into lstm
上级
17e33738
63ffe525
变更
20
展开全部
隐藏空白更改
内联
并排
Showing
20 changed file
with
959 addition
and
190 deletion
+959
-190
doc/design/cluster_train/src/trainer.graffle
doc/design/cluster_train/src/trainer.graffle
+0
-0
doc/howto/usage/cluster/cluster_train_cn.md
doc/howto/usage/cluster/cluster_train_cn.md
+221
-95
doc/howto/usage/cluster/cluster_train_en.md
doc/howto/usage/cluster/cluster_train_en.md
+232
-95
doc/howto/usage/cluster/src/trainer.png
doc/howto/usage/cluster/src/trainer.png
+0
-0
doc/howto/usage/cluster/src/trainer_cn.png
doc/howto/usage/cluster/src/trainer_cn.png
+0
-0
doc/howto/usage/cluster/src/word2vec/api_train_v2.py
doc/howto/usage/cluster/src/word2vec/api_train_v2.py
+100
-0
doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
+123
-0
doc/howto/usage/cluster/src/word2vec/prepare.py
doc/howto/usage/cluster/src/word2vec/prepare.py
+41
-0
paddle/parameter/FirstOrderOptimizer.h
paddle/parameter/FirstOrderOptimizer.h
+4
-0
paddle/scripts/cluster_train_v2/fabric/conf.py
paddle/scripts/cluster_train_v2/fabric/conf.py
+39
-0
paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
...scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
+11
-0
paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
...s/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
+23
-0
paddle/scripts/cluster_train_v2/fabric/run.sh
paddle/scripts/cluster_train_v2/fabric/run.sh
+14
-0
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
...cripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
+43
-0
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
...scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
+25
-0
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
...ts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
+26
-0
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
...cripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
+1
-0
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
...ts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
+27
-0
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
...luster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
+1
-0
paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
+28
-0
未找到文件。
doc/design/cluster_train/src/trainer.graffle
浏览文件 @
694bc64a
无法预览此类型文件
doc/howto/usage/cluster/cluster_train_cn.md
浏览文件 @
694bc64a
此差异已折叠。
点击以展开。
doc/howto/usage/cluster/cluster_train_en.md
浏览文件 @
694bc64a
此差异已折叠。
点击以展开。
doc/howto/usage/cluster/src/trainer.png
0 → 100644
浏览文件 @
694bc64a
141.7 KB
doc/howto/usage/cluster/src/trainer_cn.png
0 → 100644
浏览文件 @
694bc64a
33.1 KB
doc/howto/usage/cluster/src/word2vec/api_train_v2.py
0 → 100644
浏览文件 @
694bc64a
import
gzip
import
math
import
paddle.v2
as
paddle
embsize
=
32
hiddensize
=
256
N
=
5
def
wordemb
(
inlayer
):
wordemb
=
paddle
.
layer
.
embedding
(
input
=
inlayer
,
size
=
embsize
,
param_attr
=
paddle
.
attr
.
Param
(
name
=
"_proj"
,
initial_std
=
0.001
,
learning_rate
=
1
,
l2_rate
=
0
,
sparse_update
=
True
))
return
wordemb
def
main
():
# for local training
cluster_train
=
False
if
not
cluster_train
:
paddle
.
init
(
use_gpu
=
False
,
trainer_count
=
1
)
else
:
paddle
.
init
(
use_gpu
=
False
,
trainer_count
=
2
,
port
=
7164
,
ports_num
=
1
,
ports_num_for_sparse
=
1
,
num_gradient_servers
=
1
)
word_dict
=
paddle
.
dataset
.
imikolov
.
build_dict
()
dict_size
=
len
(
word_dict
)
firstword
=
paddle
.
layer
.
data
(
name
=
"firstw"
,
type
=
paddle
.
data_type
.
integer_value
(
dict_size
))
secondword
=
paddle
.
layer
.
data
(
name
=
"secondw"
,
type
=
paddle
.
data_type
.
integer_value
(
dict_size
))
thirdword
=
paddle
.
layer
.
data
(
name
=
"thirdw"
,
type
=
paddle
.
data_type
.
integer_value
(
dict_size
))
fourthword
=
paddle
.
layer
.
data
(
name
=
"fourthw"
,
type
=
paddle
.
data_type
.
integer_value
(
dict_size
))
nextword
=
paddle
.
layer
.
data
(
name
=
"fifthw"
,
type
=
paddle
.
data_type
.
integer_value
(
dict_size
))
Efirst
=
wordemb
(
firstword
)
Esecond
=
wordemb
(
secondword
)
Ethird
=
wordemb
(
thirdword
)
Efourth
=
wordemb
(
fourthword
)
contextemb
=
paddle
.
layer
.
concat
(
input
=
[
Efirst
,
Esecond
,
Ethird
,
Efourth
])
hidden1
=
paddle
.
layer
.
fc
(
input
=
contextemb
,
size
=
hiddensize
,
act
=
paddle
.
activation
.
Sigmoid
(),
layer_attr
=
paddle
.
attr
.
Extra
(
drop_rate
=
0.5
),
bias_attr
=
paddle
.
attr
.
Param
(
learning_rate
=
2
),
param_attr
=
paddle
.
attr
.
Param
(
initial_std
=
1.
/
math
.
sqrt
(
embsize
*
8
),
learning_rate
=
1
))
predictword
=
paddle
.
layer
.
fc
(
input
=
hidden1
,
size
=
dict_size
,
bias_attr
=
paddle
.
attr
.
Param
(
learning_rate
=
2
),
act
=
paddle
.
activation
.
Softmax
())
def
event_handler
(
event
):
if
isinstance
(
event
,
paddle
.
event
.
EndIteration
):
if
event
.
batch_id
%
100
==
0
:
with
gzip
.
open
(
"batch-"
+
str
(
event
.
batch_id
)
+
".tar.gz"
,
'w'
)
as
f
:
trainer
.
save_parameter_to_tar
(
f
)
result
=
trainer
.
test
(
paddle
.
batch
(
paddle
.
dataset
.
imikolov
.
test
(
word_dict
,
N
),
32
))
print
"Pass %d, Batch %d, Cost %f, %s, Testing metrics %s"
%
(
event
.
pass_id
,
event
.
batch_id
,
event
.
cost
,
event
.
metrics
,
result
.
metrics
)
cost
=
paddle
.
layer
.
classification_cost
(
input
=
predictword
,
label
=
nextword
)
parameters
=
paddle
.
parameters
.
create
(
cost
)
adagrad
=
paddle
.
optimizer
.
AdaGrad
(
learning_rate
=
3e-3
,
regularization
=
paddle
.
optimizer
.
L2Regularization
(
8e-4
))
trainer
=
paddle
.
trainer
.
SGD
(
cost
,
parameters
,
adagrad
,
is_local
=
not
cluster_train
)
trainer
.
train
(
paddle
.
batch
(
paddle
.
dataset
.
imikolov
.
train
(
word_dict
,
N
),
32
),
num_passes
=
30
,
event_handler
=
event_handler
)
if
__name__
==
'__main__'
:
main
()
doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
0 → 100644
浏览文件 @
694bc64a
import
math
import
os
import
paddle.v2
as
paddle
import
pickle
embsize
=
32
hiddensize
=
256
N
=
5
cluster_train_file
=
"./train_data_dir/train/train.txt"
cluster_test_file
=
"./test_data_dir/test/test.txt"
node_id
=
os
.
getenv
(
"OMPI_COMM_WORLD_RANK"
)
if
not
node_id
:
raise
EnvironmentError
(
"must provied OMPI_COMM_WORLD_RANK"
)
def
wordemb
(
inlayer
):
wordemb
=
paddle
.
layer
.
embedding
(
input
=
inlayer
,
size
=
embsize
,
param_attr
=
paddle
.
attr
.
Param
(
name
=
"_proj"
,
initial_std
=
0.001
,
learning_rate
=
1
,
l2_rate
=
0
,
sparse_update
=
True
))
return
wordemb
def
cluster_reader_cluster
(
filename
,
node_id
):
def
cluster_reader
():
with
open
(
"-"
.
join
([
filename
,
"%05d"
%
int
(
node_id
)]),
"r"
)
as
f
:
for
l
in
f
:
csv_data
=
[
int
(
cell
)
for
cell
in
l
.
split
(
","
)]
yield
tuple
(
csv_data
)
return
cluster_reader
def
main
():
# get arguments from env
# for local training
TRUTH
=
[
"true"
,
"True"
,
"TRUE"
,
"1"
,
"yes"
,
"Yes"
,
"YES"
]
cluster_train
=
os
.
getenv
(
'PADDLE_CLUSTER_TRAIN'
,
"False"
)
in
TRUTH
use_gpu
=
os
.
getenv
(
'PADDLE_INIT_USE_GPU'
,
"False"
)
if
not
cluster_train
:
paddle
.
init
(
use_gpu
=
use_gpu
,
trainer_count
=
int
(
os
.
getenv
(
"PADDLE_INIT_TRAINER_COUNT"
,
"1"
)))
else
:
paddle
.
init
(
use_gpu
=
use_gpu
,
trainer_count
=
int
(
os
.
getenv
(
"PADDLE_INIT_TRAINER_COUNT"
,
"1"
)),
port
=
int
(
os
.
getenv
(
"PADDLE_INIT_PORT"
,
"7164"
)),
ports_num
=
int
(
os
.
getenv
(
"PADDLE_INIT_PORTS_NUM"
,
"1"
)),
ports_num_for_sparse
=
int
(
os
.
getenv
(
"PADDLE_INIT_PORTS_NUM_FOR_SPARSE"
,
"1"
)),
num_gradient_servers
=
int
(
os
.
getenv
(
"PADDLE_INIT_NUM_GRADIENT_SERVERS"
,
"1"
)),
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_INIT_TRAINER_ID"
,
"0"
)),
pservers
=
os
.
getenv
(
"PADDLE_INIT_PSERVERS"
,
"127.0.0.1"
))
fn
=
open
(
"thirdparty/wuyi_train_thdpty/word_dict.pickle"
,
"r"
)
word_dict
=
pickle
.
load
(
fn
)
fn
.
close
()
dict_size
=
len
(
word_dict
)
firstword
=
paddle
.
layer
.
data
(
name
=
"firstw"
,
type
=
paddle
.
data_type
.
integer_value
(
dict_size
))
secondword
=
paddle
.
layer
.
data
(
name
=
"secondw"
,
type
=
paddle
.
data_type
.
integer_value
(
dict_size
))
thirdword
=
paddle
.
layer
.
data
(
name
=
"thirdw"
,
type
=
paddle
.
data_type
.
integer_value
(
dict_size
))
fourthword
=
paddle
.
layer
.
data
(
name
=
"fourthw"
,
type
=
paddle
.
data_type
.
integer_value
(
dict_size
))
nextword
=
paddle
.
layer
.
data
(
name
=
"fifthw"
,
type
=
paddle
.
data_type
.
integer_value
(
dict_size
))
Efirst
=
wordemb
(
firstword
)
Esecond
=
wordemb
(
secondword
)
Ethird
=
wordemb
(
thirdword
)
Efourth
=
wordemb
(
fourthword
)
contextemb
=
paddle
.
layer
.
concat
(
input
=
[
Efirst
,
Esecond
,
Ethird
,
Efourth
])
hidden1
=
paddle
.
layer
.
fc
(
input
=
contextemb
,
size
=
hiddensize
,
act
=
paddle
.
activation
.
Sigmoid
(),
layer_attr
=
paddle
.
attr
.
Extra
(
drop_rate
=
0.5
),
bias_attr
=
paddle
.
attr
.
Param
(
learning_rate
=
2
),
param_attr
=
paddle
.
attr
.
Param
(
initial_std
=
1.
/
math
.
sqrt
(
embsize
*
8
),
learning_rate
=
1
))
predictword
=
paddle
.
layer
.
fc
(
input
=
hidden1
,
size
=
dict_size
,
bias_attr
=
paddle
.
attr
.
Param
(
learning_rate
=
2
),
act
=
paddle
.
activation
.
Softmax
())
def
event_handler
(
event
):
if
isinstance
(
event
,
paddle
.
event
.
EndIteration
):
if
event
.
batch_id
%
100
==
0
:
result
=
trainer
.
test
(
paddle
.
batch
(
cluster_reader_cluster
(
cluster_test_file
,
node_id
),
32
))
print
"Pass %d, Batch %d, Cost %f, %s, Testing metrics %s"
%
(
event
.
pass_id
,
event
.
batch_id
,
event
.
cost
,
event
.
metrics
,
result
.
metrics
)
cost
=
paddle
.
layer
.
classification_cost
(
input
=
predictword
,
label
=
nextword
)
parameters
=
paddle
.
parameters
.
create
(
cost
)
adagrad
=
paddle
.
optimizer
.
AdaGrad
(
learning_rate
=
3e-3
,
regularization
=
paddle
.
optimizer
.
L2Regularization
(
8e-4
))
trainer
=
paddle
.
trainer
.
SGD
(
cost
,
parameters
,
adagrad
,
is_local
=
not
cluster_train
)
trainer
.
train
(
paddle
.
batch
(
cluster_reader_cluster
(
cluster_train_file
,
node_id
),
32
),
num_passes
=
30
,
event_handler
=
event_handler
)
if
__name__
==
'__main__'
:
main
()
doc/howto/usage/cluster/src/word2vec/prepare.py
0 → 100644
浏览文件 @
694bc64a
import
paddle.v2
as
paddle
import
tarfile
import
os
import
pickle
SPLIT_COUNT
=
3
N
=
5
def
file_len
(
fd
):
for
i
,
l
in
enumerate
(
fd
):
pass
return
i
+
1
def
split_from_reader_by_line
(
filename
,
reader
,
split_count
):
fn
=
open
(
filename
,
"w"
)
for
batch_id
,
batch_data
in
enumerate
(
reader
()):
batch_data_str
=
[
str
(
d
)
for
d
in
batch_data
]
fn
.
write
(
","
.
join
(
batch_data_str
))
fn
.
write
(
"
\n
"
)
fn
.
close
()
fn
=
open
(
filename
,
"r"
)
total_line_count
=
file_len
(
fn
)
fn
.
close
()
per_file_lines
=
total_line_count
/
split_count
+
1
cmd
=
"split -d -a 5 -l %d %s %s-"
%
(
per_file_lines
,
filename
,
filename
)
os
.
system
(
cmd
)
word_dict
=
paddle
.
dataset
.
imikolov
.
build_dict
()
with
open
(
"word_dict.pickle"
,
"w"
)
as
dict_f
:
pickle
.
dump
(
word_dict
,
dict_f
)
split_from_reader_by_line
(
"train.txt"
,
paddle
.
dataset
.
imikolov
.
train
(
word_dict
,
N
),
SPLIT_COUNT
)
split_from_reader_by_line
(
"test.txt"
,
paddle
.
dataset
.
imikolov
.
test
(
word_dict
,
N
),
SPLIT_COUNT
)
paddle/parameter/FirstOrderOptimizer.h
浏览文件 @
694bc64a
...
...
@@ -265,6 +265,10 @@ public:
addParameterType
(
PARAMETER_SECOND_MOMENTUM
);
}
virtual
void
startBatch
(
int64_t
numSamplesProcessed
)
{
learningRate_
=
calcLearningRate
(
numSamplesProcessed
,
pass_
);
}
virtual
void
finishBatch
()
{
++
step_
;
}
virtual
void
update
(
const
VectorPtr
vecs
[],
...
...
paddle/scripts/cluster_train_v2/fabric/conf.py
0 → 100644
浏览文件 @
694bc64a
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
HOSTS
=
[
"root@10.1.9.7"
,
"root@10.1.18.7"
,
"root@10.1.32.9"
,
]
'''
workspace configuration
'''
#root dir for workspace, can be set as any director with real user account
ROOT_DIR
=
"/root"
'''
network configuration
'''
#pserver nics
PADDLE_NIC
=
"eth0"
#pserver port
PADDLE_PORT
=
7164
#pserver ports num
PADDLE_PORTS_NUM
=
1
#pserver sparse ports num
PADDLE_PORTS_NUM_FOR_SPARSE
=
1
#trainer whether use gpu
PADDLE_USE_GPU
=
"False"
#environments setting for all processes in cluster job
LD_LIBRARY_PATH
=
"/usr/local/cuda/lib64:/usr/lib64"
paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
0 → 100644
浏览文件 @
694bc64a
FROM
docker.paddlepaddlehub.com/paddle:0.10.0rc2
RUN
apt-get update
&&
apt-get
install
-y
openssh-server
RUN
mkdir
/var/run/sshd
RUN
echo
'root:root'
|chpasswd
RUN
sed
-ri
's/^PermitRootLogin\s+.*/PermitRootLogin yes/'
/etc/ssh/sshd_config
RUN
sed
-ri
's/UsePAM yes/#UsePAM yes/g'
/etc/ssh/sshd_config
EXPOSE
22
CMD
["/usr/sbin/sshd", "-D"]
paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
0 → 100644
浏览文件 @
694bc64a
apiVersion
:
extensions/v1beta1
kind
:
Deployment
metadata
:
name
:
ssh-servers
spec
:
replicas
:
3
template
:
metadata
:
labels
:
app
:
ssh-servers
spec
:
containers
:
-
name
:
ssh-servers
image
:
docker.paddlepaddlehub.com/paddlessh
resources
:
limits
:
cpu
:
500m
memory
:
1Gi
requests
:
cpu
:
500m
memory
:
1Gi
ports
:
-
containerPort
:
22
paddle/scripts/cluster_train_v2/fabric/run.sh
0 → 100644
浏览文件 @
694bc64a
#!/bin/bash
python paddle.py
\
--job_dispatch_package
=
"/root/wuyi/fabric_submit/workspace"
\
--dot_period
=
10
\
--ports_num_for_sparse
=
1
\
--log_period
=
50
\
--num_passes
=
5
\
--trainer_count
=
2
\
--saving_period
=
1
\
--local
=
0
\
--config
=
./trainer_config.py
\
--save_dir
=
./output
\
--use_gpu
=
0
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
0 → 100644
浏览文件 @
694bc64a
# Build this image: docker build -t mpi .
#
FROM
paddledev/paddle:0.10.0rc3
ENV
DEBIAN_FRONTEND noninteractive
RUN
apt-get update
-y
&&
\
apt-get upgrade
-y
&&
\
apt-get
install
-y
openssh-server zip unzip vim
sudo
\
gcc gfortran openmpi-checkpoint binutils wget curl git openmpi-bin openmpi-common libopenmpi-dev
&&
\
pip
install
mpi4py numpy virtualenv scipy matplotlib lxml sqlalchemy suds ipython obspy
&&
\
mkdir
/var/run/sshd
&&
\
echo
'root:tutorial'
| chpasswd
&&
\
sed
-i
's/PermitRootLogin without-password/PermitRootLogin yes/'
/etc/ssh/sshd_config
&&
\
# SSH login fix. Otherwise user is kicked off after login
sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \
echo "export VISIBLE=now" >> /etc/profile && \
adduser --disabled-password --gecos "" tutorial && \
echo "tutorial ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers && \
mkdir /home/tutorial/.ssh/
ENV
HOME /home/tutorial
ENV
NOTVISIBLE "in users profile"
# ------------------------------------------------------------
# Set-Up SSH with our Github deploy key
# ------------------------------------------------------------
ADD
ssh/config /home/tutorial/.ssh/config
ADD
ssh/id_rsa.mpi /home/tutorial/.ssh/id_rsa
ADD
ssh/id_rsa.mpi.pub /home/tutorial/.ssh/id_rsa.pub
ADD
ssh/id_rsa.mpi.pub /home/tutorial/.ssh/authorized_keys
#---------------------------------------------------------------
#LD_LIBRARY_PATH
#---------------------------------------------------------------
RUN
export
LD_LIBRARY_PATH
=
/usr/lib/openmpi/lib/
WORKDIR
/home/tutorial
EXPOSE
22
CMD
["/usr/sbin/sshd", "-D"]
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
0 → 100644
浏览文件 @
694bc64a
apiVersion
:
extensions/v1beta1
kind
:
Deployment
metadata
:
name
:
mpi-header
labels
:
app
:
mpi-header
spec
:
replicas
:
1
template
:
metadata
:
labels
:
app
:
mpi-header
spec
:
containers
:
-
image
:
typhoon1986/paddle-openmpi
name
:
mpi-header
resources
:
limits
:
cpu
:
500m
memory
:
2Gi
requests
:
cpu
:
500m
memory
:
2Gi
ports
:
-
containerPort
:
22
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
0 → 100644
浏览文件 @
694bc64a
apiVersion
:
extensions/v1beta1
kind
:
Deployment
metadata
:
name
:
mpi-nodes
labels
:
app
:
mpi-nodes
spec
:
replicas
:
3
template
:
metadata
:
labels
:
app
:
mpi-nodes
spec
:
containers
:
-
image
:
typhoon1986/paddle-openmpi
name
:
mpi-nodes
resources
:
limits
:
cpu
:
500m
memory
:
2Gi
requests
:
cpu
:
500m
memory
:
2Gi
ports
:
-
containerPort
:
22
imagePullPolicy
:
Always
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
0 → 100644
浏览文件 @
694bc64a
StrictHostKeyChecking no
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
0 → 100644
浏览文件 @
694bc64a
-----BEGIN RSA PRIVATE KEY-----
MIIEogIBAAKCAQEA7PWLZmgdJ508dD15T6+xqGDvL9Ehzo9SgsnN6xJ+qpUvvOi4
1axW0AqR4MnPTg/uuvk+x4tUpuufOW4w22UTGjsdvmIVWa9ujLtcRiN3YPY+SU+Y
O5FfqKg7r/hBn+/GMcSoffwSs7vVgmhBBnp/mJh2O1cOAFZEe98/47mbg3/kHBAk
36NOQktaU3l48B38EhBTnjWfcEGm1HcTRPFxXV5Wiko6ZhKFEuHcTVKng4ROtUqE
mgHyI0aB7TAxg4na0ejItsYWEPWGeDOw6ms/4MwylxNosWzHFPW9p4zgLCLNr+b6
bDDfYKjXZflAuTQtQhLmJUwD9uuYLAijpSE2fQIDAQABAoIBADgcgRET8Gt0CV/B
OtvKz/f+VEVvcWD3gWNlJDTZIVOFllNWjIZUlA4ZoqenQkbK8Q4nfV1FOht4yjCQ
TlN1oMtiWk297i5Zo4UBzPzy4w774I39oh/g8dT/WXr2/5s+7SDV38xNh6Q2A34o
79T35wUcfUrZ93/O7dKjb/6d8hx2FMha0wVKqY4lmG1lQE3bbx3kakec0PdvU5kO
YHKlpqj3pMR7CpMa+4yL/iXFwWYmnK+uu+zw7JR7PwvH1CzrnvW438wjQ1QmYbSx
mHHOE89X67Lsl5hn81qYWBhpwAlBwi1qscsE0cV9GcFyKqWFqZsj5coM9u3CRfvy
lrWe1OUCgYEA+LBUFEd3Hxs4sFiYElJ8R9SAs1udaqPvAl01hTEijJLfYlMMVs/y
rgNN7j22zjDak2f8QdyMJZX7EZdRmdYcHO0csYOwbYvalzcnwk+U3mxmdD3r4xSo
DSvkJ70fogAqUlcVIg2re6fCmZVJQTvMQYTVEM8zQomJRt/Lb2esSfsCgYEA8+zv
44aToe8uqiDs4w8guRW7LCDkTw4z4IVo9JUibIaPjaAs5bZEBXSB43EEywXCR75H
fML0rU1PVvKh1rqcvZdVzm+XMWVr3asPk0sapaiHaTcmyZvJRDxxqbLFp0zRP1T6
cCtXNFdHWU4KiuKrUi6cDyOKchpfkSZa4seiT+cCgYB+n4FgBfdQPlMB70oW4irn
g/q32CjxuGCk6oKqu5bkzo+xB6obtavSEFqouIGQwO056tNVUY+GP7Rjg5GH663K
yKw4cl3tmS0Gm43B8TVSfw03mKO3rrfWZQe5eCFYIg9qd26KNT2gK435FzsCXQkm
PxUhhu6JrW/ZR2/U3Iur6wKBgADrWLAb1ryagSuE+j+U1AO+kDkHWrTtkcZ72jxp
v3p3O11GSEUJXdJDcSXhTCpTuDq6/dv7hB6PFwh126RKicKxKlKf2wsFndV1Cpb8
hnovW2tLGOtTmfuW2rrQAKyzvmolsNfxYd/BoHQ2thV16z1hDZeFA8WQUeHjKh6G
sBbrAoGATdtQlaUxx4izua6k02ihkxx/cRYwDl2N8UDvDBHokS7vJFMX8b8NpsGg
zMElnqSpu/pe/0UG7N2MtPF6uyMcX8AZzzcsRkiMkDvWJzYt8Jpf+Eyd/uryF+Yv
yrXaOEY83tm6x/fny5ZaZmk8lNth7bfWywuTMkZLX3fYpWtIeE4=
-----END RSA PRIVATE KEY-----
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
0 → 100644
浏览文件 @
694bc64a
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDs9YtmaB0nnTx0PXlPr7GoYO8v0SHOj1KCyc3rEn6qlS+86LjVrFbQCpHgyc9OD+66+T7Hi1Sm6585bjDbZRMaOx2+YhVZr26Mu1xGI3dg9j5JT5g7kV+oqDuv+EGf78YxxKh9/BKzu9WCaEEGen+YmHY7Vw4AVkR73z/juZuDf+QcECTfo05CS1pTeXjwHfwSEFOeNZ9wQabUdxNE8XFdXlaKSjpmEoUS4dxNUqeDhE61SoSaAfIjRoHtMDGDidrR6Mi2xhYQ9YZ4M7Dqaz/gzDKXE2ixbMcU9b2njOAsIs2v5vpsMN9gqNdl+UC5NC1CEuYlTAP265gsCKOlITZ9 oweidner@peahi
paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
0 → 100644
浏览文件 @
694bc64a
#!/bin/bash
# General trainning configurations
NICS
=
eth0
PADDLE_INIT_PORT
=
7164
PADDLE_INIT_PORTS_NUM
=
1
PADDLE_INIT_PORTS_NUM_FOR_SPARSE
=
1
PADDLE_INIT_PSERVERS
=
$(
cat
machines |
sed
-e
':a'
-e
'N'
-e
'$!ba'
-e
's/\n/,/g'
)
PADDLE_INIT_USE_GPU
=
False
PADDLE_INIT_NUM_GRADIENT_SERVERS
=
${
OMPI_COMM_WORLD_SIZE
}
PADDLE_INIT_TRAINER_ID
=
${
OMPI_COMM_WORLD_RANK
}
PADDLE_CLUSTER_TRAIN
=
True
env
# start pserver
stdbuf
-oL
nohup
paddle pserver
--port
=
$PADDLE_INIT_PORT
--ports_num
=
$PADDLE_INIT_PORTS_NUM
\
--ports_num_for_sparse
=
$PADDLE_INIT_PORTS_NUM_FOR_SPARSE
--nics
=
$NICS
\
--comment
=
paddle_cluster_pserver
\
--num_gradient_servers
=
$PADDLE_INIT_NUM_GRADIENT_SERVERS
&> logs/pserver.log &
# start trainer
# NOTE: train.py will use the above environment variables as configuration
python train.py &> logs/train.log
# kill background pservers when train finishes
ps
-ef
|
grep
pserver |
awk
'{print $2}'
| xargs
kill
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录