Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
694bc64a
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
694bc64a
编写于
10月 19, 2017
作者:
D
dangqingqing
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into lstm
上级
17e33738
63ffe525
变更
20
展开全部
隐藏空白更改
内联
并排
Showing
20 changed file
with
959 addition
and
190 deletion
+959
-190
doc/design/cluster_train/src/trainer.graffle
doc/design/cluster_train/src/trainer.graffle
+0
-0
doc/howto/usage/cluster/cluster_train_cn.md
doc/howto/usage/cluster/cluster_train_cn.md
+221
-95
doc/howto/usage/cluster/cluster_train_en.md
doc/howto/usage/cluster/cluster_train_en.md
+232
-95
doc/howto/usage/cluster/src/trainer.png
doc/howto/usage/cluster/src/trainer.png
+0
-0
doc/howto/usage/cluster/src/trainer_cn.png
doc/howto/usage/cluster/src/trainer_cn.png
+0
-0
doc/howto/usage/cluster/src/word2vec/api_train_v2.py
doc/howto/usage/cluster/src/word2vec/api_train_v2.py
+100
-0
doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
+123
-0
doc/howto/usage/cluster/src/word2vec/prepare.py
doc/howto/usage/cluster/src/word2vec/prepare.py
+41
-0
paddle/parameter/FirstOrderOptimizer.h
paddle/parameter/FirstOrderOptimizer.h
+4
-0
paddle/scripts/cluster_train_v2/fabric/conf.py
paddle/scripts/cluster_train_v2/fabric/conf.py
+39
-0
paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
...scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
+11
-0
paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
...s/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
+23
-0
paddle/scripts/cluster_train_v2/fabric/run.sh
paddle/scripts/cluster_train_v2/fabric/run.sh
+14
-0
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
...cripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
+43
-0
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
...scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
+25
-0
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
...ts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
+26
-0
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
...cripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
+1
-0
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
...ts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
+27
-0
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
...luster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
+1
-0
paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
+28
-0
未找到文件。
doc/design/cluster_train/src/trainer.graffle
浏览文件 @
694bc64a
无法预览此类型文件
doc/howto/usage/cluster/cluster_train_cn.md
浏览文件 @
694bc64a
此差异已折叠。
点击以展开。
doc/howto/usage/cluster/cluster_train_en.md
浏览文件 @
694bc64a
此差异已折叠。
点击以展开。
doc/howto/usage/cluster/src/trainer.png
0 → 100644
浏览文件 @
694bc64a
141.7 KB
doc/howto/usage/cluster/src/trainer_cn.png
0 → 100644
浏览文件 @
694bc64a
33.1 KB
doc/howto/usage/cluster/src/word2vec/api_train_v2.py
0 → 100644
浏览文件 @
694bc64a
import
gzip
import
math
import
paddle.v2
as
paddle
embsize
=
32
hiddensize
=
256
N
=
5
def
wordemb
(
inlayer
):
wordemb
=
paddle
.
layer
.
embedding
(
input
=
inlayer
,
size
=
embsize
,
param_attr
=
paddle
.
attr
.
Param
(
name
=
"_proj"
,
initial_std
=
0.001
,
learning_rate
=
1
,
l2_rate
=
0
,
sparse_update
=
True
))
return
wordemb
def
main
():
# for local training
cluster_train
=
False
if
not
cluster_train
:
paddle
.
init
(
use_gpu
=
False
,
trainer_count
=
1
)
else
:
paddle
.
init
(
use_gpu
=
False
,
trainer_count
=
2
,
port
=
7164
,
ports_num
=
1
,
ports_num_for_sparse
=
1
,
num_gradient_servers
=
1
)
word_dict
=
paddle
.
dataset
.
imikolov
.
build_dict
()
dict_size
=
len
(
word_dict
)
firstword
=
paddle
.
layer
.
data
(
name
=
"firstw"
,
type
=
paddle
.
data_type
.
integer_value
(
dict_size
))
secondword
=
paddle
.
layer
.
data
(
name
=
"secondw"
,
type
=
paddle
.
data_type
.
integer_value
(
dict_size
))
thirdword
=
paddle
.
layer
.
data
(
name
=
"thirdw"
,
type
=
paddle
.
data_type
.
integer_value
(
dict_size
))
fourthword
=
paddle
.
layer
.
data
(
name
=
"fourthw"
,
type
=
paddle
.
data_type
.
integer_value
(
dict_size
))
nextword
=
paddle
.
layer
.
data
(
name
=
"fifthw"
,
type
=
paddle
.
data_type
.
integer_value
(
dict_size
))
Efirst
=
wordemb
(
firstword
)
Esecond
=
wordemb
(
secondword
)
Ethird
=
wordemb
(
thirdword
)
Efourth
=
wordemb
(
fourthword
)
contextemb
=
paddle
.
layer
.
concat
(
input
=
[
Efirst
,
Esecond
,
Ethird
,
Efourth
])
hidden1
=
paddle
.
layer
.
fc
(
input
=
contextemb
,
size
=
hiddensize
,
act
=
paddle
.
activation
.
Sigmoid
(),
layer_attr
=
paddle
.
attr
.
Extra
(
drop_rate
=
0.5
),
bias_attr
=
paddle
.
attr
.
Param
(
learning_rate
=
2
),
param_attr
=
paddle
.
attr
.
Param
(
initial_std
=
1.
/
math
.
sqrt
(
embsize
*
8
),
learning_rate
=
1
))
predictword
=
paddle
.
layer
.
fc
(
input
=
hidden1
,
size
=
dict_size
,
bias_attr
=
paddle
.
attr
.
Param
(
learning_rate
=
2
),
act
=
paddle
.
activation
.
Softmax
())
def
event_handler
(
event
):
if
isinstance
(
event
,
paddle
.
event
.
EndIteration
):
if
event
.
batch_id
%
100
==
0
:
with
gzip
.
open
(
"batch-"
+
str
(
event
.
batch_id
)
+
".tar.gz"
,
'w'
)
as
f
:
trainer
.
save_parameter_to_tar
(
f
)
result
=
trainer
.
test
(
paddle
.
batch
(
paddle
.
dataset
.
imikolov
.
test
(
word_dict
,
N
),
32
))
print
"Pass %d, Batch %d, Cost %f, %s, Testing metrics %s"
%
(
event
.
pass_id
,
event
.
batch_id
,
event
.
cost
,
event
.
metrics
,
result
.
metrics
)
cost
=
paddle
.
layer
.
classification_cost
(
input
=
predictword
,
label
=
nextword
)
parameters
=
paddle
.
parameters
.
create
(
cost
)
adagrad
=
paddle
.
optimizer
.
AdaGrad
(
learning_rate
=
3e-3
,
regularization
=
paddle
.
optimizer
.
L2Regularization
(
8e-4
))
trainer
=
paddle
.
trainer
.
SGD
(
cost
,
parameters
,
adagrad
,
is_local
=
not
cluster_train
)
trainer
.
train
(
paddle
.
batch
(
paddle
.
dataset
.
imikolov
.
train
(
word_dict
,
N
),
32
),
num_passes
=
30
,
event_handler
=
event_handler
)
if
__name__
==
'__main__'
:
main
()
doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
0 → 100644
浏览文件 @
694bc64a
import
math
import
os
import
paddle.v2
as
paddle
import
pickle
embsize
=
32
hiddensize
=
256
N
=
5
cluster_train_file
=
"./train_data_dir/train/train.txt"
cluster_test_file
=
"./test_data_dir/test/test.txt"
node_id
=
os
.
getenv
(
"OMPI_COMM_WORLD_RANK"
)
if
not
node_id
:
raise
EnvironmentError
(
"must provied OMPI_COMM_WORLD_RANK"
)
def
wordemb
(
inlayer
):
wordemb
=
paddle
.
layer
.
embedding
(
input
=
inlayer
,
size
=
embsize
,
param_attr
=
paddle
.
attr
.
Param
(
name
=
"_proj"
,
initial_std
=
0.001
,
learning_rate
=
1
,
l2_rate
=
0
,
sparse_update
=
True
))
return
wordemb
def
cluster_reader_cluster
(
filename
,
node_id
):
def
cluster_reader
():
with
open
(
"-"
.
join
([
filename
,
"%05d"
%
int
(
node_id
)]),
"r"
)
as
f
:
for
l
in
f
:
csv_data
=
[
int
(
cell
)
for
cell
in
l
.
split
(
","
)]
yield
tuple
(
csv_data
)
return
cluster_reader
def
main
():
# get arguments from env
# for local training
TRUTH
=
[
"true"
,
"True"
,
"TRUE"
,
"1"
,
"yes"
,
"Yes"
,
"YES"
]
cluster_train
=
os
.
getenv
(
'PADDLE_CLUSTER_TRAIN'
,
"False"
)
in
TRUTH
use_gpu
=
os
.
getenv
(
'PADDLE_INIT_USE_GPU'
,
"False"
)
if
not
cluster_train
:
paddle
.
init
(
use_gpu
=
use_gpu
,
trainer_count
=
int
(
os
.
getenv
(
"PADDLE_INIT_TRAINER_COUNT"
,
"1"
)))
else
:
paddle
.
init
(
use_gpu
=
use_gpu
,
trainer_count
=
int
(
os
.
getenv
(
"PADDLE_INIT_TRAINER_COUNT"
,
"1"
)),
port
=
int
(
os
.
getenv
(
"PADDLE_INIT_PORT"
,
"7164"
)),
ports_num
=
int
(
os
.
getenv
(
"PADDLE_INIT_PORTS_NUM"
,
"1"
)),
ports_num_for_sparse
=
int
(
os
.
getenv
(
"PADDLE_INIT_PORTS_NUM_FOR_SPARSE"
,
"1"
)),
num_gradient_servers
=
int
(
os
.
getenv
(
"PADDLE_INIT_NUM_GRADIENT_SERVERS"
,
"1"
)),
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_INIT_TRAINER_ID"
,
"0"
)),
pservers
=
os
.
getenv
(
"PADDLE_INIT_PSERVERS"
,
"127.0.0.1"
))
fn
=
open
(
"thirdparty/wuyi_train_thdpty/word_dict.pickle"
,
"r"
)
word_dict
=
pickle
.
load
(
fn
)
fn
.
close
()
dict_size
=
len
(
word_dict
)
firstword
=
paddle
.
layer
.
data
(
name
=
"firstw"
,
type
=
paddle
.
data_type
.
integer_value
(
dict_size
))
secondword
=
paddle
.
layer
.
data
(
name
=
"secondw"
,
type
=
paddle
.
data_type
.
integer_value
(
dict_size
))
thirdword
=
paddle
.
layer
.
data
(
name
=
"thirdw"
,
type
=
paddle
.
data_type
.
integer_value
(
dict_size
))
fourthword
=
paddle
.
layer
.
data
(
name
=
"fourthw"
,
type
=
paddle
.
data_type
.
integer_value
(
dict_size
))
nextword
=
paddle
.
layer
.
data
(
name
=
"fifthw"
,
type
=
paddle
.
data_type
.
integer_value
(
dict_size
))
Efirst
=
wordemb
(
firstword
)
Esecond
=
wordemb
(
secondword
)
Ethird
=
wordemb
(
thirdword
)
Efourth
=
wordemb
(
fourthword
)
contextemb
=
paddle
.
layer
.
concat
(
input
=
[
Efirst
,
Esecond
,
Ethird
,
Efourth
])
hidden1
=
paddle
.
layer
.
fc
(
input
=
contextemb
,
size
=
hiddensize
,
act
=
paddle
.
activation
.
Sigmoid
(),
layer_attr
=
paddle
.
attr
.
Extra
(
drop_rate
=
0.5
),
bias_attr
=
paddle
.
attr
.
Param
(
learning_rate
=
2
),
param_attr
=
paddle
.
attr
.
Param
(
initial_std
=
1.
/
math
.
sqrt
(
embsize
*
8
),
learning_rate
=
1
))
predictword
=
paddle
.
layer
.
fc
(
input
=
hidden1
,
size
=
dict_size
,
bias_attr
=
paddle
.
attr
.
Param
(
learning_rate
=
2
),
act
=
paddle
.
activation
.
Softmax
())
def
event_handler
(
event
):
if
isinstance
(
event
,
paddle
.
event
.
EndIteration
):
if
event
.
batch_id
%
100
==
0
:
result
=
trainer
.
test
(
paddle
.
batch
(
cluster_reader_cluster
(
cluster_test_file
,
node_id
),
32
))
print
"Pass %d, Batch %d, Cost %f, %s, Testing metrics %s"
%
(
event
.
pass_id
,
event
.
batch_id
,
event
.
cost
,
event
.
metrics
,
result
.
metrics
)
cost
=
paddle
.
layer
.
classification_cost
(
input
=
predictword
,
label
=
nextword
)
parameters
=
paddle
.
parameters
.
create
(
cost
)
adagrad
=
paddle
.
optimizer
.
AdaGrad
(
learning_rate
=
3e-3
,
regularization
=
paddle
.
optimizer
.
L2Regularization
(
8e-4
))
trainer
=
paddle
.
trainer
.
SGD
(
cost
,
parameters
,
adagrad
,
is_local
=
not
cluster_train
)
trainer
.
train
(
paddle
.
batch
(
cluster_reader_cluster
(
cluster_train_file
,
node_id
),
32
),
num_passes
=
30
,
event_handler
=
event_handler
)
if
__name__
==
'__main__'
:
main
()
doc/howto/usage/cluster/src/word2vec/prepare.py
0 → 100644
浏览文件 @
694bc64a
import
paddle.v2
as
paddle
import
tarfile
import
os
import
pickle
SPLIT_COUNT
=
3
N
=
5
def
file_len
(
fd
):
for
i
,
l
in
enumerate
(
fd
):
pass
return
i
+
1
def
split_from_reader_by_line
(
filename
,
reader
,
split_count
):
fn
=
open
(
filename
,
"w"
)
for
batch_id
,
batch_data
in
enumerate
(
reader
()):
batch_data_str
=
[
str
(
d
)
for
d
in
batch_data
]
fn
.
write
(
","
.
join
(
batch_data_str
))
fn
.
write
(
"
\n
"
)
fn
.
close
()
fn
=
open
(
filename
,
"r"
)
total_line_count
=
file_len
(
fn
)
fn
.
close
()
per_file_lines
=
total_line_count
/
split_count
+
1
cmd
=
"split -d -a 5 -l %d %s %s-"
%
(
per_file_lines
,
filename
,
filename
)
os
.
system
(
cmd
)
word_dict
=
paddle
.
dataset
.
imikolov
.
build_dict
()
with
open
(
"word_dict.pickle"
,
"w"
)
as
dict_f
:
pickle
.
dump
(
word_dict
,
dict_f
)
split_from_reader_by_line
(
"train.txt"
,
paddle
.
dataset
.
imikolov
.
train
(
word_dict
,
N
),
SPLIT_COUNT
)
split_from_reader_by_line
(
"test.txt"
,
paddle
.
dataset
.
imikolov
.
test
(
word_dict
,
N
),
SPLIT_COUNT
)
paddle/parameter/FirstOrderOptimizer.h
浏览文件 @
694bc64a
...
...
@@ -265,6 +265,10 @@ public:
addParameterType
(
PARAMETER_SECOND_MOMENTUM
);
}
virtual
void
startBatch
(
int64_t
numSamplesProcessed
)
{
learningRate_
=
calcLearningRate
(
numSamplesProcessed
,
pass_
);
}
virtual
void
finishBatch
()
{
++
step_
;
}
virtual
void
update
(
const
VectorPtr
vecs
[],
...
...
paddle/scripts/cluster_train_v2/fabric/conf.py
0 → 100644
浏览文件 @
694bc64a
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
HOSTS
=
[
"root@10.1.9.7"
,
"root@10.1.18.7"
,
"root@10.1.32.9"
,
]
'''
workspace configuration
'''
#root dir for workspace, can be set as any director with real user account
ROOT_DIR
=
"/root"
'''
network configuration
'''
#pserver nics
PADDLE_NIC
=
"eth0"
#pserver port
PADDLE_PORT
=
7164
#pserver ports num
PADDLE_PORTS_NUM
=
1
#pserver sparse ports num
PADDLE_PORTS_NUM_FOR_SPARSE
=
1
#trainer whether use gpu
PADDLE_USE_GPU
=
"False"
#environments setting for all processes in cluster job
LD_LIBRARY_PATH
=
"/usr/local/cuda/lib64:/usr/lib64"
paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
0 → 100644
浏览文件 @
694bc64a
FROM
docker.paddlepaddlehub.com/paddle:0.10.0rc2
RUN
apt-get update
&&
apt-get
install
-y
openssh-server
RUN
mkdir
/var/run/sshd
RUN
echo
'root:root'
|chpasswd
RUN
sed
-ri
's/^PermitRootLogin\s+.*/PermitRootLogin yes/'
/etc/ssh/sshd_config
RUN
sed
-ri
's/UsePAM yes/#UsePAM yes/g'
/etc/ssh/sshd_config
EXPOSE
22
CMD
["/usr/sbin/sshd", "-D"]
paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
0 → 100644
浏览文件 @
694bc64a
apiVersion
:
extensions/v1beta1
kind
:
Deployment
metadata
:
name
:
ssh-servers
spec
:
replicas
:
3
template
:
metadata
:
labels
:
app
:
ssh-servers
spec
:
containers
:
-
name
:
ssh-servers
image
:
docker.paddlepaddlehub.com/paddlessh
resources
:
limits
:
cpu
:
500m
memory
:
1Gi
requests
:
cpu
:
500m
memory
:
1Gi
ports
:
-
containerPort
:
22
paddle/scripts/cluster_train_v2/fabric/run.sh
0 → 100644
浏览文件 @
694bc64a
#!/bin/bash
python paddle.py
\
--job_dispatch_package
=
"/root/wuyi/fabric_submit/workspace"
\
--dot_period
=
10
\
--ports_num_for_sparse
=
1
\
--log_period
=
50
\
--num_passes
=
5
\
--trainer_count
=
2
\
--saving_period
=
1
\
--local
=
0
\
--config
=
./trainer_config.py
\
--save_dir
=
./output
\
--use_gpu
=
0
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
0 → 100644
浏览文件 @
694bc64a
# Build this image: docker build -t mpi .
#
FROM
paddledev/paddle:0.10.0rc3
ENV
DEBIAN_FRONTEND noninteractive
RUN
apt-get update
-y
&&
\
apt-get upgrade
-y
&&
\
apt-get
install
-y
openssh-server zip unzip vim
sudo
\
gcc gfortran openmpi-checkpoint binutils wget curl git openmpi-bin openmpi-common libopenmpi-dev
&&
\
pip
install
mpi4py numpy virtualenv scipy matplotlib lxml sqlalchemy suds ipython obspy
&&
\
mkdir
/var/run/sshd
&&
\
echo
'root:tutorial'
| chpasswd
&&
\
sed
-i
's/PermitRootLogin without-password/PermitRootLogin yes/'
/etc/ssh/sshd_config
&&
\
# SSH login fix. Otherwise user is kicked off after login
sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \
echo "export VISIBLE=now" >> /etc/profile && \
adduser --disabled-password --gecos "" tutorial && \
echo "tutorial ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers && \
mkdir /home/tutorial/.ssh/
ENV
HOME /home/tutorial
ENV
NOTVISIBLE "in users profile"
# ------------------------------------------------------------
# Set-Up SSH with our Github deploy key
# ------------------------------------------------------------
ADD
ssh/config /home/tutorial/.ssh/config
ADD
ssh/id_rsa.mpi /home/tutorial/.ssh/id_rsa
ADD
ssh/id_rsa.mpi.pub /home/tutorial/.ssh/id_rsa.pub
ADD
ssh/id_rsa.mpi.pub /home/tutorial/.ssh/authorized_keys
#---------------------------------------------------------------
#LD_LIBRARY_PATH
#---------------------------------------------------------------
RUN
export
LD_LIBRARY_PATH
=
/usr/lib/openmpi/lib/
WORKDIR
/home/tutorial
EXPOSE
22
CMD
["/usr/sbin/sshd", "-D"]
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
0 → 100644
浏览文件 @
694bc64a
apiVersion
:
extensions/v1beta1
kind
:
Deployment
metadata
:
name
:
mpi-header
labels
:
app
:
mpi-header
spec
:
replicas
:
1
template
:
metadata
:
labels
:
app
:
mpi-header
spec
:
containers
:
-
image
:
typhoon1986/paddle-openmpi
name
:
mpi-header
resources
:
limits
:
cpu
:
500m
memory
:
2Gi
requests
:
cpu
:
500m
memory
:
2Gi
ports
:
-
containerPort
:
22
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
0 → 100644
浏览文件 @
694bc64a
apiVersion
:
extensions/v1beta1
kind
:
Deployment
metadata
:
name
:
mpi-nodes
labels
:
app
:
mpi-nodes
spec
:
replicas
:
3
template
:
metadata
:
labels
:
app
:
mpi-nodes
spec
:
containers
:
-
image
:
typhoon1986/paddle-openmpi
name
:
mpi-nodes
resources
:
limits
:
cpu
:
500m
memory
:
2Gi
requests
:
cpu
:
500m
memory
:
2Gi
ports
:
-
containerPort
:
22
imagePullPolicy
:
Always
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
0 → 100644
浏览文件 @
694bc64a
StrictHostKeyChecking no
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
0 → 100644
浏览文件 @
694bc64a
-----BEGIN RSA PRIVATE KEY-----
MIIEogIBAAKCAQEA7PWLZmgdJ508dD15T6+xqGDvL9Ehzo9SgsnN6xJ+qpUvvOi4
1axW0AqR4MnPTg/uuvk+x4tUpuufOW4w22UTGjsdvmIVWa9ujLtcRiN3YPY+SU+Y
O5FfqKg7r/hBn+/GMcSoffwSs7vVgmhBBnp/mJh2O1cOAFZEe98/47mbg3/kHBAk
36NOQktaU3l48B38EhBTnjWfcEGm1HcTRPFxXV5Wiko6ZhKFEuHcTVKng4ROtUqE
mgHyI0aB7TAxg4na0ejItsYWEPWGeDOw6ms/4MwylxNosWzHFPW9p4zgLCLNr+b6
bDDfYKjXZflAuTQtQhLmJUwD9uuYLAijpSE2fQIDAQABAoIBADgcgRET8Gt0CV/B
OtvKz/f+VEVvcWD3gWNlJDTZIVOFllNWjIZUlA4ZoqenQkbK8Q4nfV1FOht4yjCQ
TlN1oMtiWk297i5Zo4UBzPzy4w774I39oh/g8dT/WXr2/5s+7SDV38xNh6Q2A34o
79T35wUcfUrZ93/O7dKjb/6d8hx2FMha0wVKqY4lmG1lQE3bbx3kakec0PdvU5kO
YHKlpqj3pMR7CpMa+4yL/iXFwWYmnK+uu+zw7JR7PwvH1CzrnvW438wjQ1QmYbSx
mHHOE89X67Lsl5hn81qYWBhpwAlBwi1qscsE0cV9GcFyKqWFqZsj5coM9u3CRfvy
lrWe1OUCgYEA+LBUFEd3Hxs4sFiYElJ8R9SAs1udaqPvAl01hTEijJLfYlMMVs/y
rgNN7j22zjDak2f8QdyMJZX7EZdRmdYcHO0csYOwbYvalzcnwk+U3mxmdD3r4xSo
DSvkJ70fogAqUlcVIg2re6fCmZVJQTvMQYTVEM8zQomJRt/Lb2esSfsCgYEA8+zv
44aToe8uqiDs4w8guRW7LCDkTw4z4IVo9JUibIaPjaAs5bZEBXSB43EEywXCR75H
fML0rU1PVvKh1rqcvZdVzm+XMWVr3asPk0sapaiHaTcmyZvJRDxxqbLFp0zRP1T6
cCtXNFdHWU4KiuKrUi6cDyOKchpfkSZa4seiT+cCgYB+n4FgBfdQPlMB70oW4irn
g/q32CjxuGCk6oKqu5bkzo+xB6obtavSEFqouIGQwO056tNVUY+GP7Rjg5GH663K
yKw4cl3tmS0Gm43B8TVSfw03mKO3rrfWZQe5eCFYIg9qd26KNT2gK435FzsCXQkm
PxUhhu6JrW/ZR2/U3Iur6wKBgADrWLAb1ryagSuE+j+U1AO+kDkHWrTtkcZ72jxp
v3p3O11GSEUJXdJDcSXhTCpTuDq6/dv7hB6PFwh126RKicKxKlKf2wsFndV1Cpb8
hnovW2tLGOtTmfuW2rrQAKyzvmolsNfxYd/BoHQ2thV16z1hDZeFA8WQUeHjKh6G
sBbrAoGATdtQlaUxx4izua6k02ihkxx/cRYwDl2N8UDvDBHokS7vJFMX8b8NpsGg
zMElnqSpu/pe/0UG7N2MtPF6uyMcX8AZzzcsRkiMkDvWJzYt8Jpf+Eyd/uryF+Yv
yrXaOEY83tm6x/fny5ZaZmk8lNth7bfWywuTMkZLX3fYpWtIeE4=
-----END RSA PRIVATE KEY-----
paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
0 → 100644
浏览文件 @
694bc64a
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDs9YtmaB0nnTx0PXlPr7GoYO8v0SHOj1KCyc3rEn6qlS+86LjVrFbQCpHgyc9OD+66+T7Hi1Sm6585bjDbZRMaOx2+YhVZr26Mu1xGI3dg9j5JT5g7kV+oqDuv+EGf78YxxKh9/BKzu9WCaEEGen+YmHY7Vw4AVkR73z/juZuDf+QcECTfo05CS1pTeXjwHfwSEFOeNZ9wQabUdxNE8XFdXlaKSjpmEoUS4dxNUqeDhE61SoSaAfIjRoHtMDGDidrR6Mi2xhYQ9YZ4M7Dqaz/gzDKXE2ixbMcU9b2njOAsIs2v5vpsMN9gqNdl+UC5NC1CEuYlTAP265gsCKOlITZ9 oweidner@peahi
paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
0 → 100644
浏览文件 @
694bc64a
#!/bin/bash
# General trainning configurations
NICS
=
eth0
PADDLE_INIT_PORT
=
7164
PADDLE_INIT_PORTS_NUM
=
1
PADDLE_INIT_PORTS_NUM_FOR_SPARSE
=
1
PADDLE_INIT_PSERVERS
=
$(
cat
machines |
sed
-e
':a'
-e
'N'
-e
'$!ba'
-e
's/\n/,/g'
)
PADDLE_INIT_USE_GPU
=
False
PADDLE_INIT_NUM_GRADIENT_SERVERS
=
${
OMPI_COMM_WORLD_SIZE
}
PADDLE_INIT_TRAINER_ID
=
${
OMPI_COMM_WORLD_RANK
}
PADDLE_CLUSTER_TRAIN
=
True
env
# start pserver
stdbuf
-oL
nohup
paddle pserver
--port
=
$PADDLE_INIT_PORT
--ports_num
=
$PADDLE_INIT_PORTS_NUM
\
--ports_num_for_sparse
=
$PADDLE_INIT_PORTS_NUM_FOR_SPARSE
--nics
=
$NICS
\
--comment
=
paddle_cluster_pserver
\
--num_gradient_servers
=
$PADDLE_INIT_NUM_GRADIENT_SERVERS
&> logs/pserver.log &
# start trainer
# NOTE: train.py will use the above environment variables as configuration
python train.py &> logs/train.log
# kill background pservers when train finishes
ps
-ef
|
grep
pserver |
awk
'{print $2}'
| xargs
kill
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录