Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
PaddleRec
提交
77a2da6d
P
PaddleRec
项目概览
BaiXuePrincess
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
77a2da6d
编写于
5月 12, 2020
作者:
T
tangwei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add qsub submit
上级
7c10b488
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
33 addition
and
47 deletion
+33
-47
fleet_rec/core/engine/cluster/cluster.py
fleet_rec/core/engine/cluster/cluster.py
+3
-12
fleet_rec/core/engine/cluster/master.sh
fleet_rec/core/engine/cluster/master.sh
+12
-16
fleet_rec/run.py
fleet_rec/run.py
+3
-1
models/rank/dnn/submit.sh
models/rank/dnn/submit.sh
+4
-7
models/rank/dnn/worker.sh
models/rank/dnn/worker.sh
+11
-11
未找到文件。
fleet_rec/core/engine/cluster/cluster.py
浏览文件 @
77a2da6d
...
...
@@ -21,6 +21,7 @@ import os
import
copy
from
fleetrec.core.engine.engine
import
Engine
from
fleetrec.core.factory
import
TrainerFactory
from
fleetrec.core.utils
import
envs
...
...
@@ -30,16 +31,8 @@ class ClusterEngine(Engine):
self
.
submit_script
=
os
.
path
.
join
(
abs_dir
,
"master.sh"
)
def
start_worker_procs
(
self
):
default_env
=
os
.
environ
.
copy
()
current_env
=
copy
.
copy
(
default_env
)
current_env
.
pop
(
"http_proxy"
,
None
)
current_env
.
pop
(
"https_proxy"
,
None
)
cmd
=
(
"bash {}"
.
format
(
self
.
submit_script
)).
split
(
" "
)
proc
=
subprocess
.
Popen
(
cmd
,
env
=
current_env
,
cwd
=
os
.
getcwd
())
proc
.
wait
()
print
(
"all workers and parameter servers already completed"
,
file
=
sys
.
stderr
)
trainer
=
TrainerFactory
.
create
(
self
.
trainer
)
trainer
.
run
()
def
start_master_procs
(
self
):
default_env
=
os
.
environ
.
copy
()
...
...
@@ -51,8 +44,6 @@ class ClusterEngine(Engine):
proc
=
subprocess
.
Popen
(
cmd
,
env
=
current_env
,
cwd
=
os
.
getcwd
())
proc
.
wait
()
print
(
"all workers and parameter servers already completed"
,
file
=
sys
.
stderr
)
def
run
(
self
):
role
=
envs
.
get_runtime_environ
(
"engine_role"
)
...
...
fleet_rec/core/engine/cluster/master.sh
浏览文件 @
77a2da6d
...
...
@@ -19,7 +19,6 @@ declare g_run_stage=""
declare
-r
CALL
=
"x"
################################################################################
#-----------------------------------------------------------------------------------------------------------------
# Function: get_cur_path
# Description: get churrent path
...
...
@@ -31,13 +30,12 @@ declare -r CALL="x"
# Return: 0 -- success; not 0 -- failure
# Others: N/A
#-----------------------------------------------------------------------------------------------------------------
get_cur_path
()
{
get_cur_path
()
{
g_run_stage
=
"get_cur_path"
cd
"
$(
dirname
"
${
BASH_SOURCE
-
$0
}
"
)
"
g_curPath
=
"
${
PWD
}
"
g_scriptName
=
"
$(
basename
"
${
BASH_SOURCE
-
$0
}
"
)
"
cd
-
>
/dev/null
cd
"
$(
dirname
"
${
BASH_SOURCE
-
$0
}
"
)
"
g_curPath
=
"
${
PWD
}
"
g_scriptName
=
"
$(
basename
"
${
BASH_SOURCE
-
$0
}
"
)
"
cd
-
>
/dev/null
}
#-----------------------------------------------------------------------------------------------------------------
...
...
@@ -45,15 +43,13 @@ get_cur_path()
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function
check_error
()
{
if
[
${
?
}
-ne
0
]
then
echo
"execute "
+
$g_run_stage
+
" raise exception! please check ..."
exit
1
fi
function
check_error
()
{
if
[
${
?
}
-ne
0
]
;
then
echo
"execute "
+
$g_run_stage
+
" raise exception! please check ..."
exit
1
fi
}
source
${
engine_scrpit
}
source
${
engine_s
ubmit_s
crpit
}
main
\ No newline at end of file
main
fleet_rec/run.py
浏览文件 @
77a2da6d
...
...
@@ -154,8 +154,10 @@ def cluster_engine(args):
cluster_envs
[
"train.trainer.platform"
]
=
envs
.
get_platform
()
print
(
"launch {} engine with cluster to with model: {}"
.
format
(
trainer
,
args
.
model
))
set_runtime_envs
(
cluster_envs
,
args
.
model
)
launch
=
ClusterEngine
(
cluster_envs
,
args
.
model
)
launch
=
LocalClusterEngine
(
cluster_envs
,
args
.
model
)
return
launch
if
args
.
role
==
"worker"
:
return
worker
()
else
:
...
...
models/rank/dnn/submit.sh
浏览文件 @
77a2da6d
...
...
@@ -34,7 +34,7 @@ function package() {
echo
"copy python from "
${
engine_package_python
}
" to "
${
temp
}
mkdir
${
temp
}
/package/whl
cp
${
engine_package_paddlerec
}
${
temp
}
/package/whl/
cp
${
engine_package_paddlerec
}
${
temp
}
/package/whl/
echo
"copy "
${
engine_package_paddlerec
}
" to "
${
temp
}
"/whl/"
}
...
...
@@ -63,13 +63,11 @@ function after_submit() {
#-----------------------------------------------------------------------------------------------------------------
function
submit
()
{
g_run_stage
=
"submit"
g_job_name
=
"paddle_rec_mpi"
g_hdfs_path
=
$g_hdfs_path
g_job_entry
=
"worker.sh"
engine_hdfs_output
=
${
engine_hdfs_output
}
/
`
date
+%Y%m%d%H%M%S
`
engine_hdfs_output
=
${
engine_hdfs_output
}
/
$(
date
+%Y%m%d%H%M%S
)
cd
${
engine_temp_path
}
...
...
@@ -79,9 +77,8 @@ function submit() {
--hdfs
${
engine_hdfs_name
}
\
--ugi
${
engine_hdfs_ugi
}
\
--hout
${
engine_hdfs_output
}
\
--files
${
engine_temp_path
}
\
--files
./package
\
-l
nodes
=
${
engine_submit_nodes
}
,walltime
=
1000:00:00,resource
=
full
${
g_job_entry
}
}
function
main
()
{
...
...
@@ -90,4 +87,4 @@ function main() {
before_submit
submit
after_submit
}
\ No newline at end of file
}
models/rank/dnn/worker.sh
浏览文件 @
77a2da6d
...
...
@@ -41,24 +41,24 @@ function check_error() {
#-----------------------------------------------------------------------------------------------------------------
function
env_prepare
()
{
g_run_stage
=
"env_prepare"
}
WORKDIR
=
$(
pwd
)
mpirun
-npernode
1
mv
package/
*
./
echo
"current:"
$WORKDIR
export
LIBRARY_PATH
=
$WORKDIR
/python/lib:
$LIBRARY_PATH
function
user_define_variables
()
{
echo
"user_define_variables"
g_run_stage
=
"user_define_variables"
mpirun
-npernode
1 python/bin/python
-m
pip
install
whl/fleet_rec-0.0.2-py2-none-any.whl
--index-url
=
http://pip.baidu.com/pypi/simple
--trusted-host
pip.baidu.com
>
/dev/null
check_error
}
function
job
()
{
echo
"job"
g_run_stage
=
"job"
# mpirun -npernode 2 -timestamp-output -tag-output -machinefile ${PBS_NODEFILE} python -u ${g_job_entry}
function
run
()
{
echo
"run"
g_run_stage
=
"run"
mpirun
-npernode
2
-timestamp-output
-tag-output
-machinefile
${
PBS_NODEFILE
}
python/bin/python
-u
-m
fleetrec.run
-m
fleetrec.models.rank.dnn
--engine
cluster
--role
worker
}
function
main
()
{
user_define_variables
env_prepare
job
run
}
main
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录