Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
PaddleRec
提交
e9011c05
P
PaddleRec
项目概览
BaiXuePrincess
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e9011c05
编写于
5月 12, 2020
作者:
T
tangwei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add mpi cluster
上级
1125b796
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
300 addition
and
2 deletion
+300
-2
fleet_rec/core/engine/engine.py
fleet_rec/core/engine/engine.py
+4
-0
fleet_rec/core/engine/local_cluster.py
fleet_rec/core/engine/local_cluster.py
+0
-0
fleet_rec/core/engine/local_mpi.py
fleet_rec/core/engine/local_mpi.py
+0
-0
fleet_rec/core/engine/mpi_cluster/__init__.py
fleet_rec/core/engine/mpi_cluster/__init__.py
+0
-0
fleet_rec/core/engine/mpi_cluster/cluster.py
fleet_rec/core/engine/mpi_cluster/cluster.py
+45
-0
fleet_rec/core/engine/mpi_cluster/job.sh
fleet_rec/core/engine/mpi_cluster/job.sh
+61
-0
fleet_rec/core/engine/mpi_cluster/submit.sh
fleet_rec/core/engine/mpi_cluster/submit.sh
+149
-0
fleet_rec/run.py
fleet_rec/run.py
+2
-2
models/rank/dnn/engine.yaml
models/rank/dnn/engine.yaml
+39
-0
未找到文件。
fleet_rec/core/engine/engine.py
浏览文件 @
e9011c05
...
@@ -7,6 +7,10 @@ class Engine:
...
@@ -7,6 +7,10 @@ class Engine:
def
__init__
(
self
,
envs
,
trainer
):
def
__init__
(
self
,
envs
,
trainer
):
self
.
envs
=
envs
self
.
envs
=
envs
self
.
trainer
=
trainer
self
.
trainer
=
trainer
self
.
__init_impl__
()
def
__init_impl__
(
self
):
pass
@
abc
.
abstractmethod
@
abc
.
abstractmethod
def
run
(
self
):
def
run
(
self
):
...
...
fleet_rec/core/engine/local_cluster
_engine
.py
→
fleet_rec/core/engine/local_cluster.py
浏览文件 @
e9011c05
文件已移动
fleet_rec/core/engine/local_mpi
_engine
.py
→
fleet_rec/core/engine/local_mpi.py
浏览文件 @
e9011c05
文件已移动
fleet_rec/core/engine/mpi_cluster/__init__.py
0 → 100644
浏览文件 @
e9011c05
fleet_rec/core/engine/mpi_cluster/cluster.py
0 → 100644
浏览文件 @
e9011c05
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
from
__future__
import
unicode_literals
import
subprocess
import
sys
import
os
import
copy
from
fleetrec.core.engine.engine
import
Engine
class
QSubClusterEngine
(
Engine
):
def
__init_impl__
(
self
):
abs_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
self
.
submit_script
=
os
.
path
.
join
(
abs_dir
,
"submit.sh"
)
self
.
job_script
=
os
.
path
.
join
(
abs_dir
,
"job.sh"
)
def
start_procs
(
self
):
default_env
=
os
.
environ
.
copy
()
current_env
=
copy
.
copy
(
default_env
)
current_env
.
pop
(
"http_proxy"
,
None
)
current_env
.
pop
(
"https_proxy"
,
None
)
cmd
=
(
"bash {}"
.
format
(
self
.
submit_script
)).
split
(
" "
)
proc
=
subprocess
.
Popen
(
cmd
,
env
=
current_env
,
cwd
=
os
.
getcwd
())
proc
.
wait
()
print
(
"all workers and parameter servers already completed"
,
file
=
sys
.
stderr
)
def
run
(
self
):
self
.
start_procs
()
fleet_rec/core/engine/mpi_cluster/job.sh
0 → 100644
浏览文件 @
e9011c05
#!/bin/bash
###################################################
# Usage: job.sh
# Description: run job on mpi per node
###################################################
# ---------------------------------------------------------------------------- #
# variable define #
# ---------------------------------------------------------------------------- #
declare
g_curPath
=
""
declare
g_scriptName
=
""
declare
g_workPath
=
""
declare
g_run_stage
=
""
# ---------------------------------------------------------------------------- #
# const define #
# ---------------------------------------------------------------------------- #
declare
-r
FLAGS_communicator_thread_pool_size
=
5
declare
-r
FLAGS_communicator_send_queue_size
=
18
declare
-r
FLAGS_communicator_thread_pool_size
=
20
declare
-r
FLAGS_communicator_max_merge_var_num
=
18
################################################################################
#-----------------------------------------------------------------------------------------------------------------
#fun : check function return code
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function
check_error
()
{
if
[
${
?
}
-ne
0
]
;
then
echo
"execute "
+
$g_run_stage
+
" raise exception! please check ..."
exit
1
fi
}
#-----------------------------------------------------------------------------------------------------------------
#fun : check function return code
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function
env_prepare
()
{
g_run_stage
=
"env_prepare"
}
function
user_define_variables
()
{
echo
"user_define_variables"
g_run_stage
=
"user_define_variables"
}
function
job
()
{
mpirun
-npernode
2
-timestamp-output
-tag-output
-machinefile
${
PBS_NODEFILE
}
python
-u
${
g_job_entry
}
}
function
main
()
{
user_define_variables
env_prepare
job
}
main
fleet_rec/core/engine/mpi_cluster/submit.sh
0 → 100644
浏览文件 @
e9011c05
#!/bin/bash
###################################################
# Usage: submit.sh
# Description: run mpi submit clinet
###################################################
# ---------------------------------------------------------------------------- #
# variable define #
# ---------------------------------------------------------------------------- #
declare
g_curPath
=
""
declare
g_scriptName
=
""
declare
g_workPath
=
""
declare
g_run_stage
=
""
# ----------------------------for hpc submit -------------------------------- #
declare
g_hpc_path
=
""
declare
g_job_name
=
""
declare
g_qsub_conf
=
""
declare
g_hdfs_path
=
""
declare
g_hdfs_ugi
=
""
declare
g_hdfs_output
=
""
declare
g_submit_package
=
""
declare
g_job_nodes
=
""
declare
g_job_entry
=
""
# ---------------------------------------------------------------------------- #
# const define #
# ---------------------------------------------------------------------------- #
declare
-r
CALL
=
"x"
################################################################################
#-----------------------------------------------------------------------------------------------------------------
# Function: get_cur_path
# Description: get churrent path
# Parameter:
# input:
# N/A
# output:
# N/A
# Return: 0 -- success; not 0 -- failure
# Others: N/A
#-----------------------------------------------------------------------------------------------------------------
get_cur_path
()
{
g_run_stage
=
"get_cur_path"
cd
"
$(
dirname
"
${
BASH_SOURCE
-
$0
}
"
)
"
g_curPath
=
"
${
PWD
}
"
g_scriptName
=
"
$(
basename
"
${
BASH_SOURCE
-
$0
}
"
)
"
cd
-
>
/dev/null
}
#-----------------------------------------------------------------------------------------------------------------
#fun : get argument from env, set it into variables
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function
vars_get_from_env
()
{
g_run_stage
=
"vars_get_from_env"
g_hpc_path
=
${
engine
.
}
g_crontabDate
=
$2
}
#-----------------------------------------------------------------------------------------------------------------
#fun : check function return code
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function
check_error
()
{
if
[
${
?
}
-ne
0
]
then
echo
"execute "
+
$g_run_stage
+
" raise exception! please check ..."
exit
1
fi
}
#-----------------------------------------------------------------------------------------------------------------
#fun : package
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function
package
()
{
g_run_stage
=
"package"
}
#-----------------------------------------------------------------------------------------------------------------
#fun : before hook submit to cluster
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function
before_submit
()
{
}
#-----------------------------------------------------------------------------------------------------------------
#fun : after hook submit to cluster
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function
after_submit
()
{
}
#-----------------------------------------------------------------------------------------------------------------
#fun : submit to cluster
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function
submit
()
{
g_run_stage
=
"submit"
before_submit
${
g_hpc_path
}
/bin/qsub_f
\
-N
${
g_job_name
}
\
--conf
${
g_qsub_conf
}
\
--hdfs
${
g_hdfs_path
}
\
--ugi
${
g_hdfs_ugi
}
\
--hout
${
g_hdfs_output
}
\
--files
${
g_submit_package
}
\
-l
nodes
=
${
g_job_nodes
}
,walltime
=
1000:00:00,resource
=
full
${
g_job_entry
}
after_submit
}
function
main
()
{
get_cur_path
check_error
vars_get_from_env
check_error
package
check_error
submit
check_error
}
main
fleet_rec/run.py
浏览文件 @
e9011c05
...
@@ -136,7 +136,7 @@ def cluster_mpi_engine(args):
...
@@ -136,7 +136,7 @@ def cluster_mpi_engine(args):
def
local_cluster_engine
(
args
):
def
local_cluster_engine
(
args
):
from
fleetrec.core.engine.local_cluster
_engine
import
LocalClusterEngine
from
fleetrec.core.engine.local_cluster
import
LocalClusterEngine
trainer
=
get_trainer_prefix
(
args
)
+
"ClusterTrainer"
trainer
=
get_trainer_prefix
(
args
)
+
"ClusterTrainer"
cluster_envs
=
{}
cluster_envs
=
{}
...
@@ -162,7 +162,7 @@ def local_cluster_engine(args):
...
@@ -162,7 +162,7 @@ def local_cluster_engine(args):
def
local_mpi_engine
(
args
):
def
local_mpi_engine
(
args
):
print
(
"launch cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
print
(
"launch cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
from
fleetrec.core.engine.local_mpi
_engine
import
LocalMPIEngine
from
fleetrec.core.engine.local_mpi
import
LocalMPIEngine
print
(
"use 1X1 MPI ClusterTraining at localhost to run model: {}"
.
format
(
args
.
model
))
print
(
"use 1X1 MPI ClusterTraining at localhost to run model: {}"
.
format
(
args
.
model
))
...
...
models/rank/dnn/engine.yaml
0 → 100755
浏览文件 @
e9011c05
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
engine
:
backend
:
"
MPI"
package
:
build_script
:
"
/xxxx/xx/x/x/"
python
:
"
/home/tangwei/fleet_rec_env/cpython-2.7.11-ucs4"
paddlerec
:
"
/home/tangwei/fleet_rec_env/FleetRec"
paddlepaddle
:
"
/home/tangwei/fleet_rec_env/FleetRec"
submit
:
submit_script
:
"
xx"
job
:
"
xxx"
conf
:
"
qsub_f.conf"
hpc
:
"
/home/tangwei/submit-tieba/smart_client/"
hdfs
:
"
xx"
hout
:
"
xxx"
ugi
:
"
xxxx"
nodes
:
10
before_hook
:
"
"
end_hook
:
"
"
define
:
user1
:
"
user_define1"
user2
:
"
user_define2"
define
:
"
user_define3"
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录