Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleRec
提交
15c57177
P
PaddleRec
项目概览
PaddlePaddle
/
PaddleRec
通知
68
Star
12
Fork
5
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
27
列表
看板
标记
里程碑
合并请求
10
Wiki
1
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
27
Issue
27
列表
看板
标记
里程碑
合并请求
10
合并请求
10
Pages
分析
分析
仓库分析
DevOps
Wiki
1
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
15c57177
编写于
7月 09, 2020
作者:
C
Chengmo
提交者:
GitHub
7月 09, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add cluster support (#136)
* add cluster support
上级
b8e17866
变更
15
隐藏空白更改
内联
并排
Showing
15 changed file
with
630 addition
and
75 deletion
+630
-75
core/engine/cluster/cloud/before_hook_cpu.sh.template
core/engine/cluster/cloud/before_hook_cpu.sh.template
+15
-0
core/engine/cluster/cloud/before_hook_gpu.sh.template
core/engine/cluster/cloud/before_hook_gpu.sh.template
+15
-0
core/engine/cluster/cloud/cluster.sh
core/engine/cluster/cloud/cluster.sh
+104
-32
core/engine/cluster/cloud/end_hook.sh.template
core/engine/cluster/cloud/end_hook.sh.template
+1
-0
core/engine/cluster/cloud/k8s_config.ini.template
core/engine/cluster/cloud/k8s_config.ini.template
+31
-0
core/engine/cluster/cloud/k8s_job.sh.template
core/engine/cluster/cloud/k8s_job.sh.template
+35
-0
core/engine/cluster/cloud/mpi_config.ini.template
core/engine/cluster/cloud/mpi_config.ini.template
+29
-0
core/engine/cluster/cloud/mpi_job.sh.template
core/engine/cluster/cloud/mpi_job.sh.template
+31
-0
core/engine/cluster/cluster.py
core/engine/cluster/cluster.py
+233
-13
core/trainers/framework/dataset.py
core/trainers/framework/dataset.py
+2
-1
core/utils/dataloader_instance.py
core/utils/dataloader_instance.py
+1
-1
models/rank/dnn/backend.yaml
models/rank/dnn/backend.yaml
+60
-0
models/rank/dnn/config.yaml
models/rank/dnn/config.yaml
+22
-0
run.py
run.py
+48
-26
setup.py
setup.py
+3
-2
未找到文件。
core/engine/cluster/cloud/before_hook_cpu.sh.template
0 → 100644
浏览文件 @
15c57177
echo "Run before_hook.sh ..."
wget https://paddlerec.bj.bcebos.com/whl/PaddleRec.tar.gz
tar -xf PaddleRec.tar.gz
cd PaddleRec
python setup.py install
pip uninstall -y paddlepaddle
pip install paddlepaddle-gpu==<$ PADDLEPADDLE_VERSION $> --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com
echo "End before_hook.sh ..."
core/engine/cluster/cloud/before_hook_gpu.sh.template
0 → 100644
浏览文件 @
15c57177
echo "Run before_hook.sh ..."
wget https://paddlerec.bj.bcebos.com/whl/PaddleRec.tar.gz
tar -xf PaddleRec.tar.gz
cd PaddleRec
python setup.py install
pip uninstall -y paddlepaddle
pip install paddlepaddle-gpu==<$ PADDLEPADDLE_VERSION $>.post107 --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com
echo "End before_hook.sh ..."
core/engine/cluster/cloud/cluster.sh
浏览文件 @
15c57177
...
...
@@ -16,23 +16,13 @@
###################################################
# Usage: submit.sh
# Description: run
mpi
submit client implement
# Description: run
paddlecloud
submit client implement
###################################################
# ---------------------------------------------------------------------------- #
# variable define #
# ---------------------------------------------------------------------------- #
#-----------------------------------------------------------------------------------------------------------------
#fun : package
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function
package_hook
()
{
g_run_stage
=
"package"
package
}
#-----------------------------------------------------------------------------------------------------------------
#fun : before hook submit to cluster
#param : N/A
...
...
@@ -40,17 +30,106 @@ function package_hook() {
#-----------------------------------------------------------------------------------------------------------------
function
_before_submit
()
{
echo
"before_submit"
before_submit_hook
if
[
${
DISTRIBUTE_MODE
}
==
"PS_CPU_MPI"
]
;
then
_gen_cpu_before_hook
_gen_mpi_config
_gen_mpi_job
_gen_end_hook
elif
[
${
DISTRIBUTE_MODE
}
==
"COLLECTIVE_GPU_K8S"
]
;
then
_gen_gpu_before_hook
_gen_k8s_config
_gen_k8s_job
_gen_end_hook
fi
}
function
_gen_mpi_config
()
{
echo
"gen mpi_config.ini"
sed
-e
"s#<
$
FS_NAME
$>
#
$FS_NAME
#g"
\
-e
"s#<
$
FS_UGI
$>
#
$FS_UGI
#g"
\
-e
"s#<
$
TRAIN_DATA_PATH
$>
#
$TRAIN_DATA_PATH
#g"
\
-e
"s#<
$
TEST_DATA_PATH
$>
#
$TEST_DATA_PATH
#g"
\
-e
"s#<
$
OUTPUT_PATH
$>
#
$OUTPUT_PATH
#g"
\
-e
"s#<
$
THIRDPARTY_PATH
$>
#
$THIRDPARTY_PATH
#g"
\
-e
"s#<
$
CPU_NUM
$>
#
$max_thread_num
#g"
\
-e
"s#<
$
FLAGS_communicator_is_sgd_optimizer
$>
#
$FLAGS_communicator_is_sgd_optimizer
#g"
\
-e
"s#<
$
FLAGS_communicator_send_queue_size
$>
#
$FLAGS_communicator_send_queue_size
#g"
\
-e
"s#<
$
FLAGS_communicator_thread_pool_size
$>
#
$FLAGS_communicator_thread_pool_size
#g"
\
-e
"s#<
$
FLAGS_communicator_max_merge_var_num
$>
#
$FLAGS_communicator_max_merge_var_num
#g"
\
-e
"s#<
$
FLAGS_communicator_max_send_grad_num_before_recv
$>
#
$FLAGS_communicator_max_send_grad_num_before_recv
#g"
\
-e
"s#<
$
FLAGS_communicator_fake_rpc
$>
#
$FLAGS_communicator_fake_rpc
#g"
\
-e
"s#<
$
FLAGS_rpc_retry_times
$>
#
$FLAGS_rpc_retry_times
#g"
\
${
abs_dir
}
/cloud/mpi_config.ini.template
>
${
PWD
}
/config.ini
}
function
_gen_k8s_config
()
{
echo
"gen k8s_config.ini"
sed
-e
"s#<
$
FS_NAME
$>
#
$FS_NAME
#g"
\
-e
"s#<
$
FS_UGI
$>
#
$FS_UGI
#g"
\
-e
"s#<
$
AFS_REMOTE_MOUNT_POINT
$>
#
$AFS_REMOTE_MOUNT_POINT
#g"
\
-e
"s#<
$
OUTPUT_PATH
$>
#
$OUTPUT_PATH
#g"
\
-e
"s#<
$
CPU_NUM
$>
#
$max_thread_num
#g"
\
-e
"s#<
$
FLAGS_communicator_is_sgd_optimizer
$>
#
$FLAGS_communicator_is_sgd_optimizer
#g"
\
-e
"s#<
$
FLAGS_communicator_send_queue_size
$>
#
$FLAGS_communicator_send_queue_size
#g"
\
-e
"s#<
$
FLAGS_communicator_thread_pool_size
$>
#
$FLAGS_communicator_thread_pool_size
#g"
\
-e
"s#<
$
FLAGS_communicator_max_merge_var_num
$>
#
$FLAGS_communicator_max_merge_var_num
#g"
\
-e
"s#<
$
FLAGS_communicator_max_send_grad_num_before_recv
$>
#
$FLAGS_communicator_max_send_grad_num_before_recv
#g"
\
-e
"s#<
$
FLAGS_communicator_fake_rpc
$>
#
$FLAGS_communicator_fake_rpc
#g"
\
-e
"s#<
$
FLAGS_rpc_retry_times
$>
#
$FLAGS_rpc_retry_times
#g"
\
${
abs_dir
}
/cloud/k8s_config.ini.template
>
${
PWD
}
/config.ini
}
function
_gen_cpu_before_hook
()
{
echo
"gen cpu before_hook.sh"
sed
-e
"s#<
$
PADDLEPADDLE_VERSION
$>
#
$PADDLE_VERSION
#g"
\
${
abs_dir
}
/cloud/before_hook_cpu.sh.template
>
${
PWD
}
/before_hook.sh
}
function
_gen_gpu_before_hook
()
{
echo
"gen gpu before_hook.sh"
sed
-e
"s#<
$
PADDLEPADDLE_VERSION
$>
#
$PADDLE_VERSION
#g"
\
${
abs_dir
}
/cloud/before_hook_gpu.sh.template
>
${
PWD
}
/before_hook.sh
}
function
_gen_end_hook
()
{
echo
"gen end_hook.sh"
cp
${
abs_dir
}
/cloud/end_hook.sh.template
${
PWD
}
/end_hook.sh
}
function
_gen_mpi_job
()
{
echo
"gen mpi_job.sh"
sed
-e
"s#<
$
GROUP_NAME
$>
#
$GROUP_NAME
#g"
\
-e
"s#<
$
AK
$>
#
$AK
#g"
\
-e
"s#<
$
SK
$>
#
$SK
#g"
\
-e
"s#<
$
MPI_PRIORITY
$>
#
$PRIORITY
#g"
\
-e
"s#<
$
MPI_NODES
$>
#
$MPI_NODES
#g"
\
-e
"s#<
$
START_CMD
$>
#
$START_CMD
#g"
\
${
abs_dir
}
/cloud/mpi_job.sh.template
>
${
PWD
}
/job.sh
}
function
_gen_k8s_job
()
{
echo
"gen k8s_job.sh"
sed
-e
"s#<
$
GROUP_NAME
$>
#
$GROUP_NAME
#g"
\
-e
"s#<
$
AK
$>
#
$AK
#g"
\
-e
"s#<
$
SK
$>
#
$SK
#g"
\
-e
"s#<
$
K8S_PRIORITY
$>
#
$PRIORITY
#g"
\
-e
"s#<
$
K8S_TRAINERS
$>
#
$K8S_TRAINERS
#g"
\
-e
"s#<
$
K8S_GPU_CARD
$>
#
$K8S_GPU_CARD
#g"
\
-e
"s#<
$
START_CMD
$>
#
$START_CMD
#g"
\
${
abs_dir
}
/cloud/k8s_job.sh.template
>
${
PWD
}
/job.sh
}
#-----------------------------------------------------------------------------------------------------------------
#fun : after hook submit to cluster
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function
_after_submit
()
{
echo
"after_submit"
after_submit_hook
echo
"end submit"
}
#-----------------------------------------------------------------------------------------------------------------
...
...
@@ -60,23 +139,18 @@ function _after_submit() {
#-----------------------------------------------------------------------------------------------------------------
function
_submit
()
{
g_run_stage
=
"submit"
sh job.sh
}
cd
${
engine_temp_path
}
paddlecloud job
--ak
${
engine_submit_ak
}
--sk
${
engine_submit_sk
}
train
--cluster-name
${
engine_submit_cluster
}
\
--job-version
${
engine_submit_version
}
\
--mpi-priority
${
engine_submit_priority
}
\
--mpi-wall-time
300:59:00
\
--mpi-nodes
${
engine_submit_nodes
}
--is-standalone
0
\
--mpi-memory
110Gi
\
--job-name
${
engine_submit_jobname
}
\
--start-cmd
"
${
g_run_cmd
}
"
\
--group-name
${
engine_submit_group
}
\
--job-conf
${
engine_submit_config
}
\
--files
${
g_submitfiles
}
\
--json
cd
-
function
package_hook
()
{
cur_time
=
`
date
+
"%Y%m%d%H%M"
`
new_job_name
=
"
${
JOB_NAME
}
_
${
cur_time
}
"
export
JOB_NAME
=
${
new_job_name
}
export
job_file_path
=
"
${
PWD
}
/
${
new_job_name
}
"
mkdir
${
job_file_path
}
cp
$FILES
${
job_file_path
}
/
cd
${
job_file_path
}
echo
"The task submission folder is generated at
${
job_file_path
}
"
}
function
submit_hook
()
{
...
...
@@ -86,8 +160,6 @@ function submit_hook() {
}
function
main
()
{
source
${
engine_submit_scrpit
}
package_hook
submit_hook
}
...
...
core/engine/cluster/cloud/end_hook.sh.template
0 → 100644
浏览文件 @
15c57177
echo "Run before_hook.sh ..."
\ No newline at end of file
core/engine/cluster/cloud/k8s_config.ini.template
0 → 100644
浏览文件 @
15c57177
# 必须涵盖的参数
fs_name=<$ FS_NAME $>
fs_ugi=<$ FS_UGI $>
# 模型输出目录
output_path=<$ OUTPUT_PATH $>
# ===================
# 以下是新增参数
# ===================
# 挂载 afs 的开关
mount_afs="true"
# afs 路径的远端挂载点
AFS_REMOTE_MOUNT_POINT=<$ AFS_REMOTE_MOUNT_POINT $>
# 作业运行环境的本地挂载点,/root/paddlejob/workspace/env_run/是一个固定路径,是平台运行时workspace的路径
afs_local_mount_point="/root/paddlejob/workspace/env_run/afs/"
# 可以访问运行时默认文件夹下的 ./afs/ 目录拿到挂载目录的文件
# 新k8s afs挂载帮助文档: http://wiki.baidu.com/pages/viewpage.action?pageId=906443193
PADDLE_PADDLEREC_ROLE=WORKER
CPU_NUM=<$ CPU_NUM $>
GLOG_v=0
FLAGS_communicator_is_sgd_optimizer=<$ FLAGS_communicator_is_sgd_optimizer $>
FLAGS_communicator_send_queue_size=<$ FLAGS_communicator_send_queue_size $>
FLAGS_communicator_thread_pool_size=<$ FLAGS_communicator_thread_pool_size $>
FLAGS_communicator_max_merge_var_num=<$ FLAGS_communicator_max_merge_var_num $>
FLAGS_communicator_max_send_grad_num_before_recv=<$ FLAGS_communicator_max_send_grad_num_before_recv $>
FLAGS_communicator_fake_rpc=<$ FLAGS_communicator_fake_rpc $>
FLAGS_rpc_retry_times=<$ FLAGS_rpc_retry_times $>
\ No newline at end of file
core/engine/cluster/cloud/k8s_job.sh.template
0 → 100644
浏览文件 @
15c57177
#!/bin/bash
###############################################################
## 注意-- 注意--注意 ##
## K8S NCCL2多机作业作业示例 ##
###############################################################
job_name
=
${
JOB_NAME
}
# 作业参数
group_name
=
"<
$
GROUP_NAME
$>
"
job_version
=
"paddle-fluid-v1.7.1"
start_cmd
=
"<
$
START_CMD
$>
"
wall_time
=
"10:00:00"
k8s_priority
=
<
$
K8S_PRIORITY
$>
k8s_trainers
=
<
$
K8S_TRAINERS
$>
k8s_gpu_cards
=
<
$
K8S_GPU_CARD
$>
# 你的ak/sk(可在paddlecloud web页面【个人中心】处获取)
ak
=
<
$
AK
$>
sk
=
<
$
SK
$>
paddlecloud job
--ak
${
ak
}
--sk
${
sk
}
\
train
--job-name
${
job_name
}
\
--group-name
${
group_name
}
\
--job-conf
config.ini
\
--start-cmd
"
${
start_cmd
}
"
\
--files
./
*
\
--job-version
${
job_version
}
\
--k8s-trainers
${
k8s_trainers
}
\
--k8s-gpu-cards
${
k8s_gpu_cards
}
\
--k8s-priority
${
k8s_priority
}
\
--wall-time
${
wall_time
}
\
--is-standalone
0
\
--distribute-job-type
"NCCL2"
\
--json
\ No newline at end of file
core/engine/cluster/cloud/mpi_config.ini.template
0 → 100644
浏览文件 @
15c57177
#type of storage cluster
storage_type="hdfs"
#attention: files for training should be put on hdfs
force_reuse_output_path="True"
# 可以替换成自己的hdfs集群
fs_name=<$ FS_NAME $>
fs_ugi=<$ FS_UGI $>
FLAGS_rpc_deadline=300000
##train data path on hdfs
train_data_path=<$ TRAIN_DATA_PATH $>
test_data_path=<$ TEST_DATA_PATH $>
output_path=<$ OUTPUT_PATH $>
thirdparty_path=<$ THIRDPARTY_PATH $>
PADDLE_PADDLEREC_ROLE=WORKER
CPU_NUM=<$ CPU_NUM $>
GLOG_v=0
FLAGS_communicator_is_sgd_optimizer=<$ FLAGS_communicator_is_sgd_optimizer $>
FLAGS_communicator_send_queue_size=<$ FLAGS_communicator_send_queue_size $>
FLAGS_communicator_thread_pool_size=<$ FLAGS_communicator_thread_pool_size $>
FLAGS_communicator_max_merge_var_num=<$ FLAGS_communicator_max_merge_var_num $>
FLAGS_communicator_max_send_grad_num_before_recv=<$ FLAGS_communicator_max_send_grad_num_before_recv $>
FLAGS_communicator_fake_rpc=<$ FLAGS_communicator_fake_rpc $>
FLAGS_rpc_retry_times=<$ FLAGS_rpc_retry_times $>
core/engine/cluster/cloud/mpi_job.sh.template
0 → 100644
浏览文件 @
15c57177
#!/bin/bash
###############################################################
## 注意--注意--注意 ##
## MPI 类型作业演示 ##
###############################################################
job_name
=
${
JOB_NAME
}
# 作业参数
group_name
=
<
$
GROUP_NAME
$>
job_version
=
"paddle-fluid-v1.7.1"
start_cmd
=
"<
$
START_CMD
$>
"
wall_time
=
"2:00:00"
# 你的ak/sk(可在paddlecloud web页面【个人中心】处获取)
ak
=
<
$
AK
$>
sk
=
<
$
SK
$>
paddlecloud job
--ak
${
ak
}
--sk
${
sk
}
\
train
\
--job-name
${
job_name
}
\
--mpi-priority
<
$
MPI_PRIORITY
$>
\
--group-name
${
group_name
}
\
--mpi-wall-time
${
wall_time
}
\
--mpi-nodes
<
$
MPI_NODES
$>
\
--is-standalone
0
\
--permission
group
\
--job-version
${
job_version
}
\
--job-conf
config.ini
\
--start-cmd
"
${
start_cmd
}
"
\
--files
./
*
\
--json
core/engine/cluster/cluster.py
浏览文件 @
15c57177
...
...
@@ -18,6 +18,7 @@ from __future__ import unicode_literals
import
copy
import
os
import
subprocess
import
warnings
from
paddlerec.core.engine.engine
import
Engine
from
paddlerec.core.factory
import
TrainerFactory
...
...
@@ -26,24 +27,35 @@ from paddlerec.core.utils import envs
class
ClusterEngine
(
Engine
):
def
__init_impl__
(
self
):
self
.
role
=
envs
.
get_runtime_environ
(
"engine_role"
)
if
self
.
role
==
"WORKER"
:
return
abs_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
os
.
environ
[
"abs_dir"
]
=
str
(
abs_dir
)
backend
=
envs
.
get_runtime_environ
(
"engine_
backend"
)
if
not
backend
:
backend
=
""
backend
=
backend
.
upper
()
if
backend
==
"PADDLECLOUD"
:
self
.
backend
=
envs
.
get_runtime_environ
(
"
backend"
)
if
not
self
.
backend
:
self
.
backend
=
""
self
.
backend
=
self
.
backend
.
upper
()
if
self
.
backend
==
"PADDLECLOUD"
:
self
.
submit_script
=
os
.
path
.
join
(
abs_dir
,
"cloud/cluster.sh"
)
elif
backend
==
"KUBERNETES"
:
elif
self
.
backend
==
"KUBERNETES"
:
self
.
submit_script
=
os
.
path
.
join
(
abs_dir
,
"k8s/cluster.sh"
)
else
:
raise
ValueError
(
"{} can not be supported now"
.
format
(
backend
))
raise
ValueError
(
"{} can not be supported now"
.
format
(
self
.
backend
))
def
start_worker_procs
(
self
):
trainer
=
TrainerFactory
.
create
(
self
.
trainer
)
trainer
.
run
()
def
start_master_procs
(
self
):
if
self
.
backend
==
"PADDLECLOUD"
:
self
.
paddlecloud_env_check
()
elif
self
.
backend
==
"KUBERNETES"
:
self
.
kubernetes_env_check
()
default_env
=
os
.
environ
.
copy
()
current_env
=
copy
.
copy
(
default_env
)
current_env
.
pop
(
"http_proxy"
,
None
)
...
...
@@ -55,21 +67,229 @@ class ClusterEngine(Engine):
@
staticmethod
def
workspace_replace
():
workspace
=
envs
.
get_runtime_environ
(
"
engine_
workspace"
)
workspace
=
envs
.
get_runtime_environ
(
"workspace"
)
for
k
,
v
in
os
.
environ
.
items
():
v
=
v
.
replace
(
"{workspace}"
,
workspace
)
os
.
environ
[
k
]
=
str
(
v
)
def
run
(
self
):
role
=
envs
.
get_runtime_environ
(
"engine_role"
)
if
role
==
"MASTER"
:
if
self
.
role
==
"MASTER"
:
self
.
start_master_procs
()
elif
role
==
"WORKER"
:
elif
self
.
role
==
"WORKER"
:
self
.
start_worker_procs
()
else
:
raise
ValueError
(
"role {} error, must in MASTER/WORKER"
.
format
(
role
))
self
.
role
))
def
paddlecloud_env_check
(
self
):
# get fleet mode
fleet_mode
=
envs
.
get_runtime_environ
(
"fleet_mode"
)
# get device
device
=
envs
.
get_runtime_environ
(
"device"
)
# get cluster type
cluster_type
=
envs
.
get_runtime_environ
(
"cluster_type"
)
cluster_env_check_tool
=
None
if
cluster_type
.
upper
()
==
"MPI"
:
if
device
==
"CPU"
and
fleet_mode
==
"PS"
:
cluster_env_check_tool
=
PaddleCloudMpiEnv
()
else
:
raise
ValueError
(
"Paddlecloud with Mpi don't support GPU training, check your config"
)
elif
cluster_type
.
upper
()
==
"K8S"
:
if
fleet_mode
==
"PS"
:
if
device
==
"CPU"
:
raise
ValueError
(
"PS-CPU on paddlecloud is not supported at this time, comming soon"
)
elif
device
==
"GPU"
:
raise
ValueError
(
"PS-GPU on paddlecloud is not supported at this time, comming soon"
)
if
fleet_mode
==
"COLLECTIVE"
:
if
device
==
"GPU"
:
cluster_env_check_tool
=
CloudCollectiveEnv
()
elif
device
==
"CPU"
:
raise
ValueError
(
"Unexpected config -> device: CPU with fleet_mode: Collective, check your config"
)
else
:
raise
ValueError
(
"cluster_type {} error, must in MPI/K8S"
.
format
(
cluster_type
))
cluster_env_check_tool
.
env_check
()
cluster_env_check_tool
.
env_set
()
def
kubernetes_env_check
(
self
):
pass
class
ClusterEnvBase
(
object
):
def
__init__
(
self
):
# get backend env
backend_yaml
=
envs
.
get_runtime_environ
(
"backend_yaml"
)
_env
=
envs
.
load_yaml
(
backend_yaml
)
self
.
backend_env
=
envs
.
flatten_environs
(
_env
,
"."
)
self
.
cluster_env
=
{}
def
env_check
(
self
):
# check common env
# fs_name & fs_ugi
self
.
cluster_env
[
"FS_NAME"
]
=
self
.
backend_env
.
get
(
"config.fs_name"
,
""
)
self
.
cluster_env
[
"FS_UGI"
]
=
self
.
backend_env
.
get
(
"config.fs_ugi"
,
""
)
if
self
.
cluster_env
[
"FS_NAME"
]
==
""
or
self
.
cluster_env
[
"FS_UGI"
]
==
""
:
raise
ValueError
(
"No -- FS_UGI or FS_NAME -- found in your backend.yaml, please check."
)
# output_path
self
.
cluster_env
[
"OUTPUT_PATH"
]
=
self
.
backend_env
.
get
(
"config.output_path"
,
""
)
if
self
.
cluster_env
[
"OUTPUT_PATH"
]
==
""
:
warnings
.
warn
(
"Job output_path not set! Please check your backend yaml."
,
category
=
UserWarning
,
stacklevel
=
2
)
# paddle_version
self
.
cluster_env
[
"PADDLE_VERSION"
]
=
self
.
backend_env
.
get
(
"config.paddle_version"
,
"1.7.2"
)
# communicator
self
.
cluster_env
[
"FLAGS_communicator_is_sgd_optimizer"
]
=
self
.
backend_env
.
get
(
"config.communicator.FLAGS_communicator_is_sgd_optimizer"
,
0
)
self
.
cluster_env
[
"FLAGS_communicator_send_queue_size"
]
=
self
.
backend_env
.
get
(
"config.communicator.FLAGS_communicator_send_queue_size"
,
5
)
self
.
cluster_env
[
"FLAGS_communicator_thread_pool_size"
]
=
self
.
backend_env
.
get
(
"config.communicator.FLAGS_communicator_thread_pool_size"
,
32
)
self
.
cluster_env
[
"FLAGS_communicator_max_merge_var_num"
]
=
self
.
backend_env
.
get
(
"config.communicator.FLAGS_communicator_max_merge_var_num"
,
5
)
self
.
cluster_env
[
"FLAGS_communicator_max_send_grad_num_before_recv"
]
=
self
.
backend_env
.
get
(
"config.communicator.FLAGS_communicator_max_send_grad_num_before_recv"
,
5
)
self
.
cluster_env
[
"FLAGS_communicator_fake_rpc"
]
=
self
.
backend_env
.
get
(
"config.communicator.FLAGS_communicator_fake_rpc"
,
0
)
self
.
cluster_env
[
"FLAGS_rpc_retry_times"
]
=
self
.
backend_env
.
get
(
"config.communicator.FLAGS_rpc_retry_times"
,
3
)
# ak & sk
self
.
cluster_env
[
"AK"
]
=
self
.
backend_env
.
get
(
"submit.ak"
,
""
)
self
.
cluster_env
[
"SK"
]
=
self
.
backend_env
.
get
(
"submit.sk"
,
""
)
if
self
.
cluster_env
[
"AK"
]
==
""
or
self
.
cluster_env
[
"SK"
]
==
""
:
raise
ValueError
(
"No -- AK or SK -- found in your backend.yaml, please check."
)
# priority
self
.
cluster_env
[
"PRIORITY"
]
=
self
.
backend_env
.
get
(
"submit.priority"
,
"high"
)
# job name
self
.
cluster_env
[
"JOB_NAME"
]
=
self
.
backend_env
.
get
(
"submit.job_name"
,
"PaddleRecClusterJob"
)
# group
self
.
cluster_env
[
"GROUP_NAME"
]
=
self
.
backend_env
.
get
(
"submit.group"
,
"paddle"
)
# start_cmd
self
.
cluster_env
[
"START_CMD"
]
=
self
.
backend_env
.
get
(
"submit.start_cmd"
,
"python -m paddlerec.run -m config.yaml"
)
# files
self
.
cluster_env
[
"FILES"
]
=
self
.
backend_env
.
get
(
"submit.files"
,
""
)
if
self
.
cluster_env
[
"FILES"
]
==
""
:
raise
ValueError
(
"No -- files -- found in your backend.yaml, please check."
)
def
env_set
(
self
):
envs
.
set_runtime_environs
(
self
.
cluster_env
)
flattens
=
envs
.
flatten_environs
(
self
.
cluster_env
)
print
(
envs
.
pretty_print_envs
(
flattens
,
(
"Cluster Envs"
,
"Value"
)))
class
PaddleCloudMpiEnv
(
ClusterEnvBase
):
def
__init__
(
self
):
super
(
PaddleCloudMpiEnv
,
self
).
__init__
()
def
env_check
(
self
):
super
(
PaddleCloudMpiEnv
,
self
).
env_check
()
# check mpi env
self
.
cluster_env
[
"DISTRIBUTE_MODE"
]
=
"PS_CPU_MPI"
# train_data_path
self
.
cluster_env
[
"TRAIN_DATA_PATH"
]
=
self
.
backend_env
.
get
(
"config.train_data_path"
,
""
)
if
self
.
cluster_env
[
"TRAIN_DATA_PATH"
]
==
""
:
raise
ValueError
(
"No -- TRAIN_DATA_PATH -- found in your backend.yaml, please check."
)
# test_data_path
self
.
cluster_env
[
"TEST_DATA_PATH"
]
=
self
.
backend_env
.
get
(
"config.test_data_path"
,
""
)
if
self
.
cluster_env
[
"TEST_DATA_PATH"
]
==
""
:
warnings
.
warn
(
"Job test_data_path not set! Please check your backend yaml."
,
category
=
UserWarning
,
stacklevel
=
2
)
# thirdparty_path
self
.
cluster_env
[
"THIRDPARTY_PATH"
]
=
self
.
backend_env
.
get
(
"config.thirdparty_path"
,
""
)
if
self
.
cluster_env
[
"THIRDPARTY_PATH"
]
==
""
:
warnings
.
warn
(
"Job thirdparty_path not set! Please check your backend yaml."
,
category
=
UserWarning
,
stacklevel
=
2
)
# nodes
self
.
cluster_env
[
"MPI_NODES"
]
=
self
.
backend_env
.
get
(
"submit.nodes"
,
1
)
class
PaddleCloudK8sEnv
(
ClusterEnvBase
):
def
__init__
(
self
):
super
(
PaddleCloudK8sEnv
,
self
).
__init__
()
def
env_check
(
self
):
super
(
PaddleCloudK8sEnv
,
self
).
env_check
()
# check afs_remote_mount_point
self
.
cluster_env
[
"AFS_REMOTE_MOUNT_POINT"
]
=
self
.
backend_env
.
get
(
"config.afs_remote_mount_point"
,
""
)
if
self
.
cluster_env
[
"AFS_REMOTE_MOUNT_POINT"
]
==
""
:
warnings
.
warn
(
"Job afs_remote_mount_point not set! Please check your backend yaml."
,
category
=
UserWarning
,
stacklevel
=
2
)
warnings
.
warn
(
"The remote mount point will be mounted to the ./afs/"
,
category
=
UserWarning
,
stacklevel
=
2
)
class
CloudCollectiveEnv
(
PaddleCloudK8sEnv
):
def
__init__
(
self
):
super
(
CloudCollectiveEnv
,
self
).
__init__
()
def
env_check
(
self
):
super
(
CloudCollectiveEnv
,
self
).
env_check
()
self
.
cluster_env
[
"DISTRIBUTE_MODE"
]
=
"COLLECTIVE_GPU_K8S"
self
.
cluster_env
[
"K8S_TRAINERS"
]
=
self
.
backend_env
.
get
(
"submit.k8s_trainers"
,
1
)
self
.
cluster_env
[
"K8S_GPU_CARD"
]
=
self
.
backend_env
.
get
(
"submit.k8s_gpu_card"
,
1
)
self
.
cluster_env
[
"K8S_CPU_CORES"
]
=
self
.
backend_env
.
get
(
"submit.k8s_cpu_cores"
,
1
)
core/trainers/framework/dataset.py
浏览文件 @
15c57177
...
...
@@ -118,6 +118,7 @@ class QueueDataset(DatasetBase):
dataset
.
set_batch_size
(
batch_size
)
dataset
.
set_pipe_command
(
pipe_cmd
)
train_data_path
=
envs
.
get_global_env
(
name
+
"data_path"
)
file_list
=
[
os
.
path
.
join
(
train_data_path
,
x
)
for
x
in
os
.
listdir
(
train_data_path
)
...
...
@@ -125,7 +126,7 @@ class QueueDataset(DatasetBase):
if
context
[
"engine"
]
==
EngineMode
.
LOCAL_CLUSTER
:
file_list
=
split_files
(
file_list
,
context
[
"fleet"
].
worker_index
(),
context
[
"fleet"
].
worker_num
())
print
(
"File_list: {}"
.
format
(
file_list
))
dataset
.
set_filelist
(
file_list
)
for
model_dict
in
context
[
"phases"
]:
if
model_dict
[
"dataset_name"
]
==
dataset_name
:
...
...
core/utils/dataloader_instance.py
浏览文件 @
15c57177
...
...
@@ -42,7 +42,7 @@ def dataloader_by_name(readerclass,
if
context
[
"engine"
]
==
EngineMode
.
LOCAL_CLUSTER
:
files
=
split_files
(
files
,
context
[
"fleet"
].
worker_index
(),
context
[
"fleet"
].
worker_num
())
print
(
"file_list : {}"
.
format
(
files
))
print
(
"file_list : {}"
.
format
(
files
))
reader
=
reader_class
(
yaml_file
)
reader
.
init
()
...
...
models/rank/dnn/backend.yaml
0 → 100644
浏览文件 @
15c57177
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
workspace
:
"
./"
backend
:
"
PaddleCloud"
cluster_type
:
k8s
# k8s 可选
config
:
fs_name
:
"
afs://xxx.com"
fs_ugi
:
"
usr,pwd"
output_path
:
"
"
# 填远程地址,如afs:/user/your/path/ 则此处填 /user/your/path
# for mpi
train_data_path
:
"
"
# 填远程地址,如afs:/user/your/path/ 则此处填 /user/your/path
test_data_path
:
"
"
# 填远程地址,如afs:/user/your/path/ 则此处填 /user/your/path
thirdparty_path
:
"
"
# 填远程地址,如afs:/user/your/path/ 则此处填 /user/your/path
paddle_version
:
"
1.7.2"
# 填写paddle官方版本号 >= 1.7.2
# for k8s
afs_remote_mount_point
:
"
"
# 填远程地址,如afs:/user/your/path/ 则此处填 /user/your/path
# paddle分布式底层超参,无特殊需求不理不改
communicator
:
FLAGS_communicator_is_sgd_optimizer
:
0
FLAGS_communicator_send_queue_size
:
5
FLAGS_communicator_thread_pool_size
:
32
FLAGS_communicator_max_merge_var_num
:
5
FLAGS_communicator_max_send_grad_num_before_recv
:
5
FLAGS_communicator_fake_rpc
:
0
FLAGS_rpc_retry_times
:
3
submit
:
ak
:
"
"
sk
:
"
"
priority
:
"
high"
job_name
:
"
PaddleRec_CTR"
group
:
"
"
start_cmd
:
"
python
-m
paddlerec.run
-m
./config.yaml"
files
:
./*.py ./*.yaml
# for mpi ps-cpu
nodes
:
2
# for k8s gpu
k8s_trainers
:
2
k8s_gpu_card
:
1
models/rank/dnn/config.yaml
浏览文件 @
15c57177
...
...
@@ -80,6 +80,28 @@ runner:
init_model_path
:
"
increment_dnn"
# load model path
phases
:
[
phase2
]
-
name
:
ps_cluster
class
:
cluster_train
epochs
:
2
device
:
cpu
fleet_mode
:
ps
save_checkpoint_interval
:
1
# save model interval of epochs
save_checkpoint_path
:
"
increment_dnn"
# save checkpoint path
init_model_path
:
"
"
# load model path
print_interval
:
1
phases
:
[
phase1
]
-
name
:
collective_cluster
class
:
cluster_train
epochs
:
2
device
:
gpu
fleet_mode
:
collective
save_checkpoint_interval
:
1
# save model interval of epochs
save_checkpoint_path
:
"
increment_dnn"
# save checkpoint path
init_model_path
:
"
"
# load model path
print_interval
:
1
phases
:
[
phase1
]
# runner will run all the phase in each epoch
phase
:
-
name
:
phase1
...
...
run.py
浏览文件 @
15c57177
...
...
@@ -38,7 +38,7 @@ def engine_registry():
engines
[
"TRANSPILER"
][
"TRAIN"
]
=
single_train_engine
engines
[
"TRANSPILER"
][
"INFER"
]
=
single_infer_engine
engines
[
"TRANSPILER"
][
"LOCAL_CLUSTER_TRAIN"
]
=
local_cluster_engine
engines
[
"TRANSPILER"
][
"CLUSTER"
]
=
cluster_engine
engines
[
"TRANSPILER"
][
"CLUSTER
_TRAIN
"
]
=
cluster_engine
engines
[
"PSLIB"
][
"TRAIN"
]
=
local_mpi_engine
engines
[
"PSLIB"
][
"LOCAL_CLUSTER_TRAIN"
]
=
local_mpi_engine
engines
[
"PSLIB"
][
"CLUSTER_TRAIN"
]
=
cluster_mpi_engine
...
...
@@ -111,8 +111,8 @@ def get_engine(args, running_config, mode):
engine
=
running_config
.
get
(
engine_class
,
None
)
if
engine
is
None
:
raise
ValueError
(
"not find {} in
yaml
, please check"
.
format
(
mode
,
engine_class
))
raise
ValueError
(
"not find {} in
engine_class
, please check"
.
format
(
engine
))
device
=
running_config
.
get
(
engine_device
,
None
)
engine
=
engine
.
upper
()
...
...
@@ -262,15 +262,48 @@ def single_infer_engine(args):
def
cluster_engine
(
args
):
def
master
():
from
paddlerec.core.engine.cluster.cluster
import
ClusterEngine
_envs
=
envs
.
load_yaml
(
args
.
backend
)
flattens
=
envs
.
flatten_environs
(
_envs
,
"_"
)
# Get fleet_mode & device
run_extras
=
get_all_inters_from_yaml
(
args
.
model
,
[
"runner."
])
mode
=
envs
.
get_runtime_environ
(
"mode"
)
fleet_class
=
"."
.
join
([
"runner"
,
mode
,
"fleet_mode"
])
device_class
=
"."
.
join
([
"runner"
,
mode
,
"device"
])
fleet_mode
=
run_extras
.
get
(
fleet_class
,
"ps"
)
device
=
run_extras
.
get
(
device_class
,
"cpu"
)
device
=
device
.
upper
()
fleet_mode
=
fleet_mode
.
upper
()
if
fleet_mode
==
"COLLECTIVE"
and
device
!=
"GPU"
:
raise
ValueError
(
"COLLECTIVE can not be used without GPU"
)
# Get Thread nums
model_envs
=
envs
.
load_yaml
(
args
.
model
)
phases_class
=
"."
.
join
([
"runner"
,
mode
,
"phases"
])
phase_names
=
run_extras
.
get
(
phases_class
)
phases
=
[]
all_phases
=
model_envs
.
get
(
"phase"
)
if
phase_names
is
None
:
phases
=
all_phases
else
:
for
phase
in
all_phases
:
if
phase
[
"name"
]
in
phase_names
:
phases
.
append
(
phase
)
thread_num
=
[]
for
phase
in
phases
:
thread_num
.
append
(
int
(
phase
[
"thread_num"
]))
max_thread_num
=
max
(
thread_num
)
backend_envs
=
envs
.
load_yaml
(
args
.
backend
)
flattens
=
envs
.
flatten_environs
(
backend_envs
,
"_"
)
flattens
[
"engine_role"
]
=
"MASTER"
flattens
[
"engine_mode"
]
=
envs
.
get_runtime_environ
(
"mode"
)
flattens
[
"engine_run_config"
]
=
args
.
model
flattens
[
"engine_temp_path"
]
=
tempfile
.
mkdtemp
()
flattens
[
"max_thread_num"
]
=
max_thread_num
flattens
[
"fleet_mode"
]
=
fleet_mode
flattens
[
"device"
]
=
device
flattens
[
"backend_yaml"
]
=
args
.
backend
envs
.
set_runtime_environs
(
flattens
)
ClusterEngine
.
workspace_replace
()
print
(
envs
.
pretty_print_envs
(
flattens
,
(
"Submit Envs"
,
"Value"
)))
launch
=
ClusterEngine
(
None
,
args
.
model
)
return
launch
...
...
@@ -278,40 +311,29 @@ def cluster_engine(args):
def
worker
(
mode
):
if
not
mode
:
raise
ValueError
(
"mode: {} can not be recognized"
)
from
paddlerec.core.engine.cluster.cluster
import
ClusterEngine
run_extras
=
get_all_inters_from_yaml
(
args
.
model
,
[
"runner."
])
trainer_class
=
"."
.
join
([
"runner"
,
mode
,
"trainer_class"
])
fleet_class
=
"."
.
join
([
"runner"
,
mode
,
"fleet_mode"
])
device_class
=
"."
.
join
([
"runner"
,
mode
,
"device"
])
selected_gpus_class
=
"."
.
join
([
"runner"
,
mode
,
"selected_gpus"
])
strategy_class
=
"."
.
join
([
"runner"
,
mode
,
"distribute_strategy"
])
worker_class
=
"."
.
join
([
"runner"
,
mode
,
"worker_num"
])
server_class
=
"."
.
join
([
"runner"
,
mode
,
"server_num"
])
trainer
=
run_extras
.
get
(
trainer_class
,
"GeneralTrainer"
)
fleet_mode
=
run_extras
.
get
(
fleet_class
,
"ps"
)
device
=
run_extras
.
get
(
device_class
,
"cpu"
)
selected_gpus
=
run_extras
.
get
(
selected_gpus_class
,
"0"
)
distributed_strategy
=
run_extras
.
get
(
strategy_class
,
"async"
)
worker_num
=
run_extras
.
get
(
worker_class
,
1
)
server_num
=
run_extras
.
get
(
server_class
,
1
)
executor_mode
=
"train"
device
=
device
.
upper
()
fleet_mode
=
fleet_mode
.
upper
()
if
fleet_mode
==
"COLLECTIVE"
and
device
!=
"GPU"
:
raise
ValueError
(
"COLLECTIVE can not be used with GPU"
)
raise
ValueError
(
"COLLECTIVE can not be used with
out
GPU"
)
cluster_envs
=
{}
if
device
==
"GPU"
:
cluster_envs
[
"selected_gpus"
]
=
selected_gpus
cluster_envs
[
"server_num"
]
=
server_num
cluster_envs
[
"worker_num"
]
=
worker_num
cluster_envs
[
"fleet_mode"
]
=
fleet_mode
cluster_envs
[
"engine_role"
]
=
"WORKER"
cluster_envs
[
"train.trainer.trainer"
]
=
trainer
cluster_envs
[
"train.trainer.engine"
]
=
"cluster"
cluster_envs
[
"train.trainer.executor_mode"
]
=
executor_mode
...
...
@@ -321,15 +343,15 @@ def cluster_engine(args):
cluster_envs
[
"train.trainer.platform"
]
=
envs
.
get_platform
()
print
(
"launch {} engine with cluster to with model: {}"
.
format
(
trainer
,
args
.
model
))
set_runtime_envs
(
cluster_envs
,
args
.
model
)
trainer
=
TrainerFactory
.
create
(
args
.
model
)
return
trainer
set_runtime_envs
(
cluster_envs
,
args
.
model
)
launch
=
ClusterEngine
(
None
,
args
.
model
)
return
launch
role
=
os
.
getenv
(
"PADDLE_PADDLEREC_ROLE"
,
"MASTER"
)
if
role
==
"WORKER"
:
mode
=
os
.
getenv
(
"
PADDLE_PADDLEREC_MODE
"
,
None
)
mode
=
os
.
getenv
(
"
mode
"
,
None
)
return
worker
(
mode
)
else
:
return
master
()
...
...
setup.py
浏览文件 @
15c57177
# coding=utf8
# -*- coding: utf-8 -*-
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
...
...
@@ -69,7 +70,7 @@ def build(dirname):
'Criteo_data/sample_data/train/*'
]
engine_copy
=
[
'*/*.sh'
]
engine_copy
=
[
'*/*.sh'
,
'*/*.template'
]
for
package
in
packages
:
if
package
.
startswith
(
"paddlerec.models."
):
package_data
[
package
]
=
models_copy
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录