Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
PaddleRec
提交
9438cb36
P
PaddleRec
项目概览
BaiXuePrincess
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
9438cb36
编写于
5月 13, 2020
作者:
T
tangwei12
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
bug fix
上级
77a2da6d
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
28 addition
and
14 deletion
+28
-14
fleet_rec/core/trainers/cluster_trainer.py
fleet_rec/core/trainers/cluster_trainer.py
+3
-1
fleet_rec/core/trainers/transpiler_trainer.py
fleet_rec/core/trainers/transpiler_trainer.py
+2
-1
fleet_rec/run.py
fleet_rec/run.py
+4
-4
models/rank/dnn/submit.sh
models/rank/dnn/submit.sh
+2
-2
models/rank/dnn/worker.sh
models/rank/dnn/worker.sh
+17
-6
未找到文件。
fleet_rec/core/trainers/cluster_trainer.py
浏览文件 @
9438cb36
...
...
@@ -23,6 +23,7 @@ import paddle.fluid as fluid
from
paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler
import
fleet
from
paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy
import
StrategyFactory
from
paddle.fluid.incubate.fleet.base.role_maker
import
PaddleCloudRoleMaker
from
paddle.fluid.incubate.fleet.base.role_maker
import
MPISymetricRoleMaker
from
fleetrec.core.utils
import
envs
from
fleetrec.core.trainers.transpiler_trainer
import
TranspileTrainer
...
...
@@ -30,7 +31,8 @@ from fleetrec.core.trainers.transpiler_trainer import TranspileTrainer
class
ClusterTrainer
(
TranspileTrainer
):
def
processor_register
(
self
):
role
=
PaddleCloudRoleMaker
()
#role = PaddleCloudRoleMaker()
role
=
MPISymetricRoleMaker
()
fleet
.
init
(
role
)
if
fleet
.
is_server
():
...
...
fleet_rec/core/trainers/transpiler_trainer.py
浏览文件 @
9438cb36
...
...
@@ -72,7 +72,8 @@ class TranspileTrainer(Trainer):
train_data_path
=
envs
.
get_global_env
(
"test_data_path"
,
None
,
namespace
)
threads
=
int
(
envs
.
get_runtime_environ
(
"train.trainer.threads"
))
#threads = int(envs.get_runtime_environ("train.trainer.threads"))
threads
=
2
batch_size
=
envs
.
get_global_env
(
"batch_size"
,
None
,
namespace
)
reader_class
=
envs
.
get_global_env
(
"class"
,
None
,
namespace
)
abs_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
...
...
fleet_rec/run.py
浏览文件 @
9438cb36
...
...
@@ -110,7 +110,6 @@ def single_engine(args):
def
cluster_engine
(
args
):
from
fleetrec.core.engine.cluster.cluster
import
ClusterEngine
def
update_workspace
(
cluster_envs
):
workspace
=
cluster_envs
.
get
(
"engine_workspace"
,
None
)
...
...
@@ -131,6 +130,7 @@ def cluster_engine(args):
cluster_envs
[
name
]
=
value
def
master
():
from
fleetrec.core.engine.cluster.cluster
import
ClusterEngine
with
open
(
args
.
backend
,
'r'
)
as
rb
:
_envs
=
yaml
.
load
(
rb
.
read
(),
Loader
=
yaml
.
FullLoader
)
...
...
@@ -155,10 +155,10 @@ def cluster_engine(args):
print
(
"launch {} engine with cluster to with model: {}"
.
format
(
trainer
,
args
.
model
))
set_runtime_envs
(
cluster_envs
,
args
.
model
)
launch
=
LocalClusterEngine
(
cluster_envs
,
args
.
model
)
return
launch
trainer
=
TrainerFactory
.
create
(
args
.
model
)
return
trainer
if
args
.
role
==
"
worker
"
:
if
args
.
role
==
"
WORKER
"
:
return
worker
()
else
:
return
master
()
...
...
models/rank/dnn/submit.sh
浏览文件 @
9438cb36
...
...
@@ -29,8 +29,8 @@ function package() {
cp
${
engine_submit_qconf
}
${
temp
}
echo
"copy job.sh from "
${
engine_worker
}
" to "
${
temp
}
mkdir
-p
${
temp
}
/package
/python
cp
-r
${
engine_package_python
}
/
*
${
temp
}
/package/python
/
mkdir
-p
${
temp
}
/package
cp
-r
${
engine_package_python
}
${
temp
}
/package
/
echo
"copy python from "
${
engine_package_python
}
" to "
${
temp
}
mkdir
${
temp
}
/package/whl
...
...
models/rank/dnn/worker.sh
浏览文件 @
9438cb36
...
...
@@ -16,10 +16,10 @@ declare g_run_stage=""
# ---------------------------------------------------------------------------- #
# const define #
# ---------------------------------------------------------------------------- #
declare
-r
FLAGS_communicator_thread_pool_size
=
5
declare
-r
FLAGS_communicator_send_queue_size
=
18
declare
-r
FLAGS_communicator_thread_pool_size
=
20
declare
-r
FLAGS_communicator_max_merge_var_num
=
18
export
FLAGS_communicator_thread_pool_size
=
5
export
FLAGS_communicator_send_queue_size
=
18
export
FLAGS_communicator_thread_pool_size
=
20
export
FLAGS_communicator_max_merge_var_num
=
18
################################################################################
#-----------------------------------------------------------------------------------------------------------------
...
...
@@ -44,9 +44,20 @@ function env_prepare() {
WORKDIR
=
$(
pwd
)
mpirun
-npernode
1
mv
package/
*
./
echo
"current:"
$WORKDIR
export
LIBRARY_PATH
=
$WORKDIR
/python/lib:
$LIBRARY_PATH
mpirun
-npernode
1 python/bin/python
-m
pip
install
whl/fleet_rec-0.0.2-py2-none-any.whl
--index-url
=
http://pip.baidu.com/pypi/simple
--trusted-host
pip.baidu.com
>
/dev/null
mpirun
-npernode
1
tar
-zxvf
python.tar.gz
>
/dev/null
export
PYTHONPATH
=
$WORKDIR
/python/
export
PYTHONROOT
=
$WORKDIR
/python/
export
LIBRARY_PATH
=
$PYTHONPATH
/lib:
$LIBRARY_PATH
export
LD_LIBRARY_PATH
=
$PYTHONPATH
/lib:
$LD_LIBRARY_PATH
export
PATH
=
$PYTHONPATH
/bin:
$PATH
export
LIBRARY_PATH
=
$PYTHONROOT
/lib:
$LIBRARY_PATH
python
-c
"print('heheda')"
mpirun
-npernode
1 python/bin/python
-m
pip uninstall
-y
fleet-rec
mpirun
-npernode
1 python/bin/python
-m
pip
install
whl/fleet_rec-0.0.2-py2-none-any.whl
--index-url
=
http://pip.baidu.com/pypi/simple
--trusted-host
pip.baidu.com
check_error
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录