Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
PaddleRec
提交
42347db7
P
PaddleRec
项目概览
BaiXuePrincess
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
42347db7
编写于
4月 20, 2020
作者:
T
tangwei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix bug
上级
eeaf9942
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
27 addition
and
10 deletion
+27
-10
fleetrec/core/factory.py
fleetrec/core/factory.py
+1
-1
fleetrec/core/utils/envs.py
fleetrec/core/utils/envs.py
+11
-1
fleetrec/examples/runtime.yaml
fleetrec/examples/runtime.yaml
+2
-1
fleetrec/examples/user_define/__init__.py
fleetrec/examples/user_define/__init__.py
+0
-0
fleetrec/examples/user_define/user_define_trainer.yaml
fleetrec/examples/user_define/user_define_trainer.yaml
+0
-2
fleetrec/examples/user_define_trainer.py
fleetrec/examples/user_define_trainer.py
+0
-0
fleetrec/run.py
fleetrec/run.py
+13
-5
未找到文件。
fleetrec/core/factory.py
浏览文件 @
42347db7
...
@@ -45,7 +45,7 @@ class TrainerFactory(object):
...
@@ -45,7 +45,7 @@ class TrainerFactory(object):
trainer_abs
=
trainers
.
get
(
train_mode
,
None
)
trainer_abs
=
trainers
.
get
(
train_mode
,
None
)
if
trainer_abs
is
None
:
if
trainer_abs
is
None
:
if
not
os
.
path
.
exists
(
train_mode
)
or
os
.
path
.
isfile
(
train_mode
):
if
not
os
.
path
.
exists
(
train_mode
)
or
not
os
.
path
.
isfile
(
train_mode
):
raise
ValueError
(
"trainer {} can not be recognized"
.
format
(
train_mode
))
raise
ValueError
(
"trainer {} can not be recognized"
.
format
(
train_mode
))
trainer_abs
=
train_mode
trainer_abs
=
train_mode
train_mode
=
"UserDefineTrainer"
train_mode
=
"UserDefineTrainer"
...
...
fleetrec/core/utils/envs.py
浏览文件 @
42347db7
...
@@ -21,8 +21,18 @@ global_envs = {}
...
@@ -21,8 +21,18 @@ global_envs = {}
def
set_runtime_envions
(
envs
):
def
set_runtime_envions
(
envs
):
assert
isinstance
(
envs
,
dict
)
assert
isinstance
(
envs
,
dict
)
def
fatten_env_namespace
(
namespace_nests
,
local_envs
):
for
k
,
v
in
local_envs
.
items
():
if
isinstance
(
v
,
dict
):
nests
=
copy
.
deepcopy
(
namespace_nests
)
nests
.
append
(
k
)
fatten_env_namespace
(
nests
,
v
)
else
:
global_k
=
"."
.
join
(
namespace_nests
+
[
k
])
os
.
environ
[
global_k
]
=
str
(
v
)
for
k
,
v
in
envs
.
items
():
for
k
,
v
in
envs
.
items
():
os
.
environ
[
k
]
=
str
(
v
)
fatten_env_namespace
([
k
],
v
)
def
get_runtime_envion
(
key
):
def
get_runtime_envion
(
key
):
...
...
fleetrec/examples/runtime.yaml
浏览文件 @
42347db7
...
@@ -3,10 +3,11 @@ trainer:
...
@@ -3,10 +3,11 @@ trainer:
threads
:
4
threads
:
4
# for cluster training
# for cluster training
strategy
:
"
async"
communicator
:
communicator
:
strategy
:
"
async"
send_queue_size
:
4
send_queue_size
:
4
min_send_grad_num_before_recv
:
4
min_send_grad_num_before_recv
:
4
thread_pool_size
:
5
thread_pool_size
:
5
max_merge_var_num
:
4
max_merge_var_num
:
4
fleetrec/examples/user_define/__init__.py
已删除
100644 → 0
浏览文件 @
eeaf9942
fleetrec/examples/user_define/user_define_trainer.yaml
已删除
100644 → 0
浏览文件 @
eeaf9942
trainer
:
"
UserDefineTrainer"
location
:
"
/root/FleetRec/fleetrec/examples/user_define_trainer.py"
fleetrec/examples/user_define
/user_define
_trainer.py
→
fleetrec/examples/user_define_trainer.py
浏览文件 @
42347db7
文件已移动
fleetrec/run.py
浏览文件 @
42347db7
...
@@ -21,8 +21,9 @@ def set_runtime_envs(cluster_envs, engine_yaml):
...
@@ -21,8 +21,9 @@ def set_runtime_envs(cluster_envs, engine_yaml):
if
cluster_envs
is
None
:
if
cluster_envs
is
None
:
cluster_envs
=
{}
cluster_envs
=
{}
cluster_envs
.
update
(
cluster_envs
)
cluster_envs
.
update
(
_envs
)
cluster_envs
.
update
(
_envs
)
envs
.
set_runtime_envions
(
cluster_envs
)
#
envs.set_runtime_envions(cluster_envs)
print
(
envs
.
pretty_print_envs
(
cluster_envs
,
(
"Runtime Envs"
,
"Value"
)))
print
(
envs
.
pretty_print_envs
(
cluster_envs
,
(
"Runtime Envs"
,
"Value"
)))
...
@@ -40,7 +41,10 @@ def get_engine(engine):
...
@@ -40,7 +41,10 @@ def get_engine(engine):
def
single_engine
(
args
):
def
single_engine
(
args
):
print
(
"use single engine to run model: {}"
.
format
(
args
.
model
))
print
(
"use single engine to run model: {}"
.
format
(
args
.
model
))
single_envs
=
{
"trainer.trainer"
:
"SingleTrainer"
,
"trainer.threads"
:
"2"
}
single_envs
=
{}
single_envs
[
"trainer.trainer"
]
=
"SingleTrainer"
single_envs
[
"trainer.threads"
]
=
"2"
set_runtime_envs
(
single_envs
,
args
.
engine_extras
)
set_runtime_envs
(
single_envs
,
args
.
engine_extras
)
trainer
=
TrainerFactory
.
create
(
args
.
model
)
trainer
=
TrainerFactory
.
create
(
args
.
model
)
return
trainer
return
trainer
...
@@ -49,7 +53,8 @@ def single_engine(args):
...
@@ -49,7 +53,8 @@ def single_engine(args):
def
cluster_engine
(
args
):
def
cluster_engine
(
args
):
print
(
"launch cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
print
(
"launch cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
cluster_envs
=
{
"trainer.trainer"
:
"ClusterTrainer"
}
cluster_envs
=
{}
cluster_envs
[
"trainer.trainer"
]
=
"ClusterTrainer"
set_runtime_envs
(
cluster_envs
,
args
.
engine_extras
)
set_runtime_envs
(
cluster_envs
,
args
.
engine_extras
)
envs
.
set_runtime_envions
(
cluster_envs
)
envs
.
set_runtime_envions
(
cluster_envs
)
...
@@ -60,7 +65,8 @@ def cluster_engine(args):
...
@@ -60,7 +65,8 @@ def cluster_engine(args):
def
cluster_mpi_engine
(
args
):
def
cluster_mpi_engine
(
args
):
print
(
"launch cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
print
(
"launch cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
cluster_envs
=
{
"trainer.trainer"
:
"CtrCodingTrainer"
}
cluster_envs
=
{}
cluster_envs
[
"trainer.trainer"
]
=
"CtrCodingTrainer"
set_runtime_envs
(
cluster_envs
,
args
.
engine_extras
)
set_runtime_envs
(
cluster_envs
,
args
.
engine_extras
)
trainer
=
TrainerFactory
.
create
(
args
.
model
)
trainer
=
TrainerFactory
.
create
(
args
.
model
)
...
@@ -77,7 +83,9 @@ def local_cluster_engine(args):
...
@@ -77,7 +83,9 @@ def local_cluster_engine(args):
cluster_envs
[
"start_port"
]
=
36001
cluster_envs
[
"start_port"
]
=
36001
cluster_envs
[
"log_dir"
]
=
"logs"
cluster_envs
[
"log_dir"
]
=
"logs"
cluster_envs
[
"trainer.trainer"
]
=
"ClusterTrainer"
cluster_envs
[
"trainer.trainer"
]
=
"ClusterTrainer"
cluster_envs
[
"trainer.strategy.mode"
]
=
"async"
cluster_envs
[
"trainer.strategy"
]
=
"async"
cluster_envs
[
"trainer.threads"
]
=
"2"
cluster_envs
[
"CPU_NUM"
]
=
"2"
set_runtime_envs
(
cluster_envs
,
args
.
engine_extras
)
set_runtime_envs
(
cluster_envs
,
args
.
engine_extras
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录