Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
一只大灰猫
PaddleRec
提交
2f1f76f3
P
PaddleRec
项目概览
一只大灰猫
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
2f1f76f3
编写于
5月 06, 2020
作者:
C
chengmo
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
simple code
上级
a82671c4
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
115 addition
and
315 deletion
+115
-315
fleet_rec/core/trainers/cluster_trainer.py
fleet_rec/core/trainers/cluster_trainer.py
+11
-7
fleet_rec/core/trainers/single_trainer.py
fleet_rec/core/trainers/single_trainer.py
+6
-2
fleet_rec/core/trainers/tdm_cluster_trainer.py
fleet_rec/core/trainers/tdm_cluster_trainer.py
+58
-143
fleet_rec/core/trainers/tdm_single_trainer.py
fleet_rec/core/trainers/tdm_single_trainer.py
+0
-89
fleet_rec/run.py
fleet_rec/run.py
+32
-66
models/recall/tdm/config.yaml
models/recall/tdm/config.yaml
+8
-8
未找到文件。
fleet_rec/core/trainers/cluster_trainer.py
浏览文件 @
2f1f76f3
...
...
@@ -18,6 +18,7 @@ Training use fluid with one node only.
from
__future__
import
print_function
import
os
import
paddle.fluid
as
fluid
from
paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler
import
fleet
from
paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy
import
StrategyFactory
...
...
@@ -39,7 +40,7 @@ class ClusterTrainer(TranspileTrainer):
else
:
self
.
regist_context_processor
(
'uninit'
,
self
.
instance
)
self
.
regist_context_processor
(
'init_pass'
,
self
.
init
)
self
.
regist_context_processor
(
'startup_pass'
,
self
.
startup
)
if
envs
.
get_platform
()
==
"LINUX"
:
self
.
regist_context_processor
(
'train_pass'
,
self
.
dataset_train
)
else
:
...
...
@@ -71,9 +72,11 @@ class ClusterTrainer(TranspileTrainer):
def
init
(
self
,
context
):
self
.
model
.
train_net
()
optimizer
=
self
.
model
.
optimizer
()
optimizer_name
=
envs
.
get_global_env
(
"hyper_parameters.optimizer"
)
if
optimizer_name
in
[
'adam'
,
'ADAM'
,
'Adagrad'
,
'ADAGRAD'
]:
os
.
environ
[
"FLAGS_communicator_is_sgd_optimizer"
]
=
0
optimizer_name
=
envs
.
get_global_env
(
"hyper_parameters.optimizer"
,
None
,
"train.model"
)
if
optimizer_name
not
in
[
""
,
"sgd"
,
"SGD"
,
"Sgd"
]:
os
.
environ
[
"FLAGS_communicator_is_sgd_optimizer"
]
=
'0'
strategy
=
self
.
build_strategy
()
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
)
optimizer
.
minimize
(
self
.
model
.
get_cost_op
())
...
...
@@ -89,16 +92,18 @@ class ClusterTrainer(TranspileTrainer):
if
metrics
:
self
.
fetch_vars
=
metrics
.
values
()
self
.
fetch_alias
=
metrics
.
keys
()
context
[
'status'
]
=
'
train
_pass'
context
[
'status'
]
=
'
startup
_pass'
def
server
(
self
,
context
):
fleet
.
init_server
()
fleet
.
run_server
()
context
[
'is_exit'
]
=
True
def
dataloader_train
(
self
,
context
):
def
startup
(
self
,
context
):
self
.
_exe
.
run
(
fleet
.
startup_program
)
context
[
'status'
]
=
'train_pass'
def
dataloader_train
(
self
,
context
):
fleet
.
init_worker
()
reader
=
self
.
_get_dataloader
()
...
...
@@ -144,7 +149,6 @@ class ClusterTrainer(TranspileTrainer):
context
[
'status'
]
=
'terminal_pass'
def
dataset_train
(
self
,
context
):
self
.
_exe
.
run
(
fleet
.
startup_program
)
fleet
.
init_worker
()
dataset
=
self
.
_get_dataset
()
...
...
fleet_rec/core/trainers/single_trainer.py
浏览文件 @
2f1f76f3
...
...
@@ -33,7 +33,7 @@ class SingleTrainer(TranspileTrainer):
def
processor_register
(
self
):
self
.
regist_context_processor
(
'uninit'
,
self
.
instance
)
self
.
regist_context_processor
(
'init_pass'
,
self
.
init
)
self
.
regist_context_processor
(
'startup_pass'
,
self
.
startup
)
if
envs
.
get_platform
()
==
"LINUX"
:
self
.
regist_context_processor
(
'train_pass'
,
self
.
dataset_train
)
else
:
...
...
@@ -55,10 +55,14 @@ class SingleTrainer(TranspileTrainer):
if
metrics
:
self
.
fetch_vars
=
metrics
.
values
()
self
.
fetch_alias
=
metrics
.
keys
()
context
[
'status'
]
=
'startup_pass'
def
startup
(
self
,
context
):
self
.
_exe
.
run
(
fluid
.
default_startup_program
())
context
[
'status'
]
=
'train_pass'
def
dataloader_train
(
self
,
context
):
self
.
_exe
.
run
(
fluid
.
default_startup_program
())
self
.
model
.
custom_preprocess
()
reader
=
self
.
_get_dataloader
()
...
...
fleet_rec/core/trainers/tdm_cluster_trainer.py
浏览文件 @
2f1f76f3
...
...
@@ -18,7 +18,7 @@ Training use fluid with one node only.
"""
from
__future__
import
print_function
import
logging
import
paddle.fluid
as
fluid
from
paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler
import
fleet
from
paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy
import
StrategyFactory
...
...
@@ -26,85 +26,28 @@ from paddle.fluid.incubate.fleet.base.role_maker import PaddleCloudRoleMaker
from
fleetrec.core.utils
import
envs
from
fleetrec.core.trainers.transpiler_trainer
import
TranspileTrainer
special_param
=
[
"TDM_Tree_Travel"
,
"TDM_Tree_Layer"
,
"TDM_Tree_Info"
]
class
TDMClusterTrainer
(
TranspileTrainer
):
def
processor_register
(
self
):
role
=
PaddleCloudRoleMaker
()
fleet
.
init
(
role
)
if
fleet
.
is_server
():
self
.
regist_context_processor
(
'uninit'
,
self
.
instance
)
self
.
regist_context_processor
(
'init_pass'
,
self
.
init
)
self
.
regist_context_processor
(
'server_pass'
,
self
.
server
)
else
:
self
.
regist_context_processor
(
'uninit'
,
self
.
instance
)
self
.
regist_context_processor
(
'init_pass'
,
self
.
init
)
self
.
regist_context_processor
(
'trainer_startup_pass'
,
self
.
trainer_startup
)
if
envs
.
get_platform
()
==
"LINUX"
:
self
.
regist_context_processor
(
'train_pass'
,
self
.
dataset_train
)
else
:
self
.
regist_context_processor
(
'train_pass'
,
self
.
dataloader_train
)
self
.
regist_context_processor
(
'terminal_pass'
,
self
.
terminal
)
def
build_strategy
(
self
):
mode
=
envs
.
get_runtime_environ
(
"train.trainer.strategy"
)
assert
mode
in
[
"async"
,
"geo"
,
"sync"
,
"half_async"
]
strategy
=
None
if
mode
==
"async"
:
strategy
=
StrategyFactory
.
create_async_strategy
()
elif
mode
==
"geo"
:
push_num
=
envs
.
get_global_env
(
"train.strategy.mode.push_num"
,
100
)
strategy
=
StrategyFactory
.
create_geo_strategy
(
push_num
)
elif
mode
==
"sync"
:
strategy
=
StrategyFactory
.
create_sync_strategy
()
elif
mode
==
"half_async"
:
strategy
=
StrategyFactory
.
create_half_async_strategy
()
assert
strategy
is
not
None
self
.
strategy
=
strategy
return
strategy
def
init
(
self
,
context
):
self
.
model
.
train_net
()
optimizer
=
self
.
model
.
optimizer
()
optimizer_name
=
envs
.
get_global_env
(
"hyper_parameters.optimizer"
)
if
optimizer_name
in
[
'adam'
,
'ADAM'
,
'Adagrad'
,
'ADAGRAD'
]:
os
.
environ
[
"FLAGS_communicator_is_sgd_optimizer"
]
=
0
strategy
=
self
.
build_strategy
()
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
)
optimizer
.
minimize
(
self
.
model
.
get_cost_op
())
if
fleet
.
is_server
():
context
[
'status'
]
=
'server_pass'
else
:
self
.
fetch_vars
=
[]
self
.
fetch_alias
=
[]
self
.
fetch_period
=
self
.
model
.
get_fetch_period
()
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
special_param
=
[
"TDM_Tree_Travel"
,
"TDM_Tree_Layer"
,
"TDM_Tree_Info"
]
metrics
=
self
.
model
.
get_metrics
()
if
metrics
:
self
.
fetch_vars
=
metrics
.
values
()
self
.
fetch_alias
=
metrics
.
keys
()
context
[
'status'
]
=
'trainer_startup_pass'
class
TDMClusterTrainer
(
TranspileTrainer
):
def
server
(
self
,
context
):
namespace
=
"train.startup"
model_path
=
envs
.
get_global_env
(
"cluster.model_path"
,
""
,
namespace
)
assert
not
model_path
,
"Cluster train must has init_model for TDM"
fleet
.
init_server
(
model_path
)
init_model_path
=
envs
.
get_global_env
(
"cluster.init_model_path"
,
""
,
namespace
)
assert
init_model_path
!=
""
,
"Cluster train must has init_model for TDM"
fleet
.
init_server
(
init_model_path
)
logger
.
info
(
"TDM: load model from {}"
.
format
(
init_model_path
))
fleet
.
run_server
()
context
[
'is_exit'
]
=
True
def
trainer_startup
(
self
,
context
):
def
startup
(
self
,
context
):
self
.
_exe
.
run
(
fleet
.
startup_program
)
namespace
=
"train.startup"
load_tree
=
envs
.
get_global_env
(
"cluster.load_tree"
,
True
,
namespace
)
...
...
@@ -119,7 +62,6 @@ class TDMClusterTrainer(TranspileTrainer):
"cluster.save_init_model"
,
False
,
namespace
)
init_model_path
=
envs
.
get_global_env
(
"cluster.init_model_path"
,
""
,
namespace
)
self
.
_exe
.
run
(
fluid
.
default_startup_program
())
if
load_tree
:
# 将明文树结构及数据,set到组网中的Variale中
...
...
@@ -137,74 +79,47 @@ class TDMClusterTrainer(TranspileTrainer):
context
[
'status'
]
=
'train_pass'
def
dataloader_train
(
self
,
context
):
self
.
_exe
.
run
(
fleet
.
startup_program
)
fleet
.
init_worker
()
reader
=
self
.
_get_dataloader
()
epochs
=
envs
.
get_global_env
(
"train.epochs"
)
program
=
fluid
.
compiler
.
CompiledProgram
(
fleet
.
main_program
).
with_data_parallel
(
loss_name
=
self
.
model
.
get_cost_op
().
name
,
build_strategy
=
self
.
strategy
.
get_build_strategy
(),
exec_strategy
=
self
.
strategy
.
get_execute_strategy
())
metrics_varnames
=
[]
metrics_format
=
[]
metrics_format
.
append
(
"{}: {{}}"
.
format
(
"epoch"
))
metrics_format
.
append
(
"{}: {{}}"
.
format
(
"batch"
))
for
name
,
var
in
self
.
model
.
get_metrics
().
items
():
metrics_varnames
.
append
(
var
.
name
)
metrics_format
.
append
(
"{}: {{}}"
.
format
(
name
))
metrics_format
=
", "
.
join
(
metrics_format
)
for
epoch
in
range
(
epochs
):
reader
.
start
()
batch_id
=
0
try
:
while
True
:
metrics_rets
=
self
.
_exe
.
run
(
program
=
program
,
fetch_list
=
metrics_varnames
)
metrics
=
[
epoch
,
batch_id
]
metrics
.
extend
(
metrics_rets
)
if
batch_id
%
10
==
0
and
batch_id
!=
0
:
print
(
metrics_format
.
format
(
*
metrics
))
batch_id
+=
1
except
fluid
.
core
.
EOFException
:
reader
.
reset
()
fleet
.
stop_worker
()
context
[
'status'
]
=
'terminal_pass'
def
dataset_train
(
self
,
context
):
self
.
_exe
.
run
(
fleet
.
startup_program
)
fleet
.
init_worker
()
dataset
=
self
.
_get_dataset
()
epochs
=
envs
.
get_global_env
(
"train.epochs"
)
for
i
in
range
(
epochs
):
self
.
_exe
.
train_from_dataset
(
program
=
fluid
.
default_main_program
(),
dataset
=
dataset
,
fetch_list
=
self
.
fetch_vars
,
fetch_info
=
self
.
fetch_alias
,
print_period
=
self
.
fetch_period
)
self
.
save
(
i
,
"train"
,
is_fleet
=
True
)
fleet
.
stop_worker
()
context
[
'status'
]
=
'terminal_pass'
def
infer
(
self
,
context
):
context
[
'status'
]
=
'terminal_pass'
def
terminal
(
self
,
context
):
for
model
in
self
.
increment_models
:
print
(
"epoch :{}, dir: {}"
.
format
(
model
[
0
],
model
[
1
]))
context
[
'is_exit'
]
=
True
def
tdm_prepare
(
self
,
param_name
):
if
param_name
==
"TDM_Tree_Travel"
:
travel_array
=
self
.
tdm_travel_prepare
()
return
travel_array
elif
param_name
==
"TDM_Tree_Layer"
:
layer_array
,
_
=
self
.
tdm_layer_prepare
()
return
layer_array
elif
param_name
==
"TDM_Tree_Info"
:
info_array
=
self
.
tdm_info_prepare
()
return
info_array
else
:
raise
" {} is not a special tdm param name"
.
format
(
param_name
)
def
tdm_travel_prepare
(
self
):
"""load tdm tree param from npy/list file"""
travel_array
=
np
.
load
(
self
.
tree_travel_path
)
logger
.
info
(
"TDM Tree leaf node nums: {}"
.
format
(
travel_array
.
shape
[
0
]))
return
travel_array
def
tdm_layer_prepare
(
self
):
"""load tdm tree param from npy/list file"""
layer_list
=
[]
layer_list_flat
=
[]
with
open
(
self
.
tree_layer_path
,
'r'
)
as
fin
:
for
line
in
fin
.
readlines
():
l
=
[]
layer
=
(
line
.
split
(
'
\n
'
))[
0
].
split
(
','
)
for
node
in
layer
:
if
node
:
layer_list_flat
.
append
(
node
)
l
.
append
(
node
)
layer_list
.
append
(
l
)
layer_array
=
np
.
array
(
layer_list_flat
)
layer_array
=
layer_array
.
reshape
([
-
1
,
1
])
logger
.
info
(
"TDM Tree max layer: {}"
.
format
(
len
(
layer_list
)))
logger
.
info
(
"TDM Tree layer_node_num_list: {}"
.
format
(
[
len
(
i
)
for
i
in
layer_list
]))
return
layer_array
,
layer_list
def
tdm_info_prepare
(
self
):
"""load tdm tree param from list file"""
info_array
=
np
.
load
(
self
.
tree_info_path
)
return
info_array
fleet_rec/core/trainers/tdm_single_trainer.py
浏览文件 @
2f1f76f3
...
...
@@ -34,34 +34,6 @@ special_param = ["TDM_Tree_Travel", "TDM_Tree_Layer",
class
TDMSingleTrainer
(
SingleTrainer
):
def
processor_register
(
self
):
self
.
regist_context_processor
(
'uninit'
,
self
.
instance
)
self
.
regist_context_processor
(
'init_pass'
,
self
.
init
)
self
.
regist_context_processor
(
'startup_pass'
,
self
.
startup
)
if
envs
.
get_platform
()
==
"LINUX"
:
self
.
regist_context_processor
(
'train_pass'
,
self
.
dataset_train
)
else
:
self
.
regist_context_processor
(
'train_pass'
,
self
.
dataloader_train
)
self
.
regist_context_processor
(
'infer_pass'
,
self
.
infer
)
self
.
regist_context_processor
(
'terminal_pass'
,
self
.
terminal
)
def
init
(
self
,
context
):
self
.
model
.
train_net
()
optimizer
=
self
.
model
.
optimizer
()
optimizer
.
minimize
((
self
.
model
.
get_cost_op
()))
self
.
fetch_vars
=
[]
self
.
fetch_alias
=
[]
self
.
fetch_period
=
self
.
model
.
get_fetch_period
()
metrics
=
self
.
model
.
get_metrics
()
if
metrics
:
self
.
fetch_vars
=
metrics
.
values
()
self
.
fetch_alias
=
metrics
.
keys
()
context
[
'status'
]
=
'startup_pass'
def
startup
(
self
,
context
):
namespace
=
"train.startup"
load_persistables
=
envs
.
get_global_env
(
...
...
@@ -114,67 +86,6 @@ class TDMSingleTrainer(SingleTrainer):
context
[
'status'
]
=
'train_pass'
def
dataloader_train
(
self
,
context
):
reader
=
self
.
_get_dataloader
()
epochs
=
envs
.
get_global_env
(
"train.epochs"
)
program
=
fluid
.
compiler
.
CompiledProgram
(
fluid
.
default_main_program
()).
with_data_parallel
(
loss_name
=
self
.
model
.
get_cost_op
().
name
)
metrics_varnames
=
[]
metrics_format
=
[]
metrics_format
.
append
(
"{}: {{}}"
.
format
(
"epoch"
))
metrics_format
.
append
(
"{}: {{}}"
.
format
(
"batch"
))
for
name
,
var
in
self
.
model
.
get_metrics
().
items
():
metrics_varnames
.
append
(
var
.
name
)
metrics_format
.
append
(
"{}: {{}}"
.
format
(
name
))
metrics_format
=
", "
.
join
(
metrics_format
)
for
epoch
in
range
(
epochs
):
reader
.
start
()
batch_id
=
0
try
:
while
True
:
metrics_rets
=
self
.
_exe
.
run
(
program
=
program
,
fetch_list
=
metrics_varnames
)
metrics
=
[
epoch
,
batch_id
]
metrics
.
extend
(
metrics_rets
)
if
batch_id
%
10
==
0
and
batch_id
!=
0
:
print
(
metrics_format
.
format
(
*
metrics
))
batch_id
+=
1
except
fluid
.
core
.
EOFException
:
reader
.
reset
()
context
[
'status'
]
=
'infer_pass'
def
dataset_train
(
self
,
context
):
dataset
=
self
.
_get_dataset
()
epochs
=
envs
.
get_global_env
(
"train.epochs"
)
for
i
in
range
(
epochs
):
self
.
_exe
.
train_from_dataset
(
program
=
fluid
.
default_main_program
(),
dataset
=
dataset
,
fetch_list
=
self
.
fetch_vars
,
fetch_info
=
self
.
fetch_alias
,
print_period
=
self
.
fetch_period
)
self
.
save
(
i
,
"train"
,
is_fleet
=
False
)
context
[
'status'
]
=
'infer_pass'
def
infer
(
self
,
context
):
context
[
'status'
]
=
'terminal_pass'
def
terminal
(
self
,
context
):
for
model
in
self
.
increment_models
:
print
(
"epoch :{}, dir: {}"
.
format
(
model
[
0
],
model
[
1
]))
context
[
'is_exit'
]
=
True
def
tdm_prepare
(
self
,
param_name
):
if
param_name
==
"TDM_Tree_Travel"
:
travel_array
=
self
.
tdm_travel_prepare
()
...
...
fleet_rec/run.py
浏览文件 @
2f1f76f3
...
...
@@ -10,6 +10,7 @@ from fleetrec.core.utils import util
engines
=
{}
device
=
[
"CPU"
,
"GPU"
]
clusters
=
[
"SINGLE"
,
"LOCAL_CLUSTER"
,
"CLUSTER"
]
custom_model
=
[
'tdm'
]
def
engine_registry
():
...
...
@@ -31,9 +32,12 @@ def engine_registry():
engines
[
"GPU"
]
=
gpu
def
get_engine
(
engine
,
device
):
def
get_engine
(
args
):
device
=
args
.
device
d_engine
=
engines
[
device
]
transpiler
=
get_transpiler
()
engine
=
get_custom_model_engine
(
args
)
run_engine
=
d_engine
[
transpiler
].
get
(
engine
,
None
)
if
run_engine
is
None
:
...
...
@@ -42,6 +46,16 @@ def get_engine(engine, device):
return
run_engine
def
get_custom_model_engine
(
args
):
model
=
args
.
model
model_name
=
model
.
split
(
'.'
)[
1
]
if
model_name
in
custom_model
:
engine
=
"_"
.
join
((
model_name
.
upper
(),
args
.
engine
))
else
:
engine
=
args
.
engine
return
engine
def
get_transpiler
():
FNULL
=
open
(
os
.
devnull
,
'w'
)
cmd
=
[
"python"
,
"-c"
,
...
...
@@ -81,30 +95,23 @@ def set_runtime_envs(cluster_envs, engine_yaml):
print
(
envs
.
pretty_print_envs
(
need_print
,
(
"Runtime Envs"
,
"Value"
)))
def
single_engine
(
args
):
print
(
"use single engine to run model: {}"
.
format
(
args
.
model
))
single_envs
=
{}
single_envs
[
"train.trainer.trainer"
]
=
"SingleTrainer"
single_envs
[
"train.trainer.threads"
]
=
"2"
single_envs
[
"train.trainer.engine"
]
=
"single"
single_envs
[
"train.trainer.device"
]
=
args
.
device
single_envs
[
"train.trainer.platform"
]
=
envs
.
get_platform
()
set_runtime_envs
(
single_envs
,
args
.
model
)
trainer
=
TrainerFactory
.
create
(
args
.
model
)
return
trainer
def
get_trainer_prefix
(
args
):
model
=
args
.
model
model_name
=
model
.
split
(
'.'
)[
1
]
if
model_name
in
custom_model
:
return
model_name
.
upper
()
return
""
def
tdm_single_engine
(
args
):
print
(
"use tdm single engine to run model: {}"
.
format
(
args
.
model
))
def
single_engine
(
args
):
trainer
=
get_trainer_prefix
(
args
)
+
"SingleTrainer"
single_envs
=
{}
single_envs
[
"train.trainer.trainer"
]
=
"TDMSingleTrainer"
single_envs
[
"train.trainer.trainer"
]
=
trainer
single_envs
[
"train.trainer.threads"
]
=
"2"
single_envs
[
"train.trainer.engine"
]
=
"single"
single_envs
[
"train.trainer.device"
]
=
args
.
device
single_envs
[
"train.trainer.platform"
]
=
envs
.
get_platform
()
print
(
"use {} engine to run model: {}"
.
format
(
trainer
,
args
.
model
))
set_runtime_envs
(
single_envs
,
args
.
model
)
trainer
=
TrainerFactory
.
create
(
args
.
model
)
...
...
@@ -112,31 +119,15 @@ def tdm_single_engine(args):
def
cluster_engine
(
args
):
print
(
"launch cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
cluster_envs
=
{}
cluster_envs
[
"train.trainer.trainer"
]
=
"ClusterTrainer"
cluster_envs
[
"train.trainer.engine"
]
=
"cluster"
cluster_envs
[
"train.trainer.device"
]
=
args
.
device
cluster_envs
[
"train.trainer.platform"
]
=
envs
.
get_platform
()
set_runtime_envs
(
cluster_envs
,
args
.
model
)
trainer
=
TrainerFactory
.
create
(
args
.
model
)
return
trainer
def
tdm_cluster_engine
(
args
):
print
(
"launch tdm cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
trainer
=
get_trainer_prefix
(
args
)
+
"ClusterTrainer"
cluster_envs
=
{}
cluster_envs
[
"train.trainer.trainer"
]
=
"TDMClusterTrainer"
cluster_envs
[
"train.trainer.trainer"
]
=
trainer
cluster_envs
[
"train.trainer.engine"
]
=
"cluster"
cluster_envs
[
"train.trainer.device"
]
=
args
.
device
cluster_envs
[
"train.trainer.platform"
]
=
envs
.
get_platform
()
print
(
"launch {} engine with cluster to run model: {}"
.
format
(
trainer
,
args
.
model
))
set_runtime_envs
(
cluster_envs
,
args
.
model
)
trainer
=
TrainerFactory
.
create
(
args
.
model
)
return
trainer
...
...
@@ -156,40 +147,15 @@ def cluster_mpi_engine(args):
def
local_cluster_engine
(
args
):
print
(
"launch cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
from
fleetrec.core.engine.local_cluster_engine
import
LocalClusterEngine
cluster_envs
=
{}
cluster_envs
[
"server_num"
]
=
1
cluster_envs
[
"worker_num"
]
=
1
cluster_envs
[
"start_port"
]
=
36001
cluster_envs
[
"log_dir"
]
=
"logs"
cluster_envs
[
"train.trainer.trainer"
]
=
"ClusterTrainer"
cluster_envs
[
"train.trainer.strategy"
]
=
"async"
cluster_envs
[
"train.trainer.threads"
]
=
"2"
cluster_envs
[
"train.trainer.engine"
]
=
"local_cluster"
cluster_envs
[
"train.trainer.device"
]
=
args
.
device
cluster_envs
[
"train.trainer.platform"
]
=
envs
.
get_platform
()
cluster_envs
[
"CPU_NUM"
]
=
"2"
set_runtime_envs
(
cluster_envs
,
args
.
model
)
launch
=
LocalClusterEngine
(
cluster_envs
,
args
.
model
)
return
launch
def
tdm_local_cluster_engine
(
args
):
print
(
"launch tdm cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
from
fleetrec.core.engine.local_cluster_engine
import
LocalClusterEngine
trainer
=
get_trainer_prefix
(
args
)
+
"ClusterTrainer"
cluster_envs
=
{}
cluster_envs
[
"server_num"
]
=
1
cluster_envs
[
"worker_num"
]
=
1
cluster_envs
[
"start_port"
]
=
36001
cluster_envs
[
"log_dir"
]
=
"logs"
cluster_envs
[
"train.trainer.trainer"
]
=
"TDMClusterTrainer"
cluster_envs
[
"train.trainer.trainer"
]
=
trainer
cluster_envs
[
"train.trainer.strategy"
]
=
"async"
cluster_envs
[
"train.trainer.threads"
]
=
"2"
cluster_envs
[
"train.trainer.engine"
]
=
"local_cluster"
...
...
@@ -198,9 +164,9 @@ def tdm_local_cluster_engine(args):
cluster_envs
[
"train.trainer.platform"
]
=
envs
.
get_platform
()
cluster_envs
[
"CPU_NUM"
]
=
"2"
print
(
"launch {} engine with cluster to run model: {}"
.
format
(
trainer
,
args
.
model
))
set_runtime_envs
(
cluster_envs
,
args
.
model
)
launch
=
LocalClusterEngine
(
cluster_envs
,
args
.
model
)
return
launch
...
...
@@ -258,7 +224,7 @@ if __name__ == "__main__":
args
.
model
=
get_abs_model
(
args
.
model
)
engine_registry
()
which_engine
=
get_engine
(
args
.
engine
,
args
.
device
)
which_engine
=
get_engine
(
args
)
engine
=
which_engine
(
args
)
engine
.
run
()
models/recall/tdm/config.yaml
浏览文件 @
2f1f76f3
...
...
@@ -46,22 +46,22 @@ train:
startup
:
single
:
# 建议tree只load一次,保存为paddle tensor,之后从paddle模型热启
load_persistables
:
False
persistables_model_path
:
"
"
tree
:
# 单机训练建议tree只load一次,保存为paddle tensor,之后从paddle模型热启
# 分布式训练trainer需要独立load
load_tree
:
True
tree_layer_path
:
"
{workspace}/tree/layer_list.txt"
tree_travel_path
:
"
{workspace}/tree/travel_list.npy"
tree_info_path
:
"
{workspace}/tree/tree_info.npy"
tree_emb_path
:
"
{workspace}/tree/tree_emb.npy"
single
:
load_persistables
:
False
persistables_model_path
:
"
"
save_init_model
:
True
init_model_path
:
"
"
init_model_path
:
"
{workspace}/init_model
"
cluster
:
load_persistables
:
True
persistables_model_path
:
"
"
init_model_path
:
"
{workspace}/init_model"
save
:
increment
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录