Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
PaddleRec
提交
457f693c
P
PaddleRec
项目概览
BaiXuePrincess
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
457f693c
编写于
8月 31, 2020
作者:
L
liuyuhui
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix a bug , transfer from print() to logging
上级
2caf374d
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
189 addition
and
89 deletion
+189
-89
core/engine/cluster/cluster.py
core/engine/cluster/cluster.py
+7
-1
core/engine/local_cluster.py
core/engine/local_cluster.py
+8
-2
core/engine/local_mpi.py
core/engine/local_mpi.py
+6
-1
core/factory.py
core/factory.py
+6
-1
core/trainer.py
core/trainer.py
+15
-9
core/trainers/finetuning_trainer.py
core/trainers/finetuning_trainer.py
+1
-1
core/trainers/framework/dataset.py
core/trainers/framework/dataset.py
+9
-3
core/trainers/framework/instance.py
core/trainers/framework/instance.py
+9
-4
core/trainers/framework/network.py
core/trainers/framework/network.py
+10
-5
core/trainers/framework/runner.py
core/trainers/framework/runner.py
+21
-16
core/trainers/framework/startup.py
core/trainers/framework/startup.py
+14
-9
core/trainers/framework/terminal.py
core/trainers/framework/terminal.py
+7
-2
core/trainers/general_trainer.py
core/trainers/general_trainer.py
+6
-1
core/utils/dataloader_instance.py
core/utils/dataloader_instance.py
+9
-4
core/utils/dataset_holder.py
core/utils/dataset_holder.py
+6
-1
core/utils/envs.py
core/utils/envs.py
+9
-3
core/utils/fs.py
core/utils/fs.py
+7
-1
core/utils/util.py
core/utils/util.py
+7
-2
core/utils/validation.py
core/utils/validation.py
+32
-23
未找到文件。
core/engine/cluster/cluster.py
浏览文件 @
457f693c
...
...
@@ -19,11 +19,16 @@ import copy
import
os
import
subprocess
import
warnings
import
logging
from
paddlerec.core.engine.engine
import
Engine
from
paddlerec.core.factory
import
TrainerFactory
from
paddlerec.core.utils
import
envs
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
()
logger
.
setLevel
(
logging
.
INFO
)
class
ClusterEngine
(
Engine
):
def
__init_impl__
(
self
):
...
...
@@ -220,7 +225,8 @@ class ClusterEnvBase(object):
def
env_set
(
self
):
envs
.
set_runtime_environs
(
self
.
cluster_env
)
flattens
=
envs
.
flatten_environs
(
self
.
cluster_env
)
print
(
envs
.
pretty_print_envs
(
flattens
,
(
"Cluster Envs"
,
"Value"
)))
logger
.
info
(
envs
.
pretty_print_envs
(
flattens
,
(
"Cluster Envs"
,
"Value"
)))
class
PaddleCloudMpiEnv
(
ClusterEnvBase
):
...
...
core/engine/local_cluster.py
浏览文件 @
457f693c
...
...
@@ -19,10 +19,15 @@ import copy
import
os
import
sys
import
subprocess
import
logging
from
paddlerec.core.engine.engine
import
Engine
from
paddlerec.core.utils
import
envs
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
()
logger
.
setLevel
(
logging
.
INFO
)
class
LocalClusterEngine
(
Engine
):
def
start_procs
(
self
):
...
...
@@ -57,7 +62,8 @@ class LocalClusterEngine(Engine):
]
factory
=
"paddlerec.core.factory"
cmd
=
[
sys
.
executable
,
"-u"
,
"-m"
,
factory
,
self
.
trainer
]
cmd
=
[
sys
.
executable
,
"-u"
,
"-m"
,
factory
,
self
.
trainer
]
#problems
for
i
in
range
(
server_num
):
current_env
.
update
({
...
...
@@ -145,7 +151,7 @@ class LocalClusterEngine(Engine):
if
len
(
log_fns
)
>
0
:
log_fns
[
i
].
close
()
procs
[
i
].
terminate
()
print
(
logger
.
info
(
"all workers already completed, you can view logs under the `{}` directory"
.
format
(
logs_dir
),
file
=
sys
.
stderr
)
...
...
core/engine/local_mpi.py
浏览文件 @
457f693c
...
...
@@ -19,9 +19,14 @@ import copy
import
os
import
sys
import
subprocess
import
logging
from
paddlerec.core.engine.engine
import
Engine
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
()
logger
.
setLevel
(
logging
.
INFO
)
class
LocalMPIEngine
(
Engine
):
def
start_procs
(
self
):
...
...
@@ -51,7 +56,7 @@ class LocalMPIEngine(Engine):
if
len
(
log_fns
)
>
0
:
log_fns
[
i
].
close
()
procs
[
i
].
wait
()
print
(
logger
.
info
(
"all workers and parameter servers already completed"
,
file
=
sys
.
stderr
)
...
...
core/factory.py
浏览文件 @
457f693c
...
...
@@ -14,8 +14,13 @@
import
os
import
sys
import
logging
from
paddlerec.core.utils
import
envs
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
trainer_abs
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"trainers"
)
trainers
=
{}
...
...
@@ -53,7 +58,7 @@ class TrainerFactory(object):
@
staticmethod
def
_build_trainer
(
yaml_path
):
print
(
envs
.
pretty_print_envs
(
envs
.
get_global_envs
()))
logger
.
info
(
envs
.
pretty_print_envs
(
envs
.
get_global_envs
()))
train_mode
=
envs
.
get_trainer
()
trainer_abs
=
trainers
.
get
(
train_mode
,
None
)
...
...
core/trainer.py
浏览文件 @
457f693c
...
...
@@ -17,11 +17,16 @@ import os
import
time
import
sys
import
traceback
import
logging
from
paddle
import
fluid
from
paddlerec.core.utils
import
envs
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
class
EngineMode
:
"""
...
...
@@ -88,7 +93,7 @@ class Trainer(object):
phases
.
append
(
phase
)
self
.
_context
[
"phases"
]
=
phases
print
(
"PaddleRec: Runner {} Begin"
.
format
(
self
.
_runner_name
))
logger
.
info
(
"PaddleRec: Runner {} Begin"
.
format
(
self
.
_runner_name
))
self
.
which_engine
()
self
.
which_device
()
self
.
which_fleet_mode
()
...
...
@@ -107,7 +112,7 @@ class Trainer(object):
self
.
device
=
Device
.
GPU
gpu_id
=
int
(
os
.
environ
.
get
(
'FLAGS_selected_gpus'
,
0
))
self
.
_place
=
fluid
.
CUDAPlace
(
gpu_id
)
print
(
"PaddleRec run on device GPU: {}"
.
format
(
gpu_id
))
logger
.
info
(
"PaddleRec run on device GPU: {}"
.
format
(
gpu_id
))
self
.
_exe
=
fluid
.
Executor
(
self
.
_place
)
elif
device
==
"CPU"
:
self
.
device
=
Device
.
CPU
...
...
@@ -169,7 +174,7 @@ class Trainer(object):
def
which_cluster_type
(
self
):
cluster_type
=
os
.
getenv
(
"PADDLEREC_CLUSTER_TYPE"
,
"MPI"
)
print
(
"PADDLEREC_CLUSTER_TYPE: {}"
.
format
(
cluster_type
))
logger
.
info
(
"PADDLEREC_CLUSTER_TYPE: {}"
.
format
(
cluster_type
))
if
cluster_type
and
cluster_type
.
upper
()
==
"K8S"
:
self
.
_context
[
"cluster_type"
]
=
"K8S"
else
:
...
...
@@ -184,7 +189,7 @@ class Trainer(object):
self
.
is_infer
=
False
else
:
self
.
is_infer
=
True
print
(
"Executor Mode: {}"
.
format
(
executor_mode
))
logger
.
info
(
"Executor Mode: {}"
.
format
(
executor_mode
))
self
.
_context
[
"is_infer"
]
=
self
.
is_infer
def
legality_check
(
self
):
...
...
@@ -224,7 +229,7 @@ class Trainer(object):
Return:
None, just sleep in base
"""
print
(
'unknow context_status:%s, do nothing'
%
context
[
'status'
])
logger
.
info
(
'unknow context_status:%s, do nothing'
%
context
[
'status'
])
time
.
sleep
(
60
)
def
handle_processor_exception
(
self
,
context
,
exception
):
...
...
@@ -233,9 +238,10 @@ class Trainer(object):
Return:
bool exit_app or not
"""
print
(
"
\n
--------------------------------
\n
PaddleRec Error Message "
"Summary:
\n
--------------------------------
\n
"
)
print
(
logger
.
info
(
"
\n
--------------------------------
\n
PaddleRec Error Message "
"Summary:
\n
--------------------------------
\n
"
)
logger
.
info
(
'Exit PaddleRec. catch exception in precoss status: [%s], except: %s'
%
(
context
[
'status'
],
str
(
exception
)))
return
True
...
...
@@ -258,7 +264,7 @@ class Trainer(object):
break
except
Exception
as
err
:
traceback
.
print_exc
()
print
(
'Catch Exception:%s'
%
str
(
err
))
logger
.
info
(
'Catch Exception:%s'
%
str
(
err
))
sys
.
stdout
.
flush
()
self
.
handle_processor_exception
(
self
.
_context
,
err
)
sys
.
exit
(
type
(
err
).
__name__
)
core/trainers/finetuning_trainer.py
浏览文件 @
457f693c
...
...
@@ -34,7 +34,7 @@ class FineTuningTrainer(Trainer):
self
.
runner_env_name
=
"runner."
+
self
.
_context
[
"runner_name"
]
def
processor_register
(
self
):
print
(
"processor_register begin"
)
logger
.
info
(
"processor_register begin"
)
self
.
regist_context_processor
(
'uninit'
,
self
.
instance
)
self
.
regist_context_processor
(
'network_pass'
,
self
.
network
)
self
.
regist_context_processor
(
'startup_pass'
,
self
.
startup
)
...
...
core/trainers/framework/dataset.py
浏览文件 @
457f693c
...
...
@@ -15,6 +15,7 @@
from
__future__
import
print_function
import
os
import
logging
import
paddle.fluid
as
fluid
from
paddlerec.core.utils
import
envs
...
...
@@ -25,6 +26,10 @@ from paddlerec.core.utils.util import split_files, check_filelist
__all__
=
[
"DatasetBase"
,
"DataLoader"
,
"QueueDataset"
]
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
class
DatasetBase
(
object
):
"""R
...
...
@@ -83,7 +88,8 @@ class QueueDataset(DatasetBase):
name
=
"dataset."
+
dataset_name
+
"."
type_name
=
envs
.
get_global_env
(
name
+
"type"
)
if
envs
.
get_platform
()
!=
"LINUX"
:
print
(
"platform "
,
envs
.
get_platform
(),
"Reader To Dataloader"
)
logger
.
info
(
"platform "
,
envs
.
get_platform
(),
"Reader To Dataloader"
)
type_name
=
"DataLoader"
if
type_name
==
"DataLoader"
:
...
...
@@ -126,7 +132,7 @@ class QueueDataset(DatasetBase):
data_file_list
=
[],
train_data_path
=
train_data_path
)
if
(
hidden_file_list
is
not
None
):
print
(
logger
.
info
(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
.
format
(
hidden_file_list
))
...
...
@@ -143,7 +149,7 @@ class QueueDataset(DatasetBase):
if
need_split_files
:
file_list
=
split_files
(
file_list
,
context
[
"fleet"
].
worker_index
(),
context
[
"fleet"
].
worker_num
())
print
(
"File_list: {}"
.
format
(
file_list
))
logger
.
info
(
"File_list: {}"
.
format
(
file_list
))
dataset
.
set_filelist
(
file_list
)
for
model_dict
in
context
[
"phases"
]:
...
...
core/trainers/framework/instance.py
浏览文件 @
457f693c
...
...
@@ -15,6 +15,7 @@
from
__future__
import
print_function
import
warnings
import
logging
import
paddle.fluid
as
fluid
from
paddlerec.core.utils
import
envs
...
...
@@ -24,6 +25,10 @@ __all__ = [
"CollectiveInstance"
]
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
class
InstanceBase
(
object
):
"""R
...
...
@@ -38,7 +43,7 @@ class InstanceBase(object):
class
SingleInstance
(
InstanceBase
):
def
__init__
(
self
,
context
):
print
(
"Running SingleInstance."
)
logger
.
info
(
"Running SingleInstance."
)
pass
def
instance
(
self
,
context
):
...
...
@@ -47,7 +52,7 @@ class SingleInstance(InstanceBase):
class
PSInstance
(
InstanceBase
):
def
__init__
(
self
,
context
):
print
(
"Running PSInstance."
)
logger
.
info
(
"Running PSInstance."
)
pass
def
instance
(
self
,
context
):
...
...
@@ -61,7 +66,7 @@ class PSInstance(InstanceBase):
class
PslibInstance
(
InstanceBase
):
def
__init__
(
self
,
context
):
print
(
"Running PslibInstance."
)
logger
.
info
(
"Running PslibInstance."
)
pass
def
instance
(
self
,
context
):
...
...
@@ -73,7 +78,7 @@ class PslibInstance(InstanceBase):
class
CollectiveInstance
(
InstanceBase
):
def
__init__
(
self
,
context
):
print
(
"Running CollectiveInstance."
)
logger
.
info
(
"Running CollectiveInstance."
)
pass
def
instance
(
self
,
context
):
...
...
core/trainers/framework/network.py
浏览文件 @
457f693c
...
...
@@ -16,6 +16,7 @@ from __future__ import print_function
import
os
import
warnings
import
logging
import
paddle.fluid
as
fluid
from
paddlerec.core.utils
import
envs
...
...
@@ -26,6 +27,10 @@ __all__ = [
"CollectiveNetwork"
,
"FineTuningNetwork"
]
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
class
NetworkBase
(
object
):
"""R
...
...
@@ -43,7 +48,7 @@ class SingleNetwork(NetworkBase):
"""
def
__init__
(
self
,
context
):
print
(
"Running SingleNetwork."
)
logger
.
info
(
"Running SingleNetwork."
)
pass
def
build_network
(
self
,
context
):
...
...
@@ -114,7 +119,7 @@ class FineTuningNetwork(NetworkBase):
"""
def
__init__
(
self
,
context
):
print
(
"Running FineTuningNetwork."
)
logger
.
info
(
"Running FineTuningNetwork."
)
def
build_network
(
self
,
context
):
context
[
"model"
]
=
{}
...
...
@@ -193,7 +198,7 @@ class FineTuningNetwork(NetworkBase):
class
PSNetwork
(
NetworkBase
):
def
__init__
(
self
,
context
):
print
(
"Running PSNetwork."
)
logger
.
info
(
"Running PSNetwork."
)
pass
def
build_network
(
self
,
context
):
...
...
@@ -285,7 +290,7 @@ class PSNetwork(NetworkBase):
class
PslibNetwork
(
NetworkBase
):
def
__init__
(
self
,
context
):
print
(
"Running PslibNetwork."
)
logger
.
info
(
"Running PslibNetwork."
)
pass
def
build_network
(
self
,
context
):
...
...
@@ -357,7 +362,7 @@ class PslibNetwork(NetworkBase):
class
CollectiveNetwork
(
NetworkBase
):
def
__init__
(
self
,
context
):
print
(
"Running CollectiveNetwork."
)
logger
.
info
(
"Running CollectiveNetwork."
)
pass
def
build_network
(
self
,
context
):
...
...
core/trainers/framework/runner.py
浏览文件 @
457f693c
...
...
@@ -17,6 +17,7 @@ from __future__ import print_function
import
os
import
time
import
warnings
import
logging
import
numpy
as
np
import
paddle.fluid
as
fluid
...
...
@@ -27,6 +28,10 @@ __all__ = [
"RunnerBase"
,
"SingleRunner"
,
"PSRunner"
,
"CollectiveRunner"
,
"PslibRunner"
]
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
def
as_numpy
(
tensor
):
"""
...
...
@@ -169,7 +174,7 @@ class RunnerBase(object):
metrics
.
extend
(
metrics_rets
)
if
batch_id
%
fetch_period
==
0
and
batch_id
!=
0
:
print
(
metrics_format
.
format
(
*
metrics
))
logger
.
info
(
metrics_format
.
format
(
*
metrics
))
batch_id
+=
1
except
fluid
.
core
.
EOFException
:
reader
.
reset
()
...
...
@@ -365,7 +370,7 @@ class SingleRunner(RunnerBase):
"""
def
__init__
(
self
,
context
):
print
(
"Running SingleRunner."
)
logger
.
info
(
"Running SingleRunner."
)
pass
def
run
(
self
,
context
):
...
...
@@ -381,7 +386,7 @@ class SingleRunner(RunnerBase):
result
=
self
.
_run
(
context
,
model_dict
)
end_time
=
time
.
time
()
seconds
=
end_time
-
begin_time
message
=
"epoch {} done, use time: {}"
.
format
(
epoch
,
seconds
)
message
=
"epoch {} done, use time: {}
s
"
.
format
(
epoch
,
seconds
)
metrics_result
=
[]
for
key
in
metrics
:
if
isinstance
(
metrics
[
key
],
Metric
):
...
...
@@ -394,7 +399,7 @@ class SingleRunner(RunnerBase):
metrics_result
.
append
(
_str
)
if
len
(
metrics_result
)
>
0
:
message
+=
", global metrics: "
+
", "
.
join
(
metrics_result
)
print
(
message
)
logger
.
info
(
message
)
with
fluid
.
scope_guard
(
context
[
"model"
][
model_dict
[
"name"
]][
"scope"
]):
...
...
@@ -409,7 +414,7 @@ class SingleRunner(RunnerBase):
class
PSRunner
(
RunnerBase
):
def
__init__
(
self
,
context
):
print
(
"Running PSRunner."
)
logger
.
info
(
"Running PSRunner."
)
pass
def
run
(
self
,
context
):
...
...
@@ -424,7 +429,7 @@ class PSRunner(RunnerBase):
result
=
self
.
_run
(
context
,
model_dict
)
end_time
=
time
.
time
()
seconds
=
end_time
-
begin_time
message
=
"epoch {} done, use time: {}"
.
format
(
epoch
,
seconds
)
message
=
"epoch {} done, use time: {}
s
"
.
format
(
epoch
,
seconds
)
# TODO, wait for PaddleCloudRoleMaker supports gloo
from
paddle.fluid.incubate.fleet.base.role_maker
import
GeneralRoleMaker
...
...
@@ -442,7 +447,7 @@ class PSRunner(RunnerBase):
metrics_result
.
append
(
_str
)
if
len
(
metrics_result
)
>
0
:
message
+=
", global metrics: "
+
", "
.
join
(
metrics_result
)
print
(
message
)
logger
.
info
(
message
)
with
fluid
.
scope_guard
(
context
[
"model"
][
model_dict
[
"name"
]][
"scope"
]):
train_prog
=
context
[
"model"
][
model_dict
[
"name"
]][
...
...
@@ -456,7 +461,7 @@ class PSRunner(RunnerBase):
class
CollectiveRunner
(
RunnerBase
):
def
__init__
(
self
,
context
):
print
(
"Running CollectiveRunner."
)
logger
.
info
(
"Running CollectiveRunner."
)
pass
def
run
(
self
,
context
):
...
...
@@ -469,7 +474,7 @@ class CollectiveRunner(RunnerBase):
self
.
_run
(
context
,
model_dict
)
end_time
=
time
.
time
()
seconds
=
end_time
-
begin_time
print
(
"epoch {} done, use time: {}
"
.
format
(
epoch
,
seconds
))
logger
.
info
(
"epoch {} done, use time: {}s
"
.
format
(
epoch
,
seconds
))
with
fluid
.
scope_guard
(
context
[
"model"
][
model_dict
[
"name"
]][
"scope"
]):
train_prog
=
context
[
"model"
][
model_dict
[
"name"
]][
...
...
@@ -483,7 +488,7 @@ class CollectiveRunner(RunnerBase):
class
PslibRunner
(
RunnerBase
):
def
__init__
(
self
,
context
):
print
(
"Running PSRunner."
)
logger
.
info
(
"Running PSRunner."
)
pass
def
run
(
self
,
context
):
...
...
@@ -497,7 +502,7 @@ class PslibRunner(RunnerBase):
self
.
_run
(
context
,
model_dict
)
end_time
=
time
.
time
()
seconds
=
end_time
-
begin_time
print
(
"epoch {} done, use time: {}
"
.
format
(
epoch
,
seconds
))
logger
.
info
(
"epoch {} done, use time: {}s
"
.
format
(
epoch
,
seconds
))
"""
# online Training Can do more, As shown below:
...
...
@@ -527,7 +532,7 @@ class PslibRunner(RunnerBase):
self._run(context, model_dict)
end_time = time.time()
seconds = end_time - begin_time
print
("epoch {} done, use time: {}".format(epoch, seconds))
logger.info
("epoch {} done, use time: {}".format(epoch, seconds))
with fluid.scope_guard(context["model"][model_dict["name"]]
["scope"]):
train_prog = context["model"][model_dict["name"]][
...
...
@@ -543,7 +548,7 @@ class PslibRunner(RunnerBase):
class
SingleInferRunner
(
RunnerBase
):
def
__init__
(
self
,
context
):
print
(
"Running SingleInferRunner."
)
logger
.
info
(
"Running SingleInferRunner."
)
pass
def
run
(
self
,
context
):
...
...
@@ -559,7 +564,7 @@ class SingleInferRunner(RunnerBase):
result
=
self
.
_run
(
context
,
model_dict
)
end_time
=
time
.
time
()
seconds
=
end_time
-
begin_time
message
=
"Infer {} of epoch {} done, use time: {}"
.
format
(
message
=
"Infer {} of epoch {} done, use time: {}
s
"
.
format
(
model_dict
[
"name"
],
epoch_name
,
seconds
)
metrics_result
=
[]
for
key
in
metrics
:
...
...
@@ -573,14 +578,14 @@ class SingleInferRunner(RunnerBase):
metrics_result
.
append
(
_str
)
if
len
(
metrics_result
)
>
0
:
message
+=
", global metrics: "
+
", "
.
join
(
metrics_result
)
print
(
message
)
logger
.
info
(
message
)
context
[
"status"
]
=
"terminal_pass"
def
_load
(
self
,
context
,
model_dict
,
model_path
):
if
model_path
is
None
or
model_path
==
""
:
return
print
(
"load persistables from"
,
model_path
)
logger
.
info
(
"load persistables from"
,
model_path
)
with
fluid
.
scope_guard
(
context
[
"model"
][
model_dict
[
"name"
]][
"scope"
]):
train_prog
=
context
[
"model"
][
model_dict
[
"name"
]][
"main_program"
]
...
...
core/trainers/framework/startup.py
浏览文件 @
457f693c
...
...
@@ -15,6 +15,7 @@
from
__future__
import
print_function
import
warnings
import
logging
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
...
...
@@ -25,6 +26,10 @@ __all__ = [
"FineTuningStartup"
]
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
class
StartupBase
(
object
):
"""R
...
...
@@ -41,10 +46,10 @@ class StartupBase(object):
"runner."
+
context
[
"runner_name"
]
+
".init_model_path"
,
None
)
if
dirname
is
None
or
dirname
==
""
:
return
print
(
"going to load "
,
dirname
)
logger
.
info
(
"going to load "
,
dirname
)
fluid
.
io
.
load_persistables
(
context
[
"exe"
],
dirname
,
main_program
=
main_program
)
print
(
"load from {} success"
.
format
(
dirname
))
logger
.
info
(
"load from {} success"
.
format
(
dirname
))
class
SingleStartup
(
StartupBase
):
...
...
@@ -52,7 +57,7 @@ class SingleStartup(StartupBase):
"""
def
__init__
(
self
,
context
):
print
(
"Running SingleStartup."
)
logger
.
info
(
"Running SingleStartup."
)
pass
def
startup
(
self
,
context
):
...
...
@@ -79,7 +84,7 @@ class FineTuningStartup(StartupBase):
self
.
self
.
op_role_var_attr_name
=
core
.
op_proto_and_checker_maker
.
kOpRoleVarAttrName
(
)
print
(
"Running SingleStartup."
)
logger
.
info
(
"Running SingleStartup."
)
def
_is_opt_role_op
(
self
,
op
):
# NOTE: depend on oprole to find out whether this op is for
...
...
@@ -155,7 +160,7 @@ class FineTuningStartup(StartupBase):
"runner."
+
context
[
"runner_name"
]
+
".init_model_path"
,
None
)
if
dirname
is
None
or
dirname
==
""
:
return
print
(
"going to load "
,
dirname
)
logger
.
info
(
"going to load "
,
dirname
)
params_grads
=
self
.
_get_params_grads
(
main_program
)
update_params
=
[
p
for
p
,
_
in
params_grads
]
...
...
@@ -169,7 +174,7 @@ class FineTuningStartup(StartupBase):
fluid
.
io
.
load_vars
(
context
[
"exe"
],
dirname
,
main_program
,
need_load_vars
)
print
(
"load from {} success"
.
format
(
dirname
))
logger
.
info
(
"load from {} success"
.
format
(
dirname
))
def
startup
(
self
,
context
):
for
model_dict
in
context
[
"phases"
]:
...
...
@@ -187,7 +192,7 @@ class FineTuningStartup(StartupBase):
class
PSStartup
(
StartupBase
):
def
__init__
(
self
,
context
):
print
(
"Running PSStartup."
)
logger
.
info
(
"Running PSStartup."
)
pass
def
startup
(
self
,
context
):
...
...
@@ -204,7 +209,7 @@ class PSStartup(StartupBase):
class
CollectiveStartup
(
StartupBase
):
def
__init__
(
self
,
context
):
print
(
"Running CollectiveStartup."
)
logger
.
info
(
"Running CollectiveStartup."
)
pass
def
startup
(
self
,
context
):
...
...
@@ -222,7 +227,7 @@ class CollectiveStartup(StartupBase):
class
SingleInferStartup
(
StartupBase
):
def
__init__
(
self
,
context
):
print
(
"Running SingleInferStartup."
)
logger
.
info
(
"Running SingleInferStartup."
)
pass
def
startup
(
self
,
context
):
...
...
core/trainers/framework/terminal.py
浏览文件 @
457f693c
...
...
@@ -15,12 +15,17 @@
from
__future__
import
print_function
import
warnings
import
logging
import
paddle.fluid
as
fluid
from
paddlerec.core.utils
import
envs
__all__
=
[
"TerminalBase"
,
"PSTerminalBase"
]
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
class
TerminalBase
(
object
):
"""R
...
...
@@ -30,7 +35,7 @@ class TerminalBase(object):
pass
def
terminal
(
self
,
context
):
print
(
"PaddleRec Finish"
)
logger
.
info
(
"PaddleRec Finish"
)
class
PSTerminal
(
TerminalBase
):
...
...
@@ -42,4 +47,4 @@ class PSTerminal(TerminalBase):
def
terminal
(
self
,
context
):
context
[
"fleet"
].
stop_worker
()
print
(
"PaddleRec Finish"
)
logger
.
info
(
"PaddleRec Finish"
)
core/trainers/general_trainer.py
浏览文件 @
457f693c
...
...
@@ -17,10 +17,15 @@ General Trainer, applicable to many situations: Single/Cluster/Local_Cluster + P
from
__future__
import
print_function
import
os
import
logging
from
paddlerec.core.utils
import
envs
from
paddlerec.core.trainer
import
Trainer
,
EngineMode
,
FleetMode
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
()
logger
.
setLevel
(
logging
.
INFO
)
class
GeneralTrainer
(
Trainer
):
"""
...
...
@@ -34,7 +39,7 @@ class GeneralTrainer(Trainer):
self
.
runner_env_name
=
"runner."
+
self
.
_context
[
"runner_name"
]
def
processor_register
(
self
):
print
(
"processor_register begin"
)
logger
.
info
(
"processor_register begin"
)
self
.
regist_context_processor
(
'uninit'
,
self
.
instance
)
self
.
regist_context_processor
(
'network_pass'
,
self
.
network
)
self
.
regist_context_processor
(
'startup_pass'
,
self
.
startup
)
...
...
core/utils/dataloader_instance.py
浏览文件 @
457f693c
...
...
@@ -14,6 +14,7 @@
from
__future__
import
print_function
import
os
import
logging
from
paddlerec.core.utils.envs
import
lazy_instance_by_fliename
from
paddlerec.core.utils.envs
import
get_global_env
from
paddlerec.core.utils.envs
import
get_runtime_environ
...
...
@@ -21,6 +22,10 @@ from paddlerec.core.reader import SlotReader
from
paddlerec.core.trainer
import
EngineMode
from
paddlerec.core.utils.util
import
split_files
,
check_filelist
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
()
logger
.
setLevel
(
logging
.
INFO
)
def
dataloader_by_name
(
readerclass
,
dataset_name
,
...
...
@@ -41,7 +46,7 @@ def dataloader_by_name(readerclass,
hidden_file_list
,
files
=
check_filelist
(
hidden_file_list
=
[],
data_file_list
=
[],
train_data_path
=
data_path
)
if
(
hidden_file_list
is
not
None
):
print
(
logger
.
info
(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
.
format
(
hidden_file_list
))
...
...
@@ -55,7 +60,7 @@ def dataloader_by_name(readerclass,
"cluster_type"
]
==
"K8S"
:
# for k8s mount mode, split files for every node
need_split_files
=
True
print
(
"need_split_files: {}"
.
format
(
need_split_files
))
logger
.
info
(
"need_split_files: {}"
.
format
(
need_split_files
))
if
need_split_files
:
files
=
split_files
(
files
,
context
[
"fleet"
].
worker_index
(),
context
[
"fleet"
].
worker_num
())
...
...
@@ -103,7 +108,7 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context):
hidden_file_list
,
files
=
check_filelist
(
hidden_file_list
=
[],
data_file_list
=
[],
train_data_path
=
data_path
)
if
(
hidden_file_list
is
not
None
):
print
(
logger
.
info
(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
.
format
(
hidden_file_list
))
...
...
@@ -173,7 +178,7 @@ def slotdataloader(readerclass, train, yaml_file, context):
hidden_file_list
,
files
=
check_filelist
(
hidden_file_list
=
[],
data_file_list
=
[],
train_data_path
=
data_path
)
if
(
hidden_file_list
is
not
None
):
print
(
logger
.
info
(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
.
format
(
hidden_file_list
))
...
...
core/utils/dataset_holder.py
浏览文件 @
457f693c
...
...
@@ -15,12 +15,17 @@
import
abc
import
datetime
import
time
import
logging
import
paddle.fluid
as
fluid
from
paddlerec.core.utils
import
fs
as
fs
from
paddlerec.core.utils
import
util
as
util
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
class
DatasetHolder
(
object
):
"""
...
...
@@ -187,7 +192,7 @@ class TimeSplitDatasetHolder(DatasetHolder):
windown_min
=
params
[
'time_window_min'
]
if
begin_time
not
in
self
.
_datasets
:
while
self
.
check_ready
(
begin_time
,
windown_min
)
==
False
:
print
(
"dataset not ready, time:"
+
begin_time
)
logger
.
info
(
"dataset not ready, time:"
+
begin_time
)
time
.
sleep
(
30
)
file_list
=
self
.
get_file_list
(
begin_time
,
windown_min
,
params
[
'node_num'
],
...
...
core/utils/envs.py
浏览文件 @
457f693c
...
...
@@ -21,6 +21,12 @@ import sys
import
six
import
traceback
import
six
import
time
import
logging
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
()
logger
.
setLevel
(
logging
.
INFO
)
global_envs
=
{}
global_envs_flatten
=
{}
...
...
@@ -104,7 +110,7 @@ def set_global_envs(envs):
global_envs
[
name
]
=
"DataLoader"
if
get_platform
()
==
"LINUX"
and
six
.
PY3
:
print
(
"QueueDataset can not support PY3, change to DataLoader"
)
logger
.
info
(
"QueueDataset can not support PY3, change to DataLoader"
)
for
dataset
in
envs
[
"dataset"
]:
name
=
"."
.
join
([
"dataset"
,
dataset
[
"name"
],
"type"
])
global_envs
[
name
]
=
"DataLoader"
...
...
@@ -207,7 +213,7 @@ def lazy_instance_by_package(package, class_name):
return
instance
except
Exception
as
err
:
traceback
.
print_exc
()
print
(
'Catch Exception:%s'
%
str
(
err
))
logger
.
info
(
'Catch Exception:%s'
%
str
(
err
))
return
None
...
...
@@ -223,7 +229,7 @@ def lazy_instance_by_fliename(abs, class_name):
return
instance
except
Exception
as
err
:
traceback
.
print_exc
()
print
(
'Catch Exception:%s'
%
str
(
err
))
logger
.
info
(
'Catch Exception:%s'
%
str
(
err
))
return
None
...
...
core/utils/fs.py
浏览文件 @
457f693c
...
...
@@ -13,9 +13,15 @@
# limitations under the License.
import
os
import
time
import
logging
from
paddle.fluid.incubate.fleet.utils.hdfs
import
HDFSClient
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
()
logger
.
setLevel
(
logging
.
INFO
)
def
is_afs_path
(
path
):
"""is_afs_path
...
...
@@ -177,4 +183,4 @@ class FileHandler(object):
return
self
.
_hdfs_client
.
upload
(
dest_path
,
org_path
)
if
org_is_afs
and
not
dest_is_afs
:
return
self
.
_hdfs_client
.
download
(
org_path
,
dest_path
)
print
(
"Not Suppor hdfs cp currently"
)
logger
.
info
(
"Not Suppor hdfs cp currently"
)
core/utils/util.py
浏览文件 @
457f693c
...
...
@@ -16,9 +16,14 @@ import datetime
import
os
import
sys
import
time
import
logging
import
numpy
as
np
from
paddle
import
fluid
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
def
save_program_proto
(
path
,
program
=
None
):
if
program
is
None
:
...
...
@@ -146,9 +151,9 @@ def print_log(log_str, params):
log_str
=
time_str
+
" "
+
log_str
if
'master'
in
params
and
params
[
'master'
]:
if
'index'
in
params
and
params
[
'index'
]
==
0
:
print
(
log_str
)
logger
.
info
(
log_str
)
else
:
print
(
log_str
)
logger
.
info
(
log_str
)
sys
.
stdout
.
flush
()
if
'stdout'
in
params
:
params
[
'stdout'
]
+=
log_str
+
'
\n
'
...
...
core/utils/validation.py
浏览文件 @
457f693c
...
...
@@ -12,8 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
time
import
logging
from
paddlerec.core.utils
import
envs
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
()
logger
.
setLevel
(
logging
.
INFO
)
class
ValueFormat
:
def
__init__
(
self
,
value_type
,
value
,
value_handler
,
required
=
False
):
...
...
@@ -41,64 +47,67 @@ class ValueFormat:
def
is_type_valid
(
self
,
name
,
value
):
if
self
.
value_type
==
"int"
:
if
not
isinstance
(
value
,
int
):
print
(
"
\n
attr {} should be int, but {} now
\n
"
.
format
(
logger
.
info
(
"
\n
attr {} should be int, but {} now
\n
"
.
format
(
name
,
type
(
value
)))
return
False
return
True
elif
self
.
value_type
==
"str"
:
if
not
isinstance
(
value
,
str
):
print
(
"
\n
attr {} should be str, but {} now
\n
"
.
format
(
logger
.
info
(
"
\n
attr {} should be str, but {} now
\n
"
.
format
(
name
,
type
(
value
)))
return
False
return
True
elif
self
.
value_type
==
"strs"
:
if
not
isinstance
(
value
,
list
):
print
(
"
\n
attr {} should be list(str), but {} now
\n
"
.
format
(
name
,
type
(
value
)))
logger
.
info
(
"
\n
attr {} should be list(str), but {} now
\n
"
.
format
(
name
,
type
(
value
)))
return
False
for
v
in
value
:
if
not
isinstance
(
v
,
str
):
print
(
"
\n
attr {} should be list(str), but list({}) now
\n
"
.
format
(
name
,
type
(
v
)))
logger
.
info
(
"
\n
attr {} should be list(str), but list({}) now
\n
"
.
format
(
name
,
type
(
v
)))
return
False
return
True
elif
self
.
value_type
==
"dict"
:
if
not
isinstance
(
value
,
dict
):
print
(
"
\n
attr {} should be str, but {} now
\n
"
.
format
(
logger
.
info
(
"
\n
attr {} should be str, but {} now
\n
"
.
format
(
name
,
type
(
value
)))
return
False
return
True
elif
self
.
value_type
==
"dicts"
:
if
not
isinstance
(
value
,
list
):
print
(
"
\n
attr {} should be list(dist), but {} now
\n
"
.
format
(
name
,
type
(
value
)))
logger
.
info
(
"
\n
attr {} should be list(dist), but {} now
\n
"
.
format
(
name
,
type
(
value
)))
return
False
for
v
in
value
:
if
not
isinstance
(
v
,
dict
):
print
(
"
\n
attr {} should be list(dist), but list({}) now
\n
"
.
format
(
name
,
type
(
v
)))
logger
.
info
(
"
\n
attr {} should be list(dist), but list({}) now
\n
"
.
format
(
name
,
type
(
v
)))
return
False
return
True
elif
self
.
value_type
==
"ints"
:
if
not
isinstance
(
value
,
list
):
print
(
"
\n
attr {} should be list(int), but {} now
\n
"
.
format
(
name
,
type
(
value
)))
logger
.
info
(
"
\n
attr {} should be list(int), but {} now
\n
"
.
format
(
name
,
type
(
value
)))
return
False
for
v
in
value
:
if
not
isinstance
(
v
,
int
):
print
(
"
\n
attr {} should be list(int), but list({}) now
\n
"
.
format
(
name
,
type
(
v
)))
logger
.
info
(
"
\n
attr {} should be list(int), but list({}) now
\n
"
.
format
(
name
,
type
(
v
)))
return
False
return
True
else
:
print
(
"
\n
attr {}'s type is {}, can not be supported now
\n
"
.
format
(
name
,
type
(
value
)))
logger
.
info
(
"
\n
attr {}'s type is {}, can not be supported now
\n
"
.
format
(
name
,
type
(
value
)))
return
False
def
is_value_valid
(
self
,
name
,
value
):
...
...
@@ -108,7 +117,7 @@ class ValueFormat:
def
in_value_handler
(
name
,
value
,
values
):
if
value
not
in
values
:
print
(
"
\n
attr {}'s value is {}, but {} is expected
\n
"
.
format
(
logger
.
info
(
"
\n
attr {}'s value is {}, but {} is expected
\n
"
.
format
(
name
,
value
,
values
))
return
False
return
True
...
...
@@ -116,7 +125,7 @@ def in_value_handler(name, value, values):
def
eq_value_handler
(
name
,
value
,
values
):
if
value
!=
values
:
print
(
"
\n
attr {}'s value is {}, but == {} is expected
\n
"
.
format
(
logger
.
info
(
"
\n
attr {}'s value is {}, but == {} is expected
\n
"
.
format
(
name
,
value
,
values
))
return
False
return
True
...
...
@@ -124,7 +133,7 @@ def eq_value_handler(name, value, values):
def
ge_value_handler
(
name
,
value
,
values
):
if
value
<
values
:
print
(
"
\n
attr {}'s value is {}, but >= {} is expected
\n
"
.
format
(
logger
.
info
(
"
\n
attr {}'s value is {}, but >= {} is expected
\n
"
.
format
(
name
,
value
,
values
))
return
False
return
True
...
...
@@ -132,7 +141,7 @@ def ge_value_handler(name, value, values):
def
le_value_handler
(
name
,
value
,
values
):
if
value
>
values
:
print
(
"
\n
attr {}'s value is {}, but <= {} is expected
\n
"
.
format
(
logger
.
info
(
"
\n
attr {}'s value is {}, but <= {} is expected
\n
"
.
format
(
name
,
value
,
values
))
return
False
return
True
...
...
@@ -160,8 +169,8 @@ def yaml_validation(config):
for
required
in
require_checkers
:
if
required
not
in
_config
.
keys
():
print
(
"
\n
can not find {} in yaml, which is required
\n
"
.
format
(
required
))
logger
.
info
(
"
\n
can not find {} in yaml, which is required
\n
"
.
format
(
required
))
return
False
for
name
,
value
in
_config
.
items
():
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录