Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleRec
提交
457f693c
P
PaddleRec
项目概览
PaddlePaddle
/
PaddleRec
通知
68
Star
12
Fork
5
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
27
列表
看板
标记
里程碑
合并请求
10
Wiki
1
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
27
Issue
27
列表
看板
标记
里程碑
合并请求
10
合并请求
10
Pages
分析
分析
仓库分析
DevOps
Wiki
1
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
457f693c
编写于
8月 31, 2020
作者:
L
liuyuhui
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix a bug , transfer from print() to logging
上级
2caf374d
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
189 addition
and
89 deletion
+189
-89
core/engine/cluster/cluster.py
core/engine/cluster/cluster.py
+7
-1
core/engine/local_cluster.py
core/engine/local_cluster.py
+8
-2
core/engine/local_mpi.py
core/engine/local_mpi.py
+6
-1
core/factory.py
core/factory.py
+6
-1
core/trainer.py
core/trainer.py
+15
-9
core/trainers/finetuning_trainer.py
core/trainers/finetuning_trainer.py
+1
-1
core/trainers/framework/dataset.py
core/trainers/framework/dataset.py
+9
-3
core/trainers/framework/instance.py
core/trainers/framework/instance.py
+9
-4
core/trainers/framework/network.py
core/trainers/framework/network.py
+10
-5
core/trainers/framework/runner.py
core/trainers/framework/runner.py
+21
-16
core/trainers/framework/startup.py
core/trainers/framework/startup.py
+14
-9
core/trainers/framework/terminal.py
core/trainers/framework/terminal.py
+7
-2
core/trainers/general_trainer.py
core/trainers/general_trainer.py
+6
-1
core/utils/dataloader_instance.py
core/utils/dataloader_instance.py
+9
-4
core/utils/dataset_holder.py
core/utils/dataset_holder.py
+6
-1
core/utils/envs.py
core/utils/envs.py
+9
-3
core/utils/fs.py
core/utils/fs.py
+7
-1
core/utils/util.py
core/utils/util.py
+7
-2
core/utils/validation.py
core/utils/validation.py
+32
-23
未找到文件。
core/engine/cluster/cluster.py
浏览文件 @
457f693c
...
...
@@ -19,11 +19,16 @@ import copy
import
os
import
subprocess
import
warnings
import
logging
from
paddlerec.core.engine.engine
import
Engine
from
paddlerec.core.factory
import
TrainerFactory
from
paddlerec.core.utils
import
envs
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
()
logger
.
setLevel
(
logging
.
INFO
)
class
ClusterEngine
(
Engine
):
def
__init_impl__
(
self
):
...
...
@@ -220,7 +225,8 @@ class ClusterEnvBase(object):
def
env_set
(
self
):
envs
.
set_runtime_environs
(
self
.
cluster_env
)
flattens
=
envs
.
flatten_environs
(
self
.
cluster_env
)
print
(
envs
.
pretty_print_envs
(
flattens
,
(
"Cluster Envs"
,
"Value"
)))
logger
.
info
(
envs
.
pretty_print_envs
(
flattens
,
(
"Cluster Envs"
,
"Value"
)))
class
PaddleCloudMpiEnv
(
ClusterEnvBase
):
...
...
core/engine/local_cluster.py
浏览文件 @
457f693c
...
...
@@ -19,10 +19,15 @@ import copy
import
os
import
sys
import
subprocess
import
logging
from
paddlerec.core.engine.engine
import
Engine
from
paddlerec.core.utils
import
envs
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
()
logger
.
setLevel
(
logging
.
INFO
)
class
LocalClusterEngine
(
Engine
):
def
start_procs
(
self
):
...
...
@@ -57,7 +62,8 @@ class LocalClusterEngine(Engine):
]
factory
=
"paddlerec.core.factory"
cmd
=
[
sys
.
executable
,
"-u"
,
"-m"
,
factory
,
self
.
trainer
]
cmd
=
[
sys
.
executable
,
"-u"
,
"-m"
,
factory
,
self
.
trainer
]
#problems
for
i
in
range
(
server_num
):
current_env
.
update
({
...
...
@@ -145,7 +151,7 @@ class LocalClusterEngine(Engine):
if
len
(
log_fns
)
>
0
:
log_fns
[
i
].
close
()
procs
[
i
].
terminate
()
print
(
logger
.
info
(
"all workers already completed, you can view logs under the `{}` directory"
.
format
(
logs_dir
),
file
=
sys
.
stderr
)
...
...
core/engine/local_mpi.py
浏览文件 @
457f693c
...
...
@@ -19,9 +19,14 @@ import copy
import
os
import
sys
import
subprocess
import
logging
from
paddlerec.core.engine.engine
import
Engine
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
()
logger
.
setLevel
(
logging
.
INFO
)
class
LocalMPIEngine
(
Engine
):
def
start_procs
(
self
):
...
...
@@ -51,7 +56,7 @@ class LocalMPIEngine(Engine):
if
len
(
log_fns
)
>
0
:
log_fns
[
i
].
close
()
procs
[
i
].
wait
()
print
(
logger
.
info
(
"all workers and parameter servers already completed"
,
file
=
sys
.
stderr
)
...
...
core/factory.py
浏览文件 @
457f693c
...
...
@@ -14,8 +14,13 @@
import
os
import
sys
import
logging
from
paddlerec.core.utils
import
envs
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
trainer_abs
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"trainers"
)
trainers
=
{}
...
...
@@ -53,7 +58,7 @@ class TrainerFactory(object):
@
staticmethod
def
_build_trainer
(
yaml_path
):
print
(
envs
.
pretty_print_envs
(
envs
.
get_global_envs
()))
logger
.
info
(
envs
.
pretty_print_envs
(
envs
.
get_global_envs
()))
train_mode
=
envs
.
get_trainer
()
trainer_abs
=
trainers
.
get
(
train_mode
,
None
)
...
...
core/trainer.py
浏览文件 @
457f693c
...
...
@@ -17,11 +17,16 @@ import os
import
time
import
sys
import
traceback
import
logging
from
paddle
import
fluid
from
paddlerec.core.utils
import
envs
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
class
EngineMode
:
"""
...
...
@@ -88,7 +93,7 @@ class Trainer(object):
phases
.
append
(
phase
)
self
.
_context
[
"phases"
]
=
phases
print
(
"PaddleRec: Runner {} Begin"
.
format
(
self
.
_runner_name
))
logger
.
info
(
"PaddleRec: Runner {} Begin"
.
format
(
self
.
_runner_name
))
self
.
which_engine
()
self
.
which_device
()
self
.
which_fleet_mode
()
...
...
@@ -107,7 +112,7 @@ class Trainer(object):
self
.
device
=
Device
.
GPU
gpu_id
=
int
(
os
.
environ
.
get
(
'FLAGS_selected_gpus'
,
0
))
self
.
_place
=
fluid
.
CUDAPlace
(
gpu_id
)
print
(
"PaddleRec run on device GPU: {}"
.
format
(
gpu_id
))
logger
.
info
(
"PaddleRec run on device GPU: {}"
.
format
(
gpu_id
))
self
.
_exe
=
fluid
.
Executor
(
self
.
_place
)
elif
device
==
"CPU"
:
self
.
device
=
Device
.
CPU
...
...
@@ -169,7 +174,7 @@ class Trainer(object):
def
which_cluster_type
(
self
):
cluster_type
=
os
.
getenv
(
"PADDLEREC_CLUSTER_TYPE"
,
"MPI"
)
print
(
"PADDLEREC_CLUSTER_TYPE: {}"
.
format
(
cluster_type
))
logger
.
info
(
"PADDLEREC_CLUSTER_TYPE: {}"
.
format
(
cluster_type
))
if
cluster_type
and
cluster_type
.
upper
()
==
"K8S"
:
self
.
_context
[
"cluster_type"
]
=
"K8S"
else
:
...
...
@@ -184,7 +189,7 @@ class Trainer(object):
self
.
is_infer
=
False
else
:
self
.
is_infer
=
True
print
(
"Executor Mode: {}"
.
format
(
executor_mode
))
logger
.
info
(
"Executor Mode: {}"
.
format
(
executor_mode
))
self
.
_context
[
"is_infer"
]
=
self
.
is_infer
def
legality_check
(
self
):
...
...
@@ -224,7 +229,7 @@ class Trainer(object):
Return:
None, just sleep in base
"""
print
(
'unknow context_status:%s, do nothing'
%
context
[
'status'
])
logger
.
info
(
'unknow context_status:%s, do nothing'
%
context
[
'status'
])
time
.
sleep
(
60
)
def
handle_processor_exception
(
self
,
context
,
exception
):
...
...
@@ -233,9 +238,10 @@ class Trainer(object):
Return:
bool exit_app or not
"""
print
(
"
\n
--------------------------------
\n
PaddleRec Error Message "
"Summary:
\n
--------------------------------
\n
"
)
print
(
logger
.
info
(
"
\n
--------------------------------
\n
PaddleRec Error Message "
"Summary:
\n
--------------------------------
\n
"
)
logger
.
info
(
'Exit PaddleRec. catch exception in precoss status: [%s], except: %s'
%
(
context
[
'status'
],
str
(
exception
)))
return
True
...
...
@@ -258,7 +264,7 @@ class Trainer(object):
break
except
Exception
as
err
:
traceback
.
print_exc
()
print
(
'Catch Exception:%s'
%
str
(
err
))
logger
.
info
(
'Catch Exception:%s'
%
str
(
err
))
sys
.
stdout
.
flush
()
self
.
handle_processor_exception
(
self
.
_context
,
err
)
sys
.
exit
(
type
(
err
).
__name__
)
core/trainers/finetuning_trainer.py
浏览文件 @
457f693c
...
...
@@ -34,7 +34,7 @@ class FineTuningTrainer(Trainer):
self
.
runner_env_name
=
"runner."
+
self
.
_context
[
"runner_name"
]
def
processor_register
(
self
):
print
(
"processor_register begin"
)
logger
.
info
(
"processor_register begin"
)
self
.
regist_context_processor
(
'uninit'
,
self
.
instance
)
self
.
regist_context_processor
(
'network_pass'
,
self
.
network
)
self
.
regist_context_processor
(
'startup_pass'
,
self
.
startup
)
...
...
core/trainers/framework/dataset.py
浏览文件 @
457f693c
...
...
@@ -15,6 +15,7 @@
from
__future__
import
print_function
import
os
import
logging
import
paddle.fluid
as
fluid
from
paddlerec.core.utils
import
envs
...
...
@@ -25,6 +26,10 @@ from paddlerec.core.utils.util import split_files, check_filelist
__all__
=
[
"DatasetBase"
,
"DataLoader"
,
"QueueDataset"
]
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
class
DatasetBase
(
object
):
"""R
...
...
@@ -83,7 +88,8 @@ class QueueDataset(DatasetBase):
name
=
"dataset."
+
dataset_name
+
"."
type_name
=
envs
.
get_global_env
(
name
+
"type"
)
if
envs
.
get_platform
()
!=
"LINUX"
:
print
(
"platform "
,
envs
.
get_platform
(),
"Reader To Dataloader"
)
logger
.
info
(
"platform "
,
envs
.
get_platform
(),
"Reader To Dataloader"
)
type_name
=
"DataLoader"
if
type_name
==
"DataLoader"
:
...
...
@@ -126,7 +132,7 @@ class QueueDataset(DatasetBase):
data_file_list
=
[],
train_data_path
=
train_data_path
)
if
(
hidden_file_list
is
not
None
):
print
(
logger
.
info
(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
.
format
(
hidden_file_list
))
...
...
@@ -143,7 +149,7 @@ class QueueDataset(DatasetBase):
if
need_split_files
:
file_list
=
split_files
(
file_list
,
context
[
"fleet"
].
worker_index
(),
context
[
"fleet"
].
worker_num
())
print
(
"File_list: {}"
.
format
(
file_list
))
logger
.
info
(
"File_list: {}"
.
format
(
file_list
))
dataset
.
set_filelist
(
file_list
)
for
model_dict
in
context
[
"phases"
]:
...
...
core/trainers/framework/instance.py
浏览文件 @
457f693c
...
...
@@ -15,6 +15,7 @@
from
__future__
import
print_function
import
warnings
import
logging
import
paddle.fluid
as
fluid
from
paddlerec.core.utils
import
envs
...
...
@@ -24,6 +25,10 @@ __all__ = [
"CollectiveInstance"
]
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
class
InstanceBase
(
object
):
"""R
...
...
@@ -38,7 +43,7 @@ class InstanceBase(object):
class
SingleInstance
(
InstanceBase
):
def
__init__
(
self
,
context
):
print
(
"Running SingleInstance."
)
logger
.
info
(
"Running SingleInstance."
)
pass
def
instance
(
self
,
context
):
...
...
@@ -47,7 +52,7 @@ class SingleInstance(InstanceBase):
class
PSInstance
(
InstanceBase
):
def
__init__
(
self
,
context
):
print
(
"Running PSInstance."
)
logger
.
info
(
"Running PSInstance."
)
pass
def
instance
(
self
,
context
):
...
...
@@ -61,7 +66,7 @@ class PSInstance(InstanceBase):
class
PslibInstance
(
InstanceBase
):
def
__init__
(
self
,
context
):
print
(
"Running PslibInstance."
)
logger
.
info
(
"Running PslibInstance."
)
pass
def
instance
(
self
,
context
):
...
...
@@ -73,7 +78,7 @@ class PslibInstance(InstanceBase):
class
CollectiveInstance
(
InstanceBase
):
def
__init__
(
self
,
context
):
print
(
"Running CollectiveInstance."
)
logger
.
info
(
"Running CollectiveInstance."
)
pass
def
instance
(
self
,
context
):
...
...
core/trainers/framework/network.py
浏览文件 @
457f693c
...
...
@@ -16,6 +16,7 @@ from __future__ import print_function
import
os
import
warnings
import
logging
import
paddle.fluid
as
fluid
from
paddlerec.core.utils
import
envs
...
...
@@ -26,6 +27,10 @@ __all__ = [
"CollectiveNetwork"
,
"FineTuningNetwork"
]
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
class
NetworkBase
(
object
):
"""R
...
...
@@ -43,7 +48,7 @@ class SingleNetwork(NetworkBase):
"""
def
__init__
(
self
,
context
):
print
(
"Running SingleNetwork."
)
logger
.
info
(
"Running SingleNetwork."
)
pass
def
build_network
(
self
,
context
):
...
...
@@ -114,7 +119,7 @@ class FineTuningNetwork(NetworkBase):
"""
def
__init__
(
self
,
context
):
print
(
"Running FineTuningNetwork."
)
logger
.
info
(
"Running FineTuningNetwork."
)
def
build_network
(
self
,
context
):
context
[
"model"
]
=
{}
...
...
@@ -193,7 +198,7 @@ class FineTuningNetwork(NetworkBase):
class
PSNetwork
(
NetworkBase
):
def
__init__
(
self
,
context
):
print
(
"Running PSNetwork."
)
logger
.
info
(
"Running PSNetwork."
)
pass
def
build_network
(
self
,
context
):
...
...
@@ -285,7 +290,7 @@ class PSNetwork(NetworkBase):
class
PslibNetwork
(
NetworkBase
):
def
__init__
(
self
,
context
):
print
(
"Running PslibNetwork."
)
logger
.
info
(
"Running PslibNetwork."
)
pass
def
build_network
(
self
,
context
):
...
...
@@ -357,7 +362,7 @@ class PslibNetwork(NetworkBase):
class
CollectiveNetwork
(
NetworkBase
):
def
__init__
(
self
,
context
):
print
(
"Running CollectiveNetwork."
)
logger
.
info
(
"Running CollectiveNetwork."
)
pass
def
build_network
(
self
,
context
):
...
...
core/trainers/framework/runner.py
浏览文件 @
457f693c
...
...
@@ -17,6 +17,7 @@ from __future__ import print_function
import
os
import
time
import
warnings
import
logging
import
numpy
as
np
import
paddle.fluid
as
fluid
...
...
@@ -27,6 +28,10 @@ __all__ = [
"RunnerBase"
,
"SingleRunner"
,
"PSRunner"
,
"CollectiveRunner"
,
"PslibRunner"
]
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
def
as_numpy
(
tensor
):
"""
...
...
@@ -169,7 +174,7 @@ class RunnerBase(object):
metrics
.
extend
(
metrics_rets
)
if
batch_id
%
fetch_period
==
0
and
batch_id
!=
0
:
print
(
metrics_format
.
format
(
*
metrics
))
logger
.
info
(
metrics_format
.
format
(
*
metrics
))
batch_id
+=
1
except
fluid
.
core
.
EOFException
:
reader
.
reset
()
...
...
@@ -365,7 +370,7 @@ class SingleRunner(RunnerBase):
"""
def
__init__
(
self
,
context
):
print
(
"Running SingleRunner."
)
logger
.
info
(
"Running SingleRunner."
)
pass
def
run
(
self
,
context
):
...
...
@@ -381,7 +386,7 @@ class SingleRunner(RunnerBase):
result
=
self
.
_run
(
context
,
model_dict
)
end_time
=
time
.
time
()
seconds
=
end_time
-
begin_time
message
=
"epoch {} done, use time: {}"
.
format
(
epoch
,
seconds
)
message
=
"epoch {} done, use time: {}
s
"
.
format
(
epoch
,
seconds
)
metrics_result
=
[]
for
key
in
metrics
:
if
isinstance
(
metrics
[
key
],
Metric
):
...
...
@@ -394,7 +399,7 @@ class SingleRunner(RunnerBase):
metrics_result
.
append
(
_str
)
if
len
(
metrics_result
)
>
0
:
message
+=
", global metrics: "
+
", "
.
join
(
metrics_result
)
print
(
message
)
logger
.
info
(
message
)
with
fluid
.
scope_guard
(
context
[
"model"
][
model_dict
[
"name"
]][
"scope"
]):
...
...
@@ -409,7 +414,7 @@ class SingleRunner(RunnerBase):
class
PSRunner
(
RunnerBase
):
def
__init__
(
self
,
context
):
print
(
"Running PSRunner."
)
logger
.
info
(
"Running PSRunner."
)
pass
def
run
(
self
,
context
):
...
...
@@ -424,7 +429,7 @@ class PSRunner(RunnerBase):
result
=
self
.
_run
(
context
,
model_dict
)
end_time
=
time
.
time
()
seconds
=
end_time
-
begin_time
message
=
"epoch {} done, use time: {}"
.
format
(
epoch
,
seconds
)
message
=
"epoch {} done, use time: {}
s
"
.
format
(
epoch
,
seconds
)
# TODO, wait for PaddleCloudRoleMaker supports gloo
from
paddle.fluid.incubate.fleet.base.role_maker
import
GeneralRoleMaker
...
...
@@ -442,7 +447,7 @@ class PSRunner(RunnerBase):
metrics_result
.
append
(
_str
)
if
len
(
metrics_result
)
>
0
:
message
+=
", global metrics: "
+
", "
.
join
(
metrics_result
)
print
(
message
)
logger
.
info
(
message
)
with
fluid
.
scope_guard
(
context
[
"model"
][
model_dict
[
"name"
]][
"scope"
]):
train_prog
=
context
[
"model"
][
model_dict
[
"name"
]][
...
...
@@ -456,7 +461,7 @@ class PSRunner(RunnerBase):
class
CollectiveRunner
(
RunnerBase
):
def
__init__
(
self
,
context
):
print
(
"Running CollectiveRunner."
)
logger
.
info
(
"Running CollectiveRunner."
)
pass
def
run
(
self
,
context
):
...
...
@@ -469,7 +474,7 @@ class CollectiveRunner(RunnerBase):
self
.
_run
(
context
,
model_dict
)
end_time
=
time
.
time
()
seconds
=
end_time
-
begin_time
print
(
"epoch {} done, use time: {}
"
.
format
(
epoch
,
seconds
))
logger
.
info
(
"epoch {} done, use time: {}s
"
.
format
(
epoch
,
seconds
))
with
fluid
.
scope_guard
(
context
[
"model"
][
model_dict
[
"name"
]][
"scope"
]):
train_prog
=
context
[
"model"
][
model_dict
[
"name"
]][
...
...
@@ -483,7 +488,7 @@ class CollectiveRunner(RunnerBase):
class
PslibRunner
(
RunnerBase
):
def
__init__
(
self
,
context
):
print
(
"Running PSRunner."
)
logger
.
info
(
"Running PSRunner."
)
pass
def
run
(
self
,
context
):
...
...
@@ -497,7 +502,7 @@ class PslibRunner(RunnerBase):
self
.
_run
(
context
,
model_dict
)
end_time
=
time
.
time
()
seconds
=
end_time
-
begin_time
print
(
"epoch {} done, use time: {}
"
.
format
(
epoch
,
seconds
))
logger
.
info
(
"epoch {} done, use time: {}s
"
.
format
(
epoch
,
seconds
))
"""
# online Training Can do more, As shown below:
...
...
@@ -527,7 +532,7 @@ class PslibRunner(RunnerBase):
self._run(context, model_dict)
end_time = time.time()
seconds = end_time - begin_time
print
("epoch {} done, use time: {}".format(epoch, seconds))
logger.info
("epoch {} done, use time: {}".format(epoch, seconds))
with fluid.scope_guard(context["model"][model_dict["name"]]
["scope"]):
train_prog = context["model"][model_dict["name"]][
...
...
@@ -543,7 +548,7 @@ class PslibRunner(RunnerBase):
class
SingleInferRunner
(
RunnerBase
):
def
__init__
(
self
,
context
):
print
(
"Running SingleInferRunner."
)
logger
.
info
(
"Running SingleInferRunner."
)
pass
def
run
(
self
,
context
):
...
...
@@ -559,7 +564,7 @@ class SingleInferRunner(RunnerBase):
result
=
self
.
_run
(
context
,
model_dict
)
end_time
=
time
.
time
()
seconds
=
end_time
-
begin_time
message
=
"Infer {} of epoch {} done, use time: {}"
.
format
(
message
=
"Infer {} of epoch {} done, use time: {}
s
"
.
format
(
model_dict
[
"name"
],
epoch_name
,
seconds
)
metrics_result
=
[]
for
key
in
metrics
:
...
...
@@ -573,14 +578,14 @@ class SingleInferRunner(RunnerBase):
metrics_result
.
append
(
_str
)
if
len
(
metrics_result
)
>
0
:
message
+=
", global metrics: "
+
", "
.
join
(
metrics_result
)
print
(
message
)
logger
.
info
(
message
)
context
[
"status"
]
=
"terminal_pass"
def
_load
(
self
,
context
,
model_dict
,
model_path
):
if
model_path
is
None
or
model_path
==
""
:
return
print
(
"load persistables from"
,
model_path
)
logger
.
info
(
"load persistables from"
,
model_path
)
with
fluid
.
scope_guard
(
context
[
"model"
][
model_dict
[
"name"
]][
"scope"
]):
train_prog
=
context
[
"model"
][
model_dict
[
"name"
]][
"main_program"
]
...
...
core/trainers/framework/startup.py
浏览文件 @
457f693c
...
...
@@ -15,6 +15,7 @@
from
__future__
import
print_function
import
warnings
import
logging
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
...
...
@@ -25,6 +26,10 @@ __all__ = [
"FineTuningStartup"
]
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
class
StartupBase
(
object
):
"""R
...
...
@@ -41,10 +46,10 @@ class StartupBase(object):
"runner."
+
context
[
"runner_name"
]
+
".init_model_path"
,
None
)
if
dirname
is
None
or
dirname
==
""
:
return
print
(
"going to load "
,
dirname
)
logger
.
info
(
"going to load "
,
dirname
)
fluid
.
io
.
load_persistables
(
context
[
"exe"
],
dirname
,
main_program
=
main_program
)
print
(
"load from {} success"
.
format
(
dirname
))
logger
.
info
(
"load from {} success"
.
format
(
dirname
))
class
SingleStartup
(
StartupBase
):
...
...
@@ -52,7 +57,7 @@ class SingleStartup(StartupBase):
"""
def
__init__
(
self
,
context
):
print
(
"Running SingleStartup."
)
logger
.
info
(
"Running SingleStartup."
)
pass
def
startup
(
self
,
context
):
...
...
@@ -79,7 +84,7 @@ class FineTuningStartup(StartupBase):
self
.
self
.
op_role_var_attr_name
=
core
.
op_proto_and_checker_maker
.
kOpRoleVarAttrName
(
)
print
(
"Running SingleStartup."
)
logger
.
info
(
"Running SingleStartup."
)
def
_is_opt_role_op
(
self
,
op
):
# NOTE: depend on oprole to find out whether this op is for
...
...
@@ -155,7 +160,7 @@ class FineTuningStartup(StartupBase):
"runner."
+
context
[
"runner_name"
]
+
".init_model_path"
,
None
)
if
dirname
is
None
or
dirname
==
""
:
return
print
(
"going to load "
,
dirname
)
logger
.
info
(
"going to load "
,
dirname
)
params_grads
=
self
.
_get_params_grads
(
main_program
)
update_params
=
[
p
for
p
,
_
in
params_grads
]
...
...
@@ -169,7 +174,7 @@ class FineTuningStartup(StartupBase):
fluid
.
io
.
load_vars
(
context
[
"exe"
],
dirname
,
main_program
,
need_load_vars
)
print
(
"load from {} success"
.
format
(
dirname
))
logger
.
info
(
"load from {} success"
.
format
(
dirname
))
def
startup
(
self
,
context
):
for
model_dict
in
context
[
"phases"
]:
...
...
@@ -187,7 +192,7 @@ class FineTuningStartup(StartupBase):
class
PSStartup
(
StartupBase
):
def
__init__
(
self
,
context
):
print
(
"Running PSStartup."
)
logger
.
info
(
"Running PSStartup."
)
pass
def
startup
(
self
,
context
):
...
...
@@ -204,7 +209,7 @@ class PSStartup(StartupBase):
class
CollectiveStartup
(
StartupBase
):
def
__init__
(
self
,
context
):
print
(
"Running CollectiveStartup."
)
logger
.
info
(
"Running CollectiveStartup."
)
pass
def
startup
(
self
,
context
):
...
...
@@ -222,7 +227,7 @@ class CollectiveStartup(StartupBase):
class
SingleInferStartup
(
StartupBase
):
def
__init__
(
self
,
context
):
print
(
"Running SingleInferStartup."
)
logger
.
info
(
"Running SingleInferStartup."
)
pass
def
startup
(
self
,
context
):
...
...
core/trainers/framework/terminal.py
浏览文件 @
457f693c
...
...
@@ -15,12 +15,17 @@
from
__future__
import
print_function
import
warnings
import
logging
import
paddle.fluid
as
fluid
from
paddlerec.core.utils
import
envs
__all__
=
[
"TerminalBase"
,
"PSTerminalBase"
]
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
class
TerminalBase
(
object
):
"""R
...
...
@@ -30,7 +35,7 @@ class TerminalBase(object):
pass
def
terminal
(
self
,
context
):
print
(
"PaddleRec Finish"
)
logger
.
info
(
"PaddleRec Finish"
)
class
PSTerminal
(
TerminalBase
):
...
...
@@ -42,4 +47,4 @@ class PSTerminal(TerminalBase):
def
terminal
(
self
,
context
):
context
[
"fleet"
].
stop_worker
()
print
(
"PaddleRec Finish"
)
logger
.
info
(
"PaddleRec Finish"
)
core/trainers/general_trainer.py
浏览文件 @
457f693c
...
...
@@ -17,10 +17,15 @@ General Trainer, applicable to many situations: Single/Cluster/Local_Cluster + P
from
__future__
import
print_function
import
os
import
logging
from
paddlerec.core.utils
import
envs
from
paddlerec.core.trainer
import
Trainer
,
EngineMode
,
FleetMode
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
()
logger
.
setLevel
(
logging
.
INFO
)
class
GeneralTrainer
(
Trainer
):
"""
...
...
@@ -34,7 +39,7 @@ class GeneralTrainer(Trainer):
self
.
runner_env_name
=
"runner."
+
self
.
_context
[
"runner_name"
]
def
processor_register
(
self
):
print
(
"processor_register begin"
)
logger
.
info
(
"processor_register begin"
)
self
.
regist_context_processor
(
'uninit'
,
self
.
instance
)
self
.
regist_context_processor
(
'network_pass'
,
self
.
network
)
self
.
regist_context_processor
(
'startup_pass'
,
self
.
startup
)
...
...
core/utils/dataloader_instance.py
浏览文件 @
457f693c
...
...
@@ -14,6 +14,7 @@
from
__future__
import
print_function
import
os
import
logging
from
paddlerec.core.utils.envs
import
lazy_instance_by_fliename
from
paddlerec.core.utils.envs
import
get_global_env
from
paddlerec.core.utils.envs
import
get_runtime_environ
...
...
@@ -21,6 +22,10 @@ from paddlerec.core.reader import SlotReader
from
paddlerec.core.trainer
import
EngineMode
from
paddlerec.core.utils.util
import
split_files
,
check_filelist
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
()
logger
.
setLevel
(
logging
.
INFO
)
def
dataloader_by_name
(
readerclass
,
dataset_name
,
...
...
@@ -41,7 +46,7 @@ def dataloader_by_name(readerclass,
hidden_file_list
,
files
=
check_filelist
(
hidden_file_list
=
[],
data_file_list
=
[],
train_data_path
=
data_path
)
if
(
hidden_file_list
is
not
None
):
print
(
logger
.
info
(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
.
format
(
hidden_file_list
))
...
...
@@ -55,7 +60,7 @@ def dataloader_by_name(readerclass,
"cluster_type"
]
==
"K8S"
:
# for k8s mount mode, split files for every node
need_split_files
=
True
print
(
"need_split_files: {}"
.
format
(
need_split_files
))
logger
.
info
(
"need_split_files: {}"
.
format
(
need_split_files
))
if
need_split_files
:
files
=
split_files
(
files
,
context
[
"fleet"
].
worker_index
(),
context
[
"fleet"
].
worker_num
())
...
...
@@ -103,7 +108,7 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context):
hidden_file_list
,
files
=
check_filelist
(
hidden_file_list
=
[],
data_file_list
=
[],
train_data_path
=
data_path
)
if
(
hidden_file_list
is
not
None
):
print
(
logger
.
info
(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
.
format
(
hidden_file_list
))
...
...
@@ -173,7 +178,7 @@ def slotdataloader(readerclass, train, yaml_file, context):
hidden_file_list
,
files
=
check_filelist
(
hidden_file_list
=
[],
data_file_list
=
[],
train_data_path
=
data_path
)
if
(
hidden_file_list
is
not
None
):
print
(
logger
.
info
(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
.
format
(
hidden_file_list
))
...
...
core/utils/dataset_holder.py
浏览文件 @
457f693c
...
...
@@ -15,12 +15,17 @@
import
abc
import
datetime
import
time
import
logging
import
paddle.fluid
as
fluid
from
paddlerec.core.utils
import
fs
as
fs
from
paddlerec.core.utils
import
util
as
util
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
class
DatasetHolder
(
object
):
"""
...
...
@@ -187,7 +192,7 @@ class TimeSplitDatasetHolder(DatasetHolder):
windown_min
=
params
[
'time_window_min'
]
if
begin_time
not
in
self
.
_datasets
:
while
self
.
check_ready
(
begin_time
,
windown_min
)
==
False
:
print
(
"dataset not ready, time:"
+
begin_time
)
logger
.
info
(
"dataset not ready, time:"
+
begin_time
)
time
.
sleep
(
30
)
file_list
=
self
.
get_file_list
(
begin_time
,
windown_min
,
params
[
'node_num'
],
...
...
core/utils/envs.py
浏览文件 @
457f693c
...
...
@@ -21,6 +21,12 @@ import sys
import
six
import
traceback
import
six
import
time
import
logging
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
()
logger
.
setLevel
(
logging
.
INFO
)
global_envs
=
{}
global_envs_flatten
=
{}
...
...
@@ -104,7 +110,7 @@ def set_global_envs(envs):
global_envs
[
name
]
=
"DataLoader"
if
get_platform
()
==
"LINUX"
and
six
.
PY3
:
print
(
"QueueDataset can not support PY3, change to DataLoader"
)
logger
.
info
(
"QueueDataset can not support PY3, change to DataLoader"
)
for
dataset
in
envs
[
"dataset"
]:
name
=
"."
.
join
([
"dataset"
,
dataset
[
"name"
],
"type"
])
global_envs
[
name
]
=
"DataLoader"
...
...
@@ -207,7 +213,7 @@ def lazy_instance_by_package(package, class_name):
return
instance
except
Exception
as
err
:
traceback
.
print_exc
()
print
(
'Catch Exception:%s'
%
str
(
err
))
logger
.
info
(
'Catch Exception:%s'
%
str
(
err
))
return
None
...
...
@@ -223,7 +229,7 @@ def lazy_instance_by_fliename(abs, class_name):
return
instance
except
Exception
as
err
:
traceback
.
print_exc
()
print
(
'Catch Exception:%s'
%
str
(
err
))
logger
.
info
(
'Catch Exception:%s'
%
str
(
err
))
return
None
...
...
core/utils/fs.py
浏览文件 @
457f693c
...
...
@@ -13,9 +13,15 @@
# limitations under the License.
import
os
import
time
import
logging
from
paddle.fluid.incubate.fleet.utils.hdfs
import
HDFSClient
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
()
logger
.
setLevel
(
logging
.
INFO
)
def
is_afs_path
(
path
):
"""is_afs_path
...
...
@@ -177,4 +183,4 @@ class FileHandler(object):
return
self
.
_hdfs_client
.
upload
(
dest_path
,
org_path
)
if
org_is_afs
and
not
dest_is_afs
:
return
self
.
_hdfs_client
.
download
(
org_path
,
dest_path
)
print
(
"Not Suppor hdfs cp currently"
)
logger
.
info
(
"Not Suppor hdfs cp currently"
)
core/utils/util.py
浏览文件 @
457f693c
...
...
@@ -16,9 +16,14 @@ import datetime
import
os
import
sys
import
time
import
logging
import
numpy
as
np
from
paddle
import
fluid
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
def
save_program_proto
(
path
,
program
=
None
):
if
program
is
None
:
...
...
@@ -146,9 +151,9 @@ def print_log(log_str, params):
log_str
=
time_str
+
" "
+
log_str
if
'master'
in
params
and
params
[
'master'
]:
if
'index'
in
params
and
params
[
'index'
]
==
0
:
print
(
log_str
)
logger
.
info
(
log_str
)
else
:
print
(
log_str
)
logger
.
info
(
log_str
)
sys
.
stdout
.
flush
()
if
'stdout'
in
params
:
params
[
'stdout'
]
+=
log_str
+
'
\n
'
...
...
core/utils/validation.py
浏览文件 @
457f693c
...
...
@@ -12,8 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
time
import
logging
from
paddlerec.core.utils
import
envs
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
()
logger
.
setLevel
(
logging
.
INFO
)
class
ValueFormat
:
def
__init__
(
self
,
value_type
,
value
,
value_handler
,
required
=
False
):
...
...
@@ -41,64 +47,67 @@ class ValueFormat:
def
is_type_valid
(
self
,
name
,
value
):
if
self
.
value_type
==
"int"
:
if
not
isinstance
(
value
,
int
):
print
(
"
\n
attr {} should be int, but {} now
\n
"
.
format
(
logger
.
info
(
"
\n
attr {} should be int, but {} now
\n
"
.
format
(
name
,
type
(
value
)))
return
False
return
True
elif
self
.
value_type
==
"str"
:
if
not
isinstance
(
value
,
str
):
print
(
"
\n
attr {} should be str, but {} now
\n
"
.
format
(
logger
.
info
(
"
\n
attr {} should be str, but {} now
\n
"
.
format
(
name
,
type
(
value
)))
return
False
return
True
elif
self
.
value_type
==
"strs"
:
if
not
isinstance
(
value
,
list
):
print
(
"
\n
attr {} should be list(str), but {} now
\n
"
.
format
(
name
,
type
(
value
)))
logger
.
info
(
"
\n
attr {} should be list(str), but {} now
\n
"
.
format
(
name
,
type
(
value
)))
return
False
for
v
in
value
:
if
not
isinstance
(
v
,
str
):
print
(
"
\n
attr {} should be list(str), but list({}) now
\n
"
.
format
(
name
,
type
(
v
)))
logger
.
info
(
"
\n
attr {} should be list(str), but list({}) now
\n
"
.
format
(
name
,
type
(
v
)))
return
False
return
True
elif
self
.
value_type
==
"dict"
:
if
not
isinstance
(
value
,
dict
):
print
(
"
\n
attr {} should be str, but {} now
\n
"
.
format
(
logger
.
info
(
"
\n
attr {} should be str, but {} now
\n
"
.
format
(
name
,
type
(
value
)))
return
False
return
True
elif
self
.
value_type
==
"dicts"
:
if
not
isinstance
(
value
,
list
):
print
(
"
\n
attr {} should be list(dist), but {} now
\n
"
.
format
(
name
,
type
(
value
)))
logger
.
info
(
"
\n
attr {} should be list(dist), but {} now
\n
"
.
format
(
name
,
type
(
value
)))
return
False
for
v
in
value
:
if
not
isinstance
(
v
,
dict
):
print
(
"
\n
attr {} should be list(dist), but list({}) now
\n
"
.
format
(
name
,
type
(
v
)))
logger
.
info
(
"
\n
attr {} should be list(dist), but list({}) now
\n
"
.
format
(
name
,
type
(
v
)))
return
False
return
True
elif
self
.
value_type
==
"ints"
:
if
not
isinstance
(
value
,
list
):
print
(
"
\n
attr {} should be list(int), but {} now
\n
"
.
format
(
name
,
type
(
value
)))
logger
.
info
(
"
\n
attr {} should be list(int), but {} now
\n
"
.
format
(
name
,
type
(
value
)))
return
False
for
v
in
value
:
if
not
isinstance
(
v
,
int
):
print
(
"
\n
attr {} should be list(int), but list({}) now
\n
"
.
format
(
name
,
type
(
v
)))
logger
.
info
(
"
\n
attr {} should be list(int), but list({}) now
\n
"
.
format
(
name
,
type
(
v
)))
return
False
return
True
else
:
print
(
"
\n
attr {}'s type is {}, can not be supported now
\n
"
.
format
(
name
,
type
(
value
)))
logger
.
info
(
"
\n
attr {}'s type is {}, can not be supported now
\n
"
.
format
(
name
,
type
(
value
)))
return
False
def
is_value_valid
(
self
,
name
,
value
):
...
...
@@ -108,7 +117,7 @@ class ValueFormat:
def
in_value_handler
(
name
,
value
,
values
):
if
value
not
in
values
:
print
(
"
\n
attr {}'s value is {}, but {} is expected
\n
"
.
format
(
logger
.
info
(
"
\n
attr {}'s value is {}, but {} is expected
\n
"
.
format
(
name
,
value
,
values
))
return
False
return
True
...
...
@@ -116,7 +125,7 @@ def in_value_handler(name, value, values):
def
eq_value_handler
(
name
,
value
,
values
):
if
value
!=
values
:
print
(
"
\n
attr {}'s value is {}, but == {} is expected
\n
"
.
format
(
logger
.
info
(
"
\n
attr {}'s value is {}, but == {} is expected
\n
"
.
format
(
name
,
value
,
values
))
return
False
return
True
...
...
@@ -124,7 +133,7 @@ def eq_value_handler(name, value, values):
def
ge_value_handler
(
name
,
value
,
values
):
if
value
<
values
:
print
(
"
\n
attr {}'s value is {}, but >= {} is expected
\n
"
.
format
(
logger
.
info
(
"
\n
attr {}'s value is {}, but >= {} is expected
\n
"
.
format
(
name
,
value
,
values
))
return
False
return
True
...
...
@@ -132,7 +141,7 @@ def ge_value_handler(name, value, values):
def
le_value_handler
(
name
,
value
,
values
):
if
value
>
values
:
print
(
"
\n
attr {}'s value is {}, but <= {} is expected
\n
"
.
format
(
logger
.
info
(
"
\n
attr {}'s value is {}, but <= {} is expected
\n
"
.
format
(
name
,
value
,
values
))
return
False
return
True
...
...
@@ -160,8 +169,8 @@ def yaml_validation(config):
for
required
in
require_checkers
:
if
required
not
in
_config
.
keys
():
print
(
"
\n
can not find {} in yaml, which is required
\n
"
.
format
(
required
))
logger
.
info
(
"
\n
can not find {} in yaml, which is required
\n
"
.
format
(
required
))
return
False
for
name
,
value
in
_config
.
items
():
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录