Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
PaddleRec
提交
15d8501e
P
PaddleRec
项目概览
BaiXuePrincess
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
15d8501e
编写于
8月 18, 2020
作者:
L
liuyuhui
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
mv check_filelist to util.py and add Warning for hidden files,test=develop
上级
d229f76f
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
46 addition
and
68 deletion
+46
-68
core/trainers/framework/dataset.py
core/trainers/framework/dataset.py
+9
-25
core/utils/dataloader_instance.py
core/utils/dataloader_instance.py
+15
-43
core/utils/util.py
core/utils/util.py
+22
-0
未找到文件。
core/trainers/framework/dataset.py
浏览文件 @
15d8501e
...
...
@@ -21,7 +21,7 @@ from paddlerec.core.utils import envs
from
paddlerec.core.utils
import
dataloader_instance
from
paddlerec.core.reader
import
SlotReader
from
paddlerec.core.trainer
import
EngineMode
from
paddlerec.core.utils.util
import
split_files
from
paddlerec.core.utils.util
import
split_files
,
check_filelist
__all__
=
[
"DatasetBase"
,
"DataLoader"
,
"QueueDataset"
]
...
...
@@ -89,24 +89,6 @@ class QueueDataset(DatasetBase):
else
:
return
self
.
_get_dataset
(
dataset_name
,
context
)
def
check_filelist
(
self
,
file_list
,
train_data_path
):
for
root
,
dirs
,
files
in
os
.
walk
(
train_data_path
):
files
=
[
f
for
f
in
files
if
not
f
[
0
]
==
'.'
]
dirs
[:]
=
[
d
for
d
in
dirs
if
not
d
[
0
]
==
'.'
]
if
(
files
==
None
and
dirs
==
None
):
return
None
else
:
# use files and dirs
for
file_name
in
files
:
file_list
.
append
(
os
.
path
.
join
(
train_data_path
,
file_name
))
print
(
os
.
path
.
join
(
train_data_path
,
file_name
))
for
dirs_name
in
dirs
:
dir_root
.
append
(
os
.
path
.
join
(
train_data_path
,
dirs_name
))
check_filelist
(
file_list
,
os
.
path
.
join
(
train_data_path
,
dirs_name
))
print
(
os
.
path
.
join
(
train_data_path
,
dirs_name
))
return
file_list
def
_get_dataset
(
self
,
dataset_name
,
context
):
name
=
"dataset."
+
dataset_name
+
"."
reader_class
=
envs
.
get_global_env
(
name
+
"data_converter"
)
...
...
@@ -137,12 +119,14 @@ class QueueDataset(DatasetBase):
dataset
.
set_pipe_command
(
pipe_cmd
)
train_data_path
=
envs
.
get_global_env
(
name
+
"data_path"
)
# file_list = [
# os.path.join(train_data_path, x)
# for x in os.listdir(train_data_path)
# ]
file_list
=
[]
file_list
=
self
.
check_filelist
(
file_list
,
train_data_path
)
hidden_file_list
,
file_list
=
check_filelist
(
hidden_file_list
=
[],
data_file_list
=
[],
train_data_path
=
train_data_path
)
if
(
hidden_file_list
is
not
None
):
print
(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
.
format
(
hidden_file_list
))
if
context
[
"engine"
]
==
EngineMode
.
LOCAL_CLUSTER
:
file_list
=
split_files
(
file_list
,
context
[
"fleet"
].
worker_index
(),
...
...
core/utils/dataloader_instance.py
浏览文件 @
15d8501e
...
...
@@ -19,7 +19,7 @@ from paddlerec.core.utils.envs import get_global_env
from
paddlerec.core.utils.envs
import
get_runtime_environ
from
paddlerec.core.reader
import
SlotReader
from
paddlerec.core.trainer
import
EngineMode
from
paddlerec.core.utils.util
import
split_files
from
paddlerec.core.utils.util
import
split_files
,
check_filelist
def
dataloader_by_name
(
readerclass
,
...
...
@@ -38,27 +38,13 @@ def dataloader_by_name(readerclass,
assert
package_base
is
not
None
data_path
=
os
.
path
.
join
(
package_base
,
data_path
.
split
(
"::"
)[
1
])
def
check_filelist
(
file_list
,
train_data_path
):
for
root
,
dirs
,
files
in
os
.
walk
(
train_data_path
):
files
=
[
f
for
f
in
files
if
not
f
[
0
]
==
'.'
]
dirs
[:]
=
[
d
for
d
in
dirs
if
not
d
[
0
]
==
'.'
]
if
(
files
==
None
and
dirs
==
None
):
return
None
else
:
# use files and dirs
for
file_name
in
files
:
file_list
.
append
(
os
.
path
.
join
(
train_data_path
,
file_name
))
print
(
os
.
path
.
join
(
train_data_path
,
file_name
))
for
dirs_name
in
dirs
:
dir_root
.
append
(
os
.
path
.
join
(
train_data_path
,
dirs_name
))
check_filelist
(
file_list
,
os
.
path
.
join
(
train_data_path
,
dirs_name
))
print
(
os
.
path
.
join
(
train_data_path
,
dirs_name
))
return
file_list
#files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)]
files
=
[]
files
=
check_filelist
(
files
,
data_path
)
hidden_file_list
,
files
=
check_filelist
(
hidden_file_list
=
[],
data_file_list
=
[],
train_data_path
=
data_path
)
if
(
hidden_file_list
is
not
None
):
print
(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
.
format
(
hidden_file_list
))
if
context
[
"engine"
]
==
EngineMode
.
LOCAL_CLUSTER
:
files
=
split_files
(
files
,
context
[
"fleet"
].
worker_index
(),
context
[
"fleet"
].
worker_num
())
...
...
@@ -100,27 +86,13 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context):
assert
package_base
is
not
None
data_path
=
os
.
path
.
join
(
package_base
,
data_path
.
split
(
"::"
)[
1
])
def
check_filelist
(
file_list
,
train_data_path
):
for
root
,
dirs
,
files
in
os
.
walk
(
train_data_path
):
files
=
[
f
for
f
in
files
if
not
f
[
0
]
==
'.'
]
dirs
[:]
=
[
d
for
d
in
dirs
if
not
d
[
0
]
==
'.'
]
if
(
files
==
None
and
dirs
==
None
):
return
None
else
:
# use files and dirs
for
file_name
in
files
:
file_list
.
append
(
os
.
path
.
join
(
train_data_path
,
file_name
))
print
(
os
.
path
.
join
(
train_data_path
,
file_name
))
for
dirs_name
in
dirs
:
dir_root
.
append
(
os
.
path
.
join
(
train_data_path
,
dirs_name
))
check_filelist
(
file_list
,
os
.
path
.
join
(
train_data_path
,
dirs_name
))
print
(
os
.
path
.
join
(
train_data_path
,
dirs_name
))
return
file_list
#files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)]
files
=
[]
files
=
check_filelist
(
files
,
data_path
)
hidden_file_list
,
files
=
check_filelist
(
hidden_file_list
=
[],
data_file_list
=
[],
train_data_path
=
data_path
)
if
(
hidden_file_list
is
not
None
):
print
(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
.
format
(
hidden_file_list
))
if
context
[
"engine"
]
==
EngineMode
.
LOCAL_CLUSTER
:
files
=
split_files
(
files
,
context
[
"fleet"
].
worker_index
(),
context
[
"fleet"
].
worker_num
())
...
...
core/utils/util.py
浏览文件 @
15d8501e
...
...
@@ -201,6 +201,28 @@ def split_files(files, trainer_id, trainers):
return
trainer_files
[
trainer_id
]
def
check_filelist
(
hidden_file_list
,
data_file_list
,
train_data_path
):
for
root
,
dirs
,
files
in
os
.
walk
(
train_data_path
):
if
(
files
==
None
and
dirs
==
None
):
return
None
,
None
else
:
# use files and dirs
for
file_name
in
files
:
file_path
=
os
.
path
.
join
(
train_data_path
,
file_name
)
if
file_name
[
0
]
==
'.'
:
hidden_file_list
.
append
(
file_path
)
else
:
data_file_list
.
append
(
file_path
)
for
dirs_name
in
dirs
:
dirs_path
=
os
.
path
.
join
(
train_data_path
,
dirs_name
)
if
dirs_name
[
0
]
==
'.'
:
hidden_file_list
.
append
(
dirs_path
)
else
:
#train_data_path = os.path.join(train_data_path, dirs_name)
check_filelist
(
hidden_file_list
,
data_file_list
,
dirs_path
)
return
hidden_file_list
,
data_file_list
class
CostPrinter
(
object
):
"""
For count cost time && print cost log
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录