Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
PaddleRec
提交
af7c0e7d
P
PaddleRec
项目概览
BaiXuePrincess
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
af7c0e7d
编写于
8月 21, 2020
作者:
W
wuzhihua
提交者:
GitHub
8月 21, 2020
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #174 from vslyu/vslyu-fixhidefiles
fix read hide files bug for reader bug
上级
12fc8c82
373b88d3
变更
3
显示空白变更内容
内联
并排
Showing
3 changed file
with
54 addition
and
11 deletion
+54
-11
core/trainers/framework/dataset.py
core/trainers/framework/dataset.py
+10
-5
core/utils/dataloader_instance.py
core/utils/dataloader_instance.py
+22
-6
core/utils/util.py
core/utils/util.py
+22
-0
未找到文件。
core/trainers/framework/dataset.py
浏览文件 @
af7c0e7d
...
...
@@ -21,7 +21,7 @@ from paddlerec.core.utils import envs
from
paddlerec.core.utils
import
dataloader_instance
from
paddlerec.core.reader
import
SlotReader
from
paddlerec.core.trainer
import
EngineMode
from
paddlerec.core.utils.util
import
split_files
from
paddlerec.core.utils.util
import
split_files
,
check_filelist
__all__
=
[
"DatasetBase"
,
"DataLoader"
,
"QueueDataset"
]
...
...
@@ -119,10 +119,15 @@ class QueueDataset(DatasetBase):
dataset
.
set_pipe_command
(
pipe_cmd
)
train_data_path
=
envs
.
get_global_env
(
name
+
"data_path"
)
file_list
=
[
os
.
path
.
join
(
train_data_path
,
x
)
for
x
in
os
.
listdir
(
train_data_path
)
]
hidden_file_list
,
file_list
=
check_filelist
(
hidden_file_list
=
[],
data_file_list
=
[],
train_data_path
=
train_data_path
)
if
(
hidden_file_list
is
not
None
):
print
(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
.
format
(
hidden_file_list
))
file_list
.
sort
()
need_split_files
=
False
if
context
[
"engine"
]
==
EngineMode
.
LOCAL_CLUSTER
:
...
...
core/utils/dataloader_instance.py
浏览文件 @
af7c0e7d
...
...
@@ -19,7 +19,7 @@ from paddlerec.core.utils.envs import get_global_env
from
paddlerec.core.utils.envs
import
get_runtime_environ
from
paddlerec.core.reader
import
SlotReader
from
paddlerec.core.trainer
import
EngineMode
from
paddlerec.core.utils.util
import
split_files
from
paddlerec.core.utils.util
import
split_files
,
check_filelist
def
dataloader_by_name
(
readerclass
,
...
...
@@ -38,7 +38,13 @@ def dataloader_by_name(readerclass,
assert
package_base
is
not
None
data_path
=
os
.
path
.
join
(
package_base
,
data_path
.
split
(
"::"
)[
1
])
files
=
[
str
(
data_path
)
+
"/%s"
%
x
for
x
in
os
.
listdir
(
data_path
)]
hidden_file_list
,
files
=
check_filelist
(
hidden_file_list
=
[],
data_file_list
=
[],
train_data_path
=
data_path
)
if
(
hidden_file_list
is
not
None
):
print
(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
.
format
(
hidden_file_list
))
files
.
sort
()
need_split_files
=
False
...
...
@@ -54,8 +60,6 @@ def dataloader_by_name(readerclass,
files
=
split_files
(
files
,
context
[
"fleet"
].
worker_index
(),
context
[
"fleet"
].
worker_num
())
print
(
"file_list : {}"
.
format
(
files
))
reader
=
reader_class
(
yaml_file
)
reader
.
init
()
...
...
@@ -92,7 +96,13 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context):
assert
package_base
is
not
None
data_path
=
os
.
path
.
join
(
package_base
,
data_path
.
split
(
"::"
)[
1
])
files
=
[
str
(
data_path
)
+
"/%s"
%
x
for
x
in
os
.
listdir
(
data_path
)]
hidden_file_list
,
files
=
check_filelist
(
hidden_file_list
=
[],
data_file_list
=
[],
train_data_path
=
data_path
)
if
(
hidden_file_list
is
not
None
):
print
(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
.
format
(
hidden_file_list
))
files
.
sort
()
need_split_files
=
False
...
...
@@ -156,7 +166,13 @@ def slotdataloader(readerclass, train, yaml_file, context):
assert
package_base
is
not
None
data_path
=
os
.
path
.
join
(
package_base
,
data_path
.
split
(
"::"
)[
1
])
files
=
[
str
(
data_path
)
+
"/%s"
%
x
for
x
in
os
.
listdir
(
data_path
)]
hidden_file_list
,
files
=
check_filelist
(
hidden_file_list
=
[],
data_file_list
=
[],
train_data_path
=
data_path
)
if
(
hidden_file_list
is
not
None
):
print
(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
.
format
(
hidden_file_list
))
files
.
sort
()
need_split_files
=
False
...
...
core/utils/util.py
浏览文件 @
af7c0e7d
...
...
@@ -201,6 +201,28 @@ def split_files(files, trainer_id, trainers):
return
trainer_files
[
trainer_id
]
def
check_filelist
(
hidden_file_list
,
data_file_list
,
train_data_path
):
for
root
,
dirs
,
files
in
os
.
walk
(
train_data_path
):
if
(
files
==
None
and
dirs
==
None
):
return
None
,
None
else
:
# use files and dirs
for
file_name
in
files
:
file_path
=
os
.
path
.
join
(
train_data_path
,
file_name
)
if
file_name
[
0
]
==
'.'
:
hidden_file_list
.
append
(
file_path
)
else
:
data_file_list
.
append
(
file_path
)
for
dirs_name
in
dirs
:
dirs_path
=
os
.
path
.
join
(
train_data_path
,
dirs_name
)
if
dirs_name
[
0
]
==
'.'
:
hidden_file_list
.
append
(
dirs_path
)
else
:
#train_data_path = os.path.join(train_data_path, dirs_name)
check_filelist
(
hidden_file_list
,
data_file_list
,
dirs_path
)
return
hidden_file_list
,
data_file_list
class
CostPrinter
(
object
):
"""
For count cost time && print cost log
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录