Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
PaddleRec
提交
75e238df
P
PaddleRec
项目概览
BaiXuePrincess
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
75e238df
编写于
3月 05, 2020
作者:
X
xiexionghang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
commit kagle for paddle
上级
dcf92119
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
16 addition
and
8 deletion
+16
-8
kagle/kagle_dataset.py
kagle/kagle_dataset.py
+16
-8
未找到文件。
kagle/kagle_dataset.py
浏览文件 @
75e238df
"""
"""
import
copy
import
copy
import
yaml
import
yaml
import
time
import
time
...
@@ -13,7 +15,8 @@ class Dataset(object):
...
@@ -13,7 +15,8 @@ class Dataset(object):
"""
"""
__metaclass__
=
abc
.
ABCMeta
__metaclass__
=
abc
.
ABCMeta
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
""" """
"""
"""
self
.
_datasets
=
{}
self
.
_datasets
=
{}
self
.
_config
=
config
self
.
_config
=
config
...
@@ -28,19 +31,23 @@ class Dataset(object):
...
@@ -28,19 +31,23 @@ class Dataset(object):
@
abc
.
abstractmethod
@
abc
.
abstractmethod
def
load_dataset
(
self
,
params
):
def
load_dataset
(
self
,
params
):
""" """
"""
"""
pass
pass
@
abc
.
abstractmethod
@
abc
.
abstractmethod
def
preload_dataset
(
self
,
params
):
def
preload_dataset
(
self
,
params
):
""" """
"""
"""
pass
pass
@
abc
.
abstractmethod
@
abc
.
abstractmethod
def
release_dataset
(
self
,
params
):
def
release_dataset
(
self
,
params
):
""" """
"""
"""
pass
pass
class
TimeSplitDataset
(
Dataset
):
class
TimeSplitDataset
(
Dataset
):
"""
"""
Dataset with time split dir. root_path/$DAY/$HOUR
Dataset with time split dir. root_path/$DAY/$HOUR
...
@@ -52,7 +59,7 @@ class TimeSplitDataset(Dataset):
...
@@ -52,7 +59,7 @@ class TimeSplitDataset(Dataset):
Dataset
.
__init__
(
self
,
config
)
Dataset
.
__init__
(
self
,
config
)
if
'data_donefile'
not
in
config
or
config
[
'data_donefile'
]
is
None
:
if
'data_donefile'
not
in
config
or
config
[
'data_donefile'
]
is
None
:
config
[
'data_donefile'
]
=
config
[
'data_path'
]
+
"/to.hadoop.done"
config
[
'data_donefile'
]
=
config
[
'data_path'
]
+
"/to.hadoop.done"
self
.
_path_generator
=
kagle_util
.
PathGenerator
({
'templates'
:
[
self
.
_path_generator
=
kagle_util
.
PathGenerator
({
'templates'
:
[
{
'name'
:
'data_path'
,
'template'
:
config
[
'data_path'
]},
{
'name'
:
'data_path'
,
'template'
:
config
[
'data_path'
]},
{
'name'
:
'donefile_path'
,
'template'
:
config
[
'data_donefile'
]}
{
'name'
:
'donefile_path'
,
'template'
:
config
[
'data_donefile'
]}
]})
]})
...
@@ -72,7 +79,7 @@ class TimeSplitDataset(Dataset):
...
@@ -72,7 +79,7 @@ class TimeSplitDataset(Dataset):
skip_mins
=
self
.
_split_interval
-
(
mins_of_day
%
self
.
_split_interval
)
skip_mins
=
self
.
_split_interval
-
(
mins_of_day
%
self
.
_split_interval
)
data_time
=
data_time
+
datetime
.
timedelta
(
minutes
=
skip_mins
)
data_time
=
data_time
+
datetime
.
timedelta
(
minutes
=
skip_mins
)
time_window_mins
=
time_window_mins
-
skip_mins
time_window_mins
=
time_window_mins
-
skip_mins
return
data_time
,
time_window_mins
return
data_time
,
time_window_mins
def
check_ready
(
self
,
daytime_str
,
time_window_mins
):
def
check_ready
(
self
,
daytime_str
,
time_window_mins
):
"""
"""
...
@@ -84,7 +91,7 @@ class TimeSplitDataset(Dataset):
...
@@ -84,7 +91,7 @@ class TimeSplitDataset(Dataset):
True/False
True/False
"""
"""
is_ready
=
True
is_ready
=
True
data_time
,
windows_mins
=
self
.
_format_data_time
(
daytime_str
,
time_window_mins
)
data_time
,
windows_mins
=
self
.
_format_data_time
(
daytime_str
,
time_window_mins
)
while
time_window_mins
>
0
:
while
time_window_mins
>
0
:
file_path
=
self
.
_path_generator
.
generate_path
(
'donefile_path'
,
{
'time_format'
:
data_time
})
file_path
=
self
.
_path_generator
.
generate_path
(
'donefile_path'
,
{
'time_format'
:
data_time
})
if
not
self
.
_data_file_handler
.
is_exist
(
file_path
):
if
not
self
.
_data_file_handler
.
is_exist
(
file_path
):
...
@@ -106,7 +113,7 @@ class TimeSplitDataset(Dataset):
...
@@ -106,7 +113,7 @@ class TimeSplitDataset(Dataset):
list, data_shard[node_idx]
list, data_shard[node_idx]
"""
"""
data_file_list
=
[]
data_file_list
=
[]
data_time
,
windows_mins
=
self
.
_format_data_time
(
daytime_str
,
time_window_mins
)
data_time
,
windows_mins
=
self
.
_format_data_time
(
daytime_str
,
time_window_mins
)
while
time_window_mins
>
0
:
while
time_window_mins
>
0
:
file_path
=
self
.
_path_generator
.
generate_path
(
'data_path'
,
{
'time_format'
:
data_time
})
file_path
=
self
.
_path_generator
.
generate_path
(
'data_path'
,
{
'time_format'
:
data_time
})
sub_file_list
=
self
.
_data_file_handler
.
ls
(
file_path
)
sub_file_list
=
self
.
_data_file_handler
.
ls
(
file_path
)
...
@@ -120,6 +127,7 @@ class TimeSplitDataset(Dataset):
...
@@ -120,6 +127,7 @@ class TimeSplitDataset(Dataset):
data_time
=
data_time
+
datetime
.
timedelta
(
minutes
=
self
.
_split_interval
)
data_time
=
data_time
+
datetime
.
timedelta
(
minutes
=
self
.
_split_interval
)
return
data_file_list
return
data_file_list
class
FluidTimeSplitDataset
(
TimeSplitDataset
):
class
FluidTimeSplitDataset
(
TimeSplitDataset
):
"""
"""
A Dataset with time split for PaddleFluid
A Dataset with time split for PaddleFluid
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录