Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
PaddleRec
提交
dcf92119
P
PaddleRec
项目概览
BaiXuePrincess
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
dcf92119
编写于
3月 05, 2020
作者:
X
xiexionghang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
commit kagle for paddle
上级
71fd9646
变更
1
显示空白变更内容
内联
并排
Showing
1 changed file
with
51 addition
and
7 deletion
+51
-7
kagle/kagle_dataset.py
kagle/kagle_dataset.py
+51
-7
未找到文件。
kagle/kagle_dataset.py
浏览文件 @
dcf92119
...
...
@@ -6,32 +6,49 @@ import kagle_fs
import
kagle_util
import
kagle_layer
import
paddle.fluid
as
fluid
from
abc
import
ABCMeta
,
abstractmethod
import
abc
class
Dataset
(
object
):
__metaclass__
=
ABCMeta
"""
"""
__metaclass__
=
abc
.
ABCMeta
def
__init__
(
self
,
config
):
""" """
self
.
_datasets
=
{}
self
.
_config
=
config
@
abstractmethod
@
ab
c
.
ab
stractmethod
def
check_ready
(
self
,
params
):
"""
check data ready or not
Return:
True/False
"""
pass
@
abstractmethod
@
ab
c
.
ab
stractmethod
def
load_dataset
(
self
,
params
):
""" """
pass
@
abstractmethod
@
ab
c
.
ab
stractmethod
def
preload_dataset
(
self
,
params
):
""" """
pass
@
abstractmethod
@
ab
c
.
ab
stractmethod
def
release_dataset
(
self
,
params
):
""" """
pass
class
TimeSplitDataset
(
Dataset
):
"""
Dataset with time split dir. root_path/$DAY/$HOUR
"""
def
__init__
(
self
,
config
):
"""
init data root_path, time_split_interval, data_path_format
"""
Dataset
.
__init__
(
self
,
config
)
if
'data_donefile'
not
in
config
or
config
[
'data_donefile'
]
is
None
:
config
[
'data_donefile'
]
=
config
[
'data_path'
]
+
"/to.hadoop.done"
...
...
@@ -43,6 +60,7 @@ class TimeSplitDataset(Dataset):
self
.
_data_file_handler
=
kagle_fs
.
FileHandler
(
config
)
def
_format_data_time
(
self
,
daytime_str
,
time_window_mins
):
""" """
data_time
=
kagle_util
.
make_datetime
(
daytime_str
)
mins_of_day
=
data_time
.
hour
*
60
+
data_time
.
minute
begin_stage
=
mins_of_day
/
self
.
_split_interval
...
...
@@ -57,6 +75,14 @@ class TimeSplitDataset(Dataset):
return
data_time
,
time_window_mins
def
check_ready
(
self
,
daytime_str
,
time_window_mins
):
"""
data in [daytime_str, daytime_str + time_window_mins] is ready or not
Args:
daytime_str: datetime with str format, such as "202001122200" meanings "2020-01-12 22:00"
time_window_mins(int): from daytime_str to daytime_str + time_window_mins
Return:
True/False
"""
is_ready
=
True
data_time
,
windows_mins
=
self
.
_format_data_time
(
daytime_str
,
time_window_mins
)
while
time_window_mins
>
0
:
...
...
@@ -69,6 +95,16 @@ class TimeSplitDataset(Dataset):
return
is_ready
def
get_file_list
(
self
,
daytime_str
,
time_window_mins
,
node_num
=
1
,
node_idx
=
0
):
"""
data in [daytime_str, daytime_str + time_window_mins], random shard to node_num, return shard[node_idx]
Args:
daytime_str: datetime with str format, such as "202001122200" meanings "2020-01-12 22:00"
time_window_mins(int): from daytime_str to daytime_str + time_window_mins
node_num(int): data split shard num
node_idx(int): shard_idx
Return:
list, data_shard[node_idx]
"""
data_file_list
=
[]
data_time
,
windows_mins
=
self
.
_format_data_time
(
daytime_str
,
time_window_mins
)
while
time_window_mins
>
0
:
...
...
@@ -85,10 +121,15 @@ class TimeSplitDataset(Dataset):
return
data_file_list
class
FluidTimeSplitDataset
(
TimeSplitDataset
):
"""
A Dataset with time split for PaddleFluid
"""
def
__init__
(
self
,
config
):
""" """
TimeSplitDataset
.
__init__
(
self
,
config
)
def
_alloc_dataset
(
self
,
file_list
):
""" """
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
self
.
_config
[
'dataset_type'
])
dataset
.
set_batch_size
(
self
.
_config
[
'batch_size'
])
dataset
.
set_thread
(
self
.
_config
[
'load_thread'
])
...
...
@@ -101,6 +142,7 @@ class FluidTimeSplitDataset(TimeSplitDataset):
return
dataset
def
load_dataset
(
self
,
params
):
""" """
begin_time
=
params
[
'begin_time'
]
windown_min
=
params
[
'time_window_min'
]
if
begin_time
not
in
self
.
_datasets
:
...
...
@@ -115,6 +157,7 @@ class FluidTimeSplitDataset(TimeSplitDataset):
return
self
.
_datasets
[
begin_time
]
def
preload_dataset
(
self
,
params
):
""" """
begin_time
=
params
[
'begin_time'
]
windown_min
=
params
[
'time_window_min'
]
if
begin_time
not
in
self
.
_datasets
:
...
...
@@ -126,6 +169,7 @@ class FluidTimeSplitDataset(TimeSplitDataset):
return
False
def
release_dataset
(
self
,
params
):
""" """
begin_time
=
params
[
'begin_time'
]
windown_min
=
params
[
'time_window_min'
]
if
begin_time
in
self
.
_datasets
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录