Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Pytorch Widedeep
提交
8813ceea
P
Pytorch Widedeep
项目概览
Greenplum
/
Pytorch Widedeep
大约 1 年 前同步成功
通知
9
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Pytorch Widedeep
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
8813ceea
编写于
7月 30, 2023
作者:
P
Pavol Mulinka
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
added movielens dataset and tests
上级
04e9d38b
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
155 addition
and
7 deletion
+155
-7
pytorch_widedeep/datasets/__init__.py
pytorch_widedeep/datasets/__init__.py
+2
-0
pytorch_widedeep/datasets/_base.py
pytorch_widedeep/datasets/_base.py
+109
-7
pytorch_widedeep/datasets/data/MovieLens100k_data.parquet.brotli
..._widedeep/datasets/data/MovieLens100k_data.parquet.brotli
+0
-0
pytorch_widedeep/datasets/data/MovieLens100k_items.parquet.brotli
...widedeep/datasets/data/MovieLens100k_items.parquet.brotli
+0
-0
pytorch_widedeep/datasets/data/MovieLens100k_users.parquet.brotli
...widedeep/datasets/data/MovieLens100k_users.parquet.brotli
+0
-0
tests/test_datasets/test_datasets.py
tests/test_datasets/test_datasets.py
+44
-0
未找到文件。
pytorch_widedeep/datasets/__init__.py
浏览文件 @
8813ceea
...
...
@@ -4,6 +4,7 @@ from ._base import (
load_birds
,
load_ecoli
,
load_bio_kdd04
,
load_movielens100k
,
load_womens_ecommerce
,
load_california_housing
,
)
...
...
@@ -16,4 +17,5 @@ __all__ = [
"load_birds"
,
"load_rf1"
,
"load_womens_ecommerce"
,
"load_movielens100k"
,
]
pytorch_widedeep/datasets/_base.py
浏览文件 @
8813ceea
# dataframes are saved as parquet, pyarrow, brotli
# pd.to_parquet(path=None, engine="auto", compression="brotli", index=False)
# see related post: https://python.plainenglish.io/storing-pandas-98-faster-disk-reads-and-72-less-space-208e2e2be8bb
from
typing
import
Tuple
,
Union
from
importlib
import
resources
import
numpy
as
np
import
pandas
as
pd
def
load_bio_kdd04
(
as_frame
:
bool
=
False
):
def
load_bio_kdd04
(
as_frame
:
bool
=
False
)
->
Union
[
np
.
ndarray
,
pd
.
DataFrame
]
:
"""Load and return the higly imbalanced binary classification Protein Homology
Dataset from [KDD cup 2004](https://www.kdd.org/kdd-cup/view/kdd-cup-2004/Data).
This datasets include only bio_train.dat part of the dataset
...
...
@@ -39,7 +41,7 @@ def load_bio_kdd04(as_frame: bool = False):
return
df
.
to_numpy
()
def
load_adult
(
as_frame
:
bool
=
False
):
def
load_adult
(
as_frame
:
bool
=
False
)
->
Union
[
np
.
ndarray
,
pd
.
DataFrame
]
:
"""Load and return the higly imbalanced binary classification [adult income datatest](http://www.cs.toronto.edu/~delve/data/adult/desc.html).
you may find detailed description [here](http://www.cs.toronto.edu/~delve/data/adult/adultDetail.html)
"""
...
...
@@ -55,7 +57,7 @@ def load_adult(as_frame: bool = False):
return
df
.
to_numpy
()
def
load_ecoli
(
as_frame
:
bool
=
False
):
def
load_ecoli
(
as_frame
:
bool
=
False
)
->
Union
[
np
.
ndarray
,
pd
.
DataFrame
]
:
"""Load and return the higly imbalanced multiclass classification e.coli dataset
Dataset from [UCI Machine learning Repository](https://archive.ics.uci.edu/ml/datasets/ecoli).
...
...
@@ -142,7 +144,7 @@ def load_ecoli(as_frame: bool = False):
return
df
.
to_numpy
()
def
load_california_housing
(
as_frame
:
bool
=
False
):
def
load_california_housing
(
as_frame
:
bool
=
False
)
->
Union
[
np
.
ndarray
,
pd
.
DataFrame
]
:
"""Load and return the higly imbalanced regression California housing dataset.
Characteristics:
...
...
@@ -190,7 +192,7 @@ def load_california_housing(as_frame: bool = False):
return
df
.
to_numpy
()
def
load_birds
(
as_frame
:
bool
=
False
):
def
load_birds
(
as_frame
:
bool
=
False
)
->
Union
[
np
.
ndarray
,
pd
.
DataFrame
]
:
"""Load and return the multi-label classification bird dataset.
References
...
...
@@ -216,7 +218,7 @@ def load_birds(as_frame: bool = False):
return
df
.
to_numpy
()
def
load_rf1
(
as_frame
:
bool
=
False
):
def
load_rf1
(
as_frame
:
bool
=
False
)
->
Union
[
np
.
ndarray
,
pd
.
DataFrame
]
:
"""Load and return the multi-target regression River Flow(RF1) dataset.
Characterisctics:
...
...
@@ -243,7 +245,7 @@ def load_rf1(as_frame: bool = False):
return
df
.
to_numpy
()
def
load_womens_ecommerce
(
as_frame
:
bool
=
False
):
def
load_womens_ecommerce
(
as_frame
:
bool
=
False
)
->
Union
[
np
.
ndarray
,
pd
.
DataFrame
]
:
"""
Context
This is a Women’s Clothing E-Commerce dataset revolving around the reviews written by customers.
...
...
@@ -279,3 +281,103 @@ def load_womens_ecommerce(as_frame: bool = False):
return
df
else
:
return
df
.
to_numpy
()
def
load_movielens100k
(
as_frame
:
bool
=
False
,
)
->
Union
[
Tuple
[
np
.
ndarray
,
np
.
ndarray
,
np
.
ndarray
],
Tuple
[
pd
.
DataFrame
,
pd
.
DataFrame
,
pd
.
DataFrame
],
]:
"""Load and return the MovieLens 100k dataset in 3 separate files.
SUMMARY & USAGE LICENSE:
=============================================
MovieLens data sets were collected by the GroupLens Research Project
at the University of Minnesota.
This data set consists of:
* 100,000 ratings (1-5) from 943 users on 1682 movies.
* Each user has rated at least 20 movies.
* Simple demographic info for the users (age, gender, occupation, zip)
The data was collected through the MovieLens web site
(movielens.umn.edu) during the seven-month period from September 19th,
1997 through April 22nd, 1998. This data has been cleaned up - users
who had less than 20 ratings or did not have complete demographic
information were removed from this data set. Detailed descriptions of
the data file can be found at the end of this file.
Neither the University of Minnesota nor any of the researchers
involved can guarantee the correctness of the data, its suitability
for any particular purpose, or the validity of results based on the
use of the data set. The data set may be used for any research
purposes under the following conditions:
* The user may not state or imply any endorsement from the
University of Minnesota or the GroupLens Research Group.
* The user must acknowledge the use of the data set in
publications resulting from the use of the data set
(see below for citation information).
* The user may not redistribute the data without separate
permission.
* The user may not use this information for any commercial or
revenue-bearing purposes without first obtaining permission
from a faculty member of the GroupLens Research Project at the
University of Minnesota.
If you have any further questions or comments, please contact GroupLens
<grouplens-info@cs.umn.edu>.
CITATION:
=============================================
To acknowledge use of the dataset in publications, please cite the
following paper:
F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets:
History and Context. ACM Transactions on Interactive Intelligent
Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages.
DOI=http://dx.doi.org/10.1145/2827872
Returns
-------
df_data: Union[np.ndarray, pd.DataFrame]
The full u data set, 100000 ratings by 943 users on 1682 items.
Each user has rated at least 20 movies. Users and items are
numbered consecutively from 1. The data is randomly
ordered. The time stamps are unix seconds since 1/1/1970 UTC
df_items: Union[np.ndarray, pd.DataFrame]
Information about the items (movies).
The last 19 fields are the genres, a 1 indicates the movie
is of that genre, a 0 indicates it is not; movies can be in
several genres at once.
The movie ids are the ones used in the df_data data set.
df_users: Union[np.ndarray, pd.DataFrame]
Demographic information about the users.
The user ids are the ones used in the df_data data set.
"""
with
resources
.
path
(
"pytorch_widedeep.datasets.data"
,
"MovieLens100k_data.parquet.brotli"
,
)
as
fpath
:
df_data
=
pd
.
read_parquet
(
fpath
)
with
resources
.
path
(
"pytorch_widedeep.datasets.data"
,
"MovieLens100k_items.parquet.brotli"
,
)
as
fpath
:
df_items
=
pd
.
read_parquet
(
fpath
)
with
resources
.
path
(
"pytorch_widedeep.datasets.data"
,
"MovieLens100k_users.parquet.brotli"
,
)
as
fpath
:
df_users
=
pd
.
read_parquet
(
fpath
)
if
as_frame
:
return
df_data
,
df_items
,
df_users
else
:
return
df_data
.
to_numpy
(),
df_items
.
to_numpy
(),
df_users
.
to_numpy
()
pytorch_widedeep/datasets/data/MovieLens100k_data.parquet.brotli
0 → 100644
浏览文件 @
8813ceea
文件已添加
pytorch_widedeep/datasets/data/MovieLens100k_items.parquet.brotli
0 → 100644
浏览文件 @
8813ceea
文件已添加
pytorch_widedeep/datasets/data/MovieLens100k_users.parquet.brotli
0 → 100644
浏览文件 @
8813ceea
文件已添加
tests/test_datasets/test_datasets.py
浏览文件 @
8813ceea
...
...
@@ -10,6 +10,7 @@ from pytorch_widedeep.datasets import (
load_bio_kdd04
,
load_womens_ecommerce
,
load_california_housing
,
load_movielens100k
,
)
...
...
@@ -116,3 +117,46 @@ def test_load_california_housing(as_frame):
assert
(
df
.
shape
,
type
(
df
))
==
((
20640
,
9
),
pd
.
DataFrame
)
else
:
assert
(
df
.
shape
,
type
(
df
))
==
((
20640
,
9
),
np
.
ndarray
)
@
pytest
.
mark
.
parametrize
(
"as_frame"
,
[
(
True
),
(
False
),
],
)
def
test_load_movielens100k
(
as_frame
):
df_data
,
df_items
,
df_users
=
load_movielens100k
(
as_frame
=
as_frame
)
if
as_frame
:
assert
(
df_data
.
shape
,
df_items
.
shape
,
df_users
.
shape
,
type
(
df_data
),
type
(
df_items
),
type
(
df_users
),
)
==
(
(
100000
,
4
),
(
1682
,
24
),
(
943
,
5
),
pd
.
DataFrame
,
pd
.
DataFrame
,
pd
.
DataFrame
,
)
else
:
assert
(
df_data
.
shape
,
df_items
.
shape
,
df_users
.
shape
,
type
(
df_data
),
type
(
df_items
),
type
(
df_users
),
)
==
(
(
100000
,
4
),
(
1682
,
24
),
(
943
,
5
),
np
.
ndarray
,
np
.
ndarray
,
np
.
ndarray
,
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录