diff --git a/pytorch_widedeep/datasets/__init__.py b/pytorch_widedeep/datasets/__init__.py index 9792d454b280897f7a86319c17bf2c816de09e7b..4c9b901616bc410b32fe7beffa3b0633b4d71f05 100644 --- a/pytorch_widedeep/datasets/__init__.py +++ b/pytorch_widedeep/datasets/__init__.py @@ -4,6 +4,7 @@ from ._base import ( load_birds, load_ecoli, load_bio_kdd04, + load_movielens100k, load_womens_ecommerce, load_california_housing, ) @@ -16,4 +17,5 @@ __all__ = [ "load_birds", "load_rf1", "load_womens_ecommerce", + "load_movielens100k", ] diff --git a/pytorch_widedeep/datasets/_base.py b/pytorch_widedeep/datasets/_base.py index 34e18cd17547e01ee1fa029072d0af26eb945459..235b5f9a84e8a42bd298e1598a3b2caa1cb03549 100644 --- a/pytorch_widedeep/datasets/_base.py +++ b/pytorch_widedeep/datasets/_base.py @@ -1,12 +1,14 @@ # dataframes are saved as parquet, pyarrow, brotli # pd.to_parquet(path=None, engine="auto", compression="brotli", index=False) # see related post: https://python.plainenglish.io/storing-pandas-98-faster-disk-reads-and-72-less-space-208e2e2be8bb +from typing import Tuple, Union from importlib import resources +import numpy as np import pandas as pd -def load_bio_kdd04(as_frame: bool = False): +def load_bio_kdd04(as_frame: bool = False) -> Union[np.ndarray, pd.DataFrame]: """Load and return the higly imbalanced binary classification Protein Homology Dataset from [KDD cup 2004](https://www.kdd.org/kdd-cup/view/kdd-cup-2004/Data). This datasets include only bio_train.dat part of the dataset @@ -39,7 +41,7 @@ def load_bio_kdd04(as_frame: bool = False): return df.to_numpy() -def load_adult(as_frame: bool = False): +def load_adult(as_frame: bool = False) -> Union[np.ndarray, pd.DataFrame]: """Load and return the higly imbalanced binary classification [adult income datatest](http://www.cs.toronto.edu/~delve/data/adult/desc.html). you may find detailed description [here](http://www.cs.toronto.edu/~delve/data/adult/adultDetail.html) """ @@ -55,7 +57,7 @@ def load_adult(as_frame: bool = False): return df.to_numpy() -def load_ecoli(as_frame: bool = False): +def load_ecoli(as_frame: bool = False) -> Union[np.ndarray, pd.DataFrame]: """Load and return the higly imbalanced multiclass classification e.coli dataset Dataset from [UCI Machine learning Repository](https://archive.ics.uci.edu/ml/datasets/ecoli). @@ -142,7 +144,7 @@ def load_ecoli(as_frame: bool = False): return df.to_numpy() -def load_california_housing(as_frame: bool = False): +def load_california_housing(as_frame: bool = False) -> Union[np.ndarray, pd.DataFrame]: """Load and return the higly imbalanced regression California housing dataset. Characteristics: @@ -190,7 +192,7 @@ def load_california_housing(as_frame: bool = False): return df.to_numpy() -def load_birds(as_frame: bool = False): +def load_birds(as_frame: bool = False) -> Union[np.ndarray, pd.DataFrame]: """Load and return the multi-label classification bird dataset. References @@ -216,7 +218,7 @@ def load_birds(as_frame: bool = False): return df.to_numpy() -def load_rf1(as_frame: bool = False): +def load_rf1(as_frame: bool = False) -> Union[np.ndarray, pd.DataFrame]: """Load and return the multi-target regression River Flow(RF1) dataset. Characterisctics: @@ -243,7 +245,7 @@ def load_rf1(as_frame: bool = False): return df.to_numpy() -def load_womens_ecommerce(as_frame: bool = False): +def load_womens_ecommerce(as_frame: bool = False) -> Union[np.ndarray, pd.DataFrame]: """ Context This is a Women’s Clothing E-Commerce dataset revolving around the reviews written by customers. @@ -279,3 +281,103 @@ def load_womens_ecommerce(as_frame: bool = False): return df else: return df.to_numpy() + + +def load_movielens100k( + as_frame: bool = False, +) -> Union[ + Tuple[np.ndarray, np.ndarray, np.ndarray], + Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame], +]: + """Load and return the MovieLens 100k dataset in 3 separate files. + + SUMMARY & USAGE LICENSE: + ============================================= + MovieLens data sets were collected by the GroupLens Research Project + at the University of Minnesota. + + This data set consists of: + * 100,000 ratings (1-5) from 943 users on 1682 movies. + * Each user has rated at least 20 movies. + * Simple demographic info for the users (age, gender, occupation, zip) + + The data was collected through the MovieLens web site + (movielens.umn.edu) during the seven-month period from September 19th, + 1997 through April 22nd, 1998. This data has been cleaned up - users + who had less than 20 ratings or did not have complete demographic + information were removed from this data set. Detailed descriptions of + the data file can be found at the end of this file. + + Neither the University of Minnesota nor any of the researchers + involved can guarantee the correctness of the data, its suitability + for any particular purpose, or the validity of results based on the + use of the data set. The data set may be used for any research + purposes under the following conditions: + + * The user may not state or imply any endorsement from the + University of Minnesota or the GroupLens Research Group. + + * The user must acknowledge the use of the data set in + publications resulting from the use of the data set + (see below for citation information). + + * The user may not redistribute the data without separate + permission. + + * The user may not use this information for any commercial or + revenue-bearing purposes without first obtaining permission + from a faculty member of the GroupLens Research Project at the + University of Minnesota. + + If you have any further questions or comments, please contact GroupLens + . + + CITATION: + ============================================= + To acknowledge use of the dataset in publications, please cite the + following paper: + + F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: + History and Context. ACM Transactions on Interactive Intelligent + Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages. + DOI=http://dx.doi.org/10.1145/2827872 + + Returns + ------- + df_data: Union[np.ndarray, pd.DataFrame] + The full u data set, 100000 ratings by 943 users on 1682 items. + Each user has rated at least 20 movies. Users and items are + numbered consecutively from 1. The data is randomly + ordered. The time stamps are unix seconds since 1/1/1970 UTC + df_items: Union[np.ndarray, pd.DataFrame] + Information about the items (movies). + The last 19 fields are the genres, a 1 indicates the movie + is of that genre, a 0 indicates it is not; movies can be in + several genres at once. + The movie ids are the ones used in the df_data data set. + df_users: Union[np.ndarray, pd.DataFrame] + Demographic information about the users. + The user ids are the ones used in the df_data data set. + """ + with resources.path( + "pytorch_widedeep.datasets.data", + "MovieLens100k_data.parquet.brotli", + ) as fpath: + df_data = pd.read_parquet(fpath) + + with resources.path( + "pytorch_widedeep.datasets.data", + "MovieLens100k_items.parquet.brotli", + ) as fpath: + df_items = pd.read_parquet(fpath) + + with resources.path( + "pytorch_widedeep.datasets.data", + "MovieLens100k_users.parquet.brotli", + ) as fpath: + df_users = pd.read_parquet(fpath) + + if as_frame: + return df_data, df_items, df_users + else: + return df_data.to_numpy(), df_items.to_numpy(), df_users.to_numpy() diff --git a/pytorch_widedeep/datasets/data/MovieLens100k_data.parquet.brotli b/pytorch_widedeep/datasets/data/MovieLens100k_data.parquet.brotli new file mode 100644 index 0000000000000000000000000000000000000000..547834647a6e25b61bd511c55687c77599848746 Binary files /dev/null and b/pytorch_widedeep/datasets/data/MovieLens100k_data.parquet.brotli differ diff --git a/pytorch_widedeep/datasets/data/MovieLens100k_items.parquet.brotli b/pytorch_widedeep/datasets/data/MovieLens100k_items.parquet.brotli new file mode 100644 index 0000000000000000000000000000000000000000..5331eb5aa1c750f2eb8d83103d94d5b53626b369 Binary files /dev/null and b/pytorch_widedeep/datasets/data/MovieLens100k_items.parquet.brotli differ diff --git a/pytorch_widedeep/datasets/data/MovieLens100k_users.parquet.brotli b/pytorch_widedeep/datasets/data/MovieLens100k_users.parquet.brotli new file mode 100644 index 0000000000000000000000000000000000000000..c2d83d6c2b6c3697a5e02d369920a89fd9a85163 Binary files /dev/null and b/pytorch_widedeep/datasets/data/MovieLens100k_users.parquet.brotli differ diff --git a/tests/test_datasets/test_datasets.py b/tests/test_datasets/test_datasets.py index 516535ea24cd85a6ae767ead458d486e4ba48429..30e54c901f8530f18060ba45a57c484e982a9067 100644 --- a/tests/test_datasets/test_datasets.py +++ b/tests/test_datasets/test_datasets.py @@ -10,6 +10,7 @@ from pytorch_widedeep.datasets import ( load_bio_kdd04, load_womens_ecommerce, load_california_housing, + load_movielens100k, ) @@ -116,3 +117,46 @@ def test_load_california_housing(as_frame): assert (df.shape, type(df)) == ((20640, 9), pd.DataFrame) else: assert (df.shape, type(df)) == ((20640, 9), np.ndarray) + + +@pytest.mark.parametrize( + "as_frame", + [ + (True), + (False), + ], +) +def test_load_movielens100k(as_frame): + df_data, df_items, df_users = load_movielens100k(as_frame=as_frame) + if as_frame: + assert ( + df_data.shape, + df_items.shape, + df_users.shape, + type(df_data), + type(df_items), + type(df_users), + ) == ( + (100000, 4), + (1682, 24), + (943, 5), + pd.DataFrame, + pd.DataFrame, + pd.DataFrame, + ) + else: + assert ( + df_data.shape, + df_items.shape, + df_users.shape, + type(df_data), + type(df_items), + type(df_users), + ) == ( + (100000, 4), + (1682, 24), + (943, 5), + np.ndarray, + np.ndarray, + np.ndarray, + )