提交 d30203a0 编写于 作者: J Javier

Fixed a bug related to the padding idx and the fast ai transforms. Also...

Fixed a bug related to the padding idx and the fast ai transforms. Also adjusted the scripts to show how one can use the 'load_movielens100k' function in the library
上级 b6a10336
......@@ -4,34 +4,17 @@
# https://github.com/jrzaurin/pytorch-widedeep/issues/133 In this script we
# simply prepare the data that will later be used for a custom Wide and Deep
# model and for Wide and Deep models created using this library
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
raw_data_path = Path("~/ml_projects/wide_deep_learning_for_recsys/ml-100k")
from pytorch_widedeep.datasets import load_movielens100k
save_path = Path("prepared_data")
if not save_path.exists():
save_path.mkdir(parents=True, exist_ok=True)
data, user, items = load_movielens100k(as_frame=True)
# Load the Ratings/Interaction (triplets (user, item, rating) plus timestamp)
data = pd.read_csv(raw_data_path / "u.data", sep="\t", header=None)
data.columns = ["user_id", "movie_id", "rating", "timestamp"]
# Load the User features
users = pd.read_csv(raw_data_path / "u.user", sep="|", encoding="latin-1", header=None)
users.columns = ["user_id", "age", "gender", "occupation", "zip_code"]
# Load the Item features
items = pd.read_csv(raw_data_path / "u.item", sep="|", encoding="latin-1", header=None)
items.columns = [
"movie_id",
"movie_title",
"release_date",
"video_release_date",
"IMDb_URL",
# Alternatively, as specified in the docs: 'The last 19 fields are the genres' so:
# list_of_genres = items.columns.tolist()[-19:]
list_of_genres = [
"unknown",
"Action",
"Adventure",
......@@ -53,10 +36,6 @@ items.columns = [
"Western",
]
list_of_genres = pd.read_csv(
raw_data_path / "u.genre", sep="|", header=None, usecols=[0]
)[0].tolist()
list_of_genres
# adding a column with the number of movies watched per user
dataset = data.sort_values(["user_id", "timestamp"]).reset_index(drop=True)
......@@ -95,9 +74,6 @@ dataset[list_of_genres] = dataset[list_of_genres].apply(
lambda x: x / dataset["num_watched"]
)
# Adding user features
dataset = dataset.merge(users, on="user_id", how="left")
# Again, we use the same settings as those in the Kaggle notebook,
# but 'COLD_START_TRESH' is pretty aggressive
COLD_START_TRESH = 5
......@@ -118,6 +94,10 @@ df_train = train_data.drop(cols_to_drop, axis=1)
df_valid = valid_data.drop(cols_to_drop, axis=1)
df_test = test_data.drop(cols_to_drop, axis=1)
save_path = Path("prepared_data")
if not save_path.exists():
save_path.mkdir(parents=True, exist_ok=True)
df_train.to_pickle(save_path / "df_train.pkl")
df_valid.to_pickle(save_path / "df_valid.pkl")
df_test.to_pickle(save_path / "df_test.pkl")
......@@ -378,6 +378,6 @@ def load_movielens100k(
df_users = pd.read_parquet(fpath)
if as_frame:
return df_data, df_items, df_users
return df_data, df_users, df_items
else:
return df_data.to_numpy(), df_items.to_numpy(), df_users.to_numpy()
return df_data.to_numpy(), df_users.to_numpy(), df_items.to_numpy()
......@@ -9,6 +9,7 @@ from pytorch_widedeep.utils.text_utils import (
pad_sequences,
build_embeddings_matrix,
)
from pytorch_widedeep.utils.general_utils import Alias
from pytorch_widedeep.utils.fastai_transforms import Vocab
from pytorch_widedeep.preprocessing.base_preprocessor import (
BasePreprocessor,
......@@ -16,7 +17,6 @@ from pytorch_widedeep.preprocessing.base_preprocessor import (
)
# TODO: Add alias to already_processed
class TextPreprocessor(BasePreprocessor):
r"""Preprocessor to prepare the ``deeptext`` input dataset
......@@ -36,8 +36,12 @@ class TextPreprocessor(BasePreprocessor):
pad_idx: int, default = 1
padding index. Fastai's Tokenizer leaves 0 for the 'unknown' token.
already_processed: bool, Optional, default = False
Boolean indicating if the text is already processed and we simply
want to tokenize it
Boolean indicating if the sequence of elements is already processed or
prepared. If this is the case, this Preprocessor will simply tokenize
and pad the sequence.
Param aliases: `not_text`. <br/>
word_vectors_path: str, Optional
Path to the pretrained word vectors
n_cpus: int, Optional, default = None
......@@ -70,6 +74,7 @@ class TextPreprocessor(BasePreprocessor):
array([[ 1, 1, 9, 16, 17, 18, 11, 0, 0, 13]], dtype=int32)
"""
@Alias("already_processed", "not_text")
def __init__(
self,
text_col: str,
......@@ -112,7 +117,10 @@ class TextPreprocessor(BasePreprocessor):
texts = df[self.text_col].tolist()
tokens = get_texts(texts, self.already_processed, self.n_cpus)
self.vocab = Vocab.create(
tokens, max_vocab=self.max_vocab, min_freq=self.min_freq
tokens,
max_vocab=self.max_vocab,
min_freq=self.min_freq,
pad_idx=self.pad_idx,
)
if self.verbose:
print("The vocabulary contains {} tokens".format(len(self.vocab.stoi)))
......
......@@ -391,7 +391,13 @@ class Vocab:
pickle.dump(self.itos, open(path, "wb"))
@classmethod
def create(cls, tokens: Tokens, max_vocab: int, min_freq: int) -> "Vocab":
def create(
cls,
tokens: Tokens,
max_vocab: int,
min_freq: int,
pad_idx: Optional[int] = None,
) -> "Vocab":
r"""Create a vocabulary object from a set of tokens.
Parameters
......@@ -402,9 +408,9 @@ class Vocab:
strings (e.g. list of tokenized sentences)
max_vocab: int
maximum vocabulary size
min_freq: int
minimum frequency that a token has to appear to be part of the
vocabulary
pad_idx: int, Optional, default = None
padding index. If None, Fastai's Tokenizer leaves 0 for
the 'unknown' token and defaults to 1.
Examples
--------
......@@ -427,12 +433,18 @@ class Vocab:
Vocab
An instance of a `Vocab` object
"""
freq = Counter(p for o in tokens for p in o)
itos = [o for o, c in freq.most_common(max_vocab) if c >= min_freq]
for o in reversed(defaults.text_spec_tok):
if o in itos:
itos.remove(o)
itos.insert(0, o)
if pad_idx is not None:
itos.remove(PAD)
itos.insert(pad_idx, PAD)
itos = itos[:max_vocab]
if (
len(itos) < max_vocab
......
......@@ -8,9 +8,9 @@ from pytorch_widedeep.datasets import (
load_birds,
load_ecoli,
load_bio_kdd04,
load_movielens100k,
load_womens_ecommerce,
load_california_housing,
load_movielens100k,
)
......@@ -127,19 +127,19 @@ def test_load_california_housing(as_frame):
],
)
def test_load_movielens100k(as_frame):
df_data, df_items, df_users = load_movielens100k(as_frame=as_frame)
df_data, df_users, df_items = load_movielens100k(as_frame=as_frame)
if as_frame:
assert (
df_data.shape,
df_items.shape,
df_users.shape,
df_items.shape,
type(df_data),
type(df_items),
type(df_users),
type(df_items),
) == (
(100000, 4),
(1682, 24),
(943, 5),
(1682, 24),
pd.DataFrame,
pd.DataFrame,
pd.DataFrame,
......@@ -147,15 +147,15 @@ def test_load_movielens100k(as_frame):
else:
assert (
df_data.shape,
df_items.shape,
df_users.shape,
df_items.shape,
type(df_data),
type(df_items),
type(df_users),
type(df_items),
) == (
(100000, 4),
(1682, 24),
(943, 5),
(1682, 24),
np.ndarray,
np.ndarray,
np.ndarray,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册