Fixed a bug related to the padding idx and the fast ai transforms. Also...

Fixed a bug related to the padding idx and the fast ai transforms. Also adjusted the scripts to show how one can use the 'load_movielens100k' function in the library

Fixed a bug related to the padding idx and the fast ai transforms. Also...
Fixed a bug related to the padding idx and the fast ai transforms. Also adjusted the scripts to show how one can use the 'load_movielens100k' function in the library
d30203a0 · Javier · b6a10336 · d30203a0 · d30203a0 · d30203a0
5 changed file
--- a/examples/scripts/wide_deep_for_recsys/ml100k_data_preparation.py
+++ b/examples/scripts/wide_deep_for_recsys/ml100k_data_preparation.py
@@ -4,34 +4,17 @@
 # https://github.com/jrzaurin/pytorch-widedeep/issues/133 In this script we
 # simply prepare the data that will later be used for a custom Wide and Deep
 # model and for Wide and Deep models created using this library
-
 from pathlib import Path

-import pandas as pd
 from sklearn.model_selection import train_test_split

-raw_data_path = Path("~/ml_projects/wide_deep_learning_for_recsys/ml-100k")
+from pytorch_widedeep.datasets import load_movielens100k

-save_path = Path("prepared_data")
-if not save_path.exists():
-    save_path.mkdir(parents=True, exist_ok=True)
+data, user, items = load_movielens100k(as_frame=True)

-# Load the Ratings/Interaction (triplets (user, item, rating) plus timestamp)
-data = pd.read_csv(raw_data_path / "u.data", sep="\t", header=None)
-data.columns = ["user_id", "movie_id", "rating", "timestamp"]
-
-# Load the User features
-users = pd.read_csv(raw_data_path / "u.user", sep="|", encoding="latin-1", header=None)
-users.columns = ["user_id", "age", "gender", "occupation", "zip_code"]
-
-# Load the Item features
-items = pd.read_csv(raw_data_path / "u.item", sep="|", encoding="latin-1", header=None)
-items.columns = [
-    "movie_id",
-    "movie_title",
-    "release_date",
-    "video_release_date",
-    "IMDb_URL",
+# Alternatively, as specified in the docs: 'The last 19 fields are the genres' so:
+# list_of_genres = items.columns.tolist()[-19:]
+list_of_genres = [
    "unknown",
    "Action",
    "Adventure",
@@ -53,10 +36,6 @@ items.columns = [
    "Western",
 ]

-list_of_genres = pd.read_csv(
-    raw_data_path / "u.genre", sep="|", header=None, usecols=[0]
-)[0].tolist()
-list_of_genres

 # adding a column with the number of movies watched per user
 dataset = data.sort_values(["user_id", "timestamp"]).reset_index(drop=True)
@@ -95,9 +74,6 @@ dataset[list_of_genres] = dataset[list_of_genres].apply(
    lambda x: x / dataset["num_watched"]
 )

-# Adding user features
-dataset = dataset.merge(users, on="user_id", how="left")
-
 # Again, we use the same settings as those in the Kaggle notebook,
 # but 'COLD_START_TRESH' is pretty aggressive
 COLD_START_TRESH = 5
@@ -118,6 +94,10 @@ df_train = train_data.drop(cols_to_drop, axis=1)
 df_valid = valid_data.drop(cols_to_drop, axis=1)
 df_test = test_data.drop(cols_to_drop, axis=1)

+save_path = Path("prepared_data")
+if not save_path.exists():
+    save_path.mkdir(parents=True, exist_ok=True)
+
 df_train.to_pickle(save_path / "df_train.pkl")
 df_valid.to_pickle(save_path / "df_valid.pkl")
 df_test.to_pickle(save_path / "df_test.pkl")
--- a/pytorch_widedeep/datasets/_base.py
+++ b/pytorch_widedeep/datasets/_base.py
@@ -378,6 +378,6 @@ def load_movielens100k(
        df_users = pd.read_parquet(fpath)

    if as_frame:
-        return df_data, df_items, df_users
+        return df_data, df_users, df_items
    else:
-        return df_data.to_numpy(), df_items.to_numpy(), df_users.to_numpy()
+        return df_data.to_numpy(), df_users.to_numpy(), df_items.to_numpy()
--- a/pytorch_widedeep/preprocessing/text_preprocessor.py
+++ b/pytorch_widedeep/preprocessing/text_preprocessor.py
@@ -9,6 +9,7 @@ from pytorch_widedeep.utils.text_utils import (
    pad_sequences,
    build_embeddings_matrix,
 )
+from pytorch_widedeep.utils.general_utils import Alias
 from pytorch_widedeep.utils.fastai_transforms import Vocab
 from pytorch_widedeep.preprocessing.base_preprocessor import (
    BasePreprocessor,
@@ -16,7 +17,6 @@ from pytorch_widedeep.preprocessing.base_preprocessor import (
 )


-# TODO: Add alias to already_processed
 class TextPreprocessor(BasePreprocessor):
    r"""Preprocessor to prepare the ``deeptext`` input dataset

@@ -36,8 +36,12 @@ class TextPreprocessor(BasePreprocessor):
    pad_idx: int, default = 1
        padding index. Fastai's Tokenizer leaves 0 for the 'unknown' token.
    already_processed: bool, Optional, default = False
-        Boolean indicating if the text is already processed and we simply
-        want to tokenize it
+        Boolean indicating if the sequence of elements is already processed or
+        prepared. If this is the case, this Preprocessor will simply tokenize
+        and pad the sequence.
+
+        Param aliases: `not_text`. <br/>
+
    word_vectors_path: str, Optional
        Path to the pretrained word vectors
    n_cpus: int, Optional, default = None
@@ -70,6 +74,7 @@ class TextPreprocessor(BasePreprocessor):
    array([[ 1,  1,  9, 16, 17, 18, 11,  0,  0, 13]], dtype=int32)
    """

+    @Alias("already_processed", "not_text")
    def __init__(
        self,
        text_col: str,
@@ -112,7 +117,10 @@ class TextPreprocessor(BasePreprocessor):
        texts = df[self.text_col].tolist()
        tokens = get_texts(texts, self.already_processed, self.n_cpus)
        self.vocab = Vocab.create(
-            tokens, max_vocab=self.max_vocab, min_freq=self.min_freq
+            tokens,
+            max_vocab=self.max_vocab,
+            min_freq=self.min_freq,
+            pad_idx=self.pad_idx,
        )
        if self.verbose:
            print("The vocabulary contains {} tokens".format(len(self.vocab.stoi)))

--- a/pytorch_widedeep/utils/fastai_transforms.py
+++ b/pytorch_widedeep/utils/fastai_transforms.py
@@ -391,7 +391,13 @@ class Vocab:
        pickle.dump(self.itos, open(path, "wb"))

    @classmethod
-    def create(cls, tokens: Tokens, max_vocab: int, min_freq: int) -> "Vocab":
+    def create(
+        cls,
+        tokens: Tokens,
+        max_vocab: int,
+        min_freq: int,
+        pad_idx: Optional[int] = None,
+    ) -> "Vocab":
        r"""Create a vocabulary object from a set of tokens.

        Parameters
@@ -402,9 +408,9 @@ class Vocab:
            strings (e.g. list of tokenized sentences)
        max_vocab: int
            maximum vocabulary size
-        min_freq: int
-            minimum frequency that a token has to appear to be part of the
-            vocabulary
+        pad_idx: int, Optional, default = None
+            padding index. If None, Fastai's Tokenizer leaves 0 for
+            the 'unknown' token and defaults to 1.

        Examples
        --------
@@ -427,12 +433,18 @@ class Vocab:
        Vocab
            An instance of a `Vocab` object
        """
+
        freq = Counter(p for o in tokens for p in o)
        itos = [o for o, c in freq.most_common(max_vocab) if c >= min_freq]
        for o in reversed(defaults.text_spec_tok):
            if o in itos:
                itos.remove(o)
            itos.insert(0, o)
+
+        if pad_idx is not None:
+            itos.remove(PAD)
+            itos.insert(pad_idx, PAD)
+
        itos = itos[:max_vocab]
        if (
            len(itos) < max_vocab

--- a/tests/test_datasets/test_datasets.py
+++ b/tests/test_datasets/test_datasets.py
@@ -8,9 +8,9 @@ from pytorch_widedeep.datasets import (
    load_birds,
    load_ecoli,
    load_bio_kdd04,
+    load_movielens100k,
    load_womens_ecommerce,
    load_california_housing,
-    load_movielens100k,
 )


@@ -127,19 +127,19 @@ def test_load_california_housing(as_frame):
    ],
 )
 def test_load_movielens100k(as_frame):
-    df_data, df_items, df_users = load_movielens100k(as_frame=as_frame)
+    df_data, df_users, df_items = load_movielens100k(as_frame=as_frame)
    if as_frame:
        assert (
            df_data.shape,
-            df_items.shape,
            df_users.shape,
+            df_items.shape,
            type(df_data),
-            type(df_items),
            type(df_users),
+            type(df_items),
        ) == (
            (100000, 4),
-            (1682, 24),
            (943, 5),
+            (1682, 24),
            pd.DataFrame,
            pd.DataFrame,
            pd.DataFrame,
@@ -147,15 +147,15 @@ def test_load_movielens100k(as_frame):
    else:
        assert (
            df_data.shape,
-            df_items.shape,
            df_users.shape,
+            df_items.shape,
            type(df_data),
-            type(df_items),
            type(df_users),
+            type(df_items),
        ) == (
            (100000, 4),
-            (1682, 24),
            (943, 5),
+            (1682, 24),
            np.ndarray,
            np.ndarray,
            np.ndarray,