Merge remote-tracking branch 'origin/master' into flash_attention

52ae96b5 · Javier · d3266908 · cd1ff79a · 52ae96b5 · 52ae96b5
33 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,7 @@ tmp_dir/
 weights/
 pretrained_weights/
 model_weights/
+prepared_data/

 # Unit Tests/Coverage
 .coverage

--- a/CITATION.cff
+++ b/CITATION.cff
+cff-version: "1.2.0"
+authors:
+- family-names: Zaurin
+  given-names: Javier Rodriguez
+  orcid: "https://orcid.org/0000-0002-1082-1107"
+- family-names: Mulinka
+  given-names: Pavol
+  orcid: "https://orcid.org/0000-0002-9394-8794"
+doi: 10.5281/zenodo.7908172
+message: If you use this software, please cite our article in the
+  Journal of Open Source Software.
+preferred-citation:
+  authors:
+  - family-names: Zaurin
+    given-names: Javier Rodriguez
+    orcid: "https://orcid.org/0000-0002-1082-1107"
+  - family-names: Mulinka
+    given-names: Pavol
+    orcid: "https://orcid.org/0000-0002-9394-8794"
+  date-published: 2023-06-24
+  doi: 10.21105/joss.05027
+  issn: 2475-9066
+  issue: 86
+  journal: Journal of Open Source Software
+  publisher:
+    name: Open Journals
+  start: 5027
+  title: "pytorch-widedeep: A flexible package for multimodal deep
+    learning"
+  type: article
+  url: "https://joss.theoj.org/papers/10.21105/joss.05027"
+  volume: 8
+title: "pytorch-widedeep: A flexible package for multimodal deep
+  learning"
\ No newline at end of file
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@
 [![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/jrzaurin/pytorch-widedeep/graphs/commit-activity)
 [![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/jrzaurin/pytorch-widedeep/issues)
 [![Slack](https://img.shields.io/badge/slack-chat-green.svg?logo=slack)](https://join.slack.com/t/pytorch-widedeep/shared_invite/zt-soss7stf-iXpVuLeKZz8lGTnxxtHtTw)
+[![DOI](https://joss.theoj.org/papers/10.21105/joss.05027/status.svg)](https://doi.org/10.21105/joss.05027)

 # pytorch-widedeep

@@ -38,6 +39,9 @@ The content of this document is organized as follows:
    - [How to Contribute](#how-to-contribute)
    - [Acknowledgments](#acknowledgments)
    - [License](#license)
+    - [Cite](#cite)
+      - [BibTex](#bibtex)
+      - [APA](#apa)

 ### Introduction

@@ -82,7 +86,7 @@ without a ``deephead`` component can be formulated as:


 Where &sigma; is the sigmoid function, *'W'* are the weight matrices applied to the wide model and to the final
-activations of the deep models, *'a'* are these final activations, 
+activations of the deep models, *'a'* are these final activations,
 &phi;(x) are the cross product transformations of the original features *'x'*, and
 , and *'b'* is the bias term.
 In case you are wondering what are *"cross product transformations"*, here is
@@ -331,4 +335,31 @@ Vision](https://www.pyimagesearch.com/deep-learning-computer-vision-python-book/
 This work is dual-licensed under Apache 2.0 and MIT (or any later version).
 You can choose between one of them if you use this work.

-`SPDX-License-Identifier: Apache-2.0 AND MIT`
\ No newline at end of file
+`SPDX-License-Identifier: Apache-2.0 AND MIT`
+
+### Cite
+
+#### BibTex
+
+```
+@article{Zaurin_pytorch-widedeep_A_flexible_2023,
+author = {Zaurin, Javier Rodriguez and Mulinka, Pavol},
+doi = {10.21105/joss.05027},
+journal = {Journal of Open Source Software},
+month = jun,
+number = {86},
+pages = {5027},
+title = {{pytorch-widedeep: A flexible package for multimodal deep learning}},
+url = {https://joss.theoj.org/papers/10.21105/joss.05027},
+volume = {8},
+year = {2023}
+}
+```
+
+#### APA
+
+```
+Zaurin, J. R., & Mulinka, P. (2023). pytorch-widedeep: A flexible package for
+multimodal deep learning. Journal of Open Source Software, 8(86), 5027.
+https://doi.org/10.21105/joss.05027
+```
--- a/VERSION
+++ b/VERSION
-1.3.0
+1.3.1
--- a/examples/notebooks/19_wide_and_deep_for_recsys_pt1.ipynb
+++ b/examples/notebooks/19_wide_and_deep_for_recsys_pt1.ipynb
--- a/examples/notebooks/19_wide_and_deep_for_recsys_pt2.ipynb
+++ b/examples/notebooks/19_wide_and_deep_for_recsys_pt2.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is the second of the two notebooks where we aim to illustrate how one could use this library to build recommendation algorithms using the example in this [Kaggle notebook](https://www.kaggle.com/code/matanivanov/wide-deep-learning-for-recsys-with-pytorch) as guidance. In the previous notebook we used `pytorch-widedeep` to build a model that replicated almost exactly that in the notebook. In this, shorter notebook we will show how one could use the library to explore other models, following the same problem formulation, this is: given a state of a user at a certain point in time having watched a series of movies, our goal is to predict which movie the user will watch next. \n",
+    "\n",
+    "Assuming that one has read (and run) the previous notebook, the required data will be stored in a local dir called `prepared_data`, so let's read it:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import pandas as pd\n",
+    "from torch import nn\n",
+    "\n",
+    "from pytorch_widedeep import Trainer\n",
+    "from pytorch_widedeep.utils import pad_sequences\n",
+    "from pytorch_widedeep.models import TabMlp, WideDeep, Transformer\n",
+    "from pytorch_widedeep.preprocessing import TabPreprocessor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_path = Path(\"prepared_data\")\n",
+    "\n",
+    "PAD_IDX = 0\n",
+    "\n",
+    "id_cols = [\"user_id\", \"movie_id\"]\n",
+    "\n",
+    "df_train = pd.read_pickle(save_path / \"df_train.pkl\")\n",
+    "df_valid = pd.read_pickle(save_path / \"df_valid.pkl\")\n",
+    "df_test = pd.read_pickle(save_path / \"df_test.pkl\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "...remember that in the previous notebook we explained that we are not  going to use a validation set here (in a real-world example, or simply a more realistic example, one should always use it).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_test = pd.concat([df_valid, df_test], ignore_index=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Also remember that, in the previous notebook we discussed that the `'maxlen'` and `'max_movie_index'` parameters should be computed using only the train set. In particular, to properly do the tokenization, one would have to use ONLY train tokens and add a token for new 'unknown'/'unseen' movies in the test set. This can also be done with this library or manually, so I will leave it to the reader to implement that tokenzation appraoch."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "maxlen = max(\n",
+    "    df_train.prev_movies.apply(lambda x: len(x)).max(),\n",
+    "    df_test.prev_movies.apply(lambda x: len(x)).max(),\n",
+    ")\n",
+    "\n",
+    "max_movie_index = max(df_train.movie_id.max(), df_test.movie_id.max())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "From now one things are pretty simple, moreover bearing in mind that in this example we are not going to use a wide component since, in pple, one would believe that the information in that component is also 'carried' by the movie sequences (However in the previous notebook, if one performs ablation studies, these suggest that most of the prediction power comes from the linear, wide model).\n",
+    "\n",
+    "In the example here we are going to explore one (of many) possibilities. We are simply going to encode the triplet `(user, item, rating)` and use it as a `deeptabular` component and the sequences of previously watched movies as the `deeptext` component. For the `deeptext` component we are going to use a basic encoder-only transformer model.\n",
+    "\n",
+    "Let's start with the tabular data preparation\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train_user_item = df_train[[\"user_id\", \"movie_id\", \"rating\"]]\n",
+    "train_movies_sequences = df_train.prev_movies.apply(\n",
+    "    lambda x: [int(el) for el in x]\n",
+    ").to_list()\n",
+    "y_train = df_train.target.values.astype(int)\n",
+    "\n",
+    "df_test_user_item = df_train[[\"user_id\", \"movie_id\", \"rating\"]]\n",
+    "test_movies_sequences = df_test.prev_movies.apply(\n",
+    "    lambda x: [int(el) for el in x]\n",
+    ").to_list()\n",
+    "y_test = df_test.target.values.astype(int)\n",
+    "\n",
+    "tab_preprocessor = tab_preprocessor = TabPreprocessor(\n",
+    "    cat_embed_cols=[\"user_id\", \"movie_id\", \"rating\"],\n",
+    ")\n",
+    "X_train_tab = tab_preprocessor.fit_transform(df_train_user_item)\n",
+    "X_test_tab = tab_preprocessor.transform(df_test_user_item)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And not the text component, simply padding the sequences:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train_text = np.array(\n",
+    "    [\n",
+    "        pad_sequences(\n",
+    "            s,\n",
+    "            maxlen=maxlen,\n",
+    "            pad_first=False,\n",
+    "            pad_idx=PAD_IDX,\n",
+    "        )\n",
+    "        for s in train_movies_sequences\n",
+    "    ]\n",
+    ")\n",
+    "X_test_text = np.array(\n",
+    "    [\n",
+    "        pad_sequences(\n",
+    "            s,\n",
+    "            maxlen=maxlen,\n",
+    "            pad_first=False,\n",
+    "            pad_idx=0,\n",
+    "        )\n",
+    "        for s in test_movies_sequences\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We now define the model components and the wide and deep model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tab_mlp = TabMlp(\n",
+    "    column_idx=tab_preprocessor.column_idx,\n",
+    "    cat_embed_input=tab_preprocessor.cat_embed_input,\n",
+    "    mlp_hidden_dims=[1024, 512, 256],\n",
+    "    mlp_activation=\"relu\",\n",
+    ")\n",
+    "\n",
+    "# plenty of options here, see the docs\n",
+    "transformer = Transformer(\n",
+    "    vocab_size=max_movie_index + 1,\n",
+    "    embed_dim=32,\n",
+    "    n_heads=2,\n",
+    "    n_blocks=2,\n",
+    "    seq_length=maxlen,\n",
+    ")\n",
+    "\n",
+    "wide_deep_model = WideDeep(\n",
+    "    deeptabular=tab_mlp, deeptext=transformer, pred_dim=max_movie_index + 1\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "WideDeep(\n",
+       "  (deeptabular): Sequential(\n",
+       "    (0): TabMlp(\n",
+       "      (cat_and_cont_embed): DiffSizeCatAndContEmbeddings(\n",
+       "        (cat_embed): DiffSizeCatEmbeddings(\n",
+       "          (embed_layers): ModuleDict(\n",
+       "            (emb_layer_user_id): Embedding(749, 65, padding_idx=0)\n",
+       "            (emb_layer_movie_id): Embedding(1612, 100, padding_idx=0)\n",
+       "            (emb_layer_rating): Embedding(6, 4, padding_idx=0)\n",
+       "          )\n",
+       "          (embedding_dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "      )\n",
+       "      (encoder): MLP(\n",
+       "        (mlp): Sequential(\n",
+       "          (dense_layer_0): Sequential(\n",
+       "            (0): Dropout(p=0.1, inplace=False)\n",
+       "            (1): Linear(in_features=169, out_features=1024, bias=True)\n",
+       "            (2): ReLU(inplace=True)\n",
+       "          )\n",
+       "          (dense_layer_1): Sequential(\n",
+       "            (0): Dropout(p=0.1, inplace=False)\n",
+       "            (1): Linear(in_features=1024, out_features=512, bias=True)\n",
+       "            (2): ReLU(inplace=True)\n",
+       "          )\n",
+       "          (dense_layer_2): Sequential(\n",
+       "            (0): Dropout(p=0.1, inplace=False)\n",
+       "            (1): Linear(in_features=512, out_features=256, bias=True)\n",
+       "            (2): ReLU(inplace=True)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "    (1): Linear(in_features=256, out_features=1683, bias=True)\n",
+       "  )\n",
+       "  (deeptext): Sequential(\n",
+       "    (0): Transformer(\n",
+       "      (embedding): Embedding(1683, 32)\n",
+       "      (pos_encoder): PositionalEncoding(\n",
+       "        (dropout): Dropout(p=0.1, inplace=False)\n",
+       "      )\n",
+       "      (encoder): Sequential(\n",
+       "        (transformer_block0): TransformerEncoder(\n",
+       "          (attn): MultiHeadedAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_proj): Linear(in_features=32, out_features=32, bias=False)\n",
+       "            (kv_proj): Linear(in_features=32, out_features=64, bias=False)\n",
+       "            (out_proj): Linear(in_features=32, out_features=32, bias=False)\n",
+       "          )\n",
+       "          (ff): FeedForward(\n",
+       "            (w_1): Linear(in_features=32, out_features=128, bias=True)\n",
+       "            (w_2): Linear(in_features=128, out_features=32, bias=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (activation): GELU(approximate='none')\n",
+       "          )\n",
+       "          (attn_addnorm): AddNorm(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n",
+       "          )\n",
+       "          (ff_addnorm): AddNorm(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n",
+       "          )\n",
+       "        )\n",
+       "        (transformer_block1): TransformerEncoder(\n",
+       "          (attn): MultiHeadedAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_proj): Linear(in_features=32, out_features=32, bias=False)\n",
+       "            (kv_proj): Linear(in_features=32, out_features=64, bias=False)\n",
+       "            (out_proj): Linear(in_features=32, out_features=32, bias=False)\n",
+       "          )\n",
+       "          (ff): FeedForward(\n",
+       "            (w_1): Linear(in_features=32, out_features=128, bias=True)\n",
+       "            (w_2): Linear(in_features=128, out_features=32, bias=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (activation): GELU(approximate='none')\n",
+       "          )\n",
+       "          (attn_addnorm): AddNorm(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n",
+       "          )\n",
+       "          (ff_addnorm): AddNorm(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "    (1): Linear(in_features=23552, out_features=1683, bias=True)\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wide_deep_model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And as in the previous notebook, let's train (you will need a GPU for this)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer = Trainer(\n",
+    "    model=wide_deep_model,\n",
+    "    objective=\"multiclass\",\n",
+    "    custom_loss_function=nn.CrossEntropyLoss(ignore_index=PAD_IDX),\n",
+    "    optimizers=torch.optim.Adam(wide_deep_model.parameters(), lr=1e-3),\n",
+    ")\n",
+    "\n",
+    "trainer.fit(\n",
+    "    X_train={\n",
+    "        \"X_tab\": X_train_tab,\n",
+    "        \"X_text\": X_train_text,\n",
+    "        \"target\": y_train,\n",
+    "    },\n",
+    "    X_val={\n",
+    "        \"X_tab\": X_test_tab,\n",
+    "        \"X_text\": X_test_text,\n",
+    "        \"target\": y_test,\n",
+    "    },\n",
+    "    n_epochs=10,\n",
+    "    batch_size=521,\n",
+    "    shuffle=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/examples/scripts/wide_deep_for_recsys/kaggle_wide_deep_model.py
+++ b/examples/scripts/wide_deep_for_recsys/kaggle_wide_deep_model.py
+# This script is mostly a copy/paste from the Kaggle notebook
+# https://www.kaggle.com/code/matanivanov/wide-deep-learning-for-recsys-with-pytorch.
+# Is a response to the issue:
+# https://github.com/jrzaurin/pytorch-widedeep/issues/133.
+# In this script we run the exact same model used in that Kaggle notebook
+
+from pathlib import Path
+
+import numpy as np
+import torch
+import pandas as pd
+from torch import nn, cat, mean
+from scipy.sparse import coo_matrix
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+save_path = Path("prepared_data")
+
+
+def get_coo_indexes(lil):
+    rows = []
+    cols = []
+    for i, el in enumerate(lil):
+        if type(el) != list:
+            el = [el]
+        for j in el:
+            rows.append(i)
+            cols.append(j)
+    return rows, cols
+
+
+def get_sparse_features(series, shape):
+    coo_indexes = get_coo_indexes(series.tolist())
+    sparse_df = coo_matrix(
+        (np.ones(len(coo_indexes[0])), (coo_indexes[0], coo_indexes[1])), shape=shape
+    )
+    return sparse_df
+
+
+def sparse_to_idx(data, pad_idx=-1):
+    indexes = data.nonzero()
+    indexes_df = pd.DataFrame()
+    indexes_df["rows"] = indexes[0]
+    indexes_df["cols"] = indexes[1]
+    mdf = indexes_df.groupby("rows").apply(lambda x: x["cols"].tolist())
+    max_len = mdf.apply(lambda x: len(x)).max()
+    return mdf.apply(lambda x: pd.Series(x + [pad_idx] * (max_len - len(x)))).values
+
+
+def idx_to_sparse(idx, sparse_dim):
+    sparse = np.zeros(sparse_dim)
+    sparse[int(idx)] = 1
+    return pd.Series(sparse, dtype=int)
+
+
+def process_cats_as_kaggle_notebook(df):
+    df["gender"] = (df["gender"] == "M").astype(int)
+    df = pd.concat(
+        [
+            df.drop("occupation", axis=1),
+            pd.get_dummies(df["occupation"]).astype(int),
+        ],
+        axis=1,
+    )
+    df.drop("other", axis=1, inplace=True)
+    df.drop("zip_code", axis=1, inplace=True)
+
+    return df
+
+
+id_cols = ["user_id", "movie_id"]
+
+df_train = pd.read_pickle(save_path / "df_train.pkl")
+df_valid = pd.read_pickle(save_path / "df_valid.pkl")
+df_test = pd.read_pickle(save_path / "df_test.pkl")
+df_test = pd.concat([df_valid, df_test], ignore_index=True)
+
+df_train = process_cats_as_kaggle_notebook(df_train)
+df_test = process_cats_as_kaggle_notebook(df_test)
+
+# here is another caveat, using all dataset to build 'train_movies_watched'
+# when in reality one should use only the training
+max_movie_index = max(df_train.movie_id.max(), df_test.movie_id.max())
+
+X_train = df_train.drop(id_cols + ["prev_movies", "target"], axis=1)
+y_train = df_train.target.values
+train_movies_watched = get_sparse_features(
+    df_train["prev_movies"], (len(df_train), max_movie_index + 1)
+)
+
+X_test = df_test.drop(id_cols + ["prev_movies", "target"], axis=1)
+y_test = df_test.target.values
+test_movies_watched = get_sparse_features(
+    df_test["prev_movies"], (len(df_test), max_movie_index + 1)
+)
+
+PAD_IDX = 0
+
+X_train_tensor = torch.Tensor(X_train.fillna(0).values).to(device)
+train_movies_watched_tensor = (
+    torch.sparse_coo_tensor(
+        indices=train_movies_watched.nonzero(),
+        values=[1] * len(train_movies_watched.nonzero()[0]),
+        size=train_movies_watched.shape,
+    )
+    .to_dense()
+    .to(device)
+)
+movies_train_sequences = (
+    torch.Tensor(
+        sparse_to_idx(train_movies_watched, pad_idx=PAD_IDX),
+    )
+    .long()
+    .to(device)
+)
+target_train = torch.Tensor(y_train).long().to(device)
+
+
+X_test_tensor = torch.Tensor(X_test.fillna(0).values).to(device)
+test_movies_watched_tensor = (
+    torch.sparse_coo_tensor(
+        indices=test_movies_watched.nonzero(),
+        values=[1] * len(test_movies_watched.nonzero()[0]),
+        size=test_movies_watched.shape,
+    )
+    .to_dense()
+    .to(device)
+)
+movies_test_sequences = (
+    torch.Tensor(
+        sparse_to_idx(test_movies_watched, pad_idx=PAD_IDX),
+    )
+    .long()
+    .to(device)
+)
+target_test = torch.Tensor(y_test).long().to(device)
+
+
+class WideAndDeep(nn.Module):
+    def __init__(
+        self,
+        continious_feature_shape,  # number of continious features
+        embed_size,  # size of embedding for binary features
+        embed_dict_len,  # number of unique binary features
+        pad_idx,  # padding index
+    ):
+        super(WideAndDeep, self).__init__()
+        self.embed = nn.Embedding(embed_dict_len, embed_size, padding_idx=pad_idx)
+        self.linear_relu_stack = nn.Sequential(
+            nn.Linear(embed_size + continious_feature_shape, 1024),
+            nn.ReLU(),
+            nn.Linear(1024, 512),
+            nn.ReLU(),
+            nn.Linear(512, 256),
+            nn.ReLU(),
+        )
+        self.head = nn.Sequential(
+            nn.Linear(embed_dict_len + 256, embed_dict_len),
+        )
+
+    def forward(self, continious, binary, binary_idx):
+        # get embeddings for sequence of indexes
+        binary_embed = self.embed(binary_idx)
+        binary_embed_mean = mean(binary_embed, dim=1)
+        # get logits for "deep" part: continious features + binary embeddings
+        deep_logits = self.linear_relu_stack(
+            cat((continious, binary_embed_mean), dim=1)
+        )
+        # get final softmax logits for "deep" part and raw binary features
+        total_logits = self.head(cat((deep_logits, binary), dim=1))
+        return total_logits
+
+
+model = WideAndDeep(X_train.shape[1], 16, max_movie_index + 1, PAD_IDX).to(device)
+print(model)
+
+
+EPOCHS = 10
+loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
+optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+
+for t in range(EPOCHS):
+    model.train()
+    pred_train = model(
+        X_train_tensor, train_movies_watched_tensor, movies_train_sequences
+    )
+    loss_train = loss_fn(pred_train, target_train)
+
+    # Backpropagation
+    optimizer.zero_grad()
+    loss_train.backward()
+    optimizer.step()
+
+    model.eval()
+    with torch.no_grad():
+        pred_test = model(
+            X_test_tensor, test_movies_watched_tensor, movies_test_sequences
+        )
+        loss_test = loss_fn(pred_test, target_test)
+
+    print(f"Epoch {t}")
+    print(f"Train loss: {loss_train:>7f}")
+    print(f"Test loss: {loss_test:>7f}")
--- a/examples/scripts/wide_deep_for_recsys/ml100k_data_preparation.py
+++ b/examples/scripts/wide_deep_for_recsys/ml100k_data_preparation.py
+# This script is mostly a copy/paste from the Kaggle notebook
+# https://www.kaggle.com/code/matanivanov/wide-deep-learning-for-recsys-with-pytorch.
+# Is a response to the issue:
+# https://github.com/jrzaurin/pytorch-widedeep/issues/133 In this script we
+# simply prepare the data that will later be used for a custom Wide and Deep
+# model and for Wide and Deep models created using this library
+from pathlib import Path
+
+from sklearn.model_selection import train_test_split
+
+from pytorch_widedeep.datasets import load_movielens100k
+
+data, user, items = load_movielens100k(as_frame=True)
+
+# Alternatively, as specified in the docs: 'The last 19 fields are the genres' so:
+# list_of_genres = items.columns.tolist()[-19:]
+list_of_genres = [
+    "unknown",
+    "Action",
+    "Adventure",
+    "Animation",
+    "Children's",
+    "Comedy",
+    "Crime",
+    "Documentary",
+    "Drama",
+    "Fantasy",
+    "Film-Noir",
+    "Horror",
+    "Musical",
+    "Mystery",
+    "Romance",
+    "Sci-Fi",
+    "Thriller",
+    "War",
+    "Western",
+]
+
+
+# adding a column with the number of movies watched per user
+dataset = data.sort_values(["user_id", "timestamp"]).reset_index(drop=True)
+dataset["one"] = 1
+dataset["num_watched"] = dataset.groupby("user_id")["one"].cumsum()
+dataset.drop("one", axis=1, inplace=True)
+
+# adding a column with the mean rating at a point in time per user
+dataset["mean_rate"] = (
+    dataset.groupby("user_id")["rating"].cumsum() / dataset["num_watched"]
+)
+
+# In this particular exercise the problem is formulating as predicting the
+# next movie that will be watched (in consequence the last interactions will be discarded)
+dataset["target"] = dataset.groupby("user_id")["movie_id"].shift(-1)
+
+# Here the author builds the sequences
+dataset["prev_movies"] = dataset["movie_id"].apply(lambda x: str(x))
+dataset["prev_movies"] = (
+    dataset.groupby("user_id")["prev_movies"]
+    .apply(lambda x: (x + " ").cumsum().str.strip())
+    .reset_index(drop=True)
+)
+dataset["prev_movies"] = dataset["prev_movies"].apply(lambda x: x.split())
+
+# Adding a genre_rate as the mean of all movies rated for a given genre per
+# user
+dataset = dataset.merge(items[["movie_id"] + list_of_genres], on="movie_id", how="left")
+for genre in list_of_genres:
+    dataset[f"{genre}_rate"] = dataset[genre] * dataset["rating"]
+    dataset[genre] = dataset.groupby("user_id")[genre].cumsum()
+    dataset[f"{genre}_rate"] = (
+        dataset.groupby("user_id")[f"{genre}_rate"].cumsum() / dataset[genre]
+    )
+dataset[list_of_genres] = dataset[list_of_genres].apply(
+    lambda x: x / dataset["num_watched"]
+)
+
+# Again, we use the same settings as those in the Kaggle notebook,
+# but 'COLD_START_TRESH' is pretty aggressive
+COLD_START_TRESH = 5
+
+filtred_data = dataset[
+    (dataset["num_watched"] >= COLD_START_TRESH) & ~(dataset["target"].isna())
+].sort_values("timestamp")
+train_data, _test_data = train_test_split(filtred_data, test_size=0.2, shuffle=False)
+valid_data, test_data = train_test_split(_test_data, test_size=0.5, shuffle=False)
+
+cols_to_drop = [
+    # "rating",
+    "timestamp",
+    "num_watched",
+]
+
+df_train = train_data.drop(cols_to_drop, axis=1)
+df_valid = valid_data.drop(cols_to_drop, axis=1)
+df_test = test_data.drop(cols_to_drop, axis=1)
+
+save_path = Path("prepared_data")
+if not save_path.exists():
+    save_path.mkdir(parents=True, exist_ok=True)
+
+df_train.to_pickle(save_path / "df_train.pkl")
+df_valid.to_pickle(save_path / "df_valid.pkl")
+df_test.to_pickle(save_path / "df_test.pkl")
--- a/examples/scripts/wide_deep_for_recsys/pytorch_wide_deep_pt1.py
+++ b/examples/scripts/wide_deep_for_recsys/pytorch_wide_deep_pt1.py
+# In this script I illustrate how one coould use our library to reproduce
+# almost exactly the same model used in the Kaggle Notebook
+
+from pathlib import Path
+
+import numpy as np
+import torch
+import pandas as pd
+from torch import nn
+from scipy.sparse import coo_matrix
+
+from pytorch_widedeep import Trainer
+from pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep
+from pytorch_widedeep.preprocessing import TabPreprocessor
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+save_path = Path("prepared_data")
+
+PAD_IDX = 0
+
+
+def get_coo_indexes(lil):
+    rows = []
+    cols = []
+    for i, el in enumerate(lil):
+        if type(el) != list:
+            el = [el]
+        for j in el:
+            rows.append(i)
+            cols.append(j)
+    return rows, cols
+
+
+def get_sparse_features(series, shape):
+    coo_indexes = get_coo_indexes(series.tolist())
+    sparse_df = coo_matrix(
+        (np.ones(len(coo_indexes[0])), (coo_indexes[0], coo_indexes[1])), shape=shape
+    )
+    return sparse_df
+
+
+def sparse_to_idx(data, pad_idx=-1):
+    indexes = data.nonzero()
+    indexes_df = pd.DataFrame()
+    indexes_df["rows"] = indexes[0]
+    indexes_df["cols"] = indexes[1]
+    mdf = indexes_df.groupby("rows").apply(lambda x: x["cols"].tolist())
+    max_len = mdf.apply(lambda x: len(x)).max()
+    return mdf.apply(lambda x: pd.Series(x + [pad_idx] * (max_len - len(x)))).values
+
+
+id_cols = ["user_id", "movie_id"]
+
+df_train = pd.read_pickle(save_path / "df_train.pkl")
+df_valid = pd.read_pickle(save_path / "df_valid.pkl")
+df_test = pd.read_pickle(save_path / "df_test.pkl")
+df_test = pd.concat([df_valid, df_test], ignore_index=True)
+
+# here is another caveat, using all dataset to build 'train_movies_watched'
+# when in reality one should use only the training
+max_movie_index = max(df_train.movie_id.max(), df_test.movie_id.max())
+
+X_train = df_train.drop(id_cols + ["rating", "prev_movies", "target"], axis=1)
+y_train = np.array(df_train.target.values, dtype="int64")
+train_movies_watched = get_sparse_features(
+    df_train["prev_movies"], (len(df_train), max_movie_index + 1)
+)
+
+X_test = df_test.drop(id_cols + ["rating", "prev_movies", "target"], axis=1)
+y_test = np.array(df_test.target.values, dtype="int64")
+test_movies_watched = get_sparse_features(
+    df_test["prev_movies"], (len(df_test), max_movie_index + 1)
+)
+
+cat_cols = ["gender", "occupation", "zip_code"]
+cont_cols = [c for c in X_train if c not in cat_cols]
+tab_preprocessor = TabPreprocessor(
+    cat_embed_cols=cat_cols,
+    continuous_cols=cont_cols,
+)
+
+# The sparse matrices need to be turned into dense whether at array or tensor
+# stage. This is one of the reasons why the wide component in our library is
+# implemented as Embeddings. However, our implementation is still not
+# suitable for the type of pre-processing that the author of the Kaggle
+# notebook did to come up with the what it would be the wide component
+# (a sparse martrix with 1s at those locations corresponding to the movies
+# that a user has seen at a point in time). Therefore, we will have to code a
+# Wide model (fairly simple since it is a linear layer)
+X_train_wide = np.array(train_movies_watched.todense())
+X_test_wide = np.array(test_movies_watched.todense())
+
+# Here our tabular component is a bit more elaborated than that in the
+# notebook, just a bit...
+X_train_tab = tab_preprocessor.fit_transform(X_train.fillna(0))
+X_test_tab = tab_preprocessor.transform(X_test.fillna(0))
+
+# The text component are the sequences of movies wacthed. There is an element
+# of information redundancy here in my opinion. This is because the wide and
+# text components have implicitely the same information, but in different
+# form. Anyway, we want to reproduce the Kaggle notebook as close as
+# possible.
+X_train_text = sparse_to_idx(train_movies_watched, pad_idx=PAD_IDX)
+X_test_text = sparse_to_idx(test_movies_watched, pad_idx=PAD_IDX)
+
+
+class Wide(nn.Module):
+    def __init__(self, input_dim: int, pred_dim: int):
+        super().__init__()
+
+        self.input_dim = input_dim
+        self.pred_dim = pred_dim
+
+        # The way I coded the library I never though that someone would ever
+        # wanted to code their own wide component. However, if you do, the
+        # wide component must have a 'wide_linear' attribute. In other words,
+        # the linear layer must be called 'wide_linear'
+        self.wide_linear = nn.Linear(input_dim, pred_dim)
+
+    def forward(self, X):
+        out = self.wide_linear(X.type(torch.float32))
+        return out
+
+
+wide = Wide(X_train_wide.shape[1], max_movie_index + 1)
+
+
+class SimpleEmbed(nn.Module):
+    def __init__(self, vocab_size: int, embed_dim: int, pad_idx: int):
+        super().__init__()
+
+        self.vocab_size = vocab_size
+        self.embed_dim = embed_dim
+        self.pad_idx = pad_idx
+
+        # The sequences of movies watched are simply embedded in the Kaggle
+        # notebook. No RNN, Transformer or any model is used
+        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
+
+    def forward(self, X):
+        embed = self.embed(X)
+        embed_mean = torch.mean(embed, dim=1)
+        return embed_mean
+
+    @property
+    def output_dim(self) -> int:
+        return self.embed_dim
+
+
+# In the notebook the author uses simply embeddings
+simple_embed = SimpleEmbed(max_movie_index + 1, 16, 0)
+# but maybe one would like to use an RNN to account for the sequence nature of
+# the problem formulation
+basic_rnn = BasicRNN(
+    vocab_size=max_movie_index + 1,
+    embed_dim=16,
+    hidden_dim=32,
+    n_layers=2,
+    rnn_type="gru",
+)
+
+tab_mlp = TabMlp(
+    column_idx=tab_preprocessor.column_idx,
+    cat_embed_input=tab_preprocessor.cat_embed_input,
+    continuous_cols=tab_preprocessor.continuous_cols,
+    cont_norm_layer=None,
+    mlp_hidden_dims=[1024, 512, 256],
+    mlp_activation="relu",
+)
+
+# The main difference between this wide and deep model and the Wide and Deep
+# model in the Kaggle notebook is that in that notebook, the author
+# concatenates the embedings and the tabular features(which he refers
+# as 'continuous'), then passes this concatenation through a stack of
+# linear + Relu layers. Then concatenates this output with the binary
+# features and connects this concatenation with the final linear layer. Our
+# implementation follows the notation of the original paper and instead of
+# concatenating the tabular, text and wide components, we first compute their
+# output, and then add it (see here: https://arxiv.org/pdf/1606.07792.pdf,
+# their Eq 3). Note that this is effectively the same with the caveat that
+# while in one case we initialise a big weight matrix at once, in our
+# implementation we initialise different matrices for different components.
+# Anyway, let's give it a go.
+wide_deep_model = WideDeep(
+    wide=wide, deeptabular=tab_mlp, deeptext=simple_embed, pred_dim=max_movie_index + 1
+)
+# # To use an RNN, simply
+# wide_deep_model = WideDeep(
+#     wide=wide, deeptabular=tab_mlp, deeptext=basic_rnn, pred_dim=max_movie_index + 1
+# )
+
+trainer = Trainer(
+    model=wide_deep_model,
+    objective="multiclass",
+    custom_loss_function=nn.CrossEntropyLoss(ignore_index=PAD_IDX),
+    optimizers=torch.optim.Adam(wide_deep_model.parameters(), lr=1e-3),
+)
+
+trainer.fit(
+    X_train={
+        "X_wide": X_train_wide,
+        "X_tab": X_train_tab,
+        "X_text": X_train_text,
+        "target": y_train,
+    },
+    X_val={
+        "X_wide": X_test_wide,
+        "X_tab": X_test_tab,
+        "X_text": X_test_text,
+        "target": y_test,
+    },
+    n_epochs=10,
+    batch_size=512,
+    shuffle=False,
+)
--- a/examples/scripts/wide_deep_for_recsys/pytorch_wide_deep_pt2.py
+++ b/examples/scripts/wide_deep_for_recsys/pytorch_wide_deep_pt2.py
+from pathlib import Path
+
+import numpy as np
+import torch
+import pandas as pd
+from torch import nn
+
+from pytorch_widedeep import Trainer
+from pytorch_widedeep.utils import pad_sequences
+from pytorch_widedeep.models import TabMlp, WideDeep, Transformer
+from pytorch_widedeep.preprocessing import TabPreprocessor
+
+save_path = Path("prepared_data")
+
+PAD_IDX = 0
+
+id_cols = ["user_id", "movie_id"]
+
+df_train = pd.read_pickle(save_path / "df_train.pkl")
+df_valid = pd.read_pickle(save_path / "df_valid.pkl")
+df_test = pd.read_pickle(save_path / "df_test.pkl")
+df_test = pd.concat([df_valid, df_test], ignore_index=True)
+
+# sequence length. Shorter sequences will be padded to this length. This is
+# identical to the Kaggle's implementation
+maxlen = max(
+    df_train.prev_movies.apply(lambda x: len(x)).max(),
+    df_test.prev_movies.apply(lambda x: len(x)).max(),
+)
+
+# Here there is a caveat. In pple, we are using (as in the Kaggle notebook)
+# all indexes to compute the number of tokens in the dataset. To do this
+# properly, one would have to use ONLY train tokens and add a token for new
+# unknown/unseen movies in the test set. This can also be done with this
+# library and manually, so I will leave it to the reader to implement that
+# tokenzation appraoch
+max_movie_index = max(df_train.movie_id.max(), df_test.movie_id.max())
+
+# From now one things are pretty simple, moreover bearing in mind that in this
+# example we are not going to use a wide component since, in pple, I believe
+# the information in that component is also 'carried' by the movie sequences
+# (also in previous scripts one can see that most prediction power comes from
+# the linear, wide model)
+df_train_user_item = df_train[["user_id", "movie_id", "rating"]]
+train_movies_sequences = df_train.prev_movies.apply(
+    lambda x: [int(el) for el in x]
+).to_list()
+y_train = df_train.target.values.astype(int)
+
+df_test_user_item = df_train[["user_id", "movie_id", "rating"]]
+test_movies_sequences = df_test.prev_movies.apply(
+    lambda x: [int(el) for el in x]
+).to_list()
+y_test = df_test.target.values.astype(int)
+
+# As a tabular component we are going to encode simply the triplets
+# (user, items, rating)
+tab_preprocessor = tab_preprocessor = TabPreprocessor(
+    cat_embed_cols=["user_id", "movie_id", "rating"],
+)
+X_train_tab = tab_preprocessor.fit_transform(df_train_user_item)
+X_test_tab = tab_preprocessor.transform(df_test_user_item)
+
+# And here we pad the sequences and define a transformer model for the text
+# component that is, in this case, the sequences of movies watched
+X_train_text = np.array(
+    [
+        pad_sequences(
+            s,
+            maxlen=maxlen,
+            pad_first=False,
+            pad_idx=PAD_IDX,
+        )
+        for s in train_movies_sequences
+    ]
+)
+X_test_text = np.array(
+    [
+        pad_sequences(
+            s,
+            maxlen=maxlen,
+            pad_first=False,
+            pad_idx=0,
+        )
+        for s in test_movies_sequences
+    ]
+)
+
+tab_mlp = TabMlp(
+    column_idx=tab_preprocessor.column_idx,
+    cat_embed_input=tab_preprocessor.cat_embed_input,
+    mlp_hidden_dims=[1024, 512, 256],
+    mlp_activation="relu",
+)
+
+# plenty of options here, see the docs
+transformer = Transformer(
+    vocab_size=max_movie_index + 1,
+    embed_dim=16,
+    n_heads=2,
+    n_blocks=2,
+    seq_length=maxlen,
+)
+
+wide_deep_model = WideDeep(
+    deeptabular=tab_mlp, deeptext=transformer, pred_dim=max_movie_index + 1
+)
+
+trainer = Trainer(
+    model=wide_deep_model,
+    objective="multiclass",
+    custom_loss_function=nn.CrossEntropyLoss(ignore_index=PAD_IDX),
+    optimizers=torch.optim.Adam(wide_deep_model.parameters(), lr=1e-3),
+)
+
+trainer.fit(
+    X_train={
+        "X_tab": X_train_tab,
+        "X_text": X_train_text,
+        "target": y_train,
+    },
+    X_val={
+        "X_tab": X_test_tab,
+        "X_text": X_test_text,
+        "target": y_test,
+    },
+    n_epochs=10,
+    batch_size=521,
+    shuffle=False,
+)
--- a/pytorch_widedeep/datasets/__init__.py
+++ b/pytorch_widedeep/datasets/__init__.py
@@ -4,6 +4,7 @@ from ._base import (
    load_birds,
    load_ecoli,
    load_bio_kdd04,
+    load_movielens100k,
    load_womens_ecommerce,
    load_california_housing,
 )
@@ -16,4 +17,5 @@ __all__ = [
    "load_birds",
    "load_rf1",
    "load_womens_ecommerce",
+    "load_movielens100k",
 ]
--- a/pytorch_widedeep/datasets/_base.py
+++ b/pytorch_widedeep/datasets/_base.py
 # dataframes are saved as parquet, pyarrow, brotli
 # pd.to_parquet(path=None, engine="auto", compression="brotli", index=False)
 # see related post: https://python.plainenglish.io/storing-pandas-98-faster-disk-reads-and-72-less-space-208e2e2be8bb
+from typing import Tuple, Union
 from importlib import resources

+import numpy as np
 import pandas as pd


-def load_bio_kdd04(as_frame: bool = False):
+def load_bio_kdd04(as_frame: bool = False) -> Union[np.ndarray, pd.DataFrame]:
    """Load and return the higly imbalanced binary classification Protein Homology
    Dataset from [KDD cup 2004](https://www.kdd.org/kdd-cup/view/kdd-cup-2004/Data).
    This datasets include only bio_train.dat part of the dataset
@@ -39,7 +41,7 @@ def load_bio_kdd04(as_frame: bool = False):
        return df.to_numpy()


-def load_adult(as_frame: bool = False):
+def load_adult(as_frame: bool = False) -> Union[np.ndarray, pd.DataFrame]:
    """Load and return the higly imbalanced binary classification [adult income datatest](http://www.cs.toronto.edu/~delve/data/adult/desc.html).
    you may find detailed description [here](http://www.cs.toronto.edu/~delve/data/adult/adultDetail.html)
    """
@@ -55,7 +57,7 @@ def load_adult(as_frame: bool = False):
        return df.to_numpy()


-def load_ecoli(as_frame: bool = False):
+def load_ecoli(as_frame: bool = False) -> Union[np.ndarray, pd.DataFrame]:
    """Load and return the higly imbalanced multiclass classification e.coli dataset
    Dataset from [UCI Machine learning Repository](https://archive.ics.uci.edu/ml/datasets/ecoli).

@@ -142,7 +144,7 @@ def load_ecoli(as_frame: bool = False):
        return df.to_numpy()


-def load_california_housing(as_frame: bool = False):
+def load_california_housing(as_frame: bool = False) -> Union[np.ndarray, pd.DataFrame]:
    """Load and return the higly imbalanced regression California housing dataset.

    Characteristics:
@@ -190,7 +192,7 @@ def load_california_housing(as_frame: bool = False):
        return df.to_numpy()


-def load_birds(as_frame: bool = False):
+def load_birds(as_frame: bool = False) -> Union[np.ndarray, pd.DataFrame]:
    """Load and return the multi-label classification bird dataset.

    References
@@ -216,7 +218,7 @@ def load_birds(as_frame: bool = False):
        return df.to_numpy()


-def load_rf1(as_frame: bool = False):
+def load_rf1(as_frame: bool = False) -> Union[np.ndarray, pd.DataFrame]:
    """Load and return the multi-target regression River Flow(RF1) dataset.

        Characterisctics:
@@ -243,7 +245,7 @@ def load_rf1(as_frame: bool = False):
        return df.to_numpy()


-def load_womens_ecommerce(as_frame: bool = False):
+def load_womens_ecommerce(as_frame: bool = False) -> Union[np.ndarray, pd.DataFrame]:
    """
    Context
    This is a Women’s Clothing E-Commerce dataset revolving around the reviews written by customers.
@@ -279,3 +281,103 @@ def load_womens_ecommerce(as_frame: bool = False):
        return df
    else:
        return df.to_numpy()
+
+
+def load_movielens100k(
+    as_frame: bool = False,
+) -> Union[
+    Tuple[np.ndarray, np.ndarray, np.ndarray],
+    Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame],
+]:
+    """Load and return the MovieLens 100k dataset in 3 separate files.
+
+    SUMMARY & USAGE LICENSE:
+    =============================================
+    MovieLens data sets were collected by the GroupLens Research Project
+    at the University of Minnesota.
+
+    This data set consists of:
+        * 100,000 ratings (1-5) from 943 users on 1682 movies.
+        * Each user has rated at least 20 movies.
+            * Simple demographic info for the users (age, gender, occupation, zip)
+
+    The data was collected through the MovieLens web site
+    (movielens.umn.edu) during the seven-month period from September 19th,
+    1997 through April 22nd, 1998. This data has been cleaned up - users
+    who had less than 20 ratings or did not have complete demographic
+    information were removed from this data set. Detailed descriptions of
+    the data file can be found at the end of this file.
+
+    Neither the University of Minnesota nor any of the researchers
+    involved can guarantee the correctness of the data, its suitability
+    for any particular purpose, or the validity of results based on the
+    use of the data set.  The data set may be used for any research
+    purposes under the following conditions:
+
+        * The user may not state or imply any endorsement from the
+        University of Minnesota or the GroupLens Research Group.
+
+        * The user must acknowledge the use of the data set in
+        publications resulting from the use of the data set
+        (see below for citation information).
+
+        * The user may not redistribute the data without separate
+        permission.
+
+        * The user may not use this information for any commercial or
+        revenue-bearing purposes without first obtaining permission
+        from a faculty member of the GroupLens Research Project at the
+        University of Minnesota.
+
+    If you have any further questions or comments, please contact GroupLens
+    <grouplens-info@cs.umn.edu>.
+
+    CITATION:
+    =============================================
+    To acknowledge use of the dataset in publications, please cite the
+    following paper:
+
+    F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets:
+    History and Context. ACM Transactions on Interactive Intelligent
+    Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages.
+    DOI=http://dx.doi.org/10.1145/2827872
+
+    Returns
+    -------
+    df_data: Union[np.ndarray, pd.DataFrame]
+        The full u data set, 100000 ratings by 943 users on 1682 items.
+        Each user has rated at least 20 movies. Users and items are
+        numbered consecutively from 1. The data is randomly
+        ordered. The time stamps are unix seconds since 1/1/1970 UTC
+    df_items: Union[np.ndarray, pd.DataFrame]
+        Information about the items (movies).
+        The last 19 fields are the genres, a 1 indicates the movie
+        is of that genre, a 0 indicates it is not; movies can be in
+        several genres at once.
+        The movie ids are the ones used in the df_data data set.
+    df_users: Union[np.ndarray, pd.DataFrame]
+        Demographic information about the users.
+        The user ids are the ones used in the df_data data set.
+    """
+    with resources.path(
+        "pytorch_widedeep.datasets.data",
+        "MovieLens100k_data.parquet.brotli",
+    ) as fpath:
+        df_data = pd.read_parquet(fpath)
+
+    with resources.path(
+        "pytorch_widedeep.datasets.data",
+        "MovieLens100k_items.parquet.brotli",
+    ) as fpath:
+        df_items = pd.read_parquet(fpath)
+
+    with resources.path(
+        "pytorch_widedeep.datasets.data",
+        "MovieLens100k_users.parquet.brotli",
+    ) as fpath:
+        df_users = pd.read_parquet(fpath)
+
+    if as_frame:
+        return df_data, df_users, df_items
+    else:
+        return df_data.to_numpy(), df_users.to_numpy(), df_items.to_numpy()
--- a/pytorch_widedeep/datasets/data/MovieLens100k_data.parquet.brotli
+++ b/pytorch_widedeep/datasets/data/MovieLens100k_data.parquet.brotli
--- a/pytorch_widedeep/datasets/data/MovieLens100k_items.parquet.brotli
+++ b/pytorch_widedeep/datasets/data/MovieLens100k_items.parquet.brotli
--- a/pytorch_widedeep/datasets/data/MovieLens100k_users.parquet.brotli
+++ b/pytorch_widedeep/datasets/data/MovieLens100k_users.parquet.brotli
--- a/pytorch_widedeep/models/__init__.py
+++ b/pytorch_widedeep/models/__init__.py
 from pytorch_widedeep.models.text import (
    BasicRNN,
+    Transformer,
    AttentiveRNN,
    StackedAttentiveRNN,
 )

--- a/pytorch_widedeep/models/tabular/transformers/_attention_layers.py
+++ b/pytorch_widedeep/models/tabular/transformers/_attention_layers.py
@@ -22,16 +22,20 @@ class FeedForward(nn.Module):
        self,
        input_dim: int,
        dropout: float,
+        mult: float,
        activation: str,
-        mult: float = 4.0,
+        *,
+        ff_hidden_dim: Optional[int] = None,
    ):
        super(FeedForward, self).__init__()
-        ff_hidden_dim = int(input_dim * mult)
+        ff_hid_dim = (
+            ff_hidden_dim if ff_hidden_dim is not None else int(input_dim * mult)
+        )
        self.w_1 = nn.Linear(
            input_dim,
-            ff_hidden_dim * 2 if activation.endswith("glu") else ff_hidden_dim,
+            ff_hid_dim * 2 if activation.endswith("glu") else ff_hid_dim,
        )
-        self.w_2 = nn.Linear(ff_hidden_dim, input_dim)
+        self.w_2 = nn.Linear(ff_hid_dim, input_dim)
        self.dropout = nn.Dropout(dropout)
        self.activation = get_activation_fn(activation)


--- a/pytorch_widedeep/models/tabular/transformers/_encoders.py
+++ b/pytorch_widedeep/models/tabular/transformers/_encoders.py
@@ -20,6 +20,7 @@ class TransformerEncoder(nn.Module):
        use_bias: bool,
        attn_dropout: float,
        ff_dropout: float,
+        ff_factor: int,
        activation: str,
    ):
        super(TransformerEncoder, self).__init__()
@@ -30,7 +31,7 @@ class TransformerEncoder(nn.Module):
            use_bias,
            attn_dropout,
        )
-        self.ff = FeedForward(input_dim, ff_dropout, activation)
+        self.ff = FeedForward(input_dim, ff_dropout, ff_factor, activation)

        self.attn_addnorm = AddNorm(input_dim, attn_dropout)
        self.ff_addnorm = AddNorm(input_dim, ff_dropout)
@@ -48,6 +49,7 @@ class SaintEncoder(nn.Module):
        use_bias: bool,
        attn_dropout: float,
        ff_dropout: float,
+        ff_factor: int,
        activation: str,
        n_feat: int,
    ):
@@ -61,7 +63,7 @@ class SaintEncoder(nn.Module):
            use_bias,
            attn_dropout,
        )
-        self.col_attn_ff = FeedForward(input_dim, ff_dropout, activation)
+        self.col_attn_ff = FeedForward(input_dim, ff_dropout, ff_factor, activation)
        self.col_attn_addnorm = AddNorm(input_dim, attn_dropout)
        self.col_attn_ff_addnorm = AddNorm(input_dim, ff_dropout)

@@ -71,7 +73,12 @@ class SaintEncoder(nn.Module):
            use_bias,
            attn_dropout,
        )
-        self.row_attn_ff = FeedForward(n_feat * input_dim, ff_dropout, activation)
+        self.row_attn_ff = FeedForward(
+            n_feat * input_dim,
+            ff_dropout,
+            ff_factor,
+            activation,
+        )
        self.row_attn_addnorm = AddNorm(n_feat * input_dim, attn_dropout)
        self.row_attn_ff_addnorm = AddNorm(n_feat * input_dim, ff_dropout)

@@ -94,10 +101,10 @@ class FTTransformerEncoder(nn.Module):
        use_bias: bool,
        attn_dropout: float,
        ff_dropout: float,
+        ff_factor: float,
        kv_compression_factor: float,
        kv_sharing: bool,
        activation: str,
-        ff_factor: float,
        first_block: bool,
    ):
        super(FTTransformerEncoder, self).__init__()
@@ -113,7 +120,7 @@ class FTTransformerEncoder(nn.Module):
            kv_compression_factor,
            kv_sharing,
        )
-        self.ff = FeedForward(input_dim, ff_dropout, activation, ff_factor)
+        self.ff = FeedForward(input_dim, ff_dropout, ff_factor, activation)

        self.attn_normadd = NormAdd(input_dim, attn_dropout)
        self.ff_normadd = NormAdd(input_dim, ff_dropout)
@@ -134,6 +141,7 @@ class PerceiverEncoder(nn.Module):
        use_bias: bool,
        attn_dropout: float,
        ff_dropout: float,
+        ff_factor: int,
        activation: str,
        query_dim: Optional[int] = None,
    ):
@@ -147,7 +155,7 @@ class PerceiverEncoder(nn.Module):
            query_dim,
        )
        attn_dim_out = query_dim if query_dim is not None else input_dim
-        self.ff = FeedForward(attn_dim_out, ff_dropout, activation)
+        self.ff = FeedForward(attn_dim_out, ff_dropout, ff_factor, activation)

        self.ln_q = nn.LayerNorm(attn_dim_out)
        self.ln_kv = nn.LayerNorm(input_dim)
@@ -171,6 +179,7 @@ class FastFormerEncoder(nn.Module):
        use_bias: bool,
        attn_dropout: float,
        ff_dropout: float,
+        ff_factor: int,
        share_qv_weights: bool,
        activation: str,
    ):
@@ -184,7 +193,7 @@ class FastFormerEncoder(nn.Module):
            share_qv_weights,
        )

-        self.ff = FeedForward(input_dim, ff_dropout, activation)
+        self.ff = FeedForward(input_dim, ff_dropout, ff_factor, activation)
        self.attn_addnorm = AddNorm(input_dim, attn_dropout)
        self.ff_addnorm = AddNorm(input_dim, ff_dropout)


--- a/pytorch_widedeep/models/tabular/transformers/ft_transformer.py
+++ b/pytorch_widedeep/models/tabular/transformers/ft_transformer.py
@@ -90,13 +90,13 @@ class FTTransformer(BaseTabularModelWithAttention):
        Dropout that will be applied to the Linear-Attention layers
    ff_dropout: float, default = 0.1
        Dropout that will be applied to the FeedForward network
-    transformer_activation: str, default = "gelu"
-        Transformer Encoder activation function. _'tanh'_, _'relu'_,
-        _'leaky_relu'_, _'gelu'_, _'geglu'_ and _'reglu'_ are supported
    ff_factor: float, default = 4 / 3
        Multiplicative factor applied to the first layer of the FF network in
        each Transformer block, This is normally set to 4, but they use 4/3
        in the paper.
+    transformer_activation: str, default = "gelu"
+        Transformer Encoder activation function. _'tanh'_, _'relu'_,
+        _'leaky_relu'_, _'gelu'_, _'geglu'_ and _'reglu'_ are supported
    mlp_hidden_dims: List, Optional, default = None
        MLP hidden dimensions. If not provided no MLP on top of the final
        FTTransformer block will be used
@@ -162,8 +162,8 @@ class FTTransformer(BaseTabularModelWithAttention):
        n_blocks: int = 4,
        attn_dropout: float = 0.2,
        ff_dropout: float = 0.1,
-        transformer_activation: str = "reglu",
        ff_factor: float = 1.33,
+        transformer_activation: str = "reglu",
        mlp_hidden_dims: Optional[List[int]] = None,
        mlp_activation: str = "relu",
        mlp_dropout: float = 0.1,
@@ -197,8 +197,8 @@ class FTTransformer(BaseTabularModelWithAttention):
        self.n_blocks = n_blocks
        self.attn_dropout = attn_dropout
        self.ff_dropout = ff_dropout
-        self.transformer_activation = transformer_activation
        self.ff_factor = ff_factor
+        self.transformer_activation = transformer_activation

        self.mlp_hidden_dims = mlp_hidden_dims
        self.mlp_activation = mlp_activation
@@ -226,10 +226,10 @@ class FTTransformer(BaseTabularModelWithAttention):
                    use_qkv_bias,
                    attn_dropout,
                    ff_dropout,
+                    ff_factor,
                    kv_compression_factor,
                    kv_sharing,
                    transformer_activation,
-                    ff_factor,
                    is_first,
                ),
            )

--- a/pytorch_widedeep/models/tabular/transformers/saint.py
+++ b/pytorch_widedeep/models/tabular/transformers/saint.py
@@ -80,6 +80,9 @@ class SAINT(BaseTabularModelWithAttention):
        row layers
    ff_dropout: float, default = 0.1
        Dropout that will be applied to the FeedForward network
+    ff_factor: float, default = 4
+        Multiplicative factor applied to the first layer of the FF network in
+        each Transformer block, This is normally set to 4.
    transformer_activation: str, default = "gelu"
        Transformer Encoder activation function. _'tanh'_, _'relu'_,
        _'leaky_relu'_, _'gelu'_, _'geglu'_ and _'reglu'_ are supported
@@ -146,6 +149,7 @@ class SAINT(BaseTabularModelWithAttention):
        n_blocks: int = 2,
        attn_dropout: float = 0.1,
        ff_dropout: float = 0.2,
+        ff_factor: int = 4,
        transformer_activation: str = "gelu",
        mlp_hidden_dims: Optional[List[int]] = None,
        mlp_activation: str = "relu",
@@ -178,6 +182,7 @@ class SAINT(BaseTabularModelWithAttention):
        self.n_blocks = n_blocks
        self.attn_dropout = attn_dropout
        self.ff_dropout = ff_dropout
+        self.ff_factor = ff_factor
        self.transformer_activation = transformer_activation

        self.mlp_hidden_dims = mlp_hidden_dims
@@ -204,6 +209,7 @@ class SAINT(BaseTabularModelWithAttention):
                    use_qkv_bias,
                    attn_dropout,
                    ff_dropout,
+                    ff_factor,
                    transformer_activation,
                    self.n_feats,
                ),

--- a/pytorch_widedeep/models/tabular/transformers/tab_fastformer.py
+++ b/pytorch_widedeep/models/tabular/transformers/tab_fastformer.py
@@ -84,6 +84,9 @@ class TabFastFormer(BaseTabularModelWithAttention):
        Dropout that will be applied to the Additive Attention layers
    ff_dropout: float, default = 0.1
        Dropout that will be applied to the FeedForward network
+    ff_factor: float, default = 4
+        Multiplicative factor applied to the first layer of the FF network in
+        each Transformer block, This is normally set to 4.
    share_qv_weights: bool, default = False
        Following the paper, this is a boolean indicating if the Value ($V$) and
        the Query ($Q$) transformation parameters will be shared.
@@ -159,6 +162,7 @@ class TabFastFormer(BaseTabularModelWithAttention):
        n_blocks: int = 4,
        attn_dropout: float = 0.1,
        ff_dropout: float = 0.2,
+        ff_factor: int = 4,
        share_qv_weights: bool = False,
        share_weights: bool = False,
        transformer_activation: str = "relu",
@@ -193,6 +197,7 @@ class TabFastFormer(BaseTabularModelWithAttention):
        self.n_blocks = n_blocks
        self.attn_dropout = attn_dropout
        self.ff_dropout = ff_dropout
+        self.ff_factor = ff_factor
        self.share_qv_weights = share_qv_weights
        self.share_weights = share_weights
        self.transformer_activation = transformer_activation
@@ -218,6 +223,7 @@ class TabFastFormer(BaseTabularModelWithAttention):
            use_bias,
            attn_dropout,
            ff_dropout,
+            ff_factor,
            share_qv_weights,
            transformer_activation,
        )
@@ -236,6 +242,7 @@ class TabFastFormer(BaseTabularModelWithAttention):
                        use_bias,
                        attn_dropout,
                        ff_dropout,
+                        ff_factor,
                        share_qv_weights,
                        transformer_activation,
                    ),

--- a/pytorch_widedeep/models/tabular/transformers/tab_perceiver.py
+++ b/pytorch_widedeep/models/tabular/transformers/tab_perceiver.py
@@ -108,6 +108,9 @@ class TabPerceiver(BaseTabularModelWithAttention):
        Dropout that will be applied to the Multi-Head Attention layers
    ff_dropout: float, default = 0.1
        Dropout that will be applied to the FeedForward network
+    ff_factor: float, default = 4
+        Multiplicative factor applied to the first layer of the FF network in
+        each Transformer block, This is normally set to 4.
    transformer_activation: str, default = "gelu"
        Transformer Encoder activation function. _'tanh'_, _'relu'_,
        _'leaky_relu'_, _'gelu'_, _'geglu'_ and _'reglu'_ are supported
@@ -183,6 +186,7 @@ class TabPerceiver(BaseTabularModelWithAttention):
        share_weights: bool = False,
        attn_dropout: float = 0.1,
        ff_dropout: float = 0.1,
+        ff_factor: int = 4,
        transformer_activation: str = "geglu",
        mlp_hidden_dims: Optional[List[int]] = None,
        mlp_activation: str = "relu",
@@ -220,6 +224,7 @@ class TabPerceiver(BaseTabularModelWithAttention):
        self.share_weights = share_weights
        self.attn_dropout = attn_dropout
        self.ff_dropout = ff_dropout
+        self.ff_factor = ff_factor
        self.transformer_activation = transformer_activation

        self.mlp_hidden_dims = mlp_hidden_dims
@@ -343,6 +348,7 @@ class TabPerceiver(BaseTabularModelWithAttention):
                    False,  # use_bias
                    self.attn_dropout,
                    self.ff_dropout,
+                    self.ff_factor,
                    self.transformer_activation,
                    self.latent_dim,  # q_dim,
                ),
@@ -360,6 +366,7 @@ class TabPerceiver(BaseTabularModelWithAttention):
                    False,  # use_bias
                    self.attn_dropout,
                    self.ff_dropout,
+                    self.ff_factor,
                    self.transformer_activation,
                ),
            )

--- a/pytorch_widedeep/models/tabular/transformers/tab_transformer.py
+++ b/pytorch_widedeep/models/tabular/transformers/tab_transformer.py
@@ -86,6 +86,9 @@ class TabTransformer(BaseTabularModelWithAttention):
        Dropout that will be applied to the Multi-Head Attention layers
    ff_dropout: float, default = 0.1
        Dropout that will be applied to the FeedForward network
+    ff_factor: float, default = 4
+        Multiplicative factor applied to the first layer of the FF network in
+        each Transformer block, This is normally set to 4.
    transformer_activation: str, default = "gelu"
        Transformer Encoder activation function. _'tanh'_, _'relu'_,
        _'leaky_relu'_, _'gelu'_, _'geglu'_ and _'reglu'_ are supported
@@ -153,6 +156,7 @@ class TabTransformer(BaseTabularModelWithAttention):
        n_blocks: int = 4,
        attn_dropout: float = 0.2,
        ff_dropout: float = 0.1,
+        ff_factor: int = 4,
        transformer_activation: str = "gelu",
        mlp_hidden_dims: Optional[List[int]] = None,
        mlp_activation: str = "relu",
@@ -186,6 +190,7 @@ class TabTransformer(BaseTabularModelWithAttention):
        self.attn_dropout = attn_dropout
        self.ff_dropout = ff_dropout
        self.transformer_activation = transformer_activation
+        self.ff_factor = ff_factor

        self.mlp_hidden_dims = mlp_hidden_dims
        self.mlp_activation = mlp_activation
@@ -215,6 +220,7 @@ class TabTransformer(BaseTabularModelWithAttention):
                    use_qkv_bias,
                    attn_dropout,
                    ff_dropout,
+                    ff_factor,
                    transformer_activation,
                ),
            )

--- a/pytorch_widedeep/models/text/__init__.py
+++ b/pytorch_widedeep/models/text/__init__.py
 from pytorch_widedeep.models.text.basic_rnn import BasicRNN
 from pytorch_widedeep.models.text.attentive_rnn import AttentiveRNN
+from pytorch_widedeep.models.text.basic_transformer import Transformer
 from pytorch_widedeep.models.text.stacked_attentive_rnn import (
    StackedAttentiveRNN,
 )
--- a/pytorch_widedeep/models/text/basic_transformer.py
+++ b/pytorch_widedeep/models/text/basic_transformer.py
+import math
+
+import torch
+from torch import nn
+
+from pytorch_widedeep.wdtypes import Union, Tensor, Optional
+from pytorch_widedeep.utils.general_utils import Alias
+from pytorch_widedeep.models.tabular.transformers._encoders import (
+    TransformerEncoder,
+)
+
+
+class Transformer(nn.Module):
+    r"""Basic Encoder-Only Transformer Model for text classification/regression.
+    As all other models in the library this model can be used as the
+    `deeptext` component of a Wide & Deep model or independently by itself.
+
+    **NOTE**: This model is introduced in the context of recommendation
+      systems and thought for sequences of any nature (e.g. items). It can,
+      of course, still be used for text. However, at this stage, we have
+      decided to not include the possibility of loading pretrained word
+      vectors since we aim to integrate the library wit Huggingface in the
+      (hopefully) near future
+
+    Parameters
+    ----------
+    vocab_size: int
+        Number of words in the vocabulary
+    input_dim: int
+        Dimension of the token embeddings
+
+        Param aliases: `embed_dim`, `d_model`. <br/>
+
+    seq_length: int, Optional, default = None
+        Input sequence length
+    n_heads: int, default = 8
+        Number of attention heads per Transformer block
+    n_blocks: int, default = 4
+        Number of Transformer blocks
+    attn_dropout: float, default = 0.2
+        Dropout that will be applied to the Multi-Head Attention layers
+    ff_dropout: float, default = 0.1
+        Dropout that will be applied to the FeedForward network
+    ff_factor: float, default = 4
+        Multiplicative factor applied to the first layer of the FF network in
+        each Transformer block, This is normally set to 4.
+    activation: str, default = "gelu"
+        Transformer Encoder activation function. _'tanh'_, _'relu'_,
+        _'leaky_relu'_, _'gelu'_, _'geglu'_ and _'reglu'_ are supported
+    with_cls_token: bool, default = False
+        Boolean indicating if a `'[CLS]'` token is included in the tokenized
+        sequences. If present, the final hidden state corresponding to this
+        token is used as the aggregated representation for classification and
+        regression tasks. **NOTE**: if included in the tokenized sequences it
+        must be inserted as the first token in the sequences.
+    with_pos_encoding: bool, default = True
+        Boolean indicating if positional encoding will be used
+    pos_encoding_dropout: float, default = 0.1
+        Positional encoding dropout
+    pos_encoder: nn.Module, Optional, default = None
+        This model uses by default a standard positional encoding approach.
+        However, any custom positional encoder can also be used and pass to
+        the Transformer model via the 'pos_encoder' parameter
+
+    Attributes
+    ----------
+    embedding: nn.Module
+        Standard token embedding layer
+    pos_encoder: nn.Module
+        Positional Encoder
+    encoder: nn.Module
+        Sequence of Transformer blocks
+    """
+
+    @Alias("input_dim", ["embed_dim", "d_model"])
+    @Alias("seq_length", ["max_length", "maxlen"])
+    def __init__(
+        self,
+        vocab_size: int,
+        seq_length: int,
+        input_dim: int,
+        n_heads: int,
+        n_blocks: int,
+        attn_dropout: float = 0.1,
+        ff_dropout: float = 0.1,
+        ff_factor: int = 4,
+        activation: str = "gelu",
+        with_cls_token: bool = False,
+        *,  # from here on pos encoding args
+        with_pos_encoding: bool = True,
+        pos_encoding_dropout: float = 0.1,
+        pos_encoder: Optional[nn.Module] = None,
+    ):
+        super().__init__()
+
+        self.input_dim = input_dim
+        self.seq_length = seq_length
+        self.n_heads = n_heads
+        self.n_blocks = n_blocks
+        self.attn_dropout = attn_dropout
+        self.ff_dropout = ff_dropout
+        self.ff_factor = ff_factor
+        self.activation = activation
+        self.with_cls_token = with_cls_token
+        self.with_pos_encoding = with_pos_encoding
+        self.pos_encoding_dropout = pos_encoding_dropout
+
+        self.embedding = nn.Embedding(vocab_size, input_dim)
+
+        if with_pos_encoding:
+            if pos_encoder is not None:
+                self.pos_encoder: Union[
+                    nn.Module, nn.Identity, PositionalEncoding
+                ] = pos_encoder
+            else:
+                self.pos_encoder = PositionalEncoding(
+                    input_dim, pos_encoding_dropout, seq_length
+                )
+        else:
+            self.pos_encoder = nn.Identity()
+
+        self.encoder = nn.Sequential()
+        for i in range(n_blocks):
+            self.encoder.add_module(
+                "transformer_block" + str(i),
+                TransformerEncoder(
+                    input_dim,
+                    n_heads,
+                    False,  # use_qkv_bias
+                    attn_dropout,
+                    ff_dropout,
+                    ff_factor,
+                    activation,
+                ),
+            )
+
+    def forward(self, X: Tensor) -> Tensor:
+        x = self.embedding(X)
+        x = self.pos_encoder(x)
+        x = self.encoder(x)
+        if self.with_cls_token:
+            x = x[:, 0, :]
+        else:
+            x = x.flatten(1)
+        return x
+
+    @property
+    def output_dim(self) -> int:
+        if self.with_cls_token:
+            output_dim = self.input_dim
+        else:
+            output_dim = self.input_dim * self.seq_length
+        return output_dim
+
+
+class PositionalEncoding(nn.Module):
+    """Positional Encoding copied and pasted directly from [The Beginners'
+    Tutorial]
+    (https://pytorch.org/tutorials/beginner/transformer_tutorial.html) at the
+    Pytorch site. Here is simply adapated so that the input sequence length
+    must be specified and in our implementation the input tensor dimensions
+    are arranged as `[batch_size, seq_len, embedding_dim]` instead of `
+    [seq_len, batch_size, embedding_dim]` , as in the before mentioned
+    tutorial
+
+    Parameters
+    ----------
+    input_dim: int
+        Dimension of the token embeddings
+    dropout: float
+        Positional encoding dropout
+    seq_length: int
+        input sequence length
+
+    """
+
+    def __init__(self, input_dim: int, dropout: float, seq_length: int):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        position = torch.arange(seq_length).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, input_dim, 2) * (-math.log(10000.0) / input_dim)
+        )
+        pe = torch.zeros(1, seq_length, input_dim)
+        pe[0, :, 0::2] = torch.sin(position * div_term)
+        pe[0, :, 1::2] = torch.cos(position * div_term)
+        self.register_buffer("pe", pe)
+
+    def forward(self, X: Tensor) -> Tensor:
+        return self.dropout(X + self.pe)
--- a/pytorch_widedeep/preprocessing/text_preprocessor.py
+++ b/pytorch_widedeep/preprocessing/text_preprocessor.py
@@ -9,6 +9,7 @@ from pytorch_widedeep.utils.text_utils import (
    pad_sequences,
    build_embeddings_matrix,
 )
+from pytorch_widedeep.utils.general_utils import Alias
 from pytorch_widedeep.utils.fastai_transforms import Vocab
 from pytorch_widedeep.preprocessing.base_preprocessor import (
    BasePreprocessor,
@@ -34,6 +35,13 @@ class TextPreprocessor(BasePreprocessor):
        end of the sequences
    pad_idx: int, default = 1
        padding index. Fastai's Tokenizer leaves 0 for the 'unknown' token.
+    already_processed: bool, Optional, default = False
+        Boolean indicating if the sequence of elements is already processed or
+        prepared. If this is the case, this Preprocessor will simply tokenize
+        and pad the sequence.
+
+        Param aliases: `not_text`. <br/>
+
    word_vectors_path: str, Optional
        Path to the pretrained word vectors
    n_cpus: int, Optional, default = None
@@ -66,6 +74,7 @@ class TextPreprocessor(BasePreprocessor):
    array([[ 1,  1,  9, 16, 17, 18, 11,  0,  0, 13]], dtype=int32)
    """

+    @Alias("already_processed", "not_text")
    def __init__(
        self,
        text_col: str,
@@ -74,6 +83,7 @@ class TextPreprocessor(BasePreprocessor):
        maxlen: int = 80,
        pad_first: bool = True,
        pad_idx: int = 1,
+        already_processed: Optional[bool] = False,
        word_vectors_path: Optional[str] = None,
        n_cpus: Optional[int] = None,
        verbose: int = 1,
@@ -86,6 +96,7 @@ class TextPreprocessor(BasePreprocessor):
        self.maxlen = maxlen
        self.pad_first = pad_first
        self.pad_idx = pad_idx
+        self.already_processed = already_processed
        self.word_vectors_path = word_vectors_path
        self.verbose = verbose
        self.n_cpus = n_cpus if n_cpus is not None else os.cpu_count()
@@ -104,9 +115,12 @@ class TextPreprocessor(BasePreprocessor):
            `TextPreprocessor` fitted object
        """
        texts = df[self.text_col].tolist()
-        tokens = get_texts(texts, self.n_cpus)
+        tokens = get_texts(texts, self.already_processed, self.n_cpus)
        self.vocab = Vocab.create(
-            tokens, max_vocab=self.max_vocab, min_freq=self.min_freq
+            tokens,
+            max_vocab=self.max_vocab,
+            min_freq=self.min_freq,
+            pad_idx=self.pad_idx,
        )
        if self.verbose:
            print("The vocabulary contains {} tokens".format(len(self.vocab.stoi)))
@@ -131,7 +145,7 @@ class TextPreprocessor(BasePreprocessor):
        """
        check_is_fitted(self, attributes=["vocab"])
        texts = df[self.text_col].tolist()
-        self.tokens = get_texts(texts, self.n_cpus)
+        self.tokens = get_texts(texts, self.already_processed, self.n_cpus)
        sequences = [self.vocab.numericalize(t) for t in self.tokens]
        padded_seq = np.array(
            [

--- a/pytorch_widedeep/training/trainer.py
+++ b/pytorch_widedeep/training/trainer.py
@@ -1127,6 +1127,7 @@ class Trainer(BaseTrainer):
    @staticmethod
    def _extract_kwargs(kwargs):
        dataloader_params = [
+            "shuffle",
            "sampler",
            "batch_sampler",
            "num_workers",

--- a/pytorch_widedeep/utils/fastai_transforms.py
+++ b/pytorch_widedeep/utils/fastai_transforms.py
@@ -338,6 +338,7 @@ class Tokenizer:
            )


+# TODO: Fix bug regarding token num 0
 class Vocab:
    r"""Contains the correspondence between numbers and tokens.

@@ -390,7 +391,13 @@ class Vocab:
        pickle.dump(self.itos, open(path, "wb"))

    @classmethod
-    def create(cls, tokens: Tokens, max_vocab: int, min_freq: int) -> "Vocab":
+    def create(
+        cls,
+        tokens: Tokens,
+        max_vocab: int,
+        min_freq: int,
+        pad_idx: Optional[int] = None,
+    ) -> "Vocab":
        r"""Create a vocabulary object from a set of tokens.

        Parameters
@@ -401,9 +408,9 @@ class Vocab:
            strings (e.g. list of tokenized sentences)
        max_vocab: int
            maximum vocabulary size
-        min_freq: int
-            minimum frequency that a token has to appear to be part of the
-            vocabulary
+        pad_idx: int, Optional, default = None
+            padding index. If None, Fastai's Tokenizer leaves 0 for
+            the 'unknown' token and defaults to 1.

        Examples
        --------
@@ -426,12 +433,18 @@ class Vocab:
        Vocab
            An instance of a `Vocab` object
        """
+
        freq = Counter(p for o in tokens for p in o)
        itos = [o for o, c in freq.most_common(max_vocab) if c >= min_freq]
        for o in reversed(defaults.text_spec_tok):
            if o in itos:
                itos.remove(o)
            itos.insert(0, o)
+
+        if pad_idx is not None:
+            itos.remove(PAD)
+            itos.insert(pad_idx, PAD)
+
        itos = itos[:max_vocab]
        if (
            len(itos) < max_vocab

--- a/pytorch_widedeep/utils/text_utils.py
+++ b/pytorch_widedeep/utils/text_utils.py
@@ -54,7 +54,11 @@ def simple_preprocess(
    return tokens


-def get_texts(texts: List[str], n_cpus: Optional[int] = None) -> List[List[str]]:
+def get_texts(
+    texts: List[str],
+    already_processed: Optional[bool] = False,
+    n_cpus: Optional[int] = None,
+) -> List[List[str]]:
    r"""Tokenization using `Fastai`'s `Tokenizer` because it does a
    series of very convenients things during the tokenization process

@@ -64,6 +68,9 @@ def get_texts(texts: List[str], n_cpus: Optional[int] = None) -> List[List[str]]
    ----------
    texts: List
        List of str with the texts (or documents). One str per document
+    already_processed: bool, Optional, default = False
+        Boolean indicating if the text is already processed and we simply
+        want to tokenize it
    n_cpus: int, Optional, default = None
        number of CPUs to used during the tokenization process

@@ -89,8 +96,11 @@ def get_texts(texts: List[str], n_cpus: Optional[int] = None) -> List[List[str]]

    num_cpus = n_cpus if n_cpus is not None else os.cpu_count()

-    processed_textx = [" ".join(simple_preprocess(t)) for t in texts]
-    tok = Tokenizer(n_cpus=num_cpus).process_all(processed_textx)
+    if not already_processed:
+        processed_texts = [" ".join(simple_preprocess(t)) for t in texts]
+    else:
+        processed_texts = texts
+    tok = Tokenizer(n_cpus=num_cpus).process_all(processed_texts)
    return tok



--- a/pytorch_widedeep/version.py
+++ b/pytorch_widedeep/version.py
-__version__ = "1.3.0"
+__version__ = "1.3.1"
--- a/tests/test_datasets/test_datasets.py
+++ b/tests/test_datasets/test_datasets.py
@@ -8,6 +8,7 @@ from pytorch_widedeep.datasets import (
    load_birds,
    load_ecoli,
    load_bio_kdd04,
+    load_movielens100k,
    load_womens_ecommerce,
    load_california_housing,
 )
@@ -116,3 +117,46 @@ def test_load_california_housing(as_frame):
        assert (df.shape, type(df)) == ((20640, 9), pd.DataFrame)
    else:
        assert (df.shape, type(df)) == ((20640, 9), np.ndarray)
+
+
+@pytest.mark.parametrize(
+    "as_frame",
+    [
+        (True),
+        (False),
+    ],
+)
+def test_load_movielens100k(as_frame):
+    df_data, df_users, df_items = load_movielens100k(as_frame=as_frame)
+    if as_frame:
+        assert (
+            df_data.shape,
+            df_users.shape,
+            df_items.shape,
+            type(df_data),
+            type(df_users),
+            type(df_items),
+        ) == (
+            (100000, 4),
+            (943, 5),
+            (1682, 24),
+            pd.DataFrame,
+            pd.DataFrame,
+            pd.DataFrame,
+        )
+    else:
+        assert (
+            df_data.shape,
+            df_users.shape,
+            df_items.shape,
+            type(df_data),
+            type(df_users),
+            type(df_items),
+        ) == (
+            (100000, 4),
+            (943, 5),
+            (1682, 24),
+            np.ndarray,
+            np.ndarray,
+            np.ndarray,
+        )
--- a/tests/test_model_components/test_mc_text.py
+++ b/tests/test_model_components/test_mc_text.py
@@ -2,7 +2,12 @@ import numpy as np
 import torch
 import pytest

-from pytorch_widedeep.models import BasicRNN, AttentiveRNN, StackedAttentiveRNN
+from pytorch_widedeep.models import (
+    BasicRNN,
+    Transformer,
+    AttentiveRNN,
+    StackedAttentiveRNN,
+)

 padded_sequences = np.random.choice(np.arange(1, 100), (100, 48))
 padded_sequences = np.hstack(
@@ -302,3 +307,80 @@ def test_attn_weights(stacked):
        )
    else:
        assert attn_w.size() == torch.Size([100, 50])
+
+
+# ###############################################################################
+# # Test Basic Transformer
+# ###############################################################################
+
+
+@pytest.mark.parametrize(
+    "with_cls_token",
+    [True, False],
+)
+def test_basic_transformer(with_cls_token):
+    if with_cls_token:
+        # if we use a 'CLS' token it must be inserted at the beginning of the
+        # sequence
+        _padded_sequences = np.zeros(
+            (padded_sequences.shape[0], padded_sequences.shape[1] + 1), dtype=int
+        )
+        _padded_sequences[:, 0] = padded_sequences.max() + 1
+        _padded_sequences[:, 1:] = padded_sequences
+    else:
+        _padded_sequences = padded_sequences
+
+    model = Transformer(
+        vocab_size=_padded_sequences.max() + 1,
+        seq_length=_padded_sequences.shape[1],
+        input_dim=8,
+        n_heads=2,
+        n_blocks=2,
+        with_pos_encoding=False,
+        with_cls_token=with_cls_token,
+    )
+
+    out = model(torch.from_numpy(_padded_sequences))
+
+    res = []
+    res.append(out.size(0) == _padded_sequences.shape[0])
+    res.append(out.size(1) == model.output_dim)
+
+    assert all(res)
+
+
+# ###############################################################################
+# # Test Custom Positional Encoder
+# ###############################################################################
+
+
+class DummyPositionalEncoding(torch.nn.Module):
+    def __init__(self, input_dim: int, seq_length: int):
+        super().__init__()
+
+        pe = torch.ones(1, seq_length, input_dim)
+        self.register_buffer("pe", pe)
+
+    def forward(self, X):
+        return X + self.pe
+
+
+def test_custom_pos_encoder():
+    model = Transformer(
+        vocab_size=padded_sequences.max() + 1,
+        seq_length=padded_sequences.shape[1],
+        input_dim=8,
+        n_heads=2,
+        n_blocks=2,
+        pos_encoder=DummyPositionalEncoding(
+            input_dim=8, seq_length=padded_sequences.shape[1]
+        ),
+    )
+
+    out = model(torch.from_numpy(padded_sequences))
+
+    res = []
+    res.append(out.size(0) == padded_sequences.shape[0])
+    res.append(out.size(1) == model.output_dim)
+
+    assert all(res)
--- a/tests/test_model_functioning/test_miscellaneous.py
+++ b/tests/test_model_functioning/test_miscellaneous.py
@@ -17,6 +17,7 @@ from pytorch_widedeep.models import (
    BasicRNN,
    WideDeep,
    TabResnet,
+    Transformer,
    TabTransformer,
 )
 from pytorch_widedeep.metrics import Accuracy, Precision
@@ -89,7 +90,16 @@ tabnet = TabNet(
    continuous_cols=colnames[5:],
    ghost_bn=False,
 )
-deeptext = BasicRNN(vocab_size=vocab_size, embed_dim=32, padding_idx=0)
+basic_rnn = BasicRNN(vocab_size=vocab_size, embed_dim=32, padding_idx=0)
+
+basic_transformer = Transformer(
+    vocab_size=X_text.max() + 1,
+    maxlen=X_text.shape[1],
+    embed_dim=8,
+    n_heads=2,
+    n_blocks=2,
+)
+
 deepimage = Vision(pretrained_model_setup="resnet18", n_trainable=0)

 ###############################################################################
@@ -209,7 +219,8 @@ def test_basic_run_with_metrics_multiclass():
        (None, tabmlp, None, None, None, X_tab, None, None, target),
        (None, tabresnet, None, None, None, X_tab, None, None, target),
        (None, tabtransformer, None, None, None, X_tab, None, None, target),
-        (None, None, deeptext, None, None, None, X_text, None, target),
+        (None, None, basic_rnn, None, None, None, X_text, None, target),
+        (None, None, basic_transformer, None, None, None, X_text, None, target),
        (None, None, None, deepimage, None, None, None, X_img, target),
    ],
 )