In [1]:
import os
import tempfile

%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs

plt.style.use('seaborn-whitegrid')

In [2]:
ratings = tfds.load("movielens/100k-ratings", split="train")
movies = tfds.load("movielens/100k-movies", split="train")

ratings = ratings.map(lambda x: {
 "movie_title": x["movie_title"],
 "user_id": x["user_id"],
 "timestamp": x["timestamp"],
})
movies = movies.map(lambda x: x["movie_title"])

In [3]:
timestamps = np.concatenate(list(ratings.map(lambda x: x["timestamp"]).batch(100)))

max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
 min_timestamp, max_timestamp, num=1000,
)

unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1000))))
unique_user_ids = np.unique(np.concatenate(list(ratings.batch(1_000).map(
 lambda x: x["user_id"]))))

In [4]:
class UserModel(tf.keras.Model):

 def __init__(self):
 super().__init__()

 self.user_embedding = tf.keras.Sequential([
 tf.keras.layers.experimental.preprocessing.StringLookup(
 vocabulary=unique_user_ids, mask_token=None),
 tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32),
 ])
 self.timestamp_embedding = tf.keras.Sequential([
 tf.keras.layers.experimental.preprocessing.Discretization(timestamp_buckets.tolist()),
 tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32),
 ])
 self.normalized_timestamp = tf.keras.layers.experimental.preprocessing.Normalization()

 self.normalized_timestamp.adapt(timestamps)

 def call(self, inputs):
 # Take the input dictionary, pass it through each input layer,
 # and concatenate the result.
 return tf.concat([
 self.user_embedding(inputs["user_id"]),
 self.timestamp_embedding(inputs["timestamp"]),
 self.normalized_timestamp(inputs["timestamp"]),
 ], axis=1)

In [5]:
class QueryModel(tf.keras.Model):
 """Model for encoding user queries."""

 def __init__(self, layer_sizes):
 """Model for encoding user queries.

 Args:
 layer_sizes:
 A list of integers where the i-th entry represents the number of units
 the i-th layer contains.
 """
 super().__init__()

 # We first use the user model for generating embeddings.
 self.embedding_model = UserModel()

 # Then construct the layers.
 self.dense_layers = tf.keras.Sequential()

 # Use the ReLU activation for all but the last layer.
 for layer_size in layer_sizes[:-1]:
 self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

 # No activation for the last layer.
 for layer_size in layer_sizes[-1:]:
 self.dense_layers.add(tf.keras.layers.Dense(layer_size))

 def call(self, inputs):
 feature_embedding = self.embedding_model(inputs)
 return self.dense_layers(feature_embedding)

In [6]:
class MovieModel(tf.keras.Model):

 def __init__(self):
 super().__init__()

 max_tokens = 10_000

 self.title_embedding = tf.keras.Sequential([
 tf.keras.layers.experimental.preprocessing.StringLookup(
 vocabulary=unique_movie_titles,mask_token=None),
 tf.keras.layers.Embedding(len(unique_movie_titles) + 1, 32)
 ])

 self.title_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
 max_tokens=max_tokens)

 self.title_text_embedding = tf.keras.Sequential([
 self.title_vectorizer,
 tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
 tf.keras.layers.GlobalAveragePooling1D(),
 ])

 self.title_vectorizer.adapt(movies)

 def call(self, titles):
 return tf.concat([
 self.title_embedding(titles),
 self.title_text_embedding(titles),
 ], axis=1)

In [7]:
class CandidateModel(tf.keras.Model):
 """Model for encoding movies."""

 def __init__(self, layer_sizes):
 """Model for encoding movies.

 Args:
 layer_sizes:
 A list of integers where the i-th entry represents the number of units
 the i-th layer contains.
 """
 super().__init__()

 self.embedding_model = MovieModel()

 # Then construct the layers.
 self.dense_layers = tf.keras.Sequential()

 # Use the ReLU activation for all but the last layer.
 for layer_size in layer_sizes[:-1]:
 self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

 # No activation for the last layer.
 for layer_size in layer_sizes[-1:]:
 self.dense_layers.add(tf.keras.layers.Dense(layer_size))

 def call(self, inputs):
 feature_embedding = self.embedding_model(inputs)
 return self.dense_layers(feature_embedding)

In [8]:
class MovielensModel(tfrs.models.Model):

 def __init__(self, layer_sizes):
 super().__init__()
 self.query_model = QueryModel(layer_sizes)
 self.candidate_model = CandidateModel(layer_sizes)
 self.task = tfrs.tasks.Retrieval(
 metrics=tfrs.metrics.FactorizedTopK(
 candidates=movies.batch(128).map(self.candidate_model),
 ),
 )

 def compute_loss(self, features, training=False):
 # We only pass the user id and timestamp features into the query model. This
 # is to ensure that the training inputs would have the same keys as the
 # query inputs. Otherwise the discrepancy in input structure would cause an
 # error when loading the query model after saving it.
 query_embeddings = self.query_model({
 "user_id": features["user_id"],
 "timestamp": features["timestamp"],
 })
 movie_embeddings = self.candidate_model(features["movie_title"])

 return self.task(
 query_embeddings, movie_embeddings)

In [9]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

In [10]:
num_epochs = 20

model = MovielensModel([32])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

one_layer_history = model.fit(
 cached_train,
 epochs=num_epochs,
 verbose=2)

Epoch 1/20
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.














40/40 - 11s - factorized_top_k/top_1_categorical_accuracy: 0.0089 - factorized_top_k/top_5_categorical_accuracy: 0.0206 - factorized_top_k/top_10_categorical_accuracy: 0.0312 - factorized_top_k/top_50_categorical_accuracy: 0.0935 - factorized_top_k/top_100_categorical_accuracy: 0.1611 - loss: 563.2299 - regularization_loss: 0.0000e+00 - total_loss: 563.2299
Epoch 2/20
40/40 - 10s - factorized_top_k/top_1_categorical_accuracy: 0.0035 - factorized_top_k/top_5_categorical_accuracy: 0.0175 - factorized_top_k/top_10_categorical_accuracy: 0.0333 - factorized_top_k/top_50_categorical_accuracy: 0.1392 - factorized_top_k/top_100_categorical_accuracy: 0.2560 - loss: 556.3237 - regularization_loss: 0.0000e+00 - total_loss: 556.3237
Epoch 3/20
40/40 - 10s - factorized_top_k/top_1_categorical_accuracy: 0.0025 - factorized_top_k/top_5_categorical_accuracy: 0.0195 - factorized_top_k/top_10_categorical_accuracy: 0.0391 - factorized_top_k/top_50_categorical_accuracy: 0.1716 - factorized_top_k/top_100_c

In [11]:

model.evaluate(cached_test, return_dict=True)







{'factorized_top_k/top_1_categorical_accuracy': 0.00039999998989515007,
 'factorized_top_k/top_5_categorical_accuracy': 0.003650000086054206,
 'factorized_top_k/top_10_categorical_accuracy': 0.009999999776482582,
 'factorized_top_k/top_50_categorical_accuracy': 0.10984999686479568,
 'factorized_top_k/top_100_categorical_accuracy': 0.2492000013589859,
 'loss': 29180.42578125,
 'regularization_loss': 0,
 'total_loss': 29180.42578125}

In [12]:
model = MovielensModel([64, 32])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

two_layer_history = model.fit(
 cached_train,
 epochs=num_epochs,
 verbose=2)

Epoch 1/20












40/40 - 11s - factorized_top_k/top_1_categorical_accuracy: 0.0204 - factorized_top_k/top_5_categorical_accuracy: 0.0302 - factorized_top_k/top_10_categorical_accuracy: 0.0393 - factorized_top_k/top_50_categorical_accuracy: 0.0873 - factorized_top_k/top_100_categorical_accuracy: 0.1328 - loss: 597.2026 - regularization_loss: 0.0000e+00 - total_loss: 597.2026
Epoch 2/20
40/40 - 10s - factorized_top_k/top_1_categorical_accuracy: 0.0061 - factorized_top_k/top_5_categorical_accuracy: 0.0166 - factorized_top_k/top_10_categorical_accuracy: 0.0255 - factorized_top_k/top_50_categorical_accuracy: 0.0856 - factorized_top_k/top_100_categorical_accuracy: 0.1573 - loss: 584.0324 - regularization_loss: 0.0000e+00 - total_loss: 584.0324
Epoch 3/20
40/40 - 10s - factorized_top_k/top_1_categorical_accuracy: 0.0026 - factorized_top_k/top_5_categorical_accuracy: 0.0097 - factorized_top_k/top_10_categorical_accuracy: 0.0181 - factorized_top_k/top_50_categorical_accuracy: 0.0809 - factorized_top_k/top_100_c

In [13]:
model.evaluate(cached_test, return_dict=True)







{'factorized_top_k/top_1_categorical_accuracy': 0.0003499999875202775,
 'factorized_top_k/top_5_categorical_accuracy': 0.006800000090152025,
 'factorized_top_k/top_10_categorical_accuracy': 0.017249999567866325,
 'factorized_top_k/top_50_categorical_accuracy': 0.11909999698400497,
 'factorized_top_k/top_100_categorical_accuracy': 0.2454500049352646,
 'loss': 28001.8125,
 'regularization_loss': 0,
 'total_loss': 28001.8125}

In [14]:
model = MovielensModel([128, 64, 32])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

three_layer_history = model.fit(
 cached_train,
 epochs=num_epochs,
 verbose=2)

Epoch 1/20












40/40 - 11s - factorized_top_k/top_1_categorical_accuracy: 0.1176 - factorized_top_k/top_5_categorical_accuracy: 0.1213 - factorized_top_k/top_10_categorical_accuracy: 0.1262 - factorized_top_k/top_50_categorical_accuracy: 0.1358 - factorized_top_k/top_100_categorical_accuracy: 0.1475 - loss: 618.6954 - regularization_loss: 0.0000e+00 - total_loss: 618.6954
Epoch 2/20
40/40 - 10s - factorized_top_k/top_1_categorical_accuracy: 0.0082 - factorized_top_k/top_5_categorical_accuracy: 0.0136 - factorized_top_k/top_10_categorical_accuracy: 0.0188 - factorized_top_k/top_50_categorical_accuracy: 0.0438 - factorized_top_k/top_100_categorical_accuracy: 0.0740 - loss: 603.6727 - regularization_loss: 0.0000e+00 - total_loss: 603.6727
Epoch 3/20
40/40 - 10s - factorized_top_k/top_1_categorical_accuracy: 0.0076 - factorized_top_k/top_5_categorical_accuracy: 0.0189 - factorized_top_k/top_10_categorical_accuracy: 0.0281 - factorized_top_k/top_50_categorical_accuracy: 0.0685 - factorized_top_k/top_100_c

In [15]:
model.evaluate(cached_test, return_dict=True)







{'factorized_top_k/top_1_categorical_accuracy': 0.0006500000017695129,
 'factorized_top_k/top_5_categorical_accuracy': 0.006750000175088644,
 'factorized_top_k/top_10_categorical_accuracy': 0.017000000923871994,
 'factorized_top_k/top_50_categorical_accuracy': 0.10904999822378159,
 'factorized_top_k/top_100_categorical_accuracy': 0.22495000064373016,
 'loss': 28020.451171875,
 'regularization_loss': 0,
 'total_loss': 28020.451171875}