Install required packages:
pip install pandas numpy scipy scikit-learn implicit streamlit tqdm matplotlib seaborn
Standard data science libraries for dataframe operations, sparse matrices, text vectorization, and the implicit library for ALS (Alternating Least Squares) collaborative filtering.
import pandas as pd
import scipy.sparse as sps
from sklearn.feature_extraction.text import TfidfVectorizer
import implicit
import pickle
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
sns.set_style("whitegrid")
c:\Users\johnd\Programming\my-data-science-learning\recommender_systems\venv\Lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
Define where the MovieLens data is stored and create an artifacts/ directory to save the trained model.
data_dir = Path("data/")
artifacts_dir = Path("artifacts")
artifacts_dir.mkdir(exist_ok=True)
Load three datasets from MovieLens-32M:
ratings = pd.read_csv(data_dir / "ml-32m/ratings.csv")
movies = pd.read_csv(data_dir / "ml-32m/movies.csv")
tags = pd.read_csv(data_dir / "ml-32m/tags.csv")
print(f"Loaded {len(ratings)} ratings, {len(movies)} movies, {len(tags)} tags")
Loaded 32000204 ratings, 87585 movies, 2000072 tags
Select the 100 most-rated movies to reduce training time and focus on popular items with more user interactions.
top_movies = (
ratings["movieId"].value_counts().head(1000).index.tolist()
) # list of movieIds
print(f"Selected top {len(top_movies)} movies")
Selected top 1000 movies
Keep only data related to the top 1000 movies, then downsample ratings to 50% for faster training. This reduces the dataset from millions of ratings to a manageable size while maintaining the collaborative signal.
movies = movies[movies["movieId"].isin(top_movies)]
ratings = ratings[ratings["movieId"].isin(top_movies)]
ratings = ratings.sample(frac=0.5, random_state=42)
tags = tags[tags["movieId"].isin(top_movies)]
print(
f"After filtering: {len(ratings)} ratings, {len(movies)} movies, {len(tags)} tags"
)
After filtering: 9439196 ratings, 1000 movies, 704287 tags
Map raw MovieLens movieId to contiguous indices (0..n-1) and back. Only include movies that remain after filtering/downsampling so indices align with the ALS input.
movie_ids_in_ratings = sorted(ratings["movieId"].unique().tolist())
movie_2_index = {mid: idx for idx, mid in enumerate(movie_ids_in_ratings)}
index_2_movie = {idx: mid for mid, idx in movie_2_index.items()}
print(f"DEBUG: movie_ids_in_ratings has {len(movie_ids_in_ratings)} movies")
DEBUG: movie_ids_in_ratings has 1000 movies
Similarly, create dense user indices for the collaborative filtering model.
user_ids_in_ratings = sorted(ratings["userId"].unique().tolist())
user_2_index = {uid: idx for idx, uid in enumerate(user_ids_in_ratings)}
index_2_user = {idx: uid for uid, idx in user_2_index.items()}
print(f"DEBUG: user_ids_in_ratings has {len(user_ids_in_ratings)} users")
DEBUG: user_ids_in_ratings has 200758 users
Construct a sparse user-rating matrix (users × items) in CSR format. This is the core input for the ALS algorithm. Each cell contains the rating value, and sparse storage keeps memory efficient since most user-movie pairs are not rated.
rows = ratings["userId"].map(user_2_index)
cols = ratings["movieId"].map(movie_2_index)
data = ratings["rating"].astype(float)
user_rating_matrix = sps.csr_matrix(
(data, (rows, cols)), shape=(len(user_2_index), len(movie_2_index))
)
print(f"DEBUG: user_rating_matrix shape is {user_rating_matrix.shape}")
print(
f"DEBUG: user_rating_matrix has {user_rating_matrix.nnz} non-zero entries"
)
print(
f"DEBUG: Sparsity: {1 - (user_rating_matrix.nnz / (user_rating_matrix.shape[0] * user_rating_matrix.shape[1]))*100:.2f}%"
)
DEBUG: user_rating_matrix shape is (200758, 1000) DEBUG: user_rating_matrix has 9439196 non-zero entries DEBUG: Sparsity: -3.70%
Visualize the distribution of ratings to understand user behavior.
fig, axes = plt.subplots(1, 2, figsize=(14, 4))
axes[0].hist(
ratings["rating"], bins=20, edgecolor="black", alpha=0.7, color="steelblue"
)
axes[0].set_xlabel("Rating")
axes[0].set_ylabel("Frequency")
axes[0].set_title("Distribution of User Ratings")
axes[0].grid(alpha=0.3)
ratings_per_movie = ratings["movieId"].value_counts()
axes[1].hist(
ratings_per_movie, bins=30, edgecolor="black", alpha=0.7, color="coral"
)
axes[1].set_xlabel("Number of Ratings")
axes[1].set_ylabel("Frequency")
axes[1].set_title("Distribution of Ratings per Movie")
axes[1].grid(alpha=0.3)
plt.tight_layout()
plt.show()
Aggregate user-generated tags by movie and combine them with title and genres into a single "content" field. This text will be used for TF-IDF vectorization to compute content-based similarity.
movie_tags = (
tags.groupby("movieId")["tag"]
.apply(lambda ts: " ".join(ts.astype(str)))
.reset_index()
)
movies = movies.merge(movie_tags, how="left", on="movieId")
movies["tag"] = movies["tag"].fillna("")
movies["genres_clean"] = (
movies["genres"].fillna("").str.replace("|", " ", regex=False)
)
movies["content"] = (
movies["title"].fillna("")
+ " "
+ movies["genres_clean"]
+ " "
+ movies["tag"]
)
print(f"Sample content field:\n{movies['content'].iloc[0]}\n")
Sample content field: Toy Story (1995) Adventure Animation Children Comedy Fantasy children Disney animation children Disney Disney pixar animation funny Pixar Pixar Tumey's To See Again Tumey's VHS adventure classic friendship funny animation computer animation pixar toys adventure computer animation Pixar adventure animated animation clever comedy computer animation family fantasy Tom Hanks pixar Tom Hanks animation Pixar 3D animated children comedy computer animation Disney family humorous Pixar time travel Tom Hanks accepting reality emotional friendship funny soundtrack children computer animation Disney family Pixar computer animation pixar ad for toys children clever forced puns internal logic fails mixed soundtrack reflection rivalry technological marvel Pixar kids and family funny friendship Animation Cartoon Pixar adventure animated animation buddy movie comedy computer animation Disney friendship funny humorous Pixar Tom Hanks witty animation Pixar animated animation children classic clever comedy computer animation Disney friendship funny humorous pixar toys witty adventure animation buddy movie classic clever comedy friendship fun funny humorous kids Pixar toys unusual friendship very good want to see again animation family sci-fi computer animation Disney fantasy Pixar toys witty ya boy Pixar almost favorite children computer animation family humorous time travel Tom Hanks witty Tim Allen Tom Hanks animation children toys Cartoon Disney Pixar animation clever friendship funny humorous pixar witty adventure children family funny Engaging animation family good time buddy movie Tom Hanks witty animated animation buddy movie children clever time travel witty animation disney animation pixar bright DARING RESCUES fanciful HEROIC MISSION humorous light rousing TOYS COME TO LIFE UNLIKELY FRIENDSHIPS warm witty animation children computer animation Disney imdb top 250 John Lasseter Pixar Tom Hanks animation Pixar Tim Allen time travel adventure classic clever comedy Disney family friendship funny humorous imdb top 250 Pixar Tom Hanks witty pixar Tom Hanks Pixar witty toys classic Disney friendship pixar friendship funny pixar Tom Hanks adventure clever enemies become friends family Family cartoon family relationships feel-good first of series friends friendship fun fun family movie HEROIC MISSION humorous jealousy kids kids movie loyal friend loyalty redemption reflection rescue mission rivalry selflessness teamwork unlikely friendships unusual friendship witty American Animation 阮一鸣 Disney family Family cartoon Pixar Pixar animation pixar Watched Pixar time travel cute funny story voice acting witty Disney funny Pixar time travel funny Pixar animation friendship witty animation Disney Pixar buddy computer animation directorial debut low fantasy National Film Registry Oscar (Special Achievement) toys action disney kids cgi computer animation Pixar witty adventure clever funny Tom Hanks witty computer animation Pixar adventure friendship kids light hearted whimsica animation clever comedy funny humorous Pixar witty classic clever funny Tom Hanks witty animation cgi animation comedy Disney Pixar pixar fun clever comedy Disney family friendship funny humorous Pixar Tom Hanks witty adventure animation classic clever computer animation Disney kids Pixar redemption Tom Hanks unlikely friendships unusual friendship children cartoon feel-good funny animated buddy movie Cartoon cgi comedy computer animation family friendship kids toy toys Pixar children family Pixar Tom Hanks toys witty animation clever Disney pixar animation Pixar Disney favorite Pixar animation animation Tom Hanks friendship pixar toys Pixar Disney children classic Disney fun imdb top 250 kids animation children Disney Pixar animation friendship toys adventure Animation Comedy computer animation family funny Pixar Tom Hanks Disney Pixar adventure classic funny Pixar toys toy toys animation humorous Pixar time travel animation Disney pixar animation pixar Owned computer animation family joss whedon children comedy funny witty animation Disney animated animation children comedy fantasy funny humorous Pixar time travel Pixar animation Disney friendship imdb top 250 Pixar witty 1990s 3 dimensional 3d action figure affection alien animated dog animated fictional tv commercial antenna anthropomorphic toy anthropomorphism apology arcade arcade game arm asking directions astronaut audio flashback baby baby monitor backfire backyard bad guy ballet flats banister battery battleship game bed bedroom binoculars biohazard sign birthday birthday party birthday present blinds blockbuster bo peep character bouncy ball boy boy next door brat brawl brother sister relationship bull terrier bully bully comeuppance bullying burn buzz lightyear character calling someone an idiot candy land the board game car car crash card game cartoon dog catchphrase cgi animation character's point of view camera shot chased by a dog chasing a truck child child antagonist child destroys another's toy child destroys own toy child villain child's bedroom child's birthday child's birthday party christmas christmas day christmas present chrysler automobile chrysler lebaron convertible claw crane clock comeuppance comic hero commercial computer animation computer generated imagery confrontation corkboard cowboy cowboy boot cowboy doll crate cult favorite cult film dachshund dart dartboard dead battery delivery dental braces depression desk dinosaur directorial debut Disney dog dog as gift doll doorbell double prize doubt elementary school encouragement enemies become friends envy escape escape attempt etch a sketch explosion face mask falling from a window falling from height falling out a window false accusation family relationships famous line famous score famous song favoritism fear fight fireplace first of series first of trilogy first part fishing rod fistfight flashlight flying friend friends friendship frustration game of life board game gas station gift girl gliding globe gratitude guilty conscience hasbro hat heartfelt hero hockey puck home house jack in the box jealousy jumping from height jumping through a sunroof karate karate chop laughter learning a lesson leggings lifting a female into the air lifting someone into the air lipstick on face little boy little girl locked in looking out a window love interest loyalty magic 8 ball magnifying glass male antagonist male protagonist male villain martial arts match mattel medical mask meeting microphone mission misunderstanding mockery mother son relationship mouth moving moving van mr potato head mr. potato head character mutant national film registry neighbor neighborhood nesting egg new home new toy ohio operation game original story package parachute part of trilogy pet as gift pet dog piggy bank pixar pizza van pizzeria plastic army men playskool pliers porcelain poster product placement pterodactyl pull string doll push button radio controlled rag doll rain ramp recliner chair reconnaissance mission redemption reference to marie antoinette reference to mattel reflection rejection remorse remote controlled toy car rescue rescue mission resourcefulness ringing a doorbell rival rivalry rocket running scene before opening credits scene during opening credits scheming seatbelt self awareness selflessness shark shed shepherdess sheriff shivering showdown signature single mother skateboard sleeping dog sliding down a banister slinky dog slow motion scene soldier space explorer space ranger spaceman toy spacesuit squeeze toy staircase storage shed subjective camera suburb sunlight sunroof surgical mask surprise ending surrounded talking in one's sleep talking to a toy talking toy teamwork television commercial telling someone to shut up threat thunder thunderstorm time lapse photography title spoken by character toolbox torture tough guy toy toy animal toy comes to life toy dinosaur toy dog toy fire truck toy robot toy soldier toy story toy tea set toybox toyota toyota truck troll doll troubled production turbo boost twister the game two word title tyrannosaurus rex urban setting utah teapot van ventilation shaft view through binoculars villain violence visual pun walkie talkie wallpaper water weightlifting wilhelm scream window woody character yelling adventure buddy movie computer animation Pixar Tom Hanks animation pixar Tim Allen Tom Hanks comedy funny Tom Hanks classic clever witty adventure animation computer animation humorous Pixar witty animation classic comedy computer animation Disney funny humorous Pixar time travel Tom Hanks witty Disney Pixar computer animation funny humorous Pixar Tom Hanks witty children jealousy family film friendship toys hero's journey funny computer animation good cartoon chindren pixar Pixar witty animation fun animation reflection first cgi film Pixar animation computer animation friendship funny witty great movie Disney funny witty clever classic pixar CGI classic disney pixar animation cartoon friendship pixar unny funny Pixar witty adventure animated animation children clever computer animation family friendship funny humorous toys witty Pixar toys 2009 reissue in Stereoscopic 3-D 55 movies every kid should see--Entertainment Weekly BD-Video CLV DVD-Video animation imdb top 250 pixar pixar animation cgi Disney family toys animation kids movie animation Cartoon dolls National Film Registry Disney friendship nostalgic computer animation funny Pixar rated-G Pixar Tom Hanks computer animation Pixar Disney Pixar Tim Allen Tom Hanks https://movielens.org/tag/:animation adventure animation children comedy classic comedy fun funny humorous Animation clever comedy funny Pixar Tom Hanks animated fun family movie pixar adventure 3D children animation Pixar toys adventure animation comedy family fantasy John Lasseter USA animation Pixar animated animation buddy movie computer animation funny Pixar Tom Hanks animation Disney funny pixar adventure animation Disney funny pixar animated animation comedy Disney Pixar Pixar animation family Tom Hanks animation classic clever comedy computer animation Disney friendship funny pixar witty childish clever pixar pixar buddy movie cartoon children clever family friendship kids Tom Hanks witty classic Tom Hanks adventure animation buddy movie clever comedy cowboy dinosaur dolls friendship funny pixar Tim Allen Tom Hanks toys UNLIKELY FRIENDSHIPS witty jealousy Tom Hanks buddy movie friendship toys classic pixar witty classic funny Pixar CGI classic Tom Hanks children computer animation family funny Pixar Tom Hanks toys animation Disney Pixar toys innovative animation Disney family funny avi buy animation Disney pixar toys family friendship pixar Pixar Pixar Tom Hanks action figure action figures Buzz Lightyear CG animation toy toys Woody Pixar computer animation Disney humorous Pixar Pixar pixar tim allen tom hanks 3D Disney family Pixar Tom Hanks Pixar children children Pixar Tom Hanks cgi unlikely friendships adventure adventure animated animation computer animation Disney funny pixar Tom Hanks toys adventure animation Disney funny pixar adventure animated animation cgi comedy Disney family fantasy friendship imdb top 250 Pixar Tom Hanks witty itaege toys adventure clever friendship humorous Pixar witty adventure children classic computer animation Disney funny Pixar Tim Allen Tom Hanks animated comedy Disney fun funny pixar adventure animated animation buddy movie children classic clever comedy computer animation Disney family fantasy funny humorous imdb top 250 Pixar time travel Tom Hanks toys witty cgi children toys humorous pixar classic friendship Pixar Animation Pixar animation Disney Pixar animation Pixar animation children computer animation kids pixar clever computer animation Disney Tom Hanks clever funny Pixar witty soothing Pixar Tom Hanks animation children comedy Disney friendship funny pixar Tom Hanks buddy movie computer animation friendship Tom Hanks clever witty animation Cartoon children computer animation kids pixar Tom Hanks Pixar adventure animated animation classic Disney fantasy Pixar Tom Hanks toys adventure pixar villian hurts toys Pixar adventure animated animation Cartoon Disney family friendship imdb top 250 pixar toy toys animation comedy funny fantasy pixar Pixar imdb top 250
Filter movies to only those that survived rating downsampling, then reindex them to match the dense item indices (0 to n_items-1). This ensures the movie metadata aligns perfectly with the collaborative filtering model indices.
movies = movies[movies["movieId"].isin(movie_ids_in_ratings)]
print(f"DEBUG: movies dataframe has {len(movies)} movies after filtering")
movies_indexed = (
movies.set_index("movieId").reindex(movie_ids_in_ratings).reset_index()
)
for col in ["title", "genres", "content"]:
if col in movies_indexed.columns:
movies_indexed[col] = movies_indexed[col].fillna("")
print(f"DEBUG: movies_indexed has {len(movies_indexed)} movies")
DEBUG: movies dataframe has 1000 movies after filtering DEBUG: movies_indexed has 1000 movies
TF-IDF (Term Frequency-Inverse Document Frequency) is a statistical measure used to evaluate the importance of a word in a 'document' relative to a collection of documents (corpus). It helps in identifying the most relevant terms in a document while down-weighting common terms.
Now, we convert movie content (title + genres + tags) into TF-IDF vectors. This represents each movie as a sparse vector in a 5000-dimensional feature space, capturing term importance while removing common stop words.
tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
item_content = tfidf.fit_transform(movies_indexed["content"])
print(f"TF-IDF matrix shape: {item_content.shape}")
print(f"Number of features: {len(tfidf.get_feature_names_out())}")
TF-IDF matrix shape: (1000, 5000) Number of features: 5000
Quick check to see which words the TF‑IDF vectorizer thinks are most important for Star Wars films.
The cell finds movies with "Star Wars" in the title and prints their titles and genres.
It pulls the TF‑IDF rows for those movies and converts them to dense arrays (safe because it is only a few rows).
For each movie it picks the top top_n terms, unions those indices, and filters the TF‑IDF matrix to just those terms.
It then draws a heatmap where each row is a Star Wars movie and each column is a picked term, with TF‑IDF scores annotated.
Interesting to see that "jar" and "binks" (as in Jar Jar Binks) are picked up as important for "The Phantom Menace"!
star_wars_mask = movies_indexed["title"].str.contains(
"Star Wars", case=False, na=False
)
star_wars_movies = movies_indexed[star_wars_mask]
star_wars_indices = star_wars_movies.index.tolist()
star_wars_tfidf = item_content[star_wars_indices].toarray()
feature_names = tfidf.get_feature_names_out()
top_n = 5
all_top_terms = set()
for row in star_wars_tfidf:
top_indices = np.argsort(row)[-top_n:]
all_top_terms.update(top_indices)
all_top_terms = sorted(list(all_top_terms))
star_wars_tfidf_filtered = star_wars_tfidf[:, all_top_terms]
filtered_feature_names = feature_names[all_top_terms]
fig, ax = plt.subplots(figsize=(15, 6))
sns.heatmap(
star_wars_tfidf_filtered,
xticklabels=filtered_feature_names,
yticklabels=[movies_indexed.loc[i, "title"] for i in star_wars_indices],
cmap="YlOrRd",
annot=True,
fmt=".2f",
cbar_kws={"label": "TF-IDF Score"},
ax=ax,
vmin=0,
)
plt.title("TF-IDF Heatmap: Star Wars Movies (Top Terms Only)")
plt.xlabel("Terms")
plt.ylabel("Movies")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
Now, we train an implicit ALS model on the confidence-weighted rating matrix. The alpha=40 weight amplifies explicit ratings into confidence scores (implicit feedback). The model learns 64-dimensional latent factors for both users and items, using regularization to prevent overfitting.
alpha = 40.0
conf = (user_rating_matrix * alpha).astype("double")
model = implicit.als.AlternatingLeastSquares(
factors=64, regularization=0.1, iterations=20
)
model.fit(conf)
c:\Users\johnd\Programming\my-data-science-learning\recommender_systems\venv\Lib\site-packages\implicit\cpu\als.py:95: RuntimeWarning: OpenBLAS is configured to use 12 threads. It is highly recommended to disable its internal threadpool by setting the environment variable 'OPENBLAS_NUM_THREADS=1' or by calling 'threadpoolctl.threadpool_limits(1, "blas")'. Having OpenBLAS use a threadpool can lead to severe performance issues here. check_blas_config() 100%|██████████| 20/20 [00:20<00:00, 1.02s/it]
Extract the learned latent factor matrices from the trained model. These dense vectors represent movies and users in the learned latent space and are used for collaborative filtering recommendations.
item_factors = model.item_factors # shape: n_items x k
user_factors = model.user_factors # shape: n_users x k
print(f"DEBUG: item_factors shape is {item_factors.shape}")
print(f"DEBUG: user_factors shape is {user_factors.shape}")
DEBUG: item_factors shape is (1000, 64) DEBUG: user_factors shape is (200758, 64)
Visualize the distribution and correlations of learned latent factors.
fig, axes = plt.subplots(1, 2, figsize=(14, 4))
item_magnitudes = np.linalg.norm(item_factors, axis=1)
axes[0].hist(
item_magnitudes, bins=30, edgecolor="black", alpha=0.7, color="steelblue"
)
axes[0].set_xlabel("L2 Norm of Item Factors")
axes[0].set_ylabel("Frequency")
axes[0].set_title("Distribution of Item Factor Magnitudes")
axes[0].grid(alpha=0.3)
user_magnitudes = np.linalg.norm(user_factors, axis=1)
axes[1].hist(
user_magnitudes, bins=30, edgecolor="black", alpha=0.7, color="coral"
)
axes[1].set_xlabel("L2 Norm of User Factors")
axes[1].set_ylabel("Frequency")
axes[1].set_title("Distribution of User Factor Magnitudes")
axes[1].grid(alpha=0.3)
plt.tight_layout()
plt.show()
Compute a sparse item-item similarity matrix using cosine similarity of TF-IDF vectors. This captures which movies are similar based on their textual features (title, genres, tags).
item_content = tfidf.transform(movies_indexed["content"]) # already aligned
cosine_sim = cosine_similarity(
item_content, item_content, dense_output=False
) # shape: n_items x n_items
print(f"DEBUG: cosine_sim shape is {cosine_sim.shape}")
DEBUG: cosine_sim shape is (1000, 1000)
n_display = min(30, item_factors.shape[0])
alpha = 0.5
cf_sim = cosine_similarity(item_factors) # dense n_items x n_items
cosine_sim_dense = (
cosine_sim.toarray()
if hasattr(cosine_sim, "toarray")
else np.asarray(cosine_sim)
)
def normalize_mat(m):
mn, mx = m.min(), m.max()
return (m - mn) / (mx - mn) if mx > mn else np.zeros_like(m)
cf_norm = normalize_mat(cf_sim)
costine_norm = normalize_mat(cosine_sim_dense)
hybrid_sim = alpha * cf_norm + (1.0 - alpha) * costine_norm
# Ensure all "star wars" movies are included
titles_all = movies_indexed["title"].fillna("").astype(str).tolist()
star_wars_indices = [
i for i, title in enumerate(titles_all) if "star wars" in title.lower()
]
if len(star_wars_indices) >= n_display:
indices = np.random.choice(
star_wars_indices, size=n_display, replace=False
)
else:
remaining_pool = np.setdiff1d(
np.arange(item_factors.shape[0]), star_wars_indices
)
n_remaining = n_display - len(star_wars_indices)
random_indices = np.random.choice(
remaining_pool, size=n_remaining, replace=False
)
indices = np.concatenate([star_wars_indices, random_indices])
hybrid_sub = hybrid_sim[np.ix_(indices, indices)]
labels = [titles_all[i] if titles_all[i] != "" else str(i) for i in indices]
g = sns.clustermap(
hybrid_sub,
method="weighted", # options: single, complete, average, weighted, centroid, median, ward
metric="cosine",
cmap="turbo",
figsize=(12, 12),
xticklabels=labels,
yticklabels=labels,
cbar_kws={"label": "Hybrid similarity"},
)
plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90, fontsize=8)
plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0, fontsize=8)
g.cax.set_position([0.95, 0.2, 0.03, 0.6])
plt.show()
Interesting to see that we have two groups of Star Wars movies clustering together, one for the original trilogy and one for the prequels, indicating that both collaborative filtering and content-based features recognize their similarity.
Verify that all components have matching dimensions. This ensures consistency across collaborative factors, content similarity, movie metadata, and ID mappings. A mismatch would cause errors in the Streamlit app.
n_cf = item_factors.shape[0]
n_content = cosine_sim.shape[0]
n_movies = len(movies_indexed)
n_map = len(index_2_movie)
print(f"\n=== FINAL DIMENSIONS ===")
print(f"item_factors: {n_cf}")
print(f"cosine_sim: {n_content}")
print(f"movies_indexed: {n_movies}")
print(f"i2m: {n_map}")
if n_cf == n_content == n_movies == n_map:
print(f"Sanity check passed: all have {n_cf} items")
else:
print(f"\nMISMATCH")
print(
f" item_factors is from user_rating_matrix which has {URM.shape[1]} items"
)
print(f" But we're trying to match {len(movie_ids_in_ratings)} movies")
raise ValueError(
f"Dimension mismatch! item_factors={n_cf}, cosine_sim={n_content}, "
f"movies_indexed={n_movies}, i2m={n_map}. All should be equal!"
)
=== FINAL DIMENSIONS === item_factors: 1000 cosine_sim: 1000 movies_indexed: 1000 i2m: 1000 Sanity check passed: all have 1000 items
Evaluate the quality of the trained model using hit rate and NDCG metrics on a held-out test set.
np.random.seed(42)
test_fraction = 0.2
test_indices = np.random.choice(
user_rating_matrix.nnz,
size=int(user_rating_matrix.nnz * test_fraction),
replace=False,
)
test_data = np.zeros(user_rating_matrix.nnz)
train_data = user_rating_matrix.data.copy()
for idx in test_indices:
test_data[idx] = train_data[idx]
train_data[idx] = 0.0
user_rating_matrix_train = sps.csr_matrix(
(train_data, user_rating_matrix.indices, user_rating_matrix.indptr),
shape=user_rating_matrix.shape,
)
user_rating_matrix_test = sps.csr_matrix(
(test_data, user_rating_matrix.indices, user_rating_matrix.indptr),
shape=user_rating_matrix.shape,
)
print(f"Train set: {user_rating_matrix_train.nnz} ratings")
print(f"Test set: {user_rating_matrix_test.nnz} ratings")
Train set: 9439196 ratings Test set: 9439196 ratings
For each test user, generate top-k recommendations and compare against held-out test ratings.
The hit-rate is defined as the fraction of test users for who at least one held-out item appears in the top-k recommendations. For a single user it is 1.0 if any test item is present in the top-k list and 0.0 otherwise; the reported Hit Rate@k is the average across users.
NDCG (Normalized Discounted Cumulative Gain) measures ranking quality by giving higher weight to relevant items appearing higher in the recommendation list. Essentially, the higher the relevance-weighted items appear near the top of the recommendation list, the higher the NDCG, the better the recommendation.
Both metrics are averaged across users to produce the final Hit Rate@k and NDCG@k.
def compute_hit_rate_and_ndcg(
model_item_factors, user_rating_matrix_train, user_rating_matrix_test, k=10
):
"""
Compute hit rate and NDCG for top-k recommendations.
"""
hit_rates = []
ndcgs = []
test_users = np.where(user_rating_matrix_test.getnnz(axis=1) > 0)[
0
] # users with test ratings
for user_idx in tqdm(
test_users[: min(100, len(test_users))], desc="Evaluating"
):
user_trained_items = user_rating_matrix_train[user_idx].nonzero()[1]
if len(user_trained_items) == 0:
continue
user_vec = model_item_factors[user_trained_items].mean(axis=0)
scores = model_item_factors.dot(user_vec)
train_items = set(user_rating_matrix_train[user_idx].nonzero()[1])
ranked = sorted(
(
(i, scores[i])
for i in range(len(scores))
if i not in train_items
),
key=lambda x: -x[1],
)[:k]
top_k_items = [i for i, _ in ranked]
test_items = set(user_rating_matrix_test[user_idx].nonzero()[1])
# any test item in top-k?
hits = len(set(top_k_items) & test_items)
hit_rate = 1.0 if hits > 0 else 0.0
hit_rates.append(hit_rate)
# NDCG
dcg = sum(
[
1.0 / np.log2(i + 2)
for i, item in enumerate(top_k_items)
if item in test_items
]
)
idcg = sum(
[1.0 / np.log2(i + 2) for i in range(min(len(test_items), k))]
)
ndcg = dcg / idcg if idcg > 0 else 0.0
ndcgs.append(ndcg)
return np.mean(hit_rates), np.mean(ndcgs)
hit_rate, ndcg = compute_hit_rate_and_ndcg(
item_factors, user_rating_matrix_train, user_rating_matrix_test, k=10
)
print(f"\n=== EVALUATION METRICS (k=10) ===")
print(f"Hit Rate@10: {hit_rate:.4f}")
print(f"NDCG@10: {ndcg:.4f}")
Evaluating: 100%|██████████| 100/100 [00:00<00:00, 1012.24it/s]
=== EVALUATION METRICS (k=10) === Hit Rate@10: 0.3400 NDCG@10: 0.0738
Next, we want to assess how well our hybrid recommender performs across different values of k (number of recommendations) by visualizing the evaluation metrics across different k values, where k is the number of top recommendations considered for each user.
compute_hit_rate_and_ndcg(
item_factors, user_rating_matrix_train, user_rating_matrix_test, k=10
)
Evaluating: 100%|██████████| 100/100 [00:00<00:00, 1035.48it/s]
(np.float64(0.34), np.float64(0.07378929909770875))
def per_user_metrics(item_factors, URM_train, URM_test, k=10, max_users=500):
test_users = np.where(URM_test.getnnz(axis=1) > 0)[0]
if len(test_users) == 0:
return np.array([]), np.array([])
test_users = test_users[: min(max_users, len(test_users))]
hits_list = []
ndcgs = []
for u in test_users:
train_items = URM_train[u].nonzero()[1]
test_items = set(URM_test[u].nonzero()[1])
if len(train_items) == 0 or len(test_items) == 0:
continue
user_vec = item_factors[train_items].mean(axis=0)
scores = item_factors.dot(user_vec)
scores[list(train_items)] = -np.inf
top_k = np.argsort(-scores)[:k]
hit = 1.0 if any(i in test_items for i in top_k) else 0.0
hits_list.append(hit)
dcg = sum(
1.0 / np.log2(idx + 2)
for idx, item in enumerate(top_k)
if item in test_items
)
idcg = sum(
1.0 / np.log2(i + 2) for i in range(min(len(test_items), k))
)
ndcg = dcg / idcg if idcg > 0 else 0.0
ndcgs.append(ndcg)
return np.array(hits_list), np.array(ndcgs)
k_values = [5, 10, 15, 20, 30, 40]
hit_means = []
hit_sems = []
ndcg_means = []
ndcg_sems = []
for k in k_values:
hits, ndcgs = per_user_metrics(
item_factors,
user_rating_matrix_train,
user_rating_matrix_test,
k=k,
max_users=500,
)
if len(hits) == 0:
hit_means.append(0.0)
hit_sems.append(0.0)
ndcg_means.append(0.0)
ndcg_sems.append(0.0)
continue
hit_means.append(hits.mean())
hit_sems.append(
hits.std(ddof=1) / np.sqrt(len(hits)) if len(hits) > 1 else 0.0
)
ndcg_means.append(ndcgs.mean())
ndcg_sems.append(
ndcgs.std(ddof=1) / np.sqrt(len(ndcgs)) if len(ndcgs) > 1 else 0.0
)
fig, axes = plt.subplots(1, 2, figsize=(14, 4))
axes[0].errorbar(
k_values,
hit_means,
yerr=hit_sems,
marker="o",
linewidth=2,
markersize=8,
color="steelblue",
capsize=5,
)
axes[0].set_xlabel("k (number of recommendations)")
axes[0].set_ylabel("Hit Rate")
axes[0].set_title("Hit Rate@k Performance (with SEM)")
axes[0].grid(alpha=0.3)
axes[1].errorbar(
k_values,
ndcg_means,
yerr=ndcg_sems,
marker="s",
linewidth=2,
markersize=8,
color="coral",
capsize=5,
)
axes[1].set_xlabel("k (number of recommendations)")
axes[1].set_ylabel("NDCG")
axes[1].set_title("NDCG@k Performance (with SEM)")
axes[1].grid(alpha=0.3)
plt.tight_layout()
plt.show()
This is a pretty good model. Let's save it for later use in the Streamlit app!
Bundle all trained components into a single pickle file. This includes:
The Streamlit app loads this artifact and uses it to generate hybrid recommendations combining both collaborative and content-based signals.
model_data = {
"model": model,
"u2i": user_2_index,
"i2u": index_2_user,
"m2i": movie_2_index,
"i2m": index_2_movie,
"item_factors": item_factors,
"user_factors": user_factors,
"cosine_sim": cosine_sim,
"movies_indexed": movies_indexed, # aligned to m2i ordering
"tfidf": tfidf,
}
with open(artifacts_dir / "model.pkl", "wb") as f:
pickle.dump(model_data, f)
print("Model artifacts saved to artifacts/model.pkl")
print("Saved keys:", sorted(model_data.keys()))
Model artifacts saved to artifacts/model.pkl Saved keys: ['cosine_sim', 'i2m', 'i2u', 'item_factors', 'm2i', 'model', 'movies_indexed', 'tfidf', 'u2i', 'user_factors']