import pandas as pd
import scipy.sparse as sps
from sklearn.feature_extraction.text import TfidfVectorizer
import implicit
import pickle
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

sns.set_style("whitegrid")

c:\Users\johnd\Programming\my-data-science-learning\recommender_systems\venv\Lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm


data_dir = Path("data/")
artifacts_dir = Path("artifacts")
artifacts_dir.mkdir(exist_ok=True)


ratings = pd.read_csv(data_dir / "ml-32m/ratings.csv")
movies = pd.read_csv(data_dir / "ml-32m/movies.csv")
tags = pd.read_csv(data_dir / "ml-32m/tags.csv")

print(f"Loaded {len(ratings)} ratings, {len(movies)} movies, {len(tags)} tags")

Loaded 32000204 ratings, 87585 movies, 2000072 tags


top_movies = (
    ratings["movieId"].value_counts().head(1000).index.tolist()
)  # list of movieIds

print(f"Selected top {len(top_movies)} movies")

Selected top 1000 movies


movies = movies[movies["movieId"].isin(top_movies)]
ratings = ratings[ratings["movieId"].isin(top_movies)]
ratings = ratings.sample(frac=0.5, random_state=42)
tags = tags[tags["movieId"].isin(top_movies)]

print(
    f"After filtering: {len(ratings)} ratings, {len(movies)} movies, {len(tags)} tags"
)

After filtering: 9439196 ratings, 1000 movies, 704287 tags


movie_ids_in_ratings = sorted(ratings["movieId"].unique().tolist())
movie_2_index = {mid: idx for idx, mid in enumerate(movie_ids_in_ratings)}
index_2_movie = {idx: mid for mid, idx in movie_2_index.items()}

print(f"DEBUG: movie_ids_in_ratings has {len(movie_ids_in_ratings)} movies")

DEBUG: movie_ids_in_ratings has 1000 movies


user_ids_in_ratings = sorted(ratings["userId"].unique().tolist())
user_2_index = {uid: idx for idx, uid in enumerate(user_ids_in_ratings)}
index_2_user = {idx: uid for uid, idx in user_2_index.items()}

print(f"DEBUG: user_ids_in_ratings has {len(user_ids_in_ratings)} users")

DEBUG: user_ids_in_ratings has 200758 users


rows = ratings["userId"].map(user_2_index)
cols = ratings["movieId"].map(movie_2_index)
data = ratings["rating"].astype(float)
user_rating_matrix = sps.csr_matrix(
    (data, (rows, cols)), shape=(len(user_2_index), len(movie_2_index))
)

print(f"DEBUG: user_rating_matrix shape is {user_rating_matrix.shape}")
print(
    f"DEBUG: user_rating_matrix has {user_rating_matrix.nnz} non-zero entries"
)
print(
    f"DEBUG: Sparsity: {1 - (user_rating_matrix.nnz / (user_rating_matrix.shape[0] * user_rating_matrix.shape[1]))*100:.2f}%"
)

DEBUG: user_rating_matrix shape is (200758, 1000)
DEBUG: user_rating_matrix has 9439196 non-zero entries
DEBUG: Sparsity: -3.70%


fig, axes = plt.subplots(1, 2, figsize=(14, 4))

axes[0].hist(
    ratings["rating"], bins=20, edgecolor="black", alpha=0.7, color="steelblue"
)
axes[0].set_xlabel("Rating")
axes[0].set_ylabel("Frequency")
axes[0].set_title("Distribution of User Ratings")
axes[0].grid(alpha=0.3)

ratings_per_movie = ratings["movieId"].value_counts()
axes[1].hist(
    ratings_per_movie, bins=30, edgecolor="black", alpha=0.7, color="coral"
)
axes[1].set_xlabel("Number of Ratings")
axes[1].set_ylabel("Frequency")
axes[1].set_title("Distribution of Ratings per Movie")
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()


movie_tags = (
    tags.groupby("movieId")["tag"]
    .apply(lambda ts: " ".join(ts.astype(str)))
    .reset_index()
)
movies = movies.merge(movie_tags, how="left", on="movieId")
movies["tag"] = movies["tag"].fillna("")

movies["genres_clean"] = (
    movies["genres"].fillna("").str.replace("|", " ", regex=False)
)
movies["content"] = (
    movies["title"].fillna("")
    + " "
    + movies["genres_clean"]
    + " "
    + movies["tag"]
)

print(f"Sample content field:\n{movies['content'].iloc[0]}\n")

Sample content field:
Toy Story (1995) Adventure Animation Children Comedy Fantasy children Disney animation children Disney Disney pixar animation funny Pixar Pixar Tumey's To See Again Tumey's VHS adventure classic friendship funny animation computer animation pixar toys adventure computer animation Pixar adventure animated animation clever comedy computer animation family fantasy Tom Hanks pixar Tom Hanks animation Pixar 3D animated children comedy computer animation Disney family humorous Pixar time travel Tom Hanks accepting reality emotional friendship funny soundtrack children computer animation Disney family Pixar computer animation pixar ad for toys children clever forced puns  internal logic fails mixed soundtrack reflection rivalry technological marvel Pixar kids and family funny friendship Animation Cartoon Pixar adventure animated animation buddy movie comedy computer animation Disney friendship funny humorous Pixar Tom Hanks witty animation Pixar animated animation children classic clever comedy computer animation Disney friendship funny humorous pixar toys witty adventure animation buddy movie classic clever comedy friendship fun funny humorous kids Pixar toys unusual friendship very good want to see again animation family sci-fi computer animation Disney fantasy Pixar toys witty ya boy Pixar almost favorite children computer animation family humorous time travel Tom Hanks witty Tim Allen Tom Hanks animation children toys Cartoon Disney Pixar animation clever friendship funny humorous pixar witty adventure children family funny Engaging animation family good time buddy movie Tom Hanks witty animated animation buddy movie children clever time travel witty animation disney animation pixar bright DARING RESCUES fanciful HEROIC MISSION humorous light rousing TOYS COME TO LIFE UNLIKELY FRIENDSHIPS warm witty animation children computer animation Disney imdb top 250 John Lasseter Pixar Tom Hanks animation Pixar Tim Allen time travel adventure classic clever comedy Disney family friendship funny humorous imdb top 250 Pixar Tom Hanks witty pixar Tom Hanks Pixar witty toys classic Disney friendship pixar friendship funny pixar Tom Hanks adventure clever enemies become friends family Family cartoon family relationships feel-good first of series friends friendship fun fun family movie HEROIC MISSION humorous jealousy kids kids movie loyal friend loyalty redemption reflection rescue mission rivalry selflessness teamwork unlikely friendships unusual friendship witty American Animation é˜®ä¸€é¸£ Disney family Family cartoon Pixar Pixar animation pixar Watched Pixar time travel cute funny story voice acting witty Disney funny Pixar time travel funny Pixar animation friendship witty animation Disney Pixar buddy computer animation directorial debut low fantasy National Film Registry Oscar (Special Achievement) toys action disney kids cgi computer animation Pixar witty adventure clever funny Tom Hanks witty computer animation Pixar adventure friendship kids light hearted whimsica animation clever comedy funny humorous Pixar witty classic clever funny Tom Hanks witty animation cgi animation comedy Disney Pixar pixar fun clever comedy Disney family friendship funny humorous Pixar Tom Hanks witty adventure animation classic clever computer animation Disney kids Pixar redemption Tom Hanks unlikely friendships unusual friendship children cartoon feel-good funny animated buddy movie Cartoon cgi comedy computer animation family friendship kids toy toys Pixar children family Pixar Tom Hanks toys witty animation clever Disney pixar animation Pixar Disney favorite Pixar animation animation Tom Hanks friendship pixar toys Pixar Disney children classic Disney fun imdb top 250 kids animation children Disney Pixar animation friendship toys adventure Animation  Comedy  computer animation family funny Pixar Tom Hanks Disney Pixar adventure classic funny Pixar toys toy toys animation humorous Pixar time travel animation Disney pixar animation pixar Owned computer animation family joss whedon children comedy funny witty animation Disney animated animation children comedy fantasy funny humorous Pixar time travel Pixar animation Disney friendship imdb top 250 Pixar witty 1990s 3 dimensional 3d action figure affection alien animated dog animated fictional tv commercial antenna anthropomorphic toy anthropomorphism apology arcade arcade game arm asking directions astronaut audio flashback baby baby monitor backfire backyard bad guy ballet flats banister battery battleship game bed bedroom binoculars biohazard sign birthday birthday party birthday present blinds blockbuster bo peep character bouncy ball boy boy next door brat brawl brother sister relationship bull terrier bully bully comeuppance bullying burn buzz lightyear character calling someone an idiot candy land the board game car car crash card game cartoon dog catchphrase cgi animation character's point of view camera shot chased by a dog chasing a truck child child antagonist child destroys another's toy child destroys own toy child villain child's bedroom child's birthday child's birthday party christmas christmas day christmas present chrysler automobile chrysler lebaron convertible claw crane clock comeuppance comic hero commercial computer animation computer generated imagery confrontation corkboard cowboy cowboy boot cowboy doll crate cult favorite cult film dachshund dart dartboard dead battery delivery dental braces depression desk dinosaur directorial debut Disney dog dog as gift doll doorbell double prize doubt elementary school encouragement enemies become friends envy escape escape attempt etch a sketch explosion face mask falling from a window falling from height falling out a window false accusation family relationships famous line famous score famous song favoritism fear fight fireplace first of series first of trilogy first part fishing rod fistfight flashlight flying friend friends friendship frustration game of life board game gas station gift girl gliding globe gratitude guilty conscience hasbro hat heartfelt hero hockey puck home house jack in the box jealousy jumping from height jumping through a sunroof karate karate chop laughter learning a lesson leggings lifting a female into the air lifting someone into the air lipstick on face little boy little girl locked in looking out a window love interest loyalty magic 8 ball magnifying glass male antagonist male protagonist male villain martial arts match mattel medical mask meeting microphone mission misunderstanding mockery mother son relationship mouth moving moving van mr potato head mr. potato head character mutant national film registry neighbor neighborhood nesting egg new home new toy ohio operation game original story package parachute part of trilogy pet as gift pet dog piggy bank pixar pizza van pizzeria plastic army men playskool pliers porcelain poster product placement pterodactyl pull string doll push button radio controlled rag doll rain ramp recliner chair reconnaissance mission redemption reference to marie antoinette reference to mattel reflection rejection remorse remote controlled toy car rescue rescue mission resourcefulness ringing a doorbell rival rivalry rocket running scene before opening credits scene during opening credits scheming seatbelt self awareness selflessness shark shed shepherdess sheriff shivering showdown signature single mother skateboard sleeping dog sliding down a banister slinky dog slow motion scene soldier space explorer space ranger spaceman toy spacesuit squeeze toy staircase storage shed subjective camera suburb sunlight sunroof surgical mask surprise ending surrounded talking in one's sleep talking to a toy talking toy teamwork television commercial telling someone to shut up threat thunder thunderstorm time lapse photography title spoken by character toolbox torture tough guy toy toy animal toy comes to life toy dinosaur toy dog toy fire truck toy robot toy soldier toy story toy tea set toybox toyota toyota truck troll doll troubled production turbo boost twister the game two word title tyrannosaurus rex urban setting utah teapot van ventilation shaft view through binoculars villain violence visual pun walkie talkie wallpaper water weightlifting wilhelm scream window woody character yelling adventure buddy movie computer animation Pixar Tom Hanks animation pixar Tim Allen Tom Hanks comedy funny Tom Hanks classic clever witty adventure animation computer animation humorous Pixar witty animation classic comedy computer animation Disney funny humorous Pixar time travel Tom Hanks witty Disney Pixar computer animation funny humorous Pixar Tom Hanks witty children jealousy family film friendship toys hero's journey funny computer animation good cartoon chindren pixar Pixar witty animation fun animation reflection first cgi film Pixar animation computer animation friendship funny witty great movie Disney funny witty clever classic pixar CGI classic disney pixar animation cartoon friendship pixar unny funny Pixar witty adventure animated animation children clever computer animation family friendship funny humorous toys witty Pixar toys 2009 reissue in Stereoscopic 3-D 55 movies every kid should see--Entertainment Weekly BD-Video CLV DVD-Video animation imdb top 250 pixar pixar animation cgi Disney family toys animation kids movie animation Cartoon dolls National Film Registry Disney friendship nostalgic computer animation funny Pixar rated-G Pixar Tom Hanks computer animation Pixar Disney Pixar Tim Allen Tom Hanks https://movielens.org/tag/:animation adventure animation children comedy classic comedy fun funny humorous Animation  clever comedy funny Pixar Tom Hanks animated fun family movie pixar adventure 3D children animation Pixar toys adventure animation comedy family fantasy John Lasseter USA animation Pixar animated animation buddy movie computer animation funny Pixar Tom Hanks animation Disney funny pixar adventure animation Disney funny pixar animated animation comedy Disney Pixar Pixar animation family Tom Hanks animation classic clever comedy computer animation Disney friendship funny pixar witty childish clever pixar pixar buddy movie cartoon children clever family friendship kids Tom Hanks witty classic Tom Hanks adventure animation buddy movie clever comedy cowboy dinosaur dolls friendship funny pixar Tim Allen Tom Hanks toys UNLIKELY FRIENDSHIPS witty jealousy Tom Hanks buddy movie friendship toys classic pixar witty classic funny Pixar CGI classic Tom Hanks children computer animation family funny Pixar Tom Hanks toys animation Disney Pixar toys innovative animation Disney family funny avi buy animation Disney pixar toys family friendship pixar Pixar Pixar Tom Hanks action figure action figures Buzz Lightyear CG animation toy toys Woody Pixar computer animation Disney humorous Pixar Pixar pixar tim allen tom hanks 3D Disney family Pixar Tom Hanks Pixar children children Pixar Tom Hanks cgi unlikely friendships adventure adventure animated animation computer animation Disney funny pixar Tom Hanks toys adventure animation Disney funny pixar adventure animated animation cgi comedy Disney family fantasy friendship imdb top 250 Pixar Tom Hanks witty itaege toys adventure clever friendship humorous Pixar witty adventure children classic computer animation Disney funny Pixar Tim Allen Tom Hanks animated comedy Disney fun funny pixar adventure animated animation buddy movie children classic clever comedy computer animation Disney family fantasy funny humorous imdb top 250 Pixar time travel Tom Hanks toys witty cgi children toys humorous pixar classic friendship Pixar Animation Pixar animation Disney Pixar animation Pixar animation children computer animation kids pixar clever computer animation Disney Tom Hanks clever funny Pixar witty soothing Pixar Tom Hanks animation children comedy Disney friendship funny pixar Tom Hanks buddy movie computer animation friendship Tom Hanks clever witty animation Cartoon children computer animation kids pixar Tom Hanks Pixar adventure animated animation classic Disney fantasy Pixar Tom Hanks toys adventure pixar villian hurts toys Pixar adventure animated animation Cartoon Disney family friendship imdb top 250 pixar toy toys animation comedy funny fantasy pixar Pixar imdb top 250


movies = movies[movies["movieId"].isin(movie_ids_in_ratings)]

print(f"DEBUG: movies dataframe has {len(movies)} movies after filtering")

movies_indexed = (
    movies.set_index("movieId").reindex(movie_ids_in_ratings).reset_index()
)
for col in ["title", "genres", "content"]:
    if col in movies_indexed.columns:
        movies_indexed[col] = movies_indexed[col].fillna("")

print(f"DEBUG: movies_indexed has {len(movies_indexed)} movies")

DEBUG: movies dataframe has 1000 movies after filtering
DEBUG: movies_indexed has 1000 movies


tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
item_content = tfidf.fit_transform(movies_indexed["content"])

print(f"TF-IDF matrix shape: {item_content.shape}")
print(f"Number of features: {len(tfidf.get_feature_names_out())}")

TF-IDF matrix shape: (1000, 5000)
Number of features: 5000


star_wars_mask = movies_indexed["title"].str.contains(
    "Star Wars", case=False, na=False
)
star_wars_movies = movies_indexed[star_wars_mask]

star_wars_indices = star_wars_movies.index.tolist()

star_wars_tfidf = item_content[star_wars_indices].toarray()

feature_names = tfidf.get_feature_names_out()

top_n = 5
all_top_terms = set()
for row in star_wars_tfidf:
    top_indices = np.argsort(row)[-top_n:]
    all_top_terms.update(top_indices)

all_top_terms = sorted(list(all_top_terms))

star_wars_tfidf_filtered = star_wars_tfidf[:, all_top_terms]
filtered_feature_names = feature_names[all_top_terms]

fig, ax = plt.subplots(figsize=(15, 6))
sns.heatmap(
    star_wars_tfidf_filtered,
    xticklabels=filtered_feature_names,
    yticklabels=[movies_indexed.loc[i, "title"] for i in star_wars_indices],
    cmap="YlOrRd",
    annot=True,
    fmt=".2f",
    cbar_kws={"label": "TF-IDF Score"},
    ax=ax,
    vmin=0,
)
plt.title("TF-IDF Heatmap: Star Wars Movies (Top Terms Only)")
plt.xlabel("Terms")
plt.ylabel("Movies")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()


alpha = 40.0
conf = (user_rating_matrix * alpha).astype("double")
model = implicit.als.AlternatingLeastSquares(
    factors=64, regularization=0.1, iterations=20
)
model.fit(conf)

c:\Users\johnd\Programming\my-data-science-learning\recommender_systems\venv\Lib\site-packages\implicit\cpu\als.py:95: RuntimeWarning: OpenBLAS is configured to use 12 threads. It is highly recommended to disable its internal threadpool by setting the environment variable 'OPENBLAS_NUM_THREADS=1' or by calling 'threadpoolctl.threadpool_limits(1, "blas")'. Having OpenBLAS use a threadpool can lead to severe performance issues here.
  check_blas_config()
100%|██████████| 20/20 [00:20<00:00,  1.02s/it]


item_factors = model.item_factors  # shape: n_items x k
user_factors = model.user_factors  # shape: n_users x k

print(f"DEBUG: item_factors shape is {item_factors.shape}")
print(f"DEBUG: user_factors shape is {user_factors.shape}")

DEBUG: item_factors shape is (1000, 64)
DEBUG: user_factors shape is (200758, 64)


fig, axes = plt.subplots(1, 2, figsize=(14, 4))

item_magnitudes = np.linalg.norm(item_factors, axis=1)
axes[0].hist(
    item_magnitudes, bins=30, edgecolor="black", alpha=0.7, color="steelblue"
)
axes[0].set_xlabel("L2 Norm of Item Factors")
axes[0].set_ylabel("Frequency")
axes[0].set_title("Distribution of Item Factor Magnitudes")
axes[0].grid(alpha=0.3)

user_magnitudes = np.linalg.norm(user_factors, axis=1)
axes[1].hist(
    user_magnitudes, bins=30, edgecolor="black", alpha=0.7, color="coral"
)
axes[1].set_xlabel("L2 Norm of User Factors")
axes[1].set_ylabel("Frequency")
axes[1].set_title("Distribution of User Factor Magnitudes")
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()


item_content = tfidf.transform(movies_indexed["content"])  # already aligned
cosine_sim = cosine_similarity(
    item_content, item_content, dense_output=False
)  # shape: n_items x n_items

print(f"DEBUG: cosine_sim shape is {cosine_sim.shape}")

DEBUG: cosine_sim shape is (1000, 1000)


n_display = min(30, item_factors.shape[0])
alpha = 0.5

cf_sim = cosine_similarity(item_factors)  # dense n_items x n_items

cosine_sim_dense = (
    cosine_sim.toarray()
    if hasattr(cosine_sim, "toarray")
    else np.asarray(cosine_sim)
)


def normalize_mat(m):
    mn, mx = m.min(), m.max()
    return (m - mn) / (mx - mn) if mx > mn else np.zeros_like(m)


cf_norm = normalize_mat(cf_sim)
costine_norm = normalize_mat(cosine_sim_dense)

hybrid_sim = alpha * cf_norm + (1.0 - alpha) * costine_norm

# Ensure all "star wars" movies are included
titles_all = movies_indexed["title"].fillna("").astype(str).tolist()
star_wars_indices = [
    i for i, title in enumerate(titles_all) if "star wars" in title.lower()
]

if len(star_wars_indices) >= n_display:
    indices = np.random.choice(
        star_wars_indices, size=n_display, replace=False
    )
else:
    remaining_pool = np.setdiff1d(
        np.arange(item_factors.shape[0]), star_wars_indices
    )
    n_remaining = n_display - len(star_wars_indices)
    random_indices = np.random.choice(
        remaining_pool, size=n_remaining, replace=False
    )
    indices = np.concatenate([star_wars_indices, random_indices])

hybrid_sub = hybrid_sim[np.ix_(indices, indices)]

labels = [titles_all[i] if titles_all[i] != "" else str(i) for i in indices]

g = sns.clustermap(
    hybrid_sub,
    method="weighted",  # options: single, complete, average, weighted, centroid, median, ward
    metric="cosine",
    cmap="turbo",
    figsize=(12, 12),
    xticklabels=labels,
    yticklabels=labels,
    cbar_kws={"label": "Hybrid similarity"},
)

plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90, fontsize=8)
plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0, fontsize=8)
g.cax.set_position([0.95, 0.2, 0.03, 0.6])

plt.show()


n_cf = item_factors.shape[0]
n_content = cosine_sim.shape[0]
n_movies = len(movies_indexed)
n_map = len(index_2_movie)

print(f"\n=== FINAL DIMENSIONS ===")
print(f"item_factors: {n_cf}")
print(f"cosine_sim: {n_content}")
print(f"movies_indexed: {n_movies}")
print(f"i2m: {n_map}")

if n_cf == n_content == n_movies == n_map:
    print(f"Sanity check passed: all have {n_cf} items")
else:
    print(f"\nMISMATCH")
    print(
        f"   item_factors is from user_rating_matrix which has {URM.shape[1]} items"
    )
    print(f"   But we're trying to match {len(movie_ids_in_ratings)} movies")
    raise ValueError(
        f"Dimension mismatch! item_factors={n_cf}, cosine_sim={n_content}, "
        f"movies_indexed={n_movies}, i2m={n_map}. All should be equal!"
    )

=== FINAL DIMENSIONS ===
item_factors: 1000
cosine_sim: 1000
movies_indexed: 1000
i2m: 1000
Sanity check passed: all have 1000 items


np.random.seed(42)
test_fraction = 0.2
test_indices = np.random.choice(
    user_rating_matrix.nnz,
    size=int(user_rating_matrix.nnz * test_fraction),
    replace=False,
)

test_data = np.zeros(user_rating_matrix.nnz)
train_data = user_rating_matrix.data.copy()

for idx in test_indices:
    test_data[idx] = train_data[idx]
    train_data[idx] = 0.0

user_rating_matrix_train = sps.csr_matrix(
    (train_data, user_rating_matrix.indices, user_rating_matrix.indptr),
    shape=user_rating_matrix.shape,
)
user_rating_matrix_test = sps.csr_matrix(
    (test_data, user_rating_matrix.indices, user_rating_matrix.indptr),
    shape=user_rating_matrix.shape,
)

print(f"Train set: {user_rating_matrix_train.nnz} ratings")
print(f"Test set: {user_rating_matrix_test.nnz} ratings")

Train set: 9439196 ratings
Test set: 9439196 ratings


def compute_hit_rate_and_ndcg(
    model_item_factors, user_rating_matrix_train, user_rating_matrix_test, k=10
):
    """
    Compute hit rate and NDCG for top-k recommendations.
    """
    hit_rates = []
    ndcgs = []

    test_users = np.where(user_rating_matrix_test.getnnz(axis=1) > 0)[
        0
    ]  # users with test ratings

    for user_idx in tqdm(
        test_users[: min(100, len(test_users))], desc="Evaluating"
    ):
        user_trained_items = user_rating_matrix_train[user_idx].nonzero()[1]
        if len(user_trained_items) == 0:
            continue

        user_vec = model_item_factors[user_trained_items].mean(axis=0)

        scores = model_item_factors.dot(user_vec)

        train_items = set(user_rating_matrix_train[user_idx].nonzero()[1])
        ranked = sorted(
            (
                (i, scores[i])
                for i in range(len(scores))
                if i not in train_items
            ),
            key=lambda x: -x[1],
        )[:k]

        top_k_items = [i for i, _ in ranked]

        test_items = set(user_rating_matrix_test[user_idx].nonzero()[1])

        # any test item in top-k?
        hits = len(set(top_k_items) & test_items)
        hit_rate = 1.0 if hits > 0 else 0.0
        hit_rates.append(hit_rate)

        # NDCG
        dcg = sum(
            [
                1.0 / np.log2(i + 2)
                for i, item in enumerate(top_k_items)
                if item in test_items
            ]
        )
        idcg = sum(
            [1.0 / np.log2(i + 2) for i in range(min(len(test_items), k))]
        )
        ndcg = dcg / idcg if idcg > 0 else 0.0
        ndcgs.append(ndcg)

    return np.mean(hit_rates), np.mean(ndcgs)


hit_rate, ndcg = compute_hit_rate_and_ndcg(
    item_factors, user_rating_matrix_train, user_rating_matrix_test, k=10
)

print(f"\n=== EVALUATION METRICS (k=10) ===")
print(f"Hit Rate@10: {hit_rate:.4f}")
print(f"NDCG@10: {ndcg:.4f}")

Evaluating: 100%|██████████| 100/100 [00:00<00:00, 1012.24it/s]

=== EVALUATION METRICS (k=10) ===
Hit Rate@10: 0.3400
NDCG@10: 0.0738


compute_hit_rate_and_ndcg(
    item_factors, user_rating_matrix_train, user_rating_matrix_test, k=10
)

Evaluating: 100%|██████████| 100/100 [00:00<00:00, 1035.48it/s]

(np.float64(0.34), np.float64(0.07378929909770875))


def per_user_metrics(item_factors, URM_train, URM_test, k=10, max_users=500):
    test_users = np.where(URM_test.getnnz(axis=1) > 0)[0]
    if len(test_users) == 0:
        return np.array([]), np.array([])
    test_users = test_users[: min(max_users, len(test_users))]

    hits_list = []
    ndcgs = []

    for u in test_users:
        train_items = URM_train[u].nonzero()[1]
        test_items = set(URM_test[u].nonzero()[1])
        if len(train_items) == 0 or len(test_items) == 0:
            continue

        user_vec = item_factors[train_items].mean(axis=0)
        scores = item_factors.dot(user_vec)
        scores[list(train_items)] = -np.inf

        top_k = np.argsort(-scores)[:k]

        hit = 1.0 if any(i in test_items for i in top_k) else 0.0
        hits_list.append(hit)

        dcg = sum(
            1.0 / np.log2(idx + 2)
            for idx, item in enumerate(top_k)
            if item in test_items
        )
        idcg = sum(
            1.0 / np.log2(i + 2) for i in range(min(len(test_items), k))
        )
        ndcg = dcg / idcg if idcg > 0 else 0.0
        ndcgs.append(ndcg)

    return np.array(hits_list), np.array(ndcgs)


k_values = [5, 10, 15, 20, 30, 40]
hit_means = []
hit_sems = []
ndcg_means = []
ndcg_sems = []

for k in k_values:
    hits, ndcgs = per_user_metrics(
        item_factors,
        user_rating_matrix_train,
        user_rating_matrix_test,
        k=k,
        max_users=500,
    )
    if len(hits) == 0:
        hit_means.append(0.0)
        hit_sems.append(0.0)
        ndcg_means.append(0.0)
        ndcg_sems.append(0.0)
        continue

    hit_means.append(hits.mean())
    hit_sems.append(
        hits.std(ddof=1) / np.sqrt(len(hits)) if len(hits) > 1 else 0.0
    )

    ndcg_means.append(ndcgs.mean())
    ndcg_sems.append(
        ndcgs.std(ddof=1) / np.sqrt(len(ndcgs)) if len(ndcgs) > 1 else 0.0
    )


fig, axes = plt.subplots(1, 2, figsize=(14, 4))

axes[0].errorbar(
    k_values,
    hit_means,
    yerr=hit_sems,
    marker="o",
    linewidth=2,
    markersize=8,
    color="steelblue",
    capsize=5,
)
axes[0].set_xlabel("k (number of recommendations)")
axes[0].set_ylabel("Hit Rate")
axes[0].set_title("Hit Rate@k Performance (with SEM)")
axes[0].grid(alpha=0.3)

axes[1].errorbar(
    k_values,
    ndcg_means,
    yerr=ndcg_sems,
    marker="s",
    linewidth=2,
    markersize=8,
    color="coral",
    capsize=5,
)
axes[1].set_xlabel("k (number of recommendations)")
axes[1].set_ylabel("NDCG")
axes[1].set_title("NDCG@k Performance (with SEM)")
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()


model_data = {
    "model": model,
    "u2i": user_2_index,
    "i2u": index_2_user,
    "m2i": movie_2_index,
    "i2m": index_2_movie,
    "item_factors": item_factors,
    "user_factors": user_factors,
    "cosine_sim": cosine_sim,
    "movies_indexed": movies_indexed,  # aligned to m2i ordering
    "tfidf": tfidf,
}

with open(artifacts_dir / "model.pkl", "wb") as f:
    pickle.dump(model_data, f)

print("Model artifacts saved to artifacts/model.pkl")
print("Saved keys:", sorted(model_data.keys()))

Model artifacts saved to artifacts/model.pkl
Saved keys: ['cosine_sim', 'i2m', 'i2u', 'item_factors', 'm2i', 'model', 'movies_indexed', 'tfidf', 'u2i', 'user_factors']

Setup¶

Import Libraries¶

Set Up Data Paths¶

Load Raw Data¶

Filter to Top 100 Movies¶

Subset Data and Downsample Ratings¶

Build Movie ID Mappings¶

Build User ID Mappings¶

Build User-Rating Matrix¶

Rating Distribution¶

Enrich Movies with Tags¶

Filter and Reindex Movies¶

Compute TF-IDF Vectors¶

How does TF-IDF work?¶

Train ALS Collaborative Filtering Model¶

Extract Learned Factors¶

Latent Factor Analysis¶

Compute Content-Based Similarity¶

What this plot shows¶

Sanity Checks¶

Model Evaluation¶

Compute Hit Rate and NDCG¶

Evaluation Visualization¶

Save Model Artifacts¶