-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #100 from ploomber/recommender.py
Recommender.py for FastAPI
- Loading branch information
Showing
10 changed files
with
1,043 additions
and
1,587 deletions.
There are no files selected for viewing
Empty file.
55 changes: 55 additions & 0 deletions
55
mini-projects/movie-rec-system/movie_rec_system/app/app.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
from fastapi import FastAPI, HTTPException | ||
from pydantic import BaseModel, field_validator | ||
from .recommender import get_recommendation | ||
from fastapi.responses import JSONResponse | ||
import json | ||
|
||
app = FastAPI() | ||
|
||
|
||
class RecommendationRequest(BaseModel): | ||
movie: str | ||
num_rec: int = 10 | ||
|
||
@field_validator("movie") | ||
def format_movie_name(cls, movie_name): | ||
"""Ensure the movie name is formatted with the | ||
first letter capitalized.""" | ||
return movie_name.title() # Convert to title case | ||
|
||
|
||
@app.get("/") | ||
async def root(): | ||
return { | ||
"message": "Welcome! You can use this API to get movie recommendations based on viewers' votes. Visit /docs for more information and to try it out!" # noqa E501 | ||
} | ||
|
||
|
||
@app.post("/recommendations/") | ||
def get_movie_recommendations(recommendation_request: RecommendationRequest): | ||
""" | ||
Get movie recommendations for a given movie. | ||
Parameters: | ||
- movie: The name of the movie for which you want recommendations. | ||
- num_rec: The number of movie recommendations you want. Default is 10. | ||
Returns: | ||
JSON containing recommended movies and metrics. | ||
""" | ||
recommendations = get_recommendation( | ||
recommendation_request.movie, | ||
recommendation_request.num_rec, | ||
"english", | ||
) | ||
|
||
if isinstance(recommendations, str): | ||
recommendations = json.loads(recommendations) | ||
|
||
if not recommendations: | ||
raise HTTPException( | ||
status_code=404, | ||
detail="Movie not found or no recommendations available", # noqa E501 | ||
) | ||
|
||
return JSONResponse(content=recommendations) |
227 changes: 227 additions & 0 deletions
227
mini-projects/movie-rec-system/movie_rec_system/app/recommender.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,227 @@ | ||
import json | ||
import pandas as pd | ||
import duckdb | ||
from functools import lru_cache | ||
from sklearn.feature_extraction.text import TfidfVectorizer | ||
from sklearn.metrics.pairwise import cosine_similarity | ||
from .recommenderhelper import ( | ||
content_movie_recommender, | ||
get_popularity_rmse, | ||
get_vote_avg_rmse, | ||
get_vote_count_rmse, | ||
) | ||
|
||
|
||
@lru_cache(maxsize=None) | ||
def get_data() -> pd.DataFrame: | ||
""" | ||
Function that automatically connects | ||
to duckdb as a GET call upon launch | ||
of FastAPI | ||
""" | ||
con = duckdb.connect("./movies_data.duckdb") | ||
query = "SELECT * FROM movie_genre_data" | ||
df = con.execute(query).fetchdf() | ||
con.close() | ||
return df | ||
|
||
|
||
def create_combined(df: pd.DataFrame, weight=2) -> pd.DataFrame: | ||
""" | ||
Generates a "combined" column by combining the | ||
"overview" and "genre_names" columns. | ||
The "genre_names" column will be multiplied by the | ||
provided weight, essentially repeating the genre names | ||
the specified number of times. | ||
Parameters | ||
---------- | ||
df : pd.DataFrame | ||
The input DataFrame which must contain | ||
both "overview" and "genre_names" columns. | ||
weight : int, default=2 | ||
The number of times "genre_names" should be | ||
repeated in the "combined" column. | ||
Returns | ||
------- | ||
pd.DataFrame | ||
The modified DataFrame with an additional "combined" column. | ||
Examples | ||
-------- | ||
>>> df = pd.DataFrame({ | ||
... 'overview': ['A story about...'], | ||
... 'genre_names': ['Action'] | ||
... }) | ||
>>> create_combined(df) | ||
overview genre_names combined | ||
0 A story about... Action A story about... Action, Action, | ||
""" | ||
df["combined"] = df["overview"] + " " + (df["genre_names"] + ", ") * weight | ||
return df | ||
|
||
|
||
def retrieve_and_transform_data() -> pd.DataFrame: | ||
""" | ||
Retrieve data from duckdb and transform it | ||
into a format that can be used for generating | ||
movie recommendations. | ||
Returns | ||
------- | ||
pd.DataFrame | ||
The transformed DataFrame with an additional "combined" column. | ||
""" | ||
df = get_data() | ||
df["title"] = df["title"].str.lower() | ||
df = create_combined(df) | ||
return df | ||
|
||
|
||
def compute_tfidf_vectorization(df, stop_words="english"): | ||
""" | ||
Compute TF-IDF vectorization of the "combined" column | ||
in the provided DataFrame. | ||
Parameters | ||
---------- | ||
df : pd.DataFrame | ||
The input DataFrame which must contain | ||
a "combined" column. | ||
stop_words : str, optional | ||
The language of stop words to be | ||
used when vectorizing the "combined" column. | ||
Default is "english". | ||
Returns | ||
------- | ||
tfidf_matrix: scipy.sparse.csr.csr_matrix | ||
The TF-IDF vectorization of the "combined" column.""" | ||
tfidf = TfidfVectorizer(stop_words=stop_words) | ||
tfidf_matrix = tfidf.fit_transform(df["combined"]) | ||
return tfidf_matrix | ||
|
||
|
||
def compute_metrics(df, movie, recommendations): | ||
""" | ||
Compute RMSE for popularity, vote average, and vote count | ||
for the provided movie and recommendations. | ||
Parameters | ||
---------- | ||
df : pd.DataFrame | ||
The input DataFrame which must contain | ||
a "combined" column. | ||
movie : str | ||
The title of the movie for which | ||
recommendations are to be generated. | ||
recommendations : list | ||
A list of recommended movies. | ||
Returns | ||
------- | ||
popularity_rmse : float | ||
The RMSE for popularity. | ||
vote_avg_rmse : float | ||
The RMSE for vote average. | ||
ote_count_rmse : float | ||
The RMSE for vote count. | ||
""" | ||
popularity_rmse = get_popularity_rmse(df, movie, recommendations) | ||
vote_avg_rmse = get_vote_avg_rmse(df, movie, recommendations) | ||
vote_count_rmse = get_vote_count_rmse(df, movie, recommendations) | ||
return popularity_rmse, vote_avg_rmse, vote_count_rmse | ||
|
||
|
||
def get_recommendation(movie: str, num_rec: int = 10, stop_words="english"): | ||
""" | ||
Generate movie recommendations based on | ||
content similarity and computes associated metrics. | ||
This function retrieves movie data, | ||
calculates cosine similarity between movies using | ||
TF-IDF vectorization of their combined overview | ||
and genre, and returns a list of recommended | ||
movies along with certain metrics | ||
(popularity, vote average, and vote count RMSE). | ||
Parameters | ||
---------- | ||
movie : str | ||
The title of the movie for which | ||
recommendations are to be generated. | ||
num_rec : int, optional | ||
The number of movie recommendations | ||
to generate. Default is 10. | ||
stop_words : str, optional | ||
The language of stop words to be | ||
used when vectorizing the "combined" column. | ||
Default is "english". | ||
Returns | ||
------- | ||
str | ||
A JSON-formatted string containing | ||
the original movie, a list of recommendations, | ||
and associated metrics | ||
(popularity, vote average, and vote count RMSE). | ||
Examples | ||
-------- | ||
>>> result = get_recommendation("Inception", num_rec=5) | ||
>>> print(json.loads(result)) | ||
{ | ||
"movie": "Inception", | ||
"recommendations": [...], | ||
"metrics": { | ||
"popularity": ..., | ||
"vote_avg": ..., | ||
"vote_count": ... | ||
} | ||
} | ||
""" | ||
movie = movie.lower() | ||
df = retrieve_and_transform_data() | ||
|
||
tfidf_matrix = compute_tfidf_vectorization(df, stop_words) | ||
similarity = cosine_similarity(tfidf_matrix) | ||
|
||
similarity_df = pd.DataFrame( | ||
similarity, index=df.title.values, columns=df.title.values | ||
) | ||
movie_list = similarity_df.columns.values | ||
recommendations = content_movie_recommender( | ||
movie, similarity_df, movie_list, num_rec | ||
) | ||
|
||
if not recommendations: | ||
return None | ||
|
||
popularity_rmse, vote_avg_rmse, vote_count_rmse = compute_metrics( | ||
df, movie, recommendations | ||
) | ||
|
||
result = { | ||
"movie": movie, | ||
"recommendations": recommendations, | ||
"metrics": { | ||
"popularity": popularity_rmse, | ||
"vote_avg": vote_avg_rmse, | ||
"vote_count": vote_count_rmse, | ||
}, | ||
} | ||
|
||
result_json = json.dumps(result) | ||
return result_json |
99 changes: 99 additions & 0 deletions
99
mini-projects/movie-rec-system/movie_rec_system/app/recommenderhelper.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
import numpy as np | ||
import pandas as pd | ||
|
||
|
||
def content_movie_recommender( | ||
input_movie: str, | ||
similarity_database: pd.DataFrame, | ||
movie_database_list: list, | ||
top_n=10, | ||
) -> list: | ||
""" | ||
Function that uses a similarity matrix to find similar movies | ||
Parameters | ||
---------- | ||
input_movie : str | ||
reference movie to find similarities | ||
similarity_database : pandas.DataFrame | ||
similarity matrix of movies | ||
movie_database_list : numpy.ndarray | ||
movies in our similarity matrix | ||
top_n : int | ||
number of similar movies to output | ||
""" | ||
try: | ||
# get movie similarity records | ||
movie_sim = similarity_database[ | ||
similarity_database.index == input_movie | ||
].values[0] | ||
|
||
# get movies sorted by similarity | ||
sorted_movie_ids = np.argsort(movie_sim)[::-1] | ||
recommended_movies = movie_database_list[ | ||
sorted_movie_ids[1 : top_n + 1] # noqa E203 | ||
] # noqa E501 | ||
return list(recommended_movies) | ||
except IndexError: | ||
return [] | ||
|
||
|
||
def get_popularity_rmse( | ||
df: pd.DataFrame, sample_movie: str, recommendations: list | ||
) -> float: | ||
# Convert titles in dataframe and sample_movie to lowercase | ||
df["title"] = df["title"].str.lower() | ||
sample_movie = sample_movie.lower() | ||
|
||
filtered_df = df[df["title"] == sample_movie] | ||
|
||
if not filtered_df.empty: | ||
sample_movie_popularity = filtered_df.popularity.iloc[0] | ||
recommendations_popularity = df[ | ||
df["title"].isin(recommendations) | ||
].popularity.values | ||
|
||
squared_diffs = ( | ||
sample_movie_popularity - recommendations_popularity | ||
) ** 2 # noqa E501 | ||
rmse = np.sqrt(squared_diffs.mean()) | ||
|
||
return round(float(rmse), 3) | ||
else: | ||
return float("nan") | ||
|
||
|
||
def get_vote_avg_rmse( | ||
df: pd.DataFrame, sample_movie: str, recommendations: list | ||
) -> float: | ||
sample_movie_vote_average = df[ | ||
df["title"] == sample_movie | ||
].vote_average.iloc[ # noqa E501 | ||
0 | ||
] | ||
recommendations_vote_average = df[ | ||
df["title"].isin(recommendations) | ||
].vote_average.values | ||
|
||
squared_diffs = ( | ||
sample_movie_vote_average - recommendations_vote_average | ||
) ** 2 # noqa E501 | ||
rmse = np.sqrt(squared_diffs.mean()) | ||
|
||
return round(float(rmse), 3) | ||
|
||
|
||
def get_vote_count_rmse( | ||
df: pd.DataFrame, sample_movie: str, recommendations: list | ||
) -> float: | ||
sample_movie_popularity = df[df["title"] == sample_movie].vote_count.iloc[ | ||
0 | ||
] # noqa E501 | ||
recommendations_popularity = df[ | ||
df["title"].isin(recommendations) | ||
].vote_count.values # noqa E501 | ||
|
||
squared_diffs = (recommendations_popularity - sample_movie_popularity) ** 2 | ||
rmse = np.sqrt(squared_diffs.mean()) | ||
|
||
return round(float(rmse), 3) |
Oops, something went wrong.