Skip to content

Commit

Permalink
Merge pull request #100 from ploomber/recommender.py
Browse files Browse the repository at this point in the history
Recommender.py for FastAPI
  • Loading branch information
lfunderburk authored Sep 1, 2023
2 parents 734a40f + 91b87a4 commit 6f82c60
Show file tree
Hide file tree
Showing 10 changed files with 1,043 additions and 1,587 deletions.
Empty file.
55 changes: 55 additions & 0 deletions mini-projects/movie-rec-system/movie_rec_system/app/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, field_validator
from .recommender import get_recommendation
from fastapi.responses import JSONResponse
import json

app = FastAPI()


class RecommendationRequest(BaseModel):
movie: str
num_rec: int = 10

@field_validator("movie")
def format_movie_name(cls, movie_name):
"""Ensure the movie name is formatted with the
first letter capitalized."""
return movie_name.title() # Convert to title case


@app.get("/")
async def root():
return {
"message": "Welcome! You can use this API to get movie recommendations based on viewers' votes. Visit /docs for more information and to try it out!" # noqa E501
}


@app.post("/recommendations/")
def get_movie_recommendations(recommendation_request: RecommendationRequest):
"""
Get movie recommendations for a given movie.
Parameters:
- movie: The name of the movie for which you want recommendations.
- num_rec: The number of movie recommendations you want. Default is 10.
Returns:
JSON containing recommended movies and metrics.
"""
recommendations = get_recommendation(
recommendation_request.movie,
recommendation_request.num_rec,
"english",
)

if isinstance(recommendations, str):
recommendations = json.loads(recommendations)

if not recommendations:
raise HTTPException(
status_code=404,
detail="Movie not found or no recommendations available", # noqa E501
)

return JSONResponse(content=recommendations)
227 changes: 227 additions & 0 deletions mini-projects/movie-rec-system/movie_rec_system/app/recommender.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
import json
import pandas as pd
import duckdb
from functools import lru_cache
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from .recommenderhelper import (
content_movie_recommender,
get_popularity_rmse,
get_vote_avg_rmse,
get_vote_count_rmse,
)


@lru_cache(maxsize=None)
def get_data() -> pd.DataFrame:
"""
Function that automatically connects
to duckdb as a GET call upon launch
of FastAPI
"""
con = duckdb.connect("./movies_data.duckdb")
query = "SELECT * FROM movie_genre_data"
df = con.execute(query).fetchdf()
con.close()
return df


def create_combined(df: pd.DataFrame, weight=2) -> pd.DataFrame:
"""
Generates a "combined" column by combining the
"overview" and "genre_names" columns.
The "genre_names" column will be multiplied by the
provided weight, essentially repeating the genre names
the specified number of times.
Parameters
----------
df : pd.DataFrame
The input DataFrame which must contain
both "overview" and "genre_names" columns.
weight : int, default=2
The number of times "genre_names" should be
repeated in the "combined" column.
Returns
-------
pd.DataFrame
The modified DataFrame with an additional "combined" column.
Examples
--------
>>> df = pd.DataFrame({
... 'overview': ['A story about...'],
... 'genre_names': ['Action']
... })
>>> create_combined(df)
overview genre_names combined
0 A story about... Action A story about... Action, Action,
"""
df["combined"] = df["overview"] + " " + (df["genre_names"] + ", ") * weight
return df


def retrieve_and_transform_data() -> pd.DataFrame:
"""
Retrieve data from duckdb and transform it
into a format that can be used for generating
movie recommendations.
Returns
-------
pd.DataFrame
The transformed DataFrame with an additional "combined" column.
"""
df = get_data()
df["title"] = df["title"].str.lower()
df = create_combined(df)
return df


def compute_tfidf_vectorization(df, stop_words="english"):
"""
Compute TF-IDF vectorization of the "combined" column
in the provided DataFrame.
Parameters
----------
df : pd.DataFrame
The input DataFrame which must contain
a "combined" column.
stop_words : str, optional
The language of stop words to be
used when vectorizing the "combined" column.
Default is "english".
Returns
-------
tfidf_matrix: scipy.sparse.csr.csr_matrix
The TF-IDF vectorization of the "combined" column."""
tfidf = TfidfVectorizer(stop_words=stop_words)
tfidf_matrix = tfidf.fit_transform(df["combined"])
return tfidf_matrix


def compute_metrics(df, movie, recommendations):
"""
Compute RMSE for popularity, vote average, and vote count
for the provided movie and recommendations.
Parameters
----------
df : pd.DataFrame
The input DataFrame which must contain
a "combined" column.
movie : str
The title of the movie for which
recommendations are to be generated.
recommendations : list
A list of recommended movies.
Returns
-------
popularity_rmse : float
The RMSE for popularity.
vote_avg_rmse : float
The RMSE for vote average.
ote_count_rmse : float
The RMSE for vote count.
"""
popularity_rmse = get_popularity_rmse(df, movie, recommendations)
vote_avg_rmse = get_vote_avg_rmse(df, movie, recommendations)
vote_count_rmse = get_vote_count_rmse(df, movie, recommendations)
return popularity_rmse, vote_avg_rmse, vote_count_rmse


def get_recommendation(movie: str, num_rec: int = 10, stop_words="english"):
"""
Generate movie recommendations based on
content similarity and computes associated metrics.
This function retrieves movie data,
calculates cosine similarity between movies using
TF-IDF vectorization of their combined overview
and genre, and returns a list of recommended
movies along with certain metrics
(popularity, vote average, and vote count RMSE).
Parameters
----------
movie : str
The title of the movie for which
recommendations are to be generated.
num_rec : int, optional
The number of movie recommendations
to generate. Default is 10.
stop_words : str, optional
The language of stop words to be
used when vectorizing the "combined" column.
Default is "english".
Returns
-------
str
A JSON-formatted string containing
the original movie, a list of recommendations,
and associated metrics
(popularity, vote average, and vote count RMSE).
Examples
--------
>>> result = get_recommendation("Inception", num_rec=5)
>>> print(json.loads(result))
{
"movie": "Inception",
"recommendations": [...],
"metrics": {
"popularity": ...,
"vote_avg": ...,
"vote_count": ...
}
}
"""
movie = movie.lower()
df = retrieve_and_transform_data()

tfidf_matrix = compute_tfidf_vectorization(df, stop_words)
similarity = cosine_similarity(tfidf_matrix)

similarity_df = pd.DataFrame(
similarity, index=df.title.values, columns=df.title.values
)
movie_list = similarity_df.columns.values
recommendations = content_movie_recommender(
movie, similarity_df, movie_list, num_rec
)

if not recommendations:
return None

popularity_rmse, vote_avg_rmse, vote_count_rmse = compute_metrics(
df, movie, recommendations
)

result = {
"movie": movie,
"recommendations": recommendations,
"metrics": {
"popularity": popularity_rmse,
"vote_avg": vote_avg_rmse,
"vote_count": vote_count_rmse,
},
}

result_json = json.dumps(result)
return result_json
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import numpy as np
import pandas as pd


def content_movie_recommender(
input_movie: str,
similarity_database: pd.DataFrame,
movie_database_list: list,
top_n=10,
) -> list:
"""
Function that uses a similarity matrix to find similar movies
Parameters
----------
input_movie : str
reference movie to find similarities
similarity_database : pandas.DataFrame
similarity matrix of movies
movie_database_list : numpy.ndarray
movies in our similarity matrix
top_n : int
number of similar movies to output
"""
try:
# get movie similarity records
movie_sim = similarity_database[
similarity_database.index == input_movie
].values[0]

# get movies sorted by similarity
sorted_movie_ids = np.argsort(movie_sim)[::-1]
recommended_movies = movie_database_list[
sorted_movie_ids[1 : top_n + 1] # noqa E203
] # noqa E501
return list(recommended_movies)
except IndexError:
return []


def get_popularity_rmse(
df: pd.DataFrame, sample_movie: str, recommendations: list
) -> float:
# Convert titles in dataframe and sample_movie to lowercase
df["title"] = df["title"].str.lower()
sample_movie = sample_movie.lower()

filtered_df = df[df["title"] == sample_movie]

if not filtered_df.empty:
sample_movie_popularity = filtered_df.popularity.iloc[0]
recommendations_popularity = df[
df["title"].isin(recommendations)
].popularity.values

squared_diffs = (
sample_movie_popularity - recommendations_popularity
) ** 2 # noqa E501
rmse = np.sqrt(squared_diffs.mean())

return round(float(rmse), 3)
else:
return float("nan")


def get_vote_avg_rmse(
df: pd.DataFrame, sample_movie: str, recommendations: list
) -> float:
sample_movie_vote_average = df[
df["title"] == sample_movie
].vote_average.iloc[ # noqa E501
0
]
recommendations_vote_average = df[
df["title"].isin(recommendations)
].vote_average.values

squared_diffs = (
sample_movie_vote_average - recommendations_vote_average
) ** 2 # noqa E501
rmse = np.sqrt(squared_diffs.mean())

return round(float(rmse), 3)


def get_vote_count_rmse(
df: pd.DataFrame, sample_movie: str, recommendations: list
) -> float:
sample_movie_popularity = df[df["title"] == sample_movie].vote_count.iloc[
0
] # noqa E501
recommendations_popularity = df[
df["title"].isin(recommendations)
].vote_count.values # noqa E501

squared_diffs = (recommendations_popularity - sample_movie_popularity) ** 2
rmse = np.sqrt(squared_diffs.mean())

return round(float(rmse), 3)
Loading

0 comments on commit 6f82c60

Please sign in to comment.