Merge pull request #100 from ploomber/recommender.py

Recommender.py for FastAPI
ploomber · Sep 1, 2023 · 6f82c60 · 6f82c60
2 parents 734a40f + 91b87a4
commit 6f82c60
Show file tree

Hide file tree

Showing 10 changed files with 1,043 additions and 1,587 deletions.
diff --git a/mini-projects/movie-rec-system/movie_rec_system/app/__init__.py b/mini-projects/movie-rec-system/movie_rec_system/app/__init__.py
diff --git a/mini-projects/movie-rec-system/movie_rec_system/app/app.py b/mini-projects/movie-rec-system/movie_rec_system/app/app.py
@@ -0,0 +1,55 @@
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, field_validator
+from .recommender import get_recommendation
+from fastapi.responses import JSONResponse
+import json
+
+app = FastAPI()
+
+
+class RecommendationRequest(BaseModel):
+    movie: str
+    num_rec: int = 10
+
+    @field_validator("movie")
+    def format_movie_name(cls, movie_name):
+        """Ensure the movie name is formatted with the
+        first letter capitalized."""
+        return movie_name.title()  # Convert to title case
+
+
+@app.get("/")
+async def root():
+    return {
+        "message": "Welcome! You can use this API to get movie recommendations based on viewers' votes. Visit /docs for more information and to try it out!"  # noqa E501
+    }
+
+
+@app.post("/recommendations/")
+def get_movie_recommendations(recommendation_request: RecommendationRequest):
+    """
+    Get movie recommendations for a given movie.
+
+    Parameters:
+    - movie: The name of the movie for which you want recommendations.
+    - num_rec: The number of movie recommendations you want. Default is 10.
+
+    Returns:
+    JSON containing recommended movies and metrics.
+    """
+    recommendations = get_recommendation(
+        recommendation_request.movie,
+        recommendation_request.num_rec,
+        "english",
+    )
+
+    if isinstance(recommendations, str):
+        recommendations = json.loads(recommendations)
+
+    if not recommendations:
+        raise HTTPException(
+            status_code=404,
+            detail="Movie not found or no recommendations available",  # noqa E501
+        )
+
+    return JSONResponse(content=recommendations)
diff --git a/mini-projects/movie-rec-system/movie_rec_system/app/recommender.py b/mini-projects/movie-rec-system/movie_rec_system/app/recommender.py
@@ -0,0 +1,227 @@
+import json
+import pandas as pd
+import duckdb
+from functools import lru_cache
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from .recommenderhelper import (
+    content_movie_recommender,
+    get_popularity_rmse,
+    get_vote_avg_rmse,
+    get_vote_count_rmse,
+)
+
+
+@lru_cache(maxsize=None)
+def get_data() -> pd.DataFrame:
+    """
+    Function that automatically connects
+    to duckdb as a GET call upon launch
+    of FastAPI
+    """
+    con = duckdb.connect("./movies_data.duckdb")
+    query = "SELECT * FROM movie_genre_data"
+    df = con.execute(query).fetchdf()
+    con.close()
+    return df
+
+
+def create_combined(df: pd.DataFrame, weight=2) -> pd.DataFrame:
+    """
+    Generates a "combined" column by combining the
+    "overview" and "genre_names" columns.
+
+    The "genre_names" column will be multiplied by the
+    provided weight, essentially repeating the genre names
+    the specified number of times.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The input DataFrame which must contain
+        both "overview" and "genre_names" columns.
+
+    weight : int, default=2
+        The number of times "genre_names" should be
+        repeated in the "combined" column.
+
+    Returns
+    -------
+    pd.DataFrame
+        The modified DataFrame with an additional "combined" column.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame({
+    ...     'overview': ['A story about...'],
+    ...     'genre_names': ['Action']
+    ... })
+    >>> create_combined(df)
+         overview        genre_names         combined
+    0  A story about...    Action  A story about... Action, Action,
+
+    """
+    df["combined"] = df["overview"] + " " + (df["genre_names"] + ", ") * weight
+    return df
+
+
+def retrieve_and_transform_data() -> pd.DataFrame:
+    """
+    Retrieve data from duckdb and transform it
+    into a format that can be used for generating
+    movie recommendations.
+
+    Returns
+    -------
+    pd.DataFrame
+        The transformed DataFrame with an additional "combined" column.
+    """
+    df = get_data()
+    df["title"] = df["title"].str.lower()
+    df = create_combined(df)
+    return df
+
+
+def compute_tfidf_vectorization(df, stop_words="english"):
+    """
+    Compute TF-IDF vectorization of the "combined" column
+    in the provided DataFrame.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The input DataFrame which must contain
+        a "combined" column.
+
+    stop_words : str, optional
+        The language of stop words to be
+        used when vectorizing the "combined" column.
+        Default is "english".
+
+    Returns
+    -------
+    tfidf_matrix:    scipy.sparse.csr.csr_matrix
+        The TF-IDF vectorization of the "combined" column."""
+    tfidf = TfidfVectorizer(stop_words=stop_words)
+    tfidf_matrix = tfidf.fit_transform(df["combined"])
+    return tfidf_matrix
+
+
+def compute_metrics(df, movie, recommendations):
+    """
+    Compute RMSE for popularity, vote average, and vote count
+    for the provided movie and recommendations.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The input DataFrame which must contain
+        a "combined" column.
+
+    movie : str
+        The title of the movie for which
+        recommendations are to be generated.
+
+    recommendations : list
+        A list of recommended movies.
+
+    Returns
+    -------
+    popularity_rmse : float
+        The RMSE for popularity.
+
+    vote_avg_rmse : float
+        The RMSE for vote average.
+
+        ote_count_rmse : float
+        The RMSE for vote count.
+    """
+    popularity_rmse = get_popularity_rmse(df, movie, recommendations)
+    vote_avg_rmse = get_vote_avg_rmse(df, movie, recommendations)
+    vote_count_rmse = get_vote_count_rmse(df, movie, recommendations)
+    return popularity_rmse, vote_avg_rmse, vote_count_rmse
+
+
+def get_recommendation(movie: str, num_rec: int = 10, stop_words="english"):
+    """
+    Generate movie recommendations based on
+    content similarity and computes associated metrics.
+
+    This function retrieves movie data,
+    calculates cosine similarity between movies using
+    TF-IDF vectorization of their combined overview
+    and genre, and returns a list of recommended
+    movies along with certain metrics
+    (popularity, vote average, and vote count RMSE).
+
+    Parameters
+    ----------
+    movie : str
+        The title of the movie for which
+        recommendations are to be generated.
+
+    num_rec : int, optional
+        The number of movie recommendations
+        to generate. Default is 10.
+
+    stop_words : str, optional
+        The language of stop words to be
+        used when vectorizing the "combined" column.
+        Default is "english".
+
+    Returns
+    -------
+    str
+        A JSON-formatted string containing
+        the original movie, a list of recommendations,
+        and associated metrics
+        (popularity, vote average, and vote count RMSE).
+
+    Examples
+    --------
+    >>> result = get_recommendation("Inception", num_rec=5)
+    >>> print(json.loads(result))
+    {
+        "movie": "Inception",
+        "recommendations": [...],
+        "metrics": {
+            "popularity": ...,
+            "vote_avg": ...,
+            "vote_count": ...
+        }
+    }
+
+    """
+    movie = movie.lower()
+    df = retrieve_and_transform_data()
+
+    tfidf_matrix = compute_tfidf_vectorization(df, stop_words)
+    similarity = cosine_similarity(tfidf_matrix)
+
+    similarity_df = pd.DataFrame(
+        similarity, index=df.title.values, columns=df.title.values
+    )
+    movie_list = similarity_df.columns.values
+    recommendations = content_movie_recommender(
+        movie, similarity_df, movie_list, num_rec
+    )
+
+    if not recommendations:
+        return None
+
+    popularity_rmse, vote_avg_rmse, vote_count_rmse = compute_metrics(
+        df, movie, recommendations
+    )
+
+    result = {
+        "movie": movie,
+        "recommendations": recommendations,
+        "metrics": {
+            "popularity": popularity_rmse,
+            "vote_avg": vote_avg_rmse,
+            "vote_count": vote_count_rmse,
+        },
+    }
+
+    result_json = json.dumps(result)
+    return result_json
diff --git a/mini-projects/movie-rec-system/movie_rec_system/app/recommenderhelper.py b/mini-projects/movie-rec-system/movie_rec_system/app/recommenderhelper.py
@@ -0,0 +1,99 @@
+import numpy as np
+import pandas as pd
+
+
+def content_movie_recommender(
+    input_movie: str,
+    similarity_database: pd.DataFrame,
+    movie_database_list: list,
+    top_n=10,
+) -> list:
+    """
+    Function that uses a similarity matrix to find similar movies
+
+    Parameters
+    ----------
+    input_movie : str
+        reference movie to find similarities
+    similarity_database : pandas.DataFrame
+        similarity matrix of movies
+    movie_database_list : numpy.ndarray
+        movies in our similarity matrix
+    top_n : int
+        number of similar movies to output
+    """
+    try:
+        # get movie similarity records
+        movie_sim = similarity_database[
+            similarity_database.index == input_movie
+        ].values[0]
+
+        # get movies sorted by similarity
+        sorted_movie_ids = np.argsort(movie_sim)[::-1]
+        recommended_movies = movie_database_list[
+            sorted_movie_ids[1 : top_n + 1]  # noqa E203
+        ]  # noqa E501
+        return list(recommended_movies)
+    except IndexError:
+        return []
+
+
+def get_popularity_rmse(
+    df: pd.DataFrame, sample_movie: str, recommendations: list
+) -> float:
+    # Convert titles in dataframe and sample_movie to lowercase
+    df["title"] = df["title"].str.lower()
+    sample_movie = sample_movie.lower()
+
+    filtered_df = df[df["title"] == sample_movie]
+
+    if not filtered_df.empty:
+        sample_movie_popularity = filtered_df.popularity.iloc[0]
+        recommendations_popularity = df[
+            df["title"].isin(recommendations)
+        ].popularity.values
+
+        squared_diffs = (
+            sample_movie_popularity - recommendations_popularity
+        ) ** 2  # noqa E501
+        rmse = np.sqrt(squared_diffs.mean())
+
+        return round(float(rmse), 3)
+    else:
+        return float("nan")
+
+
+def get_vote_avg_rmse(
+    df: pd.DataFrame, sample_movie: str, recommendations: list
+) -> float:
+    sample_movie_vote_average = df[
+        df["title"] == sample_movie
+    ].vote_average.iloc[  # noqa E501
+        0
+    ]
+    recommendations_vote_average = df[
+        df["title"].isin(recommendations)
+    ].vote_average.values
+
+    squared_diffs = (
+        sample_movie_vote_average - recommendations_vote_average
+    ) ** 2  # noqa E501
+    rmse = np.sqrt(squared_diffs.mean())
+
+    return round(float(rmse), 3)
+
+
+def get_vote_count_rmse(
+    df: pd.DataFrame, sample_movie: str, recommendations: list
+) -> float:
+    sample_movie_popularity = df[df["title"] == sample_movie].vote_count.iloc[
+        0
+    ]  # noqa E501
+    recommendations_popularity = df[
+        df["title"].isin(recommendations)
+    ].vote_count.values  # noqa E501
+
+    squared_diffs = (recommendations_popularity - sample_movie_popularity) ** 2
+    rmse = np.sqrt(squared_diffs.mean())
+
+    return round(float(rmse), 3)