From 08b6dc7029271bb404a339894963330ca609e57c Mon Sep 17 00:00:00 2001 From: Nathan Moore Date: Mon, 22 Jan 2024 13:05:32 -0500 Subject: [PATCH] feat(analytics): Adding LinkedIn, TwitterX, and Reddit to analytics report and refactoring pivot gen as a builder (#127) * Pivot builder library WIP * Refactoring all of the pivot table logic to use a builder implementation. Helpful for preventing me from sticking forks in my eyes. * No munch * Fixing table * Fixing column indices * Providing some default behaviour for missing values * Documenting and CR comments --- gdrive/analytics_api.py | 7 +- gdrive/idva/flow_analytics.py | 395 +++++++++++++++++++++ gdrive/idva/pivot_director.py | 274 +++++++++++++++ gdrive/idva_flow_analytics.py | 622 ---------------------------------- gdrive/sheets/builders.py | 149 ++++++++ gdrive/sheets/types.py | 144 ++++++++ 6 files changed, 966 insertions(+), 625 deletions(-) create mode 100644 gdrive/idva/flow_analytics.py create mode 100644 gdrive/idva/pivot_director.py delete mode 100644 gdrive/idva_flow_analytics.py create mode 100644 gdrive/sheets/builders.py create mode 100644 gdrive/sheets/types.py diff --git a/gdrive/analytics_api.py b/gdrive/analytics_api.py index cd814ad..c5a3fe2 100644 --- a/gdrive/analytics_api.py +++ b/gdrive/analytics_api.py @@ -9,7 +9,8 @@ import fastapi from pydantic import BaseModel from fastapi import responses -from gdrive import analytics_client, idva_flow_analytics, error +from gdrive import analytics_client, error +from gdrive.idva import flow_analytics log = logging.getLogger(__name__) router = fastapi.APIRouter() @@ -63,9 +64,9 @@ async def list_accounts(): def run_analytics(start_date: datetime, end_date: datetime): try: - idva_flow_analytics.create_report(start_date, end_date) + flow_analytics.create_report(start_date, end_date) except Exception as e: - log.error(e) + log.exception(e) raise error("Report generation failed") diff --git a/gdrive/idva/flow_analytics.py b/gdrive/idva/flow_analytics.py new file mode 100644 index 0000000..acf2570 --- /dev/null +++ b/gdrive/idva/flow_analytics.py @@ -0,0 +1,395 @@ +import datetime +from enum import Enum +import pandas as pd +import logging + +from gdrive import settings, sheets_client, drive_client, analytics_client +from gdrive.idva.pivot_director import IDVAPivotDirector +from gdrive.sheets.builders import FormulaBuilder +from gdrive.sheets.types import FormulaEnum, Range, StringLiteral + +log = logging.getLogger(__name__) +idva = IDVAPivotDirector() + + +class SheetsEnum(str, Enum): + REKREWT = "Rekrewt Pivot Tables" + GSA = "GSA Use Pivot Table" + + +def create_report(start_date: datetime, end_date: datetime): + response = analytics_client.download( + settings.ANALYTICS_PROPERTY_ID, start_date, end_date + ) + + analytics_df = analytics_client.create_df_from_analytics_response(response) + analytics_df = preprocess_report(analytics_df) + + sheets_id = export(analytics_df, start_date, end_date) + names_to_id = create_pages(sheets_id) + create_pivot_tables(analytics_df, names_to_id, sheets_id) + + +def export( + df: pd.DataFrame, date_of_report: datetime, end_date: datetime = None +) -> str: + """ + Transform the downloaded response from the google analytics API into a + Google Sheets Object. + + This function first touches a Google Sheets object with the drive API, then + writes the analytics data to that object. As of right now there is no way to do + this in one API transaction. + + Args: + df (pandas.DataFrame): Tabular data to export to Google Sheets object + date_of_report (datetime): Date the report was run + Returns: + str: Google Sheets ID of the new Sheets object + """ + filename_str = generate_filename(date_of_report, end_date) + analytics_folder_id = drive_client.create_folder( + "Google Analytics", parent_id=settings.ANALYTICS_ROOT + ) + + # We have to do this in multiple steps with more than one client because the Sheets API + # doesnt support opening a file in a given directory. + sheets_id = drive_client.create_empty_spreadsheet(filename_str, analytics_folder_id) + log.info("Uploading to folder %s (%s)" % ("Google Analytics", analytics_folder_id)) + result = sheets_client.export_df_to_gdrive_speadsheet(df, sheets_id) + log.info( + "Successfully created %s (%s)" % (filename_str, result.get("spreadsheetId")) + ) + return sheets_id + + +def preprocess_report(df: pd.DataFrame) -> pd.DataFrame: + tracked_events = ["first_visit", "session_start"] + tracked_sources = [ + "m.facebook.com", + "fb.com", + "([a-zA-Z].+)craigslist.org", + "reddit.com", + "redd.it", + "t.co", + "x.com", + "twitter", + "linked.com", + "lnkd.in", + ] + tracked_mediums = ["fb", "cl", "rd", "tx", "ln"] + + for event in tracked_events: + tracked_df = df[df[0] == event] + for source in tracked_sources: + if tracked_df[tracked_df[3].str.match(source)].empty: + add_source_placeholder_row(df, event, source) + + for medium in tracked_mediums: + if tracked_df[tracked_df[2].str.match(medium)].empty: + add_medium_placeholder_row(df, event, medium) + + return df + + +def add_source_placeholder_row(df: pd.DataFrame, event: str, source: str) -> None: + logging.info(f"Adding placeholder {event} for {source}") + df.loc[len(df.index)] = [event, "---", "---", source, "---", "---", 0, 0, 0, 0, 0] + + +def add_medium_placeholder_row(df: pd.DataFrame, event: str, medium: str) -> None: + logging.info(f"Adding placeholder {event} for {medium}") + df.loc[len(df.index)] = [event, "---", medium, "---", "---", "---", 0, 0, 0, 0, 0] + + +def create_pages(sheets_id: str) -> dict: + """ + Add new pages and pivot tables. + + This function is fairly naive and inefficient. If we ever want to make Google Sheets + more often than once a day, we should refactor this to limit the number of API transactions. + + Args: + df (pandas.DataFrame): Tabular data in the spreadsheet + sheets_id (str): Google Sheets object ID + Returns: + names_to_id (dict): A Dictionary mapping string sheet names to IDs + """ + new_sheet_name_to_id = sheets_client.add_new_pages( + [SheetsEnum.REKREWT.value, SheetsEnum.GSA.value], sheets_id + ) + log.info("Added %s pages to %s" % (len(new_sheet_name_to_id.keys()), sheets_id)) + return new_sheet_name_to_id + + +def create_pivot_tables(df: pd.DataFrame, names_to_id: dict, sheets_id: str): + # Make a dictionary mapping the name of the column to its index, useful for the pivot tables. + col_dict = {} + for idx, val in enumerate(df.iloc[0]): + col_dict[val] = idx + + facebook_pivot(sheets_id, names_to_id, col_dict) + craigslist_pivot(sheets_id, names_to_id, col_dict) + reddit_pivot(sheets_id, names_to_id, col_dict) + twitter_x_pivot(sheets_id, names_to_id, col_dict) + linkedin_pivot(sheets_id, names_to_id, col_dict) + + sheets_client.add_pivot_tables( + sheets_id, names_to_id[SheetsEnum.GSA.value], idva.clicks(col_dict) + ) + + # Add formulas for some totals here + session_sum = FormulaBuilder(FormulaEnum.SUM, params=[Range("C2", "G2")]) + first_visit_sum = FormulaBuilder(FormulaEnum.SUM, params=[Range("C3", "G3")]) + + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "A2", "Sessions" + ) # Sessions for each source label + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "A3", "First Visits" + ) # First visits for each source label + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "B1", "Total" + ) # Total of each event label + + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "B2", session_sum.render() + ) # total value + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "B3", first_visit_sum.render() + ) # total value + + +def facebook_pivot(sheets_id, names_to_id, col_dict): + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "A5", "FACEBOOK" + ) # Pivot table Label + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "C1", "FACEBOOK" + ) # Totals label + + sheets_client.add_pivot_tables( + sheets_id, + names_to_id[SheetsEnum.REKREWT.value], + idva.facebook(col_dict), + row_idx=5, + col_idx=0, + ) + + facebook_sessions = FormulaBuilder( + FormulaEnum.GET_PIVOT_DATA, + params=[ + StringLiteral("SUM of eventCount"), + "A6", + StringLiteral("eventName"), + StringLiteral("session_start"), + ], + ) + + facebook_visit = FormulaBuilder( + FormulaEnum.GET_PIVOT_DATA, + params=[ + StringLiteral("SUM of eventCount"), + "A6", + StringLiteral("eventName"), + StringLiteral("first_visit"), + ], + ) + + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "C2", facebook_sessions.render() + ) + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "C3", facebook_visit.render() + ) + + +def craigslist_pivot(sheets_id, names_to_id, col_dict): + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "G5", "CRAIGSLIST" + ) # Pivot table Label + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "D1", "CRAIGSLIST" + ) # Totals label + + sheets_client.add_pivot_tables( + sheets_id, + names_to_id[SheetsEnum.REKREWT.value], + idva.craigslist(col_dict), + row_idx=5, + col_idx=6, + ) + + craigslist_sessions = FormulaBuilder( + FormulaEnum.GET_PIVOT_DATA, + params=[ + StringLiteral("SUM of eventCount"), + "G6", + StringLiteral("eventName"), + StringLiteral("session_start"), + ], + ) + + craigslist_visit = FormulaBuilder( + FormulaEnum.GET_PIVOT_DATA, + params=[ + StringLiteral("SUM of eventCount"), + "G6", + StringLiteral("eventName"), + StringLiteral("first_visit"), + ], + ) + + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "D2", craigslist_sessions.render() + ) + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "D3", craigslist_visit.render() + ) + + +def reddit_pivot(sheets_id, names_to_id, col_dict): + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "L5", "REDDIT" + ) # Pivot table Label + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "E1", "REDDIT" + ) # Totals label + + sheets_client.add_pivot_tables( + sheets_id, + names_to_id[SheetsEnum.REKREWT.value], + idva.reddit(col_dict), + row_idx=5, + col_idx=11, + ) + + reddit_sessions = FormulaBuilder( + FormulaEnum.GET_PIVOT_DATA, + params=[ + StringLiteral("SUM of eventCount"), + "L6", + StringLiteral("eventName"), + StringLiteral("session_start"), + ], + ) + + reddit_visit = FormulaBuilder( + FormulaEnum.GET_PIVOT_DATA, + params=[ + StringLiteral("SUM of eventCount"), + "L6", + StringLiteral("eventName"), + StringLiteral("first_visit"), + ], + ) + + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "E2", reddit_sessions.render() + ) + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "E3", reddit_visit.render() + ) + + +def twitter_x_pivot(sheets_id, names_to_id, col_dict): + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "Q5", "TWITTER/X" + ) # Pivot table Label + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "F1", "TWITTER/X" + ) # Totals label + + sheets_client.add_pivot_tables( + sheets_id, + names_to_id[SheetsEnum.REKREWT.value], + idva.twitter_x(col_dict), + row_idx=5, + col_idx=16, + ) + + twitter_x_sessions = FormulaBuilder( + FormulaEnum.GET_PIVOT_DATA, + params=[ + StringLiteral("SUM of eventCount"), + "Q6", + StringLiteral("eventName"), + StringLiteral("session_start"), + ], + ) + + twitter_x_visit = FormulaBuilder( + FormulaEnum.GET_PIVOT_DATA, + params=[ + StringLiteral("SUM of eventCount"), + "Q6", + StringLiteral("eventName"), + StringLiteral("first_visit"), + ], + ) + + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "F2", twitter_x_sessions.render() + ) + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "F3", twitter_x_visit.render() + ) + + +def linkedin_pivot(sheets_id, names_to_id, col_dict): + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "V5", "LINKEDIN" + ) # Pivot table Label + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "G1", "LINKEDIN" + ) # Totals label + + sheets_client.add_pivot_tables( + sheets_id, + names_to_id[SheetsEnum.REKREWT.value], + idva.linkedin(col_dict), + row_idx=5, + col_idx=21, + ) + + linkedin_sessions = FormulaBuilder( + FormulaEnum.GET_PIVOT_DATA, + params=[ + StringLiteral("SUM of eventCount"), + "V6", + StringLiteral("eventName"), + StringLiteral("session_start"), + ], + ) + + linkedin_visit = FormulaBuilder( + FormulaEnum.GET_PIVOT_DATA, + params=[ + StringLiteral("SUM of eventCount"), + "V6", + StringLiteral("eventName"), + StringLiteral("first_visit"), + ], + ) + + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "G2", linkedin_sessions.render() + ) + sheets_client.update_cell_value( + sheets_id, SheetsEnum.REKREWT.value, "G3", linkedin_visit.render() + ) + + +def generate_filename(date: datetime, end_date: datetime = None): + """ + Return filename for the new spreadsheet to be saved as + + Args: + date (datetime): date to format + Return: + str: Formatted Date + """ + ret = date.strftime("%Y%m%d") + if end_date is not None and end_date != date: + ret += "-%s" % (end_date.strftime("%Y%m%d")) + return ret diff --git a/gdrive/idva/pivot_director.py b/gdrive/idva/pivot_director.py new file mode 100644 index 0000000..1e65ef7 --- /dev/null +++ b/gdrive/idva/pivot_director.py @@ -0,0 +1,274 @@ +from gdrive.sheets.builders import FormulaBuilder, PivotTableBuilder +from gdrive.sheets.types import ( + SortOrderEnum, + FilterTypeEnum, + FormulaEnum, + UserEnteredValue, + SummarizeFunctionEnum, + StringLiteral, +) + + +class IDVAPivotDirector: + def clicks(self, col_dict: dict) -> dict: + builder = PivotTableBuilder(0, col_dict) + builder.add_row("eventName", sortOrder=SortOrderEnum.ASCENDING) + builder.add_row("eventCount", sortOrder=SortOrderEnum.ASCENDING) + builder.add_value("eventCount", SummarizeFunctionEnum.SUM) + return builder.render() + + def facebook(self, col_dict: dict) -> dict: + builder = PivotTableBuilder(0, col_dict) + builder.add_row("eventName", SortOrderEnum.ASCENDING, show_totals=False) + builder.add_row("firstUserMedium", SortOrderEnum.ASCENDING) + builder.add_row("firstUserSource", SortOrderEnum.ASCENDING) + builder.add_row( + "firstUserCampaignName", SortOrderEnum.ASCENDING, show_totals=False + ) + + builder.add_value("eventCount", SummarizeFunctionEnum.SUM) + + # =OR(regexmatch(firstUserSource,"facebook"),regexmatch(firstUserSource,"fb.com"), regexmatch(firstUserMedium,"fb")) + facebook = FormulaBuilder( + FormulaEnum.OR, + [ + FormulaBuilder( + FormulaEnum.REGEX_MATCH, + ["firstUserSource", StringLiteral("facebook")], + ), + FormulaBuilder( + FormulaEnum.REGEX_MATCH, + ["firstUserSource", StringLiteral("fb.com")], + ), + FormulaBuilder( + FormulaEnum.REGEX_MATCH, ["firstUserMedium", StringLiteral("fb")] + ), + ], + ) + + # =OR(regexmatch(eventName,"session_start"),regexmatch(eventName,"first_visit")) + sessions = FormulaBuilder( + FormulaEnum.OR, + [ + FormulaBuilder( + FormulaEnum.REGEX_MATCH, + ["eventName", StringLiteral("session_start")], + ), + FormulaBuilder( + FormulaEnum.REGEX_MATCH, ["eventName", StringLiteral("first_visit")] + ), + ], + ) + + builder.add_filter( + "firstUserSource", + FilterTypeEnum.CUSTOM, + [UserEnteredValue(facebook.render())], + ) + builder.add_filter( + "eventName", FilterTypeEnum.CUSTOM, [UserEnteredValue(sessions.render())] + ) + + return builder.render() + + def craigslist(self, col_dict: dict) -> dict: + builder = PivotTableBuilder(0, col_dict) + builder.add_row("eventName", SortOrderEnum.ASCENDING, show_totals=False) + builder.add_row("firstUserMedium", SortOrderEnum.ASCENDING) + builder.add_row("firstUserSource", SortOrderEnum.ASCENDING) + builder.add_value("eventCount", SummarizeFunctionEnum.SUM) + + # =OR(regexmatch(firstUserSource,"craigslist"), regexmatch(firstUserMedium,"cl")) + craigslist = FormulaBuilder( + FormulaEnum.OR, + [ + FormulaBuilder( + FormulaEnum.REGEX_MATCH, + ["firstUserSource", StringLiteral("craigslist")], + ), + FormulaBuilder( + FormulaEnum.REGEX_MATCH, ["firstUserMedium", StringLiteral("cl")] + ), + ], + ) + + # =OR(regexmatch(eventName,"session_start"),regexmatch(eventName,"first_visit")) + sessions = FormulaBuilder( + FormulaEnum.OR, + [ + FormulaBuilder( + FormulaEnum.REGEX_MATCH, + ["eventName", StringLiteral("session_start")], + ), + FormulaBuilder( + FormulaEnum.REGEX_MATCH, ["eventName", StringLiteral("first_visit")] + ), + ], + ) + + builder.add_filter( + "firstUserSource", + FilterTypeEnum.CUSTOM, + values=[UserEnteredValue(craigslist.render())], + ) + builder.add_filter( + "eventName", + FilterTypeEnum.CUSTOM, + values=[UserEnteredValue(sessions.render())], + ) + + return builder.render() + + def reddit(self, col_dict: dict) -> dict: + builder = PivotTableBuilder(0, col_dict) + builder.add_row("eventName", SortOrderEnum.ASCENDING, show_totals=False) + builder.add_row("firstUserMedium", SortOrderEnum.ASCENDING) + builder.add_row("firstUserSource", SortOrderEnum.ASCENDING) + builder.add_value("eventCount", SummarizeFunctionEnum.SUM) + + # =OR(regexmatch(firstUserSource,"reddit"),regexmatch(firstUserSource,"redd.it"), regexmatch(firstUserMedium,"rd")) + reddit = FormulaBuilder( + FormulaEnum.OR, + [ + FormulaBuilder( + FormulaEnum.REGEX_MATCH, + ["firstUserSource", StringLiteral("reddit")], + ), + FormulaBuilder( + FormulaEnum.REGEX_MATCH, + ["firstUserSource", StringLiteral("redd.it")], + ), + FormulaBuilder( + FormulaEnum.REGEX_MATCH, ["firstUserMedium", StringLiteral("rd")] + ), + ], + ) + + # =OR(regexmatch(eventName,"session_start"),regexmatch(eventName,"first_visit")) + sessions = FormulaBuilder( + FormulaEnum.OR, + [ + FormulaBuilder( + FormulaEnum.REGEX_MATCH, + ["eventName", StringLiteral("session_start")], + ), + FormulaBuilder( + FormulaEnum.REGEX_MATCH, ["eventName", StringLiteral("first_visit")] + ), + ], + ) + + builder.add_filter( + "firstUserSource", + FilterTypeEnum.CUSTOM, + values=[UserEnteredValue(reddit.render())], + ) + builder.add_filter( + "eventName", + FilterTypeEnum.CUSTOM, + values=[UserEnteredValue(sessions.render())], + ) + + return builder.render() + + def twitter_x(self, col_dict: dict) -> dict: + builder = PivotTableBuilder(0, col_dict) + builder.add_row("eventName", SortOrderEnum.ASCENDING, show_totals=False) + builder.add_row("firstUserMedium", SortOrderEnum.ASCENDING) + builder.add_row("firstUserSource", SortOrderEnum.ASCENDING) + builder.add_value("eventCount", SummarizeFunctionEnum.SUM) + + # =OR(regexmatch(firstUserSource,"twitter"),regexmatch(firstUserSource,"x.com"), regexmatch(firstUserMedium,"tx")) + + twitter = FormulaBuilder( + FormulaEnum.OR, + [ + FormulaBuilder( + FormulaEnum.REGEX_MATCH, + ["firstUserSource", StringLiteral("twitter")], + ), + FormulaBuilder( + FormulaEnum.REGEX_MATCH, ["firstUserSource", StringLiteral("x.com")] + ), + FormulaBuilder( + FormulaEnum.REGEX_MATCH, ["firstUserMedium", StringLiteral("tx")] + ), + ], + ) + + sessions = FormulaBuilder( + FormulaEnum.OR, + [ + FormulaBuilder( + FormulaEnum.REGEX_MATCH, + ["eventName", StringLiteral("session_start")], + ), + FormulaBuilder( + FormulaEnum.REGEX_MATCH, ["eventName", StringLiteral("first_visit")] + ), + ], + ) + + builder.add_filter( + "firstUserSource", + FilterTypeEnum.CUSTOM, + values=[UserEnteredValue(twitter.render())], + ) + builder.add_filter( + "eventName", + FilterTypeEnum.CUSTOM, + values=[UserEnteredValue(sessions.render())], + ) + + return builder.render() + + def linkedin(self, col_dict: dict) -> dict: + builder = PivotTableBuilder(0, col_dict) + builder.add_row("eventName", SortOrderEnum.ASCENDING, show_totals=False) + builder.add_row("firstUserMedium", SortOrderEnum.ASCENDING) + builder.add_row("firstUserSource", SortOrderEnum.ASCENDING) + builder.add_value("eventCount", SummarizeFunctionEnum.SUM) + + # =OR(regexmatch(firstUserSource,"linkedin.com"),regexmatch(firstUserSource,"lnkd.in"), regexmatch(firstUserMedium,"ln")) + linkedin = FormulaBuilder( + FormulaEnum.OR, + [ + FormulaBuilder( + FormulaEnum.REGEX_MATCH, + ["firstUserSource", StringLiteral("linkedin.com")], + ), + FormulaBuilder( + FormulaEnum.REGEX_MATCH, + ["firstUserSource", StringLiteral("lnkd.in")], + ), + FormulaBuilder( + FormulaEnum.REGEX_MATCH, ["firstUserMedium", StringLiteral("ln")] + ), + ], + ) + + sessions = FormulaBuilder( + FormulaEnum.OR, + [ + FormulaBuilder( + FormulaEnum.REGEX_MATCH, + ["eventName", StringLiteral("session_start")], + ), + FormulaBuilder( + FormulaEnum.REGEX_MATCH, ["eventName", StringLiteral("first_visit")] + ), + ], + ) + + builder.add_filter( + "firstUserSource", + FilterTypeEnum.CUSTOM, + values=[UserEnteredValue(linkedin.render())], + ) + builder.add_filter( + "eventName", + FilterTypeEnum.CUSTOM, + values=[UserEnteredValue(sessions.render())], + ) + + return builder.render() diff --git a/gdrive/idva_flow_analytics.py b/gdrive/idva_flow_analytics.py deleted file mode 100644 index 2b513a5..0000000 --- a/gdrive/idva_flow_analytics.py +++ /dev/null @@ -1,622 +0,0 @@ -import datetime -import pandas as pd -import logging - -from gdrive import settings, sheets_client, drive_client, analytics_client - -log = logging.getLogger(__name__) - - -def create_report(start_date: datetime, end_date: datetime): - response = analytics_client.download( - settings.ANALYTICS_PROPERTY_ID, start_date, end_date - ) - - analytics_df = analytics_client.create_df_from_analytics_response(response) - sheets_id = export(analytics_df, start_date, end_date) - create_pages_and_pivot_tables(analytics_df, sheets_id=sheets_id) - - -def export( - df: pd.DataFrame, date_of_report: datetime, end_date: datetime = None -) -> str: - """ - Transform the downloaded response from the google analytics API into a - Google Sheets Object. - - This function first touches a Google Sheets object with the drive API, then - writes the analytics data to that object. As of right now there is no way to do - this in one API transaction. - - Args: - df (pandas.DataFrame): Tabular data to export to Google Sheets object - date_of_report (datetime): Date the report was run - Returns: - str: Google Sheets ID of the new Sheets object - """ - filename_str = generate_filename(date_of_report, end_date) - analytics_folder_id = drive_client.create_folder( - "Google Analytics", parent_id=settings.ANALYTICS_ROOT - ) - - # We have to do this in multiple steps with more than one client because the Sheets API - # doesnt support opening a file in a given directory. - sheets_id = drive_client.create_empty_spreadsheet(filename_str, analytics_folder_id) - log.info("Uploading to folder %s (%s)" % ("Google Analytics", analytics_folder_id)) - result = sheets_client.export_df_to_gdrive_speadsheet(df, sheets_id) - log.info( - "Successfully created %s (%s)" % (filename_str, result.get("spreadsheetId")) - ) - return sheets_id - - -def create_pages_and_pivot_tables(df: pd.DataFrame, sheets_id: str): - """ - Add new pages and pivot tables. - - This function is fairly naive and inefficient. If we ever want to make Google Sheets - more often than once a day, we should refactor this to limit the number of API transactions. - - Args: - df (pandas.DataFrame): Tabular data in the spreadsheet - sheets_id (str): Google Sheets object ID - """ - - page1 = "Rekrewt Pivot Table - First Visit" - page2 = "Rekrewt Pivot Table - Sessions" - page3 = "GSA Use Pivot Table" - page4 = "Conversions" - - new_sheet_name_to_id = sheets_client.add_new_pages( - [page1, page2, page3, page4], sheets_id - ) - log.info("Added %s pages to %s" % (len(new_sheet_name_to_id.keys()), sheets_id)) - create_pivot_tables( - df, (page1, page2, page3, page4), new_sheet_name_to_id, sheets_id - ) - - -def create_pivot_tables( - df: pd.DataFrame, page_names: (str, str, str), names_to_id: dict, sheets_id: str -): - # Make a dictionary mapping the name of the column to its index, useful for the pivot tables. - col_dict = {} - for idx, val in enumerate(df.iloc[0]): - col_dict[val] = idx - - create_first_visit_pt(sheets_id, names_to_id[page_names[0]], col_dict) - log.info( - "Added 3 pivot tables to %s (%s)" % (page_names[0], names_to_id[page_names[0]]) - ) - - create_session_start_pt(sheets_id, names_to_id[page_names[1]], col_dict) - log.info( - "Added 3 pivot tables to %s (%s)" % (page_names[1], names_to_id[page_names[1]]) - ) - - create_clicks_pt(sheets_id, names_to_id[page_names[2]], col_dict) - log.info( - "Added pivot table to %s (%s)" % (page_names[2], names_to_id[page_names[2]]) - ) - - create_feedback_pt(sheets_id, names_to_id[page_names[3]], col_dict) - log.info( - "Added pivot table to %s (%s)" % (page_names[3], names_to_id[page_names[3]]) - ) - - sheets_client.update_cell_value( - sheets_id, page_names[0], "A17", "Total First Visits" - ) - sheets_client.update_cell_value( - sheets_id, - page_names[0], - "A18", - '=GETPIVOTDATA("SUM of eventCount",A1, "eventName", "first_visit") + GETPIVOTDATA("SUM of eventCount",F1, "eventName", "first_visit") + GETPIVOTDATA("SUM of eventCount",K1, "eventName", "first_visit")', - ) - log.info("Wrote totals to %s" % (page_names[0])) - - sheets_client.update_cell_value(sheets_id, page_names[1], "A17", "Total Sessions") - sheets_client.update_cell_value( - sheets_id, - page_names[1], - "A18", - '=GETPIVOTDATA("SUM of eventCount",A1, "eventName", "session_start") + GETPIVOTDATA("SUM of eventCount",F1, "eventName", "session_start") + GETPIVOTDATA("SUM of eventCount",K1, "eventName", "session_start")', - ) - log.info("Wrote totals to %s" % (page_names[1])) - - -def create_first_visit_pt(sheets_id, page_id, col_dict): - first_visit_facebook_pt_def = { - "pivotTable": { - "source": { - # First Sheet (Sheet1) is always ID 0 - "sheetId": 0, - }, - "rows": [ - { - "sourceColumnOffset": col_dict["eventName"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["firstUserSource"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["eventCount"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - ], - "filterSpecs": [ - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "first_visit", - } - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["eventName"], - }, - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "facebook", - }, - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["firstUserSource"], - }, - ], - "values": [ - { - "summarizeFunction": "SUM", - "sourceColumnOffset": col_dict["eventCount"], - } - ], - "valueLayout": "HORIZONTAL", - } - } - first_visit_rt_pt_def = { - "pivotTable": { - "source": { - # First Sheet (Sheet1) is always ID 0 - "sheetId": 0, - }, - "rows": [ - { - "sourceColumnOffset": col_dict["eventName"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["firstUserSource"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["eventCount"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - ], - "filterSpecs": [ - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "first_visit", - } - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["eventName"], - }, - { - "filterCriteria": { - "condition": { - "type": "TEXT_EQ", - "values": [ - { - "userEnteredValue": "rt", - }, - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["firstUserSource"], - }, - ], - "values": [ - { - "summarizeFunction": "SUM", - "sourceColumnOffset": col_dict["eventCount"], - } - ], - "valueLayout": "HORIZONTAL", - } - } - first_visit_craigslist_pt_def = { - "pivotTable": { - "source": { - # First Sheet (Sheet1) is always ID 0 - "sheetId": 0, - }, - "rows": [ - { - "sourceColumnOffset": col_dict["eventName"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["firstUserSource"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["eventCount"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - ], - "filterSpecs": [ - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "first_visit", - } - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["eventName"], - }, - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "craigslist", - }, - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["firstUserSource"], - }, - ], - "values": [ - { - "summarizeFunction": "SUM", - "sourceColumnOffset": col_dict["eventCount"], - } - ], - "valueLayout": "HORIZONTAL", - } - } - - sheets_client.add_pivot_tables( - sheets_id, - page_id, - first_visit_facebook_pt_def, - ) - sheets_client.add_pivot_tables( - sheets_id, - page_id, - first_visit_rt_pt_def, - row_idx=0, - col_idx=5, - ) - sheets_client.add_pivot_tables( - sheets_id, page_id, first_visit_craigslist_pt_def, row_idx=0, col_idx=10 - ) - - -def create_session_start_pt(sheets_id, page_id, col_dict): - # Add sessions pivot table, facebook - sessions_facebook_pt_def = { - "pivotTable": { - "source": { - # First Sheet (Sheet1) is always ID 0 - "sheetId": 0, - }, - "rows": [ - { - "sourceColumnOffset": col_dict["eventName"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["firstUserSource"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["eventCount"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - ], - "filterSpecs": [ - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "session_start", - } - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["eventName"], - }, - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "facebook", - }, - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["firstUserSource"], - }, - ], - "values": [ - { - "summarizeFunction": "SUM", - "sourceColumnOffset": col_dict["eventCount"], - } - ], - "valueLayout": "HORIZONTAL", - } - } - sessions_rt_pt_def = { - "pivotTable": { - "source": { - # First Sheet (Sheet1) is always ID 0 - "sheetId": 0, - }, - "rows": [ - { - "sourceColumnOffset": col_dict["eventName"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["firstUserSource"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["eventCount"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - ], - "filterSpecs": [ - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "session_start", - } - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["eventName"], - }, - { - "filterCriteria": { - "condition": { - "type": "TEXT_EQ", - "values": [ - { - "userEnteredValue": "rt", - }, - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["firstUserSource"], - }, - ], - "values": [ - { - "summarizeFunction": "SUM", - "sourceColumnOffset": col_dict["eventCount"], - } - ], - "valueLayout": "HORIZONTAL", - } - } - sessions_craigslist_pt_def = { - "pivotTable": { - "source": { - # First Sheet (Sheet1) is always ID 0 - "sheetId": 0, - }, - "rows": [ - { - "sourceColumnOffset": col_dict["eventName"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["firstUserSource"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["eventCount"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - ], - "filterSpecs": [ - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "session_start", - } - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["eventName"], - }, - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "craigslist", - }, - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["firstUserSource"], - }, - ], - "values": [ - { - "summarizeFunction": "SUM", - "sourceColumnOffset": col_dict["eventCount"], - } - ], - "valueLayout": "HORIZONTAL", - } - } - - sheets_client.add_pivot_tables(sheets_id, page_id, sessions_facebook_pt_def) - sheets_client.add_pivot_tables( - sheets_id, page_id, sessions_rt_pt_def, row_idx=0, col_idx=5 - ) - sheets_client.add_pivot_tables( - sheets_id, page_id, sessions_craigslist_pt_def, row_idx=0, col_idx=10 - ) - - -def create_clicks_pt(sheets_id, page_id, col_dict): - clicks_pt_def = { - "pivotTable": { - "source": { - # First Sheet (Sheet1) is always ID 0 - "sheetId": 0, - }, - "rows": [ - { - "sourceColumnOffset": col_dict["eventName"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["eventCount"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - ], - "values": [ - { - "summarizeFunction": "SUM", - "sourceColumnOffset": col_dict["eventCount"], - } - ], - "valueLayout": "HORIZONTAL", - } - } - - sheets_client.add_pivot_tables(sheets_id, page_id, clicks_pt_def) - - -def create_feedback_pt(sheets_id, page_id, col_dict): - feedback_pt_def = { - "pivotTable": { - "source": { - "sheetId": 0, - }, - "rows": [ - { - "sourceColumnOffset": col_dict["eventName"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["eventCount"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - ], - "filterSpecs": [ - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "feedback", - } - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["linkUrl"], - }, - ], - "values": [ - { - "summarizeFunction": "SUM", - "sourceColumnOffset": col_dict["eventCount"], - } - ], - "valueLayout": "HORIZONTAL", - } - } - - sheets_client.add_pivot_tables(sheets_id, page_id, feedback_pt_def) - - -def generate_filename(date: datetime, end_date: datetime = None): - """ - Return filename for the new spreadsheet to be saved as - - Args: - date (datetime): date to format - Return: - str: Formatted Date - """ - ret = date.strftime("%Y%m%d") - if end_date is not None and end_date != date: - ret += "-%s" % (end_date.strftime("%Y%m%d")) - return ret diff --git a/gdrive/sheets/builders.py b/gdrive/sheets/builders.py new file mode 100644 index 0000000..8c83291 --- /dev/null +++ b/gdrive/sheets/builders.py @@ -0,0 +1,149 @@ +from abc import ABC, abstractclassmethod + +from gdrive.sheets.types import ( + FormulaEnum, + SortOrderEnum, + SummarizeFunctionEnum, + ValueLayoutEnum, + FilterTypeEnum, + AbstractScaffold, +) + + +class AbstractBuilder(ABC): + def render(self) -> str: + pass + + +class FormulaBuilder(AbstractBuilder): + def __init__(self, op: FormulaEnum, params: []): + self.params = params + self.op = op + + def __str__(self): + return "%s(%s)" % (self.op.value, ",".join([str(x) for x in self.params])) + + def render(self) -> str: + return "=%s" % str(self) + + +class PivotTableBuilder(AbstractBuilder): + def __init__(self, source_sheet_id: int = 0, col_lookup=None) -> None: + self.source_sheet_id = source_sheet_id + self.columns = col_lookup + self.__pivot_scaffold = { + "pivotTable": { + "source": { + # First Sheet (Sheet1) is always ID 0 + "sheetId": source_sheet_id, + }, + } + } + + def __get_pivot_value(self, field: str) -> dict or None: + if field not in self.__pivot_scaffold["pivotTable"]: + return None + + return self.__pivot_scaffold["pivotTable"][field] + + def __set_pivot_value(self, field: str, value: any) -> None: + self.__pivot_scaffold["pivotTable"][field] = value + + def __get_column_id(self, column_name: str): + if column_name not in self.columns: + raise ValueError("Column name %s does not exist" % (column_name)) + + return self.columns[column_name] + + def add_row( + self, + source_col: str, + sortOrder: SortOrderEnum, + show_totals: bool = True, + ) -> None: + col_idx = self.__get_column_id(source_col) + self.add_row_from_offset(col_idx, sortOrder, show_totals) + + def add_row_from_offset( + self, + source_col_offset: int, + sortOrder: SortOrderEnum, + show_totals: bool = True, + ) -> None: + if self.__get_pivot_value("rows") is None: + self.__set_pivot_value("rows", []) + + self.__get_pivot_value("rows").append( + { + "sourceColumnOffset": source_col_offset, + "showTotals": show_totals, + "sortOrder": sortOrder.value, + } + ) + + def add_value( + self, + source_col: str, + summarize_func: SummarizeFunctionEnum, + ) -> None: + col_idx = self.__get_column_id(source_col) + self.add_value_from_offset(col_idx, summarize_func) + + def add_value_from_offset( + self, + source_col_offset: int, + summarize_func: SummarizeFunctionEnum, + ) -> None: + if self.__get_pivot_value("values") is None: + self.__set_pivot_value("values", []) + + self.__get_pivot_value("values").append( + { + "summarizeFunction": summarize_func.value, + "sourceColumnOffset": source_col_offset, + } + ) + + def add_filter( + self, + source_col: str, + filter_type: FilterTypeEnum, + values: [AbstractScaffold], + visible_by_default: bool = True, + ) -> None: + col_idx = self.__get_column_id(source_col) + self.add_filter_from_offset(col_idx, filter_type, values, visible_by_default) + + def add_filter_from_offset( + self, + source_col_offset: str, + filter_type: FilterTypeEnum, + values: [AbstractScaffold], + visible_by_default: bool = True, + ) -> None: + if self.__get_pivot_value("filterSpecs") is None: + self.__set_pivot_value("filterSpecs", []) + + self.__get_pivot_value("filterSpecs").append( + { + "filterCriteria": { + "condition": { + "type": filter_type.value, + "values": [val.get_scaffold() for val in values], + }, + "visibleByDefault": visible_by_default, + }, + "columnOffsetIndex": source_col_offset, + } + ) + + def set_value_layout( + self, value_layout: ValueLayoutEnum = ValueLayoutEnum.HORIZONTAL + ) -> None: + self.__set_pivot_value("valueLayout", value_layout.value) + + def render(self) -> dict: + return self.__pivot_scaffold + + def reset(self) -> None: + self.__init__(self.source_sheet_id) diff --git a/gdrive/sheets/types.py b/gdrive/sheets/types.py new file mode 100644 index 0000000..de55924 --- /dev/null +++ b/gdrive/sheets/types.py @@ -0,0 +1,144 @@ +from abc import ABC, abstractmethod +from enum import Enum + + +""" +This types module provides intellisense and strong(er) typing +when using the Pivot and Formula builder interfaces. If a function is +needed and the below enum do not support it, please add it to keep this +module up to date and useful. +""" + + +class FilterTypeEnum(str, Enum): + """ + FilterType provides filter function names for use with the Pivot and Formula + builders + """ + + TEXT_CONTAINS = "TEXT_CONTAINS" + TEXT_EQUALS = "TEXT_EQ" + CUSTOM = "CUSTOM_FORMULA" + + +class SortOrderEnum(str, Enum): + """ + SortOrderEnum provides sort order names for use with the Pivot and Formula + builders + """ + + ASCENDING = "ASCENDING" + DESCENDING = "DESCENDING" + + +class SummarizeFunctionEnum(str, Enum): + """ + Summarize provides summary functions for use with the + Pivot and Formula builders + """ + + SUM = "SUM" + + +class ValueLayoutEnum(str, Enum): + """ + ValueLayout provides value layout names for use with the Pivot and Formula + builders + """ + + HORIZONTAL = "HORIZONTAL" + + +class FormulaEnum(str, Enum): + """ + FormulaEnum provides formula names for use with the Pivot and Formula + builders + """ + + NOOP = "" + OR = "OR" + SUM = "sum" + REGEX_MATCH = "regexmatch" + GET_PIVOT_DATA = "GETPIVOTDATA" + + +class StringLiteral: + """ + StringLiteral allows string values to be render as literal strings in the resulting + Function or Pivot scaffold. i.e. "Hello world!" is a string literal. + + Use string literals where the use of a token (eventName) is not desired in the + Function or Pivot Scaffold + """ + + def __init__(self, value) -> None: + self.value = value + + def __str__(self) -> str: + return '"%s"' % (self.value) + + +class Range: + """ + Range provides a scaffold for a range string between two string + values. (i.e. A2:G5) + """ + + def __init__(self, a, b) -> None: + self.a = a + self.b = b + + def __str__(self) -> str: + return "%s:%s" % (self.a, self.b) + + +class AbstractScaffold(ABC): + """ + Client code may use classes like UserEnteredValue to add scaffolds to filter + criterion builder. This supports cases where filter criterion may have large + numbers of acceptable scaffolds that may be arbitrarily combined. We use + AbstractScaffold to ensure type safety in the abritary lists + + Args: + ABC (AbstractBaseClass): ABC provided base class for Abstract classes + """ + + @abstractmethod + def __init__(self): + pass + + @abstractmethod + def get_scaffold(self) -> dict: + """ + Returns internal scaffold + + Returns: + dict: this dict represents Google Sheets API conforming JSON + """ + pass + + +class UserEnteredValue(AbstractScaffold): + """ + User Entered Values are parsed by the Google Sheets API as if + the end user typed the value in from the keyboard, and are subject to + the same pre-processing as in such cases. This can be helpful when trying + to emulate the side effects of manually entering in values, as one might when + attempting to recreate a pivot table programatically. + + Args: + AbstractScaffold (ABC): Sets interface for AbstractScaffoldClasses + """ + + def __init__(self, value): + """ + Scaffolds a value, as if the user typed it into a spreadsheet + + Args: + value (ANY): Desired Value. toString is called to encode + this value into the internal scaffold + """ + self.__scaffold = {"userEnteredValue": str(value)} + + def get_scaffold(self) -> dict: + return self.__scaffold