From 0c93518a75f7d738f4e84f1b240022252fe232d0 Mon Sep 17 00:00:00 2001 From: Nathan Moore Date: Wed, 11 Oct 2023 14:10:58 -0400 Subject: [PATCH 01/17] Adding Analytics API to gdrive --- gdrive/analytics_api.py | 179 ++++++++ gdrive/analytics_client.py | 109 +++++ gdrive/api.py | 16 +- gdrive/{client.py => drive_client.py} | 69 +-- gdrive/export_api.py | 15 +- gdrive/export_client.py | 155 +------ gdrive/main.py | 3 +- gdrive/settings.py | 5 + gdrive/sheets_client.py | 601 ++++++++++++++++++++++++++ requirements.txt | 91 +++- 10 files changed, 1017 insertions(+), 226 deletions(-) create mode 100644 gdrive/analytics_api.py create mode 100644 gdrive/analytics_client.py rename gdrive/{client.py => drive_client.py} (80%) create mode 100644 gdrive/sheets_client.py diff --git a/gdrive/analytics_api.py b/gdrive/analytics_api.py new file mode 100644 index 0000000..4542d30 --- /dev/null +++ b/gdrive/analytics_api.py @@ -0,0 +1,179 @@ +""" +Google Analytics Rest API +""" + +from datetime import datetime +import logging + +import fastapi +from pydantic import BaseModel +from fastapi import BackgroundTasks, responses +import pandas as pd + +from gdrive import error, settings, analytics_client, sheets_client, drive_client + +log = logging.getLogger(__name__) +router = fastapi.APIRouter() + + +class AnalyticsRequest(BaseModel): + startDate: str = None + endDate: str = None + + +@router.post("/analytics") +async def run_analytics(background_tasks: BackgroundTasks): + if settings.ANALYTICS: + background_tasks.add_task(run_analytics_task, datetime.today(), None) + return responses.JSONResponse( + status_code=202, + content="Analytics request for %s is being processed." + % (datetime.date(datetime.today())), + ) + + else: + return responses.JSONResponse( + status_code=409, + content="Request is good, however the client has requested a resource that is unavailable at this time.", + ) + + +@router.post("/analytics/daterange") +async def run_analytics(background_tasks: BackgroundTasks, req: AnalyticsRequest): + try: + date_format = "%Y-%m-%d" + start_date = datetime.strptime(req.startDate, date_format) + end_date = datetime.strptime(req.endDate, date_format) + except ValueError as _: + return responses.JSONResponse( + status_code=400, + content="Failed (invalid date parameters): %s, %s" + % (req.start_date, req.end_date), + ) + + if settings.ANALYTICS: + background_tasks.add_task(run_analytics_task, start_date, end_date) + return responses.JSONResponse( + status_code=202, + content="Analytics request for %s - %s is being processed." + % (datetime.date(start_date), datetime.date(end_date)), + ) + else: + return responses.JSONResponse( + status_code=409, + content="Request is good, however the client has requested a resource that is unavailable at this time.", + ) + + +@router.post("/analytics/list") +async def list_accounts(backgroud_tasks: BackgroundTasks): + if settings.ANALYTICS: + backgroud_tasks.add_task(list_accounts_task) + return responses.JSONResponse( + status_code=202, content="List request is being processed." + ) + else: + return responses.JSONResponse( + status_code=409, + content="Request is good, however the client has requested a resource that is unavailable at this time.", + ) + + +async def run_analytics_task(start_date: datetime, end_date: datetime): + try: + analytics_df = analytics_client.download( + settings.ANALYTICS_PROPERTY_ID, start_date, end_date + ) + sheets_id = export(analytics_df, start_date, end_date) + do_analytics_export_post_processing(analytics_df, sheets_id=sheets_id) + except Exception as e: + log.error(e.args) + + +async def list_accounts_task(): + try: + list_response = analytics_client.list() + if list_response is not None: + log.info("-------------------------------") + for act in list_response.accounts: + log.info("Name:\t\t%s" % (act.name)) + log.info("Display name:\t%s" % (act.display_name)) + log.info("-------------------------------") + else: + log.warn( + "List response was none. Ensure credentials are set correctly" + + " and you have access to the cloud property." + ) + except Exception as e: + log.error(e.args) + + +def export( + df: pd.DataFrame, date_of_report: datetime, end_date: datetime = None +) -> str: + """ + Transform the downloaded response from the google analytics API into a + Google Sheets Object. + + This function first touches a Google Sheets object with the drive API, then + writes the analytics data to that object. As of right now there is no way to do + this in one API transaction. + + Args: + df (pandas.DataFrame): Tabular data to export to Google Sheets object + date_of_report (datetime): Date the report was run + Returns: + str: Google Sheets ID of the new Sheets object + """ + filename_str = get_filename(date_of_report, end_date) + analytics_folder_id = drive_client.create_folder( + "Google Analytics", parent_id=settings.ROOT_DIRECTORY + ) + + # We have to do this in multiple steps with more than one client because the Sheets API + # doesnt support opening a file in a given directory. + sheets_id = drive_client.create_empty_spreadsheet(filename_str, analytics_folder_id) + log.info("Uploading to folder %s (%s)" % ("Google Analytics", analytics_folder_id)) + result = sheets_client.export_df_to_gdrive_speadsheet(df, sheets_id) + log.info( + "Successfully created %s (%s)" % (filename_str, result.get("spreadsheetId")) + ) + return sheets_id + + +def do_analytics_export_post_processing(df: pd.DataFrame, sheets_id: str): + """ + Add new pages and pivot tables. + + This function is fairly naive and inefficient. If we ever want to make Google Sheets + more often than once a day, we should refactor this to limit the number of API transactions. + + Args: + df (pandas.DataFrame): Tabular data in the spreadsheet + sheets_id (str): Google Sheets object ID + """ + + page1 = "Rekrewt Pivot Table - First Visit" + page2 = "Rekrewt Pivot Table - Sessions" + page3 = "GSA Use Pivot Table" + + new_sheet_name_to_id = sheets_client.add_new_pages([page1, page2, page3], sheets_id) + log.info("Added %s pages to %s" % (len(new_sheet_name_to_id.keys()), sheets_id)) + sheets_client.do_create_pivot_tables( + df, (page1, page2, page3), new_sheet_name_to_id, sheets_id + ) + + +def get_filename(date: datetime, end_date: datetime = None): + """ + Return filename for the new spreadsheet to be saved as + + Args: + date (datetime): date to format + Return: + str: Formatted Date + """ + ret = date.strftime("%Y%m%d") + if end_date is not None and end_date != date: + ret += "-%s" % (end_date.strftime("%Y%m%d")) + return ret diff --git a/gdrive/analytics_client.py b/gdrive/analytics_client.py new file mode 100644 index 0000000..5a4c6f0 --- /dev/null +++ b/gdrive/analytics_client.py @@ -0,0 +1,109 @@ +import datetime + +from google.oauth2 import service_account +from google.analytics.admin import AnalyticsAdminServiceClient +from google.analytics.data_v1beta import BetaAnalyticsDataClient +from google.analytics.data_v1beta.types import ( + DateRange, + Dimension, + Metric, + RunReportRequest, +) + +import logging +import pandas as pd + +from gdrive import settings + +log = logging.getLogger(__name__) + +creds = service_account.Credentials.from_service_account_info(settings.CREDENTIALS) + +""" +Client for the Google Analytics (GA4) API + +This class contains functions relating to downloading analytics data +for the IDVA flow. +""" + + +def download( + property_id, target_date: datetime, end_date: datetime = None +) -> pd.DataFrame: + """ + Access Google Analytics (GA4) api and download desired analytics report. + """ + if end_date is None: + end_date = target_date + + request = RunReportRequest( + property=f"properties/{property_id}", + limit="250", + # https://developers.google.com/analytics/devguides/reporting/data/v1/api-schema + dimensions=[ + Dimension(name="eventName"), + Dimension(name="firstUserCampaignName"), + Dimension(name="firstUserMedium"), + Dimension(name="firstUserSource"), + Dimension(name="isConversionEvent"), + Dimension(name="linkUrl"), + ], + metrics=[ + Metric(name="eventCount"), + Metric(name="sessions"), + Metric(name="totalUsers"), + Metric(name="eventCountPerUser"), + Metric(name="conversions"), + ], + date_ranges=[ + DateRange( + start_date=format_date_for_api(target_date), + end_date=format_date_for_api(end_date), + ) + ], + ) + + return create_df_from_analytics_response( + BetaAnalyticsDataClient(credentials=creds).run_report(request) + ) + + +def list(): + """ + List the available properties the user has access to. Can be run to + verify setup of the enviornment is correct. + """ + client = AnalyticsAdminServiceClient(credentials=creds) + return client.list_accounts() + + +def format_date_for_api(date: datetime): + """ + Formats datetime object for Google Analytics Api (GA4) input + """ + return date.strftime("%Y-%m-%d") + + +def create_df_from_analytics_response(response): + """ + Extracts values from Google Analytics API response and transforms + them into pandas DataFrame for ease of use. This enables the analytics + client to do any processing of the data desired, if something comes up in + the future we want to do but isnt supported in GA4. + """ + all_headers = [] + for _, header in enumerate(response.dimension_headers): + all_headers += [header.name] + for _, header in enumerate(response.metric_headers): + all_headers += [header.name] + + arr = [all_headers] + for _, row in enumerate(response.rows): + row_li = [] + for _, val in enumerate(row.dimension_values): + row_li += [val.value] + for _, val in enumerate(row.metric_values): + row_li += [val.value] + arr += [row_li] + + return pd.DataFrame(arr) diff --git a/gdrive/api.py b/gdrive/api.py index c481ca0..2e19aa8 100644 --- a/gdrive/api.py +++ b/gdrive/api.py @@ -12,13 +12,13 @@ from googleapiclient.http import HttpError from starlette.requests import Request -from . import client, settings +from . import drive_client, settings log = logging.getLogger(__name__) router = fastapi.APIRouter() -client.init() +drive_client.init() # Patch zip decodeExtra to ignore invalid extra data @@ -50,16 +50,18 @@ async def upload_file( stream = io.BytesIO(body) - parent = client.create_folder(id, settings.ROOT_DIRECTORY) + parent = drive_client.create_folder(id, settings.ROOT_DIRECTORY) if zip: with zipfile.ZipFile(stream) as archive: files = archive.filelist for file in files: image = io.BytesIO(archive.read(file)) - client.upload_basic(f"{filename}_{file.filename}", parent, image) + drive_client.upload_basic( + f"{filename}_{file.filename}", parent, image + ) else: - client.upload_basic(filename, parent, stream) + drive_client.upload_basic(filename, parent, stream) except HttpError as error: log.error(f"An error occurred: {error}") @@ -73,10 +75,10 @@ async def delete_file(filename, response: Response): """ try: - files = client.get_files(filename) + files = drive_client.get_files(filename) if files: for file in files: - client.delete_file(file["id"]) + drive_client.delete_file(file["id"]) else: response.status_code = status.HTTP_404_NOT_FOUND diff --git a/gdrive/client.py b/gdrive/drive_client.py similarity index 80% rename from gdrive/client.py rename to gdrive/drive_client.py index 75c3f53..d148e2b 100644 --- a/gdrive/client.py +++ b/gdrive/drive_client.py @@ -14,8 +14,8 @@ creds = service_account.Credentials.from_service_account_info( settings.CREDENTIALS, scopes=settings.SCOPES ) + service = build("drive", "v3", credentials=creds) -sheets_service = build("sheets", "v4", credentials=creds) def init(): @@ -62,6 +62,22 @@ def list(count: int = 10, shared: bool = True) -> None: log.info(f"No such key: {error} in {item}") +def create_empty_spreadsheet(filename: str, parent_id: str) -> str: + file_metadata = { + "name": filename, + "parents": [parent_id], + "mimeType": "application/vnd.google-apps.spreadsheet", + } + + file = ( + service.files() + .create(body=file_metadata, fields="id", supportsAllDrives=True) + .execute() + ) + + return file.get("id") + + def drives_list(): """ List available shared drives @@ -164,54 +180,3 @@ def delete_file(id: str) -> None: """ service.files().delete(fileId=id, supportsAllDrives=True).execute() - - -def upload_participant( - first, - last, - email, - responseId, - time, - date, - ethnicity, - race, - gender, - age, - income, - skin_tone, -): - """ - Append participant data to spreadsheet - """ - values = [ - [ - first, - last, - first + " " + last, - email, - responseId, - time, - date, - ethnicity, - race, - gender, - income, - skin_tone, - ] - ] - - body = {"values": values} - result = ( - sheets_service.spreadsheets() - .values() - .append( - spreadsheetId=settings.SHEETS_ID, - range="Sheet1!A1", - valueInputOption="RAW", - body=body, - ) - .execute() - ) - if "error" in result: - raise error.ExportError(result["error"]["message"]) - return result diff --git a/gdrive/export_api.py b/gdrive/export_api.py index 350d31c..3f74ecd 100644 --- a/gdrive/export_api.py +++ b/gdrive/export_api.py @@ -10,7 +10,7 @@ from pydantic import BaseModel from fastapi import BackgroundTasks, responses -from gdrive import export_client, client, settings, error +from gdrive import export_client, drive_client, settings, error log = logging.getLogger(__name__) @@ -23,8 +23,8 @@ async def upload_file(interactionId): export_bytes = io.BytesIO( export_client.codename(json.dumps(export_data, indent=2)).encode() ) - parent = client.create_folder(interactionId, settings.ROOT_DIRECTORY) - client.upload_basic("analytics.json", parent, export_bytes) + parent = drive_client.create_folder(interactionId, settings.ROOT_DIRECTORY) + drive_client.upload_basic("analytics.json", parent, export_bytes) class ParticipantModel(BaseModel): @@ -73,11 +73,6 @@ async def survey_upload_response_task(request): log.info("Response found, beginning export.") - if response["status"] != "Complete": - raise error.ExportError( - f"Cannot upload incomplete survery response to raw completions spreadsheet: {request.responseId}" - ) - # By the time we get here, we can count on the response containing the demographic data # as it is included in the Completed flow responses. Responses without complete status # throws exception in get_qualtrics_response @@ -85,7 +80,7 @@ async def survey_upload_response_task(request): if request.participant: participant = request.participant - client.upload_participant( + drive_client.upload_participant( participant.first, participant.last, participant.email, @@ -122,7 +117,7 @@ class FindModel(BaseModel): responseId: str field: str values: list[str] - result_field: str | None = None + result_field: str | None @router.post("/find") diff --git a/gdrive/export_client.py b/gdrive/export_client.py index 726cb57..9874b8c 100644 --- a/gdrive/export_client.py +++ b/gdrive/export_client.py @@ -170,156 +170,11 @@ def get_qualtrics_response(surveyId: str, responseId: str): f"No survey response found for responseId: {responseId}" ) - return r.json() + resp = r.json() - -def get_all_InteractionIds(responseId): - es = OpenSearch( - hosts=[{"host": settings.ES_HOST, "port": settings.ES_PORT}], timeout=300 - ) - - # query for all parent flow intraction ids for a given response id - query_interactionId = { - "size": 500, - "query": { - "bool": { - "must": [ - {"match_phrase": {"properties.outcomeType.value": "survey_data"}}, - {"match": {"properties.outcomeDescription.value": f"{responseId}"}}, - ] - } - }, - "_source": ["interactionId"], - } - - results_interacitonId = es.search( - body=json.dumps(query_interactionId), index="_all" - ) - - if results_interacitonId["hits"]["total"]["value"] == 0: - return [] - - interactionIds_match = list( - map( - lambda res: res["_source"]["interactionId"], - results_interacitonId["hits"]["hits"], - ) - ) - - subflow_query_1 = list( - map( - lambda res: { - "bool": { - "must": [ - { - "match_phrase": { - "parentInteractionProps.parentInteractionId": f'{res["_source"]["interactionId"]}' - } - }, - {"exists": {"field": "interactionId"}}, - ] - } - }, - results_interacitonId["hits"]["hits"], - ) - ) - - subflow_query_2 = list( - map( - lambda res: { - "bool": { - "must": [ - { - "match_phrase": { - "properties.outcomeDescription.value": f'{res["_source"]["interactionId"]}' - } - }, - {"match_phrase": {"properties.outcomeType.value": "parent_id"}}, - ] - } - }, - results_interacitonId["hits"]["hits"], - ) - ) - - subflowquery = { - "size": 500, - "query": {"bool": {"should": subflow_query_1 + subflow_query_2}}, - "_source": ["interactionId"], - } - - subs = es.search(body=json.dumps(subflowquery), index="_all") - - sub_interactionIds_match = list( - map( - lambda res: res["_source"]["interactionId"], - subs["hits"]["hits"], - ) - ) - - return interactionIds_match + sub_interactionIds_match - - -def find(responseId, field, values, result): - # find values in find for all flow for a given responseId - # field and result should be one of: - # properties.outcomeDescription.value - # properties.outcomeStatus.value - # properties.outcomeType.value - # properties.outcomeDetail.value - - es = OpenSearch( - hosts=[{"host": settings.ES_HOST, "port": settings.ES_PORT}], timeout=300 - ) - - all_interactionIds = get_all_InteractionIds(responseId) - - all_interactionIds_match = list( - map(lambda res: {"match": {"interactionId": f"{res}"}}, all_interactionIds) - ) - - values_match = list( - map( - lambda res: {"match_phrase": {field: res}}, - values, - ) - ) - - query_found = { - "size": 500, - "query": { - "bool": { - "must": [ - { - "bool": { - "should": values_match, - } - }, - { - "bool": { - "should": all_interactionIds_match, - } - }, - ] - } - }, - "_source": [result], - } - - found_result = es.search(body=json.dumps(query_found), index="_all") - - list_found = list( - map( - lambda x: recursive_decent(x["_source"], result.split(".")), - found_result["hits"]["hits"], + if resp["status"] != "Complete": + raise error.ExportError( + f"Cannot upload incomplete survery response to raw completions spreadsheet: {responseId}" ) - ) - - return {"found": list_found} - -def recursive_decent(obj: dict | str, query: list[str]): - # given dict and a dot notated key name, return value of key - if query == [] or not isinstance(obj, dict): - return obj - return recursive_decent(obj.get(query[0], ""), query[1:]) + return resp diff --git a/gdrive/main.py b/gdrive/main.py index 4a05883..185877c 100644 --- a/gdrive/main.py +++ b/gdrive/main.py @@ -6,7 +6,7 @@ import fastapi import starlette_prometheus -from . import api, export_api, settings +from . import api, export_api, settings, analytics_api logging.basicConfig(level=settings.LOG_LEVEL) @@ -17,3 +17,4 @@ app.include_router(api.router) app.include_router(export_api.router) +app.include_router(analytics_api.router) diff --git a/gdrive/settings.py b/gdrive/settings.py index b904d0a..0eb4db0 100644 --- a/gdrive/settings.py +++ b/gdrive/settings.py @@ -14,10 +14,15 @@ LOG_LEVEL = os.getenv("LOG_LEVEL", logging.getLevelName(logging.INFO)) +ANALYTICS = os.getenv("ANALYTICS", False) +ANALYTICS_PROPERTY_ID = os.getenv("ANALYTICS_PROPERTY_ID", 377091467) + SCOPES = [ + "https://www.googleapis.com/auth/analytics", "https://www.googleapis.com/auth/drive", "https://www.googleapis.com/auth/spreadsheets", ] + SERVICE_ACCOUNT_FILE = "credentials.json" ROOT_DIRECTORY = "" CODE_NAMES = None diff --git a/gdrive/sheets_client.py b/gdrive/sheets_client.py new file mode 100644 index 0000000..24e69a0 --- /dev/null +++ b/gdrive/sheets_client.py @@ -0,0 +1,601 @@ +import logging +import pandas as pd +from typing import List + +from google.oauth2 import service_account +from googleapiclient.discovery import build + +from gdrive import settings, error + +log = logging.getLogger(__name__) + +creds = service_account.Credentials.from_service_account_info( + settings.CREDENTIALS, scopes=settings.SCOPES +) + +sheets_service = build("sheets", "v4", credentials=creds) + +""" +At present, every function call in this library represents its own API +transaction. If a lot of operations were to be made at once, this would hinder speed +quite a bit. + +Some batching in the future if the use case for this library gets heavier is necessary. +""" + +# Generic functions + + +def update_cell_value( + sheet_id: str, page_name: str, range_str: str, value: str, vio="USER_ENTERED" +): + """ + Write the specifed value to specified range + + Args: + sheet_id (str): Google sheets object ID + page_name (str): page target to edit + range_str (str): range to write the values to + value (str): value to write to the specified location + vio (str): + default (str): "USER_ENTERED" User entered values get resolved by googles parsing function. + Functions, Integers and strings can all be entered this way. + + Returns: + Google API Raw Result + """ + body = { + "values": [ + # Cell values + [ + value, + ] + # Other values + ] + } + + result = ( + sheets_service.spreadsheets() + .values() + .update( + spreadsheetId=sheet_id, + range="%s!%s" % (page_name, range_str), + valueInputOption=vio, + body=body, + ) + .execute() + ) + + return result + + +def add_pivot_tables( + sheets_id: str, + target_page_id: str, + pivot_table_definition: object, + row_idx: int = 0, + col_idx: int = 0, +): + """ + Writes the pivot table definition to the specified location. + + Args: + sheets_id (str): ID for the sheets object + target_page_id (str): ID for the target page of the sheets object, (Sheet1 is always 0) + pivot_table_definition (object): JSON encoded dict + row_idx (int): Index of the row to write the start of the table + default: 0 + col_idx (int): Index of the column to write the start of the table + default: 0 + + Returns: + Google Sheets API Response: RAW response to the write operation + """ + requests = [] + requests.append( + { + "updateCells": { + "rows": { + # I would need to write a whole library to parameterize this well so + # Client Code will just need to pass the JSON definitions in. + "values": pivot_table_definition + }, + "start": { + "sheetId": target_page_id, + "rowIndex": row_idx, + "columnIndex": col_idx, + }, + "fields": "pivotTable", + } + } + ) + + body = {"requests": requests} + + response = ( + sheets_service.spreadsheets() + .batchUpdate(spreadsheetId=sheets_id, body=body) + .execute() + ) + + return response + + +def add_new_pages(page_names: [str], sheets_id: str): + new_sheets_reqs = [] + for label in page_names: + req = { + "addSheet": { + "properties": { + "title": label, + } + } + } + + new_sheets_reqs.append(req) + + body = {"requests": new_sheets_reqs} + + result = ( + sheets_service.spreadsheets() + .batchUpdate( + spreadsheetId=sheets_id, + body=body, + ) + .execute() + ) + + sheet_title_to_id = {} + for reply in result.get("replies"): + props = reply.get("addSheet").get("properties") + sheet_title_to_id[props.get("title")] = props.get("sheetId") + + return sheet_title_to_id + + +def export_df_to_gdrive_speadsheet(df: pd.DataFrame, sheets_id: str, title="Sheet1"): + """ + Exports an entire pandas dataframe to a Google Sheets Object. + + Args: + df (pandas.DataFrame): Tabular data to be exported to a spreadsheet + title (str): Title for the target spreadsheet to write the data to. + default: "Sheet1" default value for new Google Sheets sheets object + + Returns: + Google Sheets API Response: RAW response to the write operation + """ + body = {"values": df.values.tolist()} + result = ( + sheets_service.spreadsheets() + .values() + .append( + spreadsheetId=sheets_id, + range="%s!A1" % (title), + valueInputOption="USER_ENTERED", + body=body, + ) + .execute() + ) + if "error" in result: + raise error.ExportError(result["error"]["message"]) + + return result + + +# Project specific functions + + +def do_create_pivot_tables( + df: pd.DataFrame, page_names: (str, str, str), names_to_id: dict, sheets_id: str +): + # Make a dictionary mapping the name of the column to its index, useful for the pivot tables. + col_dict = {} + for idx, val in enumerate(df.iloc[0]): + col_dict[val] = idx + + create_first_visit_pt(sheets_id, names_to_id[page_names[0]], col_dict) + log.info( + "Added 2 pivot tables to %s (%s)" % (page_names[0], names_to_id[page_names[0]]) + ) + + create_session_start_pt(sheets_id, names_to_id[page_names[1]], col_dict) + log.info( + "Added 2 pivot tables to %s (%s)" % (page_names[1], names_to_id[page_names[1]]) + ) + + create_clicks_pt(sheets_id, names_to_id[page_names[2]], col_dict) + log.info( + "Added pivot table to %s (%s)" % (page_names[2], names_to_id[page_names[2]]) + ) + + update_cell_value(sheets_id, page_names[0], "A17", "Total First Visits") + update_cell_value( + sheets_id, + page_names[0], + "A18", + '=GETPIVOTDATA("SUM of eventCount",A1, "eventName", "first_visit") + GETPIVOTDATA("SUM of eventCount",F1, "eventName", "first_visit")', + ) + log.info("Wrote totals to %s" % (page_names[0])) + + update_cell_value(sheets_id, page_names[1], "A17", "Total Sessions") + update_cell_value( + sheets_id, + page_names[1], + "A18", + '=GETPIVOTDATA("SUM of eventCount",A1, "eventName", "session_start") + GETPIVOTDATA("SUM of eventCount",F1, "eventName", "session_start")', + ) + log.info("Wrote totals to %s" % (page_names[1])) + + +def create_first_visit_pt(sheets_id, page_id, col_dict): + # Add first visit pivot table, Facebook + add_pivot_tables( + sheets_id, + page_id, + ( + { + "pivotTable": { + "source": { + # First Sheet (Sheet1) is always ID 0 + "sheetId": 0, + }, + "rows": [ + { + "sourceColumnOffset": col_dict["eventName"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["firstUserSource"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["eventCount"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + ], + "filterSpecs": [ + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "first_visit", + } + ], + }, + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["eventName"], + }, + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "facebook", + }, + ], + }, + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["firstUserSource"], + }, + ], + "values": [ + { + "summarizeFunction": "SUM", + "sourceColumnOffset": col_dict["eventCount"], + } + ], + "valueLayout": "HORIZONTAL", + } + } + ), + ) + # Add first visit pivot table, RT + add_pivot_tables( + sheets_id, + page_id, + ( + { + "pivotTable": { + "source": { + # First Sheet (Sheet1) is always ID 0 + "sheetId": 0, + }, + "rows": [ + { + "sourceColumnOffset": col_dict["eventName"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["firstUserSource"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["eventCount"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + ], + "filterSpecs": [ + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "first_visit", + } + ], + }, + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["eventName"], + }, + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "rt", + }, + ], + }, + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["firstUserSource"], + }, + ], + "values": [ + { + "summarizeFunction": "SUM", + "sourceColumnOffset": col_dict["eventCount"], + } + ], + "valueLayout": "HORIZONTAL", + } + } + ), + row_idx=0, + col_idx=5, + ) + + +def create_session_start_pt(sheets_id, page_id, col_dict): + # Add sessions pivot table, facebook + add_pivot_tables( + sheets_id, + page_id, + ( + { + "pivotTable": { + "source": { + # First Sheet (Sheet1) is always ID 0 + "sheetId": 0, + }, + "rows": [ + { + "sourceColumnOffset": col_dict["eventName"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["firstUserSource"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["eventCount"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + ], + "filterSpecs": [ + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "session_start", + } + ], + }, + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["eventName"], + }, + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "facebook", + }, + ], + }, + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["firstUserSource"], + }, + ], + "values": [ + { + "summarizeFunction": "SUM", + "sourceColumnOffset": col_dict["eventCount"], + } + ], + "valueLayout": "HORIZONTAL", + } + } + ), + ) + # Add sessions pivot table, rt + add_pivot_tables( + sheets_id, + page_id, + ( + { + "pivotTable": { + "source": { + # First Sheet (Sheet1) is always ID 0 + "sheetId": 0, + }, + "rows": [ + { + "sourceColumnOffset": col_dict["eventName"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["firstUserSource"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["eventCount"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + ], + "filterSpecs": [ + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "session_start", + } + ], + }, + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["eventName"], + }, + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "rt", + }, + ], + }, + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["firstUserSource"], + }, + ], + "values": [ + { + "summarizeFunction": "SUM", + "sourceColumnOffset": col_dict["eventCount"], + } + ], + "valueLayout": "HORIZONTAL", + } + } + ), + row_idx=0, + col_idx=5, + ) + + +def create_clicks_pt(sheets_id, page_id, col_dict): + add_pivot_tables( + sheets_id, + page_id, + ( + { + "pivotTable": { + "source": { + # First Sheet (Sheet1) is always ID 0 + "sheetId": 0, + }, + "rows": [ + { + "sourceColumnOffset": col_dict["eventName"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["eventCount"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + ], + "values": [ + { + "summarizeFunction": "SUM", + "sourceColumnOffset": col_dict["eventCount"], + } + ], + "valueLayout": "HORIZONTAL", + } + } + ), + ) + + +def upload_participant( + first, + last, + email, + responseId, + time, + date, + ethnicity, + race, + gender, + age, + income, + skin_tone, +): + """ + Append participant data to the rekrewt raw completions spreadsheet + """ + values = [ + [ + first, + last, + first + " " + last, + email, + responseId, + time, + date, + ethnicity, + race, + gender, + income, + skin_tone, + ] + ] + + body = {"values": values} + result = ( + sheets_service.spreadsheets() + .values() + .append( + spreadsheetId=settings.SHEETS_ID, + range="Sheet1!A1", + valueInputOption="RAW", + body=body, + ) + .execute() + ) + if "error" in result: + raise error.ExportError(result["error"]["message"]) + return result diff --git a/requirements.txt b/requirements.txt index cbc9ce9..c842fc6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,86 @@ -fastapi==0.103.2 -uvicorn==0.23.2 -starlette-prometheus==0.9.0 -google-api-python-client==2.102.0 -google-auth-httplib2==0.1.1 -google-auth-oauthlib==1.1.0 +annotated-types==0.5.0 +anyio==4.0.0 +astroid==2.15.6 +bandit==1.7.5 +black==23.9.1 +cachetools==5.3.1 +certifi==2023.7.22 +cfgv==3.4.0 +charset-normalizer==3.2.0 +click==8.1.7 +dill==0.3.7 +distlib==0.3.7 +et-xmlfile==1.1.0 +exceptiongroup==1.1.3 +fastapi==0.101.1 +filelock==3.12.4 +gitdb==4.0.10 +GitPython==3.1.37 +google-analytics-admin==0.20.0 +google-analytics-data==0.17.1 +google-api-core==2.11.1 +google-api-python-client==2.97.0 +google-auth==2.23.0 +google-auth-httplib2==0.1.0 +google-auth-oauthlib==1.0.0 +googleapis-common-protos==1.60.0 +grpcio==1.58.0 +grpcio-status==1.58.0 +h11==0.14.0 +httpcore==0.18.0 +httplib2==0.22.0 +httpx==0.25.0 +identify==2.5.29 +idna==3.4 +iniconfig==2.0.0 +isort==5.12.0 +lazy-object-proxy==1.9.0 +markdown-it-py==3.0.0 +mccabe==0.7.0 +mdurl==0.1.2 +mypy-extensions==1.0.0 +nodeenv==1.8.0 +numpy==1.26.0 +oauthlib==3.2.2 +openpyxl==3.1.2 opensearch-py==2.3.1 +packaging==23.1 +pandas==2.1.1 +pathspec==0.11.2 +pbr==5.11.1 +platformdirs==3.10.0 +pluggy==1.3.0 +pre-commit==3.4.0 +prometheus-client==0.12.0 +proto-plus==1.22.3 +protobuf==4.24.3 +pyasn1==0.5.0 +pyasn1-modules==0.3.0 +pydantic==2.3.0 +pydantic_core==2.6.3 +Pygments==2.16.1 +pylint==2.17.5 +pyparsing==3.1.1 +pytest==7.4.2 +python-dateutil==2.8.2 +pytz==2023.3.post1 +PyYAML==6.0.1 +requests==2.31.0 +requests-oauthlib==1.3.1 +rich==13.5.3 +rsa==4.9 +six==1.16.0 +smmap==5.0.1 +sniffio==1.3.0 +starlette==0.27.0 +starlette-prometheus==0.9.0 +stevedore==5.1.0 +tomli==2.0.1 +tomlkit==0.12.1 +typing_extensions==4.8.0 +tzdata==2023.3 +uritemplate==4.1.1 +urllib3==1.26.16 +uvicorn==0.23.2 +virtualenv==20.24.5 +wrapt==1.15.0 From c8f93125115910ab6998c4bcc9bb24d4155d6629 Mon Sep 17 00:00:00 2001 From: Nathan Moore Date: Thu, 12 Oct 2023 15:18:42 -0400 Subject: [PATCH 02/17] Added completions pivot table for data prior to Oct 4 change --- gdrive/analytics_api.py | 11 +++++--- gdrive/settings.py | 5 +++- gdrive/sheets_client.py | 56 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 5 deletions(-) diff --git a/gdrive/analytics_api.py b/gdrive/analytics_api.py index 4542d30..f4b15c5 100644 --- a/gdrive/analytics_api.py +++ b/gdrive/analytics_api.py @@ -87,7 +87,7 @@ async def run_analytics_task(start_date: datetime, end_date: datetime): sheets_id = export(analytics_df, start_date, end_date) do_analytics_export_post_processing(analytics_df, sheets_id=sheets_id) except Exception as e: - log.error(e.args) + log.error(e) async def list_accounts_task(): @@ -127,7 +127,7 @@ def export( """ filename_str = get_filename(date_of_report, end_date) analytics_folder_id = drive_client.create_folder( - "Google Analytics", parent_id=settings.ROOT_DIRECTORY + "Google Analytics", parent_id=settings.ANALYTICS_ROOT ) # We have to do this in multiple steps with more than one client because the Sheets API @@ -156,11 +156,14 @@ def do_analytics_export_post_processing(df: pd.DataFrame, sheets_id: str): page1 = "Rekrewt Pivot Table - First Visit" page2 = "Rekrewt Pivot Table - Sessions" page3 = "GSA Use Pivot Table" + page4 = "Completions" - new_sheet_name_to_id = sheets_client.add_new_pages([page1, page2, page3], sheets_id) + new_sheet_name_to_id = sheets_client.add_new_pages( + [page1, page2, page3, page4], sheets_id + ) log.info("Added %s pages to %s" % (len(new_sheet_name_to_id.keys()), sheets_id)) sheets_client.do_create_pivot_tables( - df, (page1, page2, page3), new_sheet_name_to_id, sheets_id + df, (page1, page2, page3, page4), new_sheet_name_to_id, sheets_id ) diff --git a/gdrive/settings.py b/gdrive/settings.py index 0eb4db0..ebe2aa1 100644 --- a/gdrive/settings.py +++ b/gdrive/settings.py @@ -15,7 +15,6 @@ LOG_LEVEL = os.getenv("LOG_LEVEL", logging.getLevelName(logging.INFO)) ANALYTICS = os.getenv("ANALYTICS", False) -ANALYTICS_PROPERTY_ID = os.getenv("ANALYTICS_PROPERTY_ID", 377091467) SCOPES = [ "https://www.googleapis.com/auth/analytics", @@ -27,6 +26,8 @@ ROOT_DIRECTORY = "" CODE_NAMES = None CREDENTIALS = None +ANALYTICS_ROOT = None +ANALYTICS_PROPERTY_ID = None ES_HOST = os.getenv("ES_HOST") ES_PORT = os.getenv("ES_PORT") @@ -49,6 +50,8 @@ log.info("Loading credentials from creds file") config = json.load(file) CREDENTIALS = config["credentials"] + ANALYTICS_ROOT = config["analytics_root"] + ANALYTICS_PROPERTY_ID = config["analytics_property_id"] ROOT_DIRECTORY = config["root_directory"] CODE_NAMES = config["code_names"] SHEETS_ID = config["sheets_id"] diff --git a/gdrive/sheets_client.py b/gdrive/sheets_client.py index 24e69a0..4b70d6a 100644 --- a/gdrive/sheets_client.py +++ b/gdrive/sheets_client.py @@ -209,6 +209,11 @@ def do_create_pivot_tables( "Added pivot table to %s (%s)" % (page_names[2], names_to_id[page_names[2]]) ) + create_feedback_pt(sheets_id, names_to_id[page_names[3]], col_dict) + log.info( + "Added pivot table to %s (%s)" % (page_names[3], names_to_id[page_names[3]]) + ) + update_cell_value(sheets_id, page_names[0], "A17", "Total First Visits") update_cell_value( sheets_id, @@ -550,6 +555,57 @@ def create_clicks_pt(sheets_id, page_id, col_dict): ) +def create_feedback_pt(sheets_id, page_id, col_dict): + add_pivot_tables( + sheets_id, + page_id, + ( + { + "pivotTable": { + "source": { + "sheetId": 0, + }, + "rows": [ + { + "sourceColumnOffset": col_dict["eventName"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["eventCount"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + ], + "filterSpecs": [ + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "feedback", + } + ], + }, + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["linkUrl"], + }, + ], + "values": [ + { + "summarizeFunction": "SUM", + "sourceColumnOffset": col_dict["eventCount"], + } + ], + "valueLayout": "HORIZONTAL", + } + } + ), + ) + + def upload_participant( first, last, From 529d2963afd8d60338291a32b75aaf575ec0ce82 Mon Sep 17 00:00:00 2001 From: Nathan Moore Date: Tue, 17 Oct 2023 13:41:02 -0400 Subject: [PATCH 03/17] gdrive cr changes --- gdrive/analytics_api.py | 56 ++++++++++++++--------------------------- gdrive/settings.py | 2 -- gdrive/sheets_client.py | 5 ++-- 3 files changed, 21 insertions(+), 42 deletions(-) diff --git a/gdrive/analytics_api.py b/gdrive/analytics_api.py index f4b15c5..9c8ccf2 100644 --- a/gdrive/analytics_api.py +++ b/gdrive/analytics_api.py @@ -23,19 +23,12 @@ class AnalyticsRequest(BaseModel): @router.post("/analytics") async def run_analytics(background_tasks: BackgroundTasks): - if settings.ANALYTICS: - background_tasks.add_task(run_analytics_task, datetime.today(), None) - return responses.JSONResponse( - status_code=202, - content="Analytics request for %s is being processed." - % (datetime.date(datetime.today())), - ) - - else: - return responses.JSONResponse( - status_code=409, - content="Request is good, however the client has requested a resource that is unavailable at this time.", - ) + background_tasks.add_task(run_analytics_task, datetime.today(), None) + return responses.JSONResponse( + status_code=202, + content="Analytics request for %s is being processed." + % (datetime.date(datetime.today())), + ) @router.post("/analytics/daterange") @@ -44,39 +37,28 @@ async def run_analytics(background_tasks: BackgroundTasks, req: AnalyticsRequest date_format = "%Y-%m-%d" start_date = datetime.strptime(req.startDate, date_format) end_date = datetime.strptime(req.endDate, date_format) - except ValueError as _: - return responses.JSONResponse( - status_code=400, - content="Failed (invalid date parameters): %s, %s" - % (req.start_date, req.end_date), - ) - if settings.ANALYTICS: background_tasks.add_task(run_analytics_task, start_date, end_date) return responses.JSONResponse( status_code=202, content="Analytics request for %s - %s is being processed." % (datetime.date(start_date), datetime.date(end_date)), ) - else: + + except ValueError as err: return responses.JSONResponse( - status_code=409, - content="Request is good, however the client has requested a resource that is unavailable at this time.", + status_code=422, + content="Failed (invalid date parameters): [%s, %s] %s" + % (req.startDate, req.endDate, err), ) @router.post("/analytics/list") async def list_accounts(backgroud_tasks: BackgroundTasks): - if settings.ANALYTICS: - backgroud_tasks.add_task(list_accounts_task) - return responses.JSONResponse( - status_code=202, content="List request is being processed." - ) - else: - return responses.JSONResponse( - status_code=409, - content="Request is good, however the client has requested a resource that is unavailable at this time.", - ) + backgroud_tasks.add_task(list_accounts_task) + return responses.JSONResponse( + status_code=202, content="List request is being processed." + ) async def run_analytics_task(start_date: datetime, end_date: datetime): @@ -85,7 +67,7 @@ async def run_analytics_task(start_date: datetime, end_date: datetime): settings.ANALYTICS_PROPERTY_ID, start_date, end_date ) sheets_id = export(analytics_df, start_date, end_date) - do_analytics_export_post_processing(analytics_df, sheets_id=sheets_id) + analytics_export_post_processing(analytics_df, sheets_id=sheets_id) except Exception as e: log.error(e) @@ -125,7 +107,7 @@ def export( Returns: str: Google Sheets ID of the new Sheets object """ - filename_str = get_filename(date_of_report, end_date) + filename_str = generate_filename(date_of_report, end_date) analytics_folder_id = drive_client.create_folder( "Google Analytics", parent_id=settings.ANALYTICS_ROOT ) @@ -141,7 +123,7 @@ def export( return sheets_id -def do_analytics_export_post_processing(df: pd.DataFrame, sheets_id: str): +def analytics_export_post_processing(df: pd.DataFrame, sheets_id: str): """ Add new pages and pivot tables. @@ -167,7 +149,7 @@ def do_analytics_export_post_processing(df: pd.DataFrame, sheets_id: str): ) -def get_filename(date: datetime, end_date: datetime = None): +def generate_filename(date: datetime, end_date: datetime = None): """ Return filename for the new spreadsheet to be saved as diff --git a/gdrive/settings.py b/gdrive/settings.py index ebe2aa1..4bae7bb 100644 --- a/gdrive/settings.py +++ b/gdrive/settings.py @@ -14,8 +14,6 @@ LOG_LEVEL = os.getenv("LOG_LEVEL", logging.getLevelName(logging.INFO)) -ANALYTICS = os.getenv("ANALYTICS", False) - SCOPES = [ "https://www.googleapis.com/auth/analytics", "https://www.googleapis.com/auth/drive", diff --git a/gdrive/sheets_client.py b/gdrive/sheets_client.py index 4b70d6a..9a6b483 100644 --- a/gdrive/sheets_client.py +++ b/gdrive/sheets_client.py @@ -91,8 +91,7 @@ def add_pivot_tables( Returns: Google Sheets API Response: RAW response to the write operation """ - requests = [] - requests.append( + requests = [ { "updateCells": { "rows": { @@ -108,7 +107,7 @@ def add_pivot_tables( "fields": "pivotTable", } } - ) + ] body = {"requests": requests} From ca7a01c796ac03a817f65eb2159461017c565fa6 Mon Sep 17 00:00:00 2001 From: Nathan Moore Date: Tue, 17 Oct 2023 15:56:20 -0400 Subject: [PATCH 04/17] Fixing dep regressions, downgrading anyio to resolve conflict --- requirements.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index c842fc6..862ae6f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ annotated-types==0.5.0 -anyio==4.0.0 +anyio==3.7.1 astroid==2.15.6 bandit==1.7.5 black==23.9.1 @@ -12,17 +12,17 @@ dill==0.3.7 distlib==0.3.7 et-xmlfile==1.1.0 exceptiongroup==1.1.3 -fastapi==0.101.1 +fastapi==0.103.2 filelock==3.12.4 gitdb==4.0.10 GitPython==3.1.37 google-analytics-admin==0.20.0 google-analytics-data==0.17.1 google-api-core==2.11.1 -google-api-python-client==2.97.0 +google-api-python-client==2.102.0 google-auth==2.23.0 -google-auth-httplib2==0.1.0 -google-auth-oauthlib==1.0.0 +google-auth-httplib2==0.1.1 +google-auth-oauthlib==1.1.0 googleapis-common-protos==1.60.0 grpcio==1.58.0 grpcio-status==1.58.0 From 256d2b477f99d114f00f089bdaeb7007cf7d498e Mon Sep 17 00:00:00 2001 From: Nathan Moore Date: Tue, 17 Oct 2023 16:18:18 -0400 Subject: [PATCH 05/17] Resolve unintended conflicts --- gdrive/export_api.py | 2 +- gdrive/export_client.py | 158 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 154 insertions(+), 6 deletions(-) diff --git a/gdrive/export_api.py b/gdrive/export_api.py index 3f74ecd..bd3d277 100644 --- a/gdrive/export_api.py +++ b/gdrive/export_api.py @@ -117,7 +117,7 @@ class FindModel(BaseModel): responseId: str field: str values: list[str] - result_field: str | None + result_field: str | None = None @router.post("/find") diff --git a/gdrive/export_client.py b/gdrive/export_client.py index 9874b8c..6e1ef36 100644 --- a/gdrive/export_client.py +++ b/gdrive/export_client.py @@ -170,11 +170,159 @@ def get_qualtrics_response(surveyId: str, responseId: str): f"No survey response found for responseId: {responseId}" ) - resp = r.json() + return r.json() - if resp["status"] != "Complete": - raise error.ExportError( - f"Cannot upload incomplete survery response to raw completions spreadsheet: {responseId}" + +def get_all_InteractionIds(responseId): + es = OpenSearch( + hosts=[{"host": settings.ES_HOST, "port": settings.ES_PORT}], timeout=300 + ) + + # query for all parent flow intraction ids for a given response id + query_interactionId = { + "size": 500, + "query": { + "bool": { + "must": [ + {"match_phrase": {"properties.outcomeType.value": "survey_data"}}, + {"match": {"properties.outcomeDescription.value": f"{responseId}"}}, + ] + } + }, + "_source": ["interactionId"], + } + + results_interacitonId = es.search( + body=json.dumps(query_interactionId), index="_all" + ) + + if results_interacitonId["hits"]["total"]["value"] == 0: + return [] + + interactionIds_match = list( + map( + lambda res: res["_source"]["interactionId"], + results_interacitonId["hits"]["hits"], + ) + ) + + subflow_query_1 = list( + map( + lambda res: { + "bool": { + "must": [ + { + "match_phrase": { + "parentInteractionProps.parentInteractionId": f'{res["_source"]["interactionId"]}' + } + }, + {"exists": {"field": "interactionId"}}, + ] + } + }, + results_interacitonId["hits"]["hits"], + ) + ) + + subflow_query_2 = list( + map( + lambda res: { + "bool": { + "must": [ + { + "match_phrase": { + "properties.outcomeDescription.value": f'{res["_source"]["interactionId"]}' + } + }, + {"match_phrase": {"properties.outcomeType.value": "parent_id"}}, + ] + } + }, + results_interacitonId["hits"]["hits"], + ) + ) + + subflowquery = { + "size": 500, + "query": {"bool": {"should": subflow_query_1 + subflow_query_2}}, + "_source": ["interactionId"], + } + + subs = es.search(body=json.dumps(subflowquery), index="_all") + + sub_interactionIds_match = list( + map( + lambda res: res["_source"]["interactionId"], + subs["hits"]["hits"], + ) + ) + + return interactionIds_match + sub_interactionIds_match + + +def find(responseId, field, values, result): + # find values in find for all flow for a given responseId + # field and result should be one of: + # properties.outcomeDescription.value + # properties.outcomeStatus.value + # properties.outcomeType.value + # properties.outcomeDetail.value + + es = OpenSearch( + hosts=[{"host": settings.ES_HOST, "port": settings.ES_PORT}], timeout=300 + ) + + all_interactionIds = get_all_InteractionIds(responseId) + + if len(all_interactionIds) == 0: + return {"found": []} + + all_interactionIds_match = list( + map(lambda res: {"match": {"interactionId": f"{res}"}}, all_interactionIds) + ) + + values_match = list( + map( + lambda res: {"match_phrase": {field: res}}, + values, ) + ) + + query_found = { + "size": 500, + "query": { + "bool": { + "must": [ + { + "bool": { + "should": values_match, + } + }, + { + "bool": { + "should": all_interactionIds_match, + } + }, + ] + } + }, + "_source": [result], + } + + found_result = es.search(body=json.dumps(query_found), index="_all") + + list_found = list( + map( + lambda x: recursive_decent(x["_source"], result.split(".")), + found_result["hits"]["hits"], + ) + ) + + return {"found": list_found} + - return resp +def recursive_decent(obj: dict | str, query: list[str]): + # given dict and a dot notated key name, return value of key + if query == [] or not isinstance(obj, dict): + return obj + return recursive_decent(obj.get(query[0], ""), query[1:]) From af9fb9d81544c00ae84ee1b47013e8a20e3c6c8f Mon Sep 17 00:00:00 2001 From: Nathan Moore Date: Tue, 17 Oct 2023 16:41:18 -0400 Subject: [PATCH 06/17] Updating req to not be the output of pip freeze, but to instead simply pull in explicit dependencies --- requirements.txt | 77 ++---------------------------------------------- 1 file changed, 2 insertions(+), 75 deletions(-) diff --git a/requirements.txt b/requirements.txt index 862ae6f..10ae035 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,21 +1,6 @@ -annotated-types==0.5.0 -anyio==3.7.1 -astroid==2.15.6 -bandit==1.7.5 -black==23.9.1 -cachetools==5.3.1 -certifi==2023.7.22 -cfgv==3.4.0 -charset-normalizer==3.2.0 -click==8.1.7 -dill==0.3.7 -distlib==0.3.7 -et-xmlfile==1.1.0 -exceptiongroup==1.1.3 fastapi==0.103.2 -filelock==3.12.4 -gitdb==4.0.10 -GitPython==3.1.37 +uvicorn==0.23.2 +starlette-prometheus==0.9.0 google-analytics-admin==0.20.0 google-analytics-data==0.17.1 google-api-core==2.11.1 @@ -24,63 +9,5 @@ google-auth==2.23.0 google-auth-httplib2==0.1.1 google-auth-oauthlib==1.1.0 googleapis-common-protos==1.60.0 -grpcio==1.58.0 -grpcio-status==1.58.0 -h11==0.14.0 -httpcore==0.18.0 -httplib2==0.22.0 -httpx==0.25.0 -identify==2.5.29 -idna==3.4 -iniconfig==2.0.0 -isort==5.12.0 -lazy-object-proxy==1.9.0 -markdown-it-py==3.0.0 -mccabe==0.7.0 -mdurl==0.1.2 -mypy-extensions==1.0.0 -nodeenv==1.8.0 -numpy==1.26.0 -oauthlib==3.2.2 -openpyxl==3.1.2 opensearch-py==2.3.1 -packaging==23.1 pandas==2.1.1 -pathspec==0.11.2 -pbr==5.11.1 -platformdirs==3.10.0 -pluggy==1.3.0 -pre-commit==3.4.0 -prometheus-client==0.12.0 -proto-plus==1.22.3 -protobuf==4.24.3 -pyasn1==0.5.0 -pyasn1-modules==0.3.0 -pydantic==2.3.0 -pydantic_core==2.6.3 -Pygments==2.16.1 -pylint==2.17.5 -pyparsing==3.1.1 -pytest==7.4.2 -python-dateutil==2.8.2 -pytz==2023.3.post1 -PyYAML==6.0.1 -requests==2.31.0 -requests-oauthlib==1.3.1 -rich==13.5.3 -rsa==4.9 -six==1.16.0 -smmap==5.0.1 -sniffio==1.3.0 -starlette==0.27.0 -starlette-prometheus==0.9.0 -stevedore==5.1.0 -tomli==2.0.1 -tomlkit==0.12.1 -typing_extensions==4.8.0 -tzdata==2023.3 -uritemplate==4.1.1 -urllib3==1.26.16 -uvicorn==0.23.2 -virtualenv==20.24.5 -wrapt==1.15.0 From 7069cc30b00338beae115069111593b98ced9f5a Mon Sep 17 00:00:00 2001 From: Nathan Moore Date: Tue, 17 Oct 2023 16:43:33 -0400 Subject: [PATCH 07/17] Removing more unintended code --- gdrive/export_client.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/gdrive/export_client.py b/gdrive/export_client.py index 6e1ef36..726cb57 100644 --- a/gdrive/export_client.py +++ b/gdrive/export_client.py @@ -274,9 +274,6 @@ def find(responseId, field, values, result): all_interactionIds = get_all_InteractionIds(responseId) - if len(all_interactionIds) == 0: - return {"found": []} - all_interactionIds_match = list( map(lambda res: {"match": {"interactionId": f"{res}"}}, all_interactionIds) ) From f3edb8b158775c065a8fcc5bfa6e1e796a09c828 Mon Sep 17 00:00:00 2001 From: Nathan Moore Date: Tue, 17 Oct 2023 16:53:24 -0400 Subject: [PATCH 08/17] Fixing git warning --- gdrive/analytics_api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gdrive/analytics_api.py b/gdrive/analytics_api.py index 9c8ccf2..95c36ee 100644 --- a/gdrive/analytics_api.py +++ b/gdrive/analytics_api.py @@ -46,10 +46,11 @@ async def run_analytics(background_tasks: BackgroundTasks, req: AnalyticsRequest ) except ValueError as err: + message = str(err) return responses.JSONResponse( status_code=422, content="Failed (invalid date parameters): [%s, %s] %s" - % (req.startDate, req.endDate, err), + % (req.startDate, req.endDate, message), ) From 4aa4d22a06063f419df1c4af8a48648c7d39bf7b Mon Sep 17 00:00:00 2001 From: Nathan Moore Date: Wed, 18 Oct 2023 10:48:36 -0400 Subject: [PATCH 09/17] Fixing unit test and removing unessessary explicit toString call --- gdrive/analytics_api.py | 4 +--- tests/test_api.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/gdrive/analytics_api.py b/gdrive/analytics_api.py index 95c36ee..4b1189c 100644 --- a/gdrive/analytics_api.py +++ b/gdrive/analytics_api.py @@ -46,11 +46,9 @@ async def run_analytics(background_tasks: BackgroundTasks, req: AnalyticsRequest ) except ValueError as err: - message = str(err) return responses.JSONResponse( status_code=422, - content="Failed (invalid date parameters): [%s, %s] %s" - % (req.startDate, req.endDate, message), + content="Failed (invalid date parameters): %s" % (err), ) diff --git a/tests/test_api.py b/tests/test_api.py index 3359e2c..74cadcf 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -7,7 +7,7 @@ from fastapi import testclient # pylint: disable=wrong-import-position -sys.modules["gdrive.client"] = MagicMock() +sys.modules["gdrive.drive_client"] = MagicMock() from gdrive import main client = testclient.TestClient(main.app) From 7e2651216c226a33c3e0318f8224e8dfb0b97a03 Mon Sep 17 00:00:00 2001 From: Nathan Moore Date: Wed, 18 Oct 2023 11:48:58 -0400 Subject: [PATCH 10/17] Attempting to suppress a warning --- gdrive/analytics_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gdrive/analytics_api.py b/gdrive/analytics_api.py index 4b1189c..6231d82 100644 --- a/gdrive/analytics_api.py +++ b/gdrive/analytics_api.py @@ -46,6 +46,7 @@ async def run_analytics(background_tasks: BackgroundTasks, req: AnalyticsRequest ) except ValueError as err: + # @suppress("py/stack-trace-exposure") return responses.JSONResponse( status_code=422, content="Failed (invalid date parameters): %s" % (err), From 38e2cc3203189c569a980fe7d3cea8c941b26ea7 Mon Sep 17 00:00:00 2001 From: Nathan Moore Date: Wed, 18 Oct 2023 16:36:28 -0400 Subject: [PATCH 11/17] Mocking clients to fix unit test --- tests/test_api.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_api.py b/tests/test_api.py index 74cadcf..a5223fc 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -8,6 +8,8 @@ # pylint: disable=wrong-import-position sys.modules["gdrive.drive_client"] = MagicMock() +sys.modules["gdrive.sheets_client"] = MagicMock() +sys.modules["gdrive.analytics_client"] = MagicMock() from gdrive import main client = testclient.TestClient(main.app) From d4540420ecd50e7d210b4113da86e268dd1cad9e Mon Sep 17 00:00:00 2001 From: nathan-moore-97 Date: Mon, 23 Oct 2023 11:27:30 -0400 Subject: [PATCH 12/17] Code review comments, adding back some more accidental deletions --- gdrive/analytics_api.py | 9 +++++---- gdrive/analytics_client.py | 7 +++---- gdrive/export_api.py | 5 +++++ gdrive/sheets_client.py | 2 +- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/gdrive/analytics_api.py b/gdrive/analytics_api.py index 6231d82..ec939c8 100644 --- a/gdrive/analytics_api.py +++ b/gdrive/analytics_api.py @@ -34,9 +34,8 @@ async def run_analytics(background_tasks: BackgroundTasks): @router.post("/analytics/daterange") async def run_analytics(background_tasks: BackgroundTasks, req: AnalyticsRequest): try: - date_format = "%Y-%m-%d" - start_date = datetime.strptime(req.startDate, date_format) - end_date = datetime.strptime(req.endDate, date_format) + start_date = datetime.strptime(req.startDate, analytics_client.API_DATE_FORMAT) + end_date = datetime.strptime(req.endDate, analytics_client.API_DATE_FORMAT) background_tasks.add_task(run_analytics_task, start_date, end_date) return responses.JSONResponse( @@ -63,9 +62,11 @@ async def list_accounts(backgroud_tasks: BackgroundTasks): async def run_analytics_task(start_date: datetime, end_date: datetime): try: - analytics_df = analytics_client.download( + response = analytics_client.download( settings.ANALYTICS_PROPERTY_ID, start_date, end_date ) + + analytics_df = analytics_client.create_df_from_analytics_response(response) sheets_id = export(analytics_df, start_date, end_date) analytics_export_post_processing(analytics_df, sheets_id=sheets_id) except Exception as e: diff --git a/gdrive/analytics_client.py b/gdrive/analytics_client.py index 5a4c6f0..e22d3fd 100644 --- a/gdrive/analytics_client.py +++ b/gdrive/analytics_client.py @@ -18,6 +18,7 @@ log = logging.getLogger(__name__) creds = service_account.Credentials.from_service_account_info(settings.CREDENTIALS) +API_DATE_FORMAT = "%Y-%m-%d" """ Client for the Google Analytics (GA4) API @@ -63,9 +64,7 @@ def download( ], ) - return create_df_from_analytics_response( - BetaAnalyticsDataClient(credentials=creds).run_report(request) - ) + return BetaAnalyticsDataClient(credentials=creds).run_report(request) def list(): @@ -81,7 +80,7 @@ def format_date_for_api(date: datetime): """ Formats datetime object for Google Analytics Api (GA4) input """ - return date.strftime("%Y-%m-%d") + return date.strftime(API_DATE_FORMAT) def create_df_from_analytics_response(response): diff --git a/gdrive/export_api.py b/gdrive/export_api.py index bd3d277..f4fad7f 100644 --- a/gdrive/export_api.py +++ b/gdrive/export_api.py @@ -73,6 +73,11 @@ async def survey_upload_response_task(request): log.info("Response found, beginning export.") + if response["status"] != "Complete": + raise error.ExportError( + f"Cannot upload incomplete survery response to raw completions spreadsheet: {request.responseId}" + ) + # By the time we get here, we can count on the response containing the demographic data # as it is included in the Completed flow responses. Responses without complete status # throws exception in get_qualtrics_response diff --git a/gdrive/sheets_client.py b/gdrive/sheets_client.py index 9a6b483..a9bdfb6 100644 --- a/gdrive/sheets_client.py +++ b/gdrive/sheets_client.py @@ -185,7 +185,7 @@ def export_df_to_gdrive_speadsheet(df: pd.DataFrame, sheets_id: str, title="Shee # Project specific functions -def do_create_pivot_tables( +def create_pivot_tables( df: pd.DataFrame, page_names: (str, str, str), names_to_id: dict, sheets_id: str ): # Make a dictionary mapping the name of the column to its index, useful for the pivot tables. From 08076617d7b1cd7812b7529f88f368b89783b3c5 Mon Sep 17 00:00:00 2001 From: nathan-moore-97 Date: Mon, 23 Oct 2023 17:24:26 -0400 Subject: [PATCH 13/17] Doing Analytics routine synchronously so the client can use retval --- gdrive/analytics_api.py | 18 +++++++++++------- gdrive/analytics_client.py | 5 +++-- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/gdrive/analytics_api.py b/gdrive/analytics_api.py index ec939c8..b3aa962 100644 --- a/gdrive/analytics_api.py +++ b/gdrive/analytics_api.py @@ -2,7 +2,7 @@ Google Analytics Rest API """ -from datetime import datetime +from datetime import datetime, timedelta import logging import fastapi @@ -22,8 +22,10 @@ class AnalyticsRequest(BaseModel): @router.post("/analytics") -async def run_analytics(background_tasks: BackgroundTasks): - background_tasks.add_task(run_analytics_task, datetime.today(), None) +async def run_analytics_default(background_tasks: BackgroundTasks): + # Default behaviour for the system is run a report from previous day + target_date = datetime.today() - timedelta(days=1) + run_analytics(target_date, None) return responses.JSONResponse( status_code=202, content="Analytics request for %s is being processed." @@ -32,12 +34,14 @@ async def run_analytics(background_tasks: BackgroundTasks): @router.post("/analytics/daterange") -async def run_analytics(background_tasks: BackgroundTasks, req: AnalyticsRequest): +async def run_analytics_daterange( + background_tasks: BackgroundTasks, req: AnalyticsRequest +): try: start_date = datetime.strptime(req.startDate, analytics_client.API_DATE_FORMAT) end_date = datetime.strptime(req.endDate, analytics_client.API_DATE_FORMAT) - background_tasks.add_task(run_analytics_task, start_date, end_date) + run_analytics(start_date, end_date) return responses.JSONResponse( status_code=202, content="Analytics request for %s - %s is being processed." @@ -60,7 +64,7 @@ async def list_accounts(backgroud_tasks: BackgroundTasks): ) -async def run_analytics_task(start_date: datetime, end_date: datetime): +def run_analytics(start_date: datetime, end_date: datetime): try: response = analytics_client.download( settings.ANALYTICS_PROPERTY_ID, start_date, end_date @@ -145,7 +149,7 @@ def analytics_export_post_processing(df: pd.DataFrame, sheets_id: str): [page1, page2, page3, page4], sheets_id ) log.info("Added %s pages to %s" % (len(new_sheet_name_to_id.keys()), sheets_id)) - sheets_client.do_create_pivot_tables( + sheets_client.create_pivot_tables( df, (page1, page2, page3, page4), new_sheet_name_to_id, sheets_id ) diff --git a/gdrive/analytics_client.py b/gdrive/analytics_client.py index e22d3fd..9accc06 100644 --- a/gdrive/analytics_client.py +++ b/gdrive/analytics_client.py @@ -8,6 +8,7 @@ Dimension, Metric, RunReportRequest, + RunReportResponse, ) import logging @@ -30,7 +31,7 @@ def download( property_id, target_date: datetime, end_date: datetime = None -) -> pd.DataFrame: +) -> RunReportResponse: """ Access Google Analytics (GA4) api and download desired analytics report. """ @@ -83,7 +84,7 @@ def format_date_for_api(date: datetime): return date.strftime(API_DATE_FORMAT) -def create_df_from_analytics_response(response): +def create_df_from_analytics_response(response: RunReportResponse): """ Extracts values from Google Analytics API response and transforms them into pandas DataFrame for ease of use. This enables the analytics From 27357cbe2119645692af4e198ba40dc87b13aded Mon Sep 17 00:00:00 2001 From: nathan-moore-97 Date: Mon, 23 Oct 2023 17:30:46 -0400 Subject: [PATCH 14/17] Updating response messages and removing dep on background tasks --- gdrive/analytics_api.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/gdrive/analytics_api.py b/gdrive/analytics_api.py index b3aa962..7d161c1 100644 --- a/gdrive/analytics_api.py +++ b/gdrive/analytics_api.py @@ -22,21 +22,18 @@ class AnalyticsRequest(BaseModel): @router.post("/analytics") -async def run_analytics_default(background_tasks: BackgroundTasks): +async def run_analytics_default(): # Default behaviour for the system is run a report from previous day target_date = datetime.today() - timedelta(days=1) run_analytics(target_date, None) return responses.JSONResponse( status_code=202, - content="Analytics request for %s is being processed." - % (datetime.date(datetime.today())), + content="Analytics report for %s complete." % (datetime.date(datetime.today())), ) @router.post("/analytics/daterange") -async def run_analytics_daterange( - background_tasks: BackgroundTasks, req: AnalyticsRequest -): +async def run_analytics_daterange(req: AnalyticsRequest): try: start_date = datetime.strptime(req.startDate, analytics_client.API_DATE_FORMAT) end_date = datetime.strptime(req.endDate, analytics_client.API_DATE_FORMAT) @@ -44,7 +41,7 @@ async def run_analytics_daterange( run_analytics(start_date, end_date) return responses.JSONResponse( status_code=202, - content="Analytics request for %s - %s is being processed." + content="Analytics report for %s - %s complete." % (datetime.date(start_date), datetime.date(end_date)), ) From 73e83c464d74e249b56b01d07d908543878077f0 Mon Sep 17 00:00:00 2001 From: nathan-moore-97 Date: Thu, 26 Oct 2023 12:10:18 -0400 Subject: [PATCH 15/17] Updating README with new API information --- README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/README.md b/README.md index 39a46c5..3273f13 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,26 @@ Request body: } ``` +#### Product Analytics Bulk Upload +Exports Google Analytics data gathered from the IDVA flow to Google Drive, as a google sheets object. Routine then builds pivot tables to enable user to read data easily. Default behaviour for the API `/analytics` writes data for the previous day. + +The ID of the Google Drive folder is configurable in `ANALYTICS_ROOT`. (`settings`) + +Optionally, the user can pass in a date range to be uploaded. The data is collated into a single document, and the same pivot tables are written on the collated data. + +`POST /analytics` +``` +Query parameters: None +``` +`POST /analytics/daterange` +```JSON +// Request body +{ + "startDate": "YYYY-MM-DD", + "endDate": "YYYY-MM-DD" +} +``` + ### Deploying to Cloud.gov during development All deployments require having the correct Cloud.gov credentials in place. If From 4b776c14cb385d103b48f1321c47fd9375ec4733 Mon Sep 17 00:00:00 2001 From: nathan-moore-97 Date: Fri, 27 Oct 2023 17:53:59 -0400 Subject: [PATCH 16/17] Combining API endpoints, improving readability --- gdrive/analytics_api.py | 65 ++-- gdrive/sheets_client.py | 655 ++++++++++++++++++-------------------- gdrive/sheets_entities.py | 0 3 files changed, 348 insertions(+), 372 deletions(-) create mode 100644 gdrive/sheets_entities.py diff --git a/gdrive/analytics_api.py b/gdrive/analytics_api.py index 7d161c1..89880ec 100644 --- a/gdrive/analytics_api.py +++ b/gdrive/analytics_api.py @@ -4,6 +4,7 @@ from datetime import datetime, timedelta import logging +from typing import Optional import fastapi from pydantic import BaseModel @@ -17,45 +18,43 @@ class AnalyticsRequest(BaseModel): - startDate: str = None - endDate: str = None + startDate: str + endDate: str @router.post("/analytics") -async def run_analytics_default(): - # Default behaviour for the system is run a report from previous day - target_date = datetime.today() - timedelta(days=1) - run_analytics(target_date, None) +async def run_analytics_default(req: Optional[AnalyticsRequest] = None): + start = None + end = None + message = None + if req is None: + start = datetime.today() - timedelta(days=1) + message = "Analytics report for %s complete." % (datetime.date(start)) + else: + try: + start = datetime.strptime(req.startDate, analytics_client.API_DATE_FORMAT) + end = datetime.strptime(req.endDate, analytics_client.API_DATE_FORMAT) + message = "Analytics report for %s - %s complete." % ( + datetime.date(start), + datetime.date(end), + ) + except ValueError as err: + # @suppress("py/stack-trace-exposure") + return responses.JSONResponse( + status_code=422, + content="Failed (invalid date parameters): %s" % (err), + ) + + run_analytics(start, end) return responses.JSONResponse( status_code=202, - content="Analytics report for %s complete." % (datetime.date(datetime.today())), + content=message, ) -@router.post("/analytics/daterange") -async def run_analytics_daterange(req: AnalyticsRequest): - try: - start_date = datetime.strptime(req.startDate, analytics_client.API_DATE_FORMAT) - end_date = datetime.strptime(req.endDate, analytics_client.API_DATE_FORMAT) - - run_analytics(start_date, end_date) - return responses.JSONResponse( - status_code=202, - content="Analytics report for %s - %s complete." - % (datetime.date(start_date), datetime.date(end_date)), - ) - - except ValueError as err: - # @suppress("py/stack-trace-exposure") - return responses.JSONResponse( - status_code=422, - content="Failed (invalid date parameters): %s" % (err), - ) - - @router.post("/analytics/list") -async def list_accounts(backgroud_tasks: BackgroundTasks): - backgroud_tasks.add_task(list_accounts_task) +async def list_accounts(): + list_accounts_task() return responses.JSONResponse( status_code=202, content="List request is being processed." ) @@ -69,12 +68,12 @@ def run_analytics(start_date: datetime, end_date: datetime): analytics_df = analytics_client.create_df_from_analytics_response(response) sheets_id = export(analytics_df, start_date, end_date) - analytics_export_post_processing(analytics_df, sheets_id=sheets_id) + create_pages_and_pivot_tables(analytics_df, sheets_id=sheets_id) except Exception as e: log.error(e) -async def list_accounts_task(): +def list_accounts_task(): try: list_response = analytics_client.list() if list_response is not None: @@ -125,7 +124,7 @@ def export( return sheets_id -def analytics_export_post_processing(df: pd.DataFrame, sheets_id: str): +def create_pages_and_pivot_tables(df: pd.DataFrame, sheets_id: str): """ Add new pages and pivot tables. diff --git a/gdrive/sheets_client.py b/gdrive/sheets_client.py index a9bdfb6..770b211 100644 --- a/gdrive/sheets_client.py +++ b/gdrive/sheets_client.py @@ -72,7 +72,7 @@ def update_cell_value( def add_pivot_tables( sheets_id: str, target_page_id: str, - pivot_table_definition: object, + pt_def: object, row_idx: int = 0, col_idx: int = 0, ): @@ -82,7 +82,7 @@ def add_pivot_tables( Args: sheets_id (str): ID for the sheets object target_page_id (str): ID for the target page of the sheets object, (Sheet1 is always 0) - pivot_table_definition (object): JSON encoded dict + pt_def (object): JSON encoded dict row_idx (int): Index of the row to write the start of the table default: 0 col_idx (int): Index of the column to write the start of the table @@ -97,7 +97,7 @@ def add_pivot_tables( "rows": { # I would need to write a whole library to parameterize this well so # Client Code will just need to pass the JSON definitions in. - "values": pivot_table_definition + "values": pt_def }, "start": { "sheetId": target_page_id, @@ -233,143 +233,140 @@ def create_pivot_tables( def create_first_visit_pt(sheets_id, page_id, col_dict): - # Add first visit pivot table, Facebook - add_pivot_tables( - sheets_id, - page_id, - ( - { - "pivotTable": { - "source": { - # First Sheet (Sheet1) is always ID 0 - "sheetId": 0, - }, - "rows": [ - { - "sourceColumnOffset": col_dict["eventName"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["firstUserSource"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["eventCount"], - "showTotals": True, - "sortOrder": "ASCENDING", + first_visit_facebook_pt_def = { + "pivotTable": { + "source": { + # First Sheet (Sheet1) is always ID 0 + "sheetId": 0, + }, + "rows": [ + { + "sourceColumnOffset": col_dict["eventName"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["firstUserSource"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["eventCount"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + ], + "filterSpecs": [ + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "first_visit", + } + ], }, - ], - "filterSpecs": [ - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "first_visit", - } - ], + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["eventName"], + }, + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "facebook", }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["eventName"], + ], + }, + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["firstUserSource"], + }, + ], + "values": [ + { + "summarizeFunction": "SUM", + "sourceColumnOffset": col_dict["eventCount"], + } + ], + "valueLayout": "HORIZONTAL", + } + } + first_visit_rt_pt_def = { + "pivotTable": { + "source": { + # First Sheet (Sheet1) is always ID 0 + "sheetId": 0, + }, + "rows": [ + { + "sourceColumnOffset": col_dict["eventName"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["firstUserSource"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["eventCount"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + ], + "filterSpecs": [ + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "first_visit", + } + ], }, - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "facebook", - }, - ], + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["eventName"], + }, + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "rt", }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["firstUserSource"], + ], }, - ], - "values": [ - { - "summarizeFunction": "SUM", - "sourceColumnOffset": col_dict["eventCount"], - } - ], - "valueLayout": "HORIZONTAL", + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["firstUserSource"], + }, + ], + "values": [ + { + "summarizeFunction": "SUM", + "sourceColumnOffset": col_dict["eventCount"], } - } - ), + ], + "valueLayout": "HORIZONTAL", + } + } + + add_pivot_tables( + sheets_id, + page_id, + first_visit_facebook_pt_def, ) - # Add first visit pivot table, RT add_pivot_tables( sheets_id, page_id, - ( - { - "pivotTable": { - "source": { - # First Sheet (Sheet1) is always ID 0 - "sheetId": 0, - }, - "rows": [ - { - "sourceColumnOffset": col_dict["eventName"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["firstUserSource"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["eventCount"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - ], - "filterSpecs": [ - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "first_visit", - } - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["eventName"], - }, - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "rt", - }, - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["firstUserSource"], - }, - ], - "values": [ - { - "summarizeFunction": "SUM", - "sourceColumnOffset": col_dict["eventCount"], - } - ], - "valueLayout": "HORIZONTAL", - } - } - ), + first_visit_rt_pt_def, row_idx=0, col_idx=5, ) @@ -377,232 +374,212 @@ def create_first_visit_pt(sheets_id, page_id, col_dict): def create_session_start_pt(sheets_id, page_id, col_dict): # Add sessions pivot table, facebook - add_pivot_tables( - sheets_id, - page_id, - ( - { - "pivotTable": { - "source": { - # First Sheet (Sheet1) is always ID 0 - "sheetId": 0, - }, - "rows": [ - { - "sourceColumnOffset": col_dict["eventName"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["firstUserSource"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["eventCount"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - ], - "filterSpecs": [ - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "session_start", - } - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["eventName"], + sessions_facebook_pt_def = { + "pivotTable": { + "source": { + # First Sheet (Sheet1) is always ID 0 + "sheetId": 0, + }, + "rows": [ + { + "sourceColumnOffset": col_dict["eventName"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["firstUserSource"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["eventCount"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + ], + "filterSpecs": [ + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "session_start", + } + ], }, - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "facebook", - }, - ], + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["eventName"], + }, + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "facebook", }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["firstUserSource"], + ], }, - ], - "values": [ - { - "summarizeFunction": "SUM", - "sourceColumnOffset": col_dict["eventCount"], - } - ], - "valueLayout": "HORIZONTAL", - } - } - ), - ) - # Add sessions pivot table, rt - add_pivot_tables( - sheets_id, - page_id, - ( - { - "pivotTable": { - "source": { - # First Sheet (Sheet1) is always ID 0 - "sheetId": 0, + "visibleByDefault": True, }, - "rows": [ - { - "sourceColumnOffset": col_dict["eventName"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["firstUserSource"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["eventCount"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - ], - "filterSpecs": [ - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "session_start", - } - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["eventName"], + "columnOffsetIndex": col_dict["firstUserSource"], + }, + ], + "values": [ + { + "summarizeFunction": "SUM", + "sourceColumnOffset": col_dict["eventCount"], + } + ], + "valueLayout": "HORIZONTAL", + } + } + sessions_rt_pt_def = { + "pivotTable": { + "source": { + # First Sheet (Sheet1) is always ID 0 + "sheetId": 0, + }, + "rows": [ + { + "sourceColumnOffset": col_dict["eventName"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["firstUserSource"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["eventCount"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + ], + "filterSpecs": [ + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "session_start", + } + ], }, - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "rt", - }, - ], + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["eventName"], + }, + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "rt", }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["firstUserSource"], + ], }, - ], - "values": [ - { - "summarizeFunction": "SUM", - "sourceColumnOffset": col_dict["eventCount"], - } - ], - "valueLayout": "HORIZONTAL", + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["firstUserSource"], + }, + ], + "values": [ + { + "summarizeFunction": "SUM", + "sourceColumnOffset": col_dict["eventCount"], } - } - ), - row_idx=0, - col_idx=5, - ) + ], + "valueLayout": "HORIZONTAL", + } + } + + add_pivot_tables(sheets_id, page_id, sessions_facebook_pt_def) + add_pivot_tables(sheets_id, page_id, sessions_rt_pt_def, row_idx=0, col_idx=5) def create_clicks_pt(sheets_id, page_id, col_dict): - add_pivot_tables( - sheets_id, - page_id, - ( - { - "pivotTable": { - "source": { - # First Sheet (Sheet1) is always ID 0 - "sheetId": 0, - }, - "rows": [ - { - "sourceColumnOffset": col_dict["eventName"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["eventCount"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - ], - "values": [ - { - "summarizeFunction": "SUM", - "sourceColumnOffset": col_dict["eventCount"], - } - ], - "valueLayout": "HORIZONTAL", + clicks_pt_def = { + "pivotTable": { + "source": { + # First Sheet (Sheet1) is always ID 0 + "sheetId": 0, + }, + "rows": [ + { + "sourceColumnOffset": col_dict["eventName"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["eventCount"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + ], + "values": [ + { + "summarizeFunction": "SUM", + "sourceColumnOffset": col_dict["eventCount"], } - } - ), - ) + ], + "valueLayout": "HORIZONTAL", + } + } + + add_pivot_tables(sheets_id, page_id, clicks_pt_def) def create_feedback_pt(sheets_id, page_id, col_dict): - add_pivot_tables( - sheets_id, - page_id, - ( - { - "pivotTable": { - "source": { - "sheetId": 0, - }, - "rows": [ - { - "sourceColumnOffset": col_dict["eventName"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["eventCount"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - ], - "filterSpecs": [ - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "feedback", - } - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["linkUrl"], + feedback_pt_def = { + "pivotTable": { + "source": { + "sheetId": 0, + }, + "rows": [ + { + "sourceColumnOffset": col_dict["eventName"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["eventCount"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + ], + "filterSpecs": [ + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "feedback", + } + ], }, - ], - "values": [ - { - "summarizeFunction": "SUM", - "sourceColumnOffset": col_dict["eventCount"], - } - ], - "valueLayout": "HORIZONTAL", + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["linkUrl"], + }, + ], + "values": [ + { + "summarizeFunction": "SUM", + "sourceColumnOffset": col_dict["eventCount"], } - } - ), - ) + ], + "valueLayout": "HORIZONTAL", + } + } + + add_pivot_tables(sheets_id, page_id, feedback_pt_def) def upload_participant( diff --git a/gdrive/sheets_entities.py b/gdrive/sheets_entities.py new file mode 100644 index 0000000..e69de29 From c64b322843ae267837338c519eed05a39e687082 Mon Sep 17 00:00:00 2001 From: nathan-moore-97 Date: Mon, 30 Oct 2023 00:31:06 -0400 Subject: [PATCH 17/17] Moving product specific code out of analytics api and clients --- gdrive/analytics_api.py | 92 +------ gdrive/idva_flow_analytics.py | 492 ++++++++++++++++++++++++++++++++++ gdrive/sheets_client.py | 402 --------------------------- gdrive/sheets_entities.py | 0 4 files changed, 497 insertions(+), 489 deletions(-) create mode 100644 gdrive/idva_flow_analytics.py delete mode 100644 gdrive/sheets_entities.py diff --git a/gdrive/analytics_api.py b/gdrive/analytics_api.py index 89880ec..e58668e 100644 --- a/gdrive/analytics_api.py +++ b/gdrive/analytics_api.py @@ -8,10 +8,8 @@ import fastapi from pydantic import BaseModel -from fastapi import BackgroundTasks, responses -import pandas as pd - -from gdrive import error, settings, analytics_client, sheets_client, drive_client +from fastapi import responses +from gdrive import analytics_client, idva_flow_analytics log = logging.getLogger(__name__) router = fastapi.APIRouter() @@ -54,7 +52,7 @@ async def run_analytics_default(req: Optional[AnalyticsRequest] = None): @router.post("/analytics/list") async def list_accounts(): - list_accounts_task() + list_accounts() return responses.JSONResponse( status_code=202, content="List request is being processed." ) @@ -62,18 +60,12 @@ async def list_accounts(): def run_analytics(start_date: datetime, end_date: datetime): try: - response = analytics_client.download( - settings.ANALYTICS_PROPERTY_ID, start_date, end_date - ) - - analytics_df = analytics_client.create_df_from_analytics_response(response) - sheets_id = export(analytics_df, start_date, end_date) - create_pages_and_pivot_tables(analytics_df, sheets_id=sheets_id) + idva_flow_analytics.create_report(start_date, end_date) except Exception as e: log.error(e) -def list_accounts_task(): +def list_accounts(): try: list_response = analytics_client.list() if list_response is not None: @@ -89,77 +81,3 @@ def list_accounts_task(): ) except Exception as e: log.error(e.args) - - -def export( - df: pd.DataFrame, date_of_report: datetime, end_date: datetime = None -) -> str: - """ - Transform the downloaded response from the google analytics API into a - Google Sheets Object. - - This function first touches a Google Sheets object with the drive API, then - writes the analytics data to that object. As of right now there is no way to do - this in one API transaction. - - Args: - df (pandas.DataFrame): Tabular data to export to Google Sheets object - date_of_report (datetime): Date the report was run - Returns: - str: Google Sheets ID of the new Sheets object - """ - filename_str = generate_filename(date_of_report, end_date) - analytics_folder_id = drive_client.create_folder( - "Google Analytics", parent_id=settings.ANALYTICS_ROOT - ) - - # We have to do this in multiple steps with more than one client because the Sheets API - # doesnt support opening a file in a given directory. - sheets_id = drive_client.create_empty_spreadsheet(filename_str, analytics_folder_id) - log.info("Uploading to folder %s (%s)" % ("Google Analytics", analytics_folder_id)) - result = sheets_client.export_df_to_gdrive_speadsheet(df, sheets_id) - log.info( - "Successfully created %s (%s)" % (filename_str, result.get("spreadsheetId")) - ) - return sheets_id - - -def create_pages_and_pivot_tables(df: pd.DataFrame, sheets_id: str): - """ - Add new pages and pivot tables. - - This function is fairly naive and inefficient. If we ever want to make Google Sheets - more often than once a day, we should refactor this to limit the number of API transactions. - - Args: - df (pandas.DataFrame): Tabular data in the spreadsheet - sheets_id (str): Google Sheets object ID - """ - - page1 = "Rekrewt Pivot Table - First Visit" - page2 = "Rekrewt Pivot Table - Sessions" - page3 = "GSA Use Pivot Table" - page4 = "Completions" - - new_sheet_name_to_id = sheets_client.add_new_pages( - [page1, page2, page3, page4], sheets_id - ) - log.info("Added %s pages to %s" % (len(new_sheet_name_to_id.keys()), sheets_id)) - sheets_client.create_pivot_tables( - df, (page1, page2, page3, page4), new_sheet_name_to_id, sheets_id - ) - - -def generate_filename(date: datetime, end_date: datetime = None): - """ - Return filename for the new spreadsheet to be saved as - - Args: - date (datetime): date to format - Return: - str: Formatted Date - """ - ret = date.strftime("%Y%m%d") - if end_date is not None and end_date != date: - ret += "-%s" % (end_date.strftime("%Y%m%d")) - return ret diff --git a/gdrive/idva_flow_analytics.py b/gdrive/idva_flow_analytics.py new file mode 100644 index 0000000..db7e020 --- /dev/null +++ b/gdrive/idva_flow_analytics.py @@ -0,0 +1,492 @@ +import datetime +import pandas as pd +import logging + +from gdrive import settings, sheets_client, drive_client, analytics_client + +log = logging.getLogger(__name__) + + +def create_report(start_date: datetime, end_date: datetime): + response = analytics_client.download( + settings.ANALYTICS_PROPERTY_ID, start_date, end_date + ) + + analytics_df = analytics_client.create_df_from_analytics_response(response) + sheets_id = export(analytics_df, start_date, end_date) + create_pages_and_pivot_tables(analytics_df, sheets_id=sheets_id) + + +def export( + df: pd.DataFrame, date_of_report: datetime, end_date: datetime = None +) -> str: + """ + Transform the downloaded response from the google analytics API into a + Google Sheets Object. + + This function first touches a Google Sheets object with the drive API, then + writes the analytics data to that object. As of right now there is no way to do + this in one API transaction. + + Args: + df (pandas.DataFrame): Tabular data to export to Google Sheets object + date_of_report (datetime): Date the report was run + Returns: + str: Google Sheets ID of the new Sheets object + """ + filename_str = generate_filename(date_of_report, end_date) + analytics_folder_id = drive_client.create_folder( + "Google Analytics", parent_id=settings.ANALYTICS_ROOT + ) + + # We have to do this in multiple steps with more than one client because the Sheets API + # doesnt support opening a file in a given directory. + sheets_id = drive_client.create_empty_spreadsheet(filename_str, analytics_folder_id) + log.info("Uploading to folder %s (%s)" % ("Google Analytics", analytics_folder_id)) + result = sheets_client.export_df_to_gdrive_speadsheet(df, sheets_id) + log.info( + "Successfully created %s (%s)" % (filename_str, result.get("spreadsheetId")) + ) + return sheets_id + + +def create_pages_and_pivot_tables(df: pd.DataFrame, sheets_id: str): + """ + Add new pages and pivot tables. + + This function is fairly naive and inefficient. If we ever want to make Google Sheets + more often than once a day, we should refactor this to limit the number of API transactions. + + Args: + df (pandas.DataFrame): Tabular data in the spreadsheet + sheets_id (str): Google Sheets object ID + """ + + page1 = "Rekrewt Pivot Table - First Visit" + page2 = "Rekrewt Pivot Table - Sessions" + page3 = "GSA Use Pivot Table" + page4 = "Completions" + + new_sheet_name_to_id = sheets_client.add_new_pages( + [page1, page2, page3, page4], sheets_id + ) + log.info("Added %s pages to %s" % (len(new_sheet_name_to_id.keys()), sheets_id)) + create_pivot_tables( + df, (page1, page2, page3, page4), new_sheet_name_to_id, sheets_id + ) + + +def create_pivot_tables( + df: pd.DataFrame, page_names: (str, str, str), names_to_id: dict, sheets_id: str +): + # Make a dictionary mapping the name of the column to its index, useful for the pivot tables. + col_dict = {} + for idx, val in enumerate(df.iloc[0]): + col_dict[val] = idx + + create_first_visit_pt(sheets_id, names_to_id[page_names[0]], col_dict) + log.info( + "Added 2 pivot tables to %s (%s)" % (page_names[0], names_to_id[page_names[0]]) + ) + + create_session_start_pt(sheets_id, names_to_id[page_names[1]], col_dict) + log.info( + "Added 2 pivot tables to %s (%s)" % (page_names[1], names_to_id[page_names[1]]) + ) + + create_clicks_pt(sheets_id, names_to_id[page_names[2]], col_dict) + log.info( + "Added pivot table to %s (%s)" % (page_names[2], names_to_id[page_names[2]]) + ) + + create_feedback_pt(sheets_id, names_to_id[page_names[3]], col_dict) + log.info( + "Added pivot table to %s (%s)" % (page_names[3], names_to_id[page_names[3]]) + ) + + sheets_client.update_cell_value( + sheets_id, page_names[0], "A17", "Total First Visits" + ) + sheets_client.update_cell_value( + sheets_id, + page_names[0], + "A18", + '=GETPIVOTDATA("SUM of eventCount",A1, "eventName", "first_visit") + GETPIVOTDATA("SUM of eventCount",F1, "eventName", "first_visit")', + ) + log.info("Wrote totals to %s" % (page_names[0])) + + sheets_client.update_cell_value(sheets_id, page_names[1], "A17", "Total Sessions") + sheets_client.update_cell_value( + sheets_id, + page_names[1], + "A18", + '=GETPIVOTDATA("SUM of eventCount",A1, "eventName", "session_start") + GETPIVOTDATA("SUM of eventCount",F1, "eventName", "session_start")', + ) + log.info("Wrote totals to %s" % (page_names[1])) + + +def create_first_visit_pt(sheets_id, page_id, col_dict): + first_visit_facebook_pt_def = { + "pivotTable": { + "source": { + # First Sheet (Sheet1) is always ID 0 + "sheetId": 0, + }, + "rows": [ + { + "sourceColumnOffset": col_dict["eventName"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["firstUserSource"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["eventCount"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + ], + "filterSpecs": [ + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "first_visit", + } + ], + }, + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["eventName"], + }, + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "facebook", + }, + ], + }, + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["firstUserSource"], + }, + ], + "values": [ + { + "summarizeFunction": "SUM", + "sourceColumnOffset": col_dict["eventCount"], + } + ], + "valueLayout": "HORIZONTAL", + } + } + first_visit_rt_pt_def = { + "pivotTable": { + "source": { + # First Sheet (Sheet1) is always ID 0 + "sheetId": 0, + }, + "rows": [ + { + "sourceColumnOffset": col_dict["eventName"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["firstUserSource"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["eventCount"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + ], + "filterSpecs": [ + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "first_visit", + } + ], + }, + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["eventName"], + }, + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "rt", + }, + ], + }, + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["firstUserSource"], + }, + ], + "values": [ + { + "summarizeFunction": "SUM", + "sourceColumnOffset": col_dict["eventCount"], + } + ], + "valueLayout": "HORIZONTAL", + } + } + + sheets_client.add_pivot_tables( + sheets_id, + page_id, + first_visit_facebook_pt_def, + ) + sheets_client.add_pivot_tables( + sheets_id, + page_id, + first_visit_rt_pt_def, + row_idx=0, + col_idx=5, + ) + + +def create_session_start_pt(sheets_id, page_id, col_dict): + # Add sessions pivot table, facebook + sessions_facebook_pt_def = { + "pivotTable": { + "source": { + # First Sheet (Sheet1) is always ID 0 + "sheetId": 0, + }, + "rows": [ + { + "sourceColumnOffset": col_dict["eventName"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["firstUserSource"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["eventCount"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + ], + "filterSpecs": [ + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "session_start", + } + ], + }, + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["eventName"], + }, + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "facebook", + }, + ], + }, + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["firstUserSource"], + }, + ], + "values": [ + { + "summarizeFunction": "SUM", + "sourceColumnOffset": col_dict["eventCount"], + } + ], + "valueLayout": "HORIZONTAL", + } + } + sessions_rt_pt_def = { + "pivotTable": { + "source": { + # First Sheet (Sheet1) is always ID 0 + "sheetId": 0, + }, + "rows": [ + { + "sourceColumnOffset": col_dict["eventName"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["firstUserSource"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["eventCount"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + ], + "filterSpecs": [ + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "session_start", + } + ], + }, + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["eventName"], + }, + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "rt", + }, + ], + }, + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["firstUserSource"], + }, + ], + "values": [ + { + "summarizeFunction": "SUM", + "sourceColumnOffset": col_dict["eventCount"], + } + ], + "valueLayout": "HORIZONTAL", + } + } + + sheets_client.add_pivot_tables(sheets_id, page_id, sessions_facebook_pt_def) + sheets_client.add_pivot_tables( + sheets_id, page_id, sessions_rt_pt_def, row_idx=0, col_idx=5 + ) + + +def create_clicks_pt(sheets_id, page_id, col_dict): + clicks_pt_def = { + "pivotTable": { + "source": { + # First Sheet (Sheet1) is always ID 0 + "sheetId": 0, + }, + "rows": [ + { + "sourceColumnOffset": col_dict["eventName"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["eventCount"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + ], + "values": [ + { + "summarizeFunction": "SUM", + "sourceColumnOffset": col_dict["eventCount"], + } + ], + "valueLayout": "HORIZONTAL", + } + } + + sheets_client.add_pivot_tables(sheets_id, page_id, clicks_pt_def) + + +def create_feedback_pt(sheets_id, page_id, col_dict): + feedback_pt_def = { + "pivotTable": { + "source": { + "sheetId": 0, + }, + "rows": [ + { + "sourceColumnOffset": col_dict["eventName"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + { + "sourceColumnOffset": col_dict["eventCount"], + "showTotals": True, + "sortOrder": "ASCENDING", + }, + ], + "filterSpecs": [ + { + "filterCriteria": { + "condition": { + "type": "TEXT_CONTAINS", + "values": [ + { + "userEnteredValue": "feedback", + } + ], + }, + "visibleByDefault": True, + }, + "columnOffsetIndex": col_dict["linkUrl"], + }, + ], + "values": [ + { + "summarizeFunction": "SUM", + "sourceColumnOffset": col_dict["eventCount"], + } + ], + "valueLayout": "HORIZONTAL", + } + } + + sheets_client.add_pivot_tables(sheets_id, page_id, feedback_pt_def) + + +def generate_filename(date: datetime, end_date: datetime = None): + """ + Return filename for the new spreadsheet to be saved as + + Args: + date (datetime): date to format + Return: + str: Formatted Date + """ + ret = date.strftime("%Y%m%d") + if end_date is not None and end_date != date: + ret += "-%s" % (end_date.strftime("%Y%m%d")) + return ret diff --git a/gdrive/sheets_client.py b/gdrive/sheets_client.py index 770b211..2c766a5 100644 --- a/gdrive/sheets_client.py +++ b/gdrive/sheets_client.py @@ -23,8 +23,6 @@ Some batching in the future if the use case for this library gets heavier is necessary. """ -# Generic functions - def update_cell_value( sheet_id: str, page_name: str, range_str: str, value: str, vio="USER_ENTERED" @@ -182,406 +180,6 @@ def export_df_to_gdrive_speadsheet(df: pd.DataFrame, sheets_id: str, title="Shee return result -# Project specific functions - - -def create_pivot_tables( - df: pd.DataFrame, page_names: (str, str, str), names_to_id: dict, sheets_id: str -): - # Make a dictionary mapping the name of the column to its index, useful for the pivot tables. - col_dict = {} - for idx, val in enumerate(df.iloc[0]): - col_dict[val] = idx - - create_first_visit_pt(sheets_id, names_to_id[page_names[0]], col_dict) - log.info( - "Added 2 pivot tables to %s (%s)" % (page_names[0], names_to_id[page_names[0]]) - ) - - create_session_start_pt(sheets_id, names_to_id[page_names[1]], col_dict) - log.info( - "Added 2 pivot tables to %s (%s)" % (page_names[1], names_to_id[page_names[1]]) - ) - - create_clicks_pt(sheets_id, names_to_id[page_names[2]], col_dict) - log.info( - "Added pivot table to %s (%s)" % (page_names[2], names_to_id[page_names[2]]) - ) - - create_feedback_pt(sheets_id, names_to_id[page_names[3]], col_dict) - log.info( - "Added pivot table to %s (%s)" % (page_names[3], names_to_id[page_names[3]]) - ) - - update_cell_value(sheets_id, page_names[0], "A17", "Total First Visits") - update_cell_value( - sheets_id, - page_names[0], - "A18", - '=GETPIVOTDATA("SUM of eventCount",A1, "eventName", "first_visit") + GETPIVOTDATA("SUM of eventCount",F1, "eventName", "first_visit")', - ) - log.info("Wrote totals to %s" % (page_names[0])) - - update_cell_value(sheets_id, page_names[1], "A17", "Total Sessions") - update_cell_value( - sheets_id, - page_names[1], - "A18", - '=GETPIVOTDATA("SUM of eventCount",A1, "eventName", "session_start") + GETPIVOTDATA("SUM of eventCount",F1, "eventName", "session_start")', - ) - log.info("Wrote totals to %s" % (page_names[1])) - - -def create_first_visit_pt(sheets_id, page_id, col_dict): - first_visit_facebook_pt_def = { - "pivotTable": { - "source": { - # First Sheet (Sheet1) is always ID 0 - "sheetId": 0, - }, - "rows": [ - { - "sourceColumnOffset": col_dict["eventName"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["firstUserSource"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["eventCount"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - ], - "filterSpecs": [ - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "first_visit", - } - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["eventName"], - }, - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "facebook", - }, - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["firstUserSource"], - }, - ], - "values": [ - { - "summarizeFunction": "SUM", - "sourceColumnOffset": col_dict["eventCount"], - } - ], - "valueLayout": "HORIZONTAL", - } - } - first_visit_rt_pt_def = { - "pivotTable": { - "source": { - # First Sheet (Sheet1) is always ID 0 - "sheetId": 0, - }, - "rows": [ - { - "sourceColumnOffset": col_dict["eventName"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["firstUserSource"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["eventCount"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - ], - "filterSpecs": [ - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "first_visit", - } - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["eventName"], - }, - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "rt", - }, - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["firstUserSource"], - }, - ], - "values": [ - { - "summarizeFunction": "SUM", - "sourceColumnOffset": col_dict["eventCount"], - } - ], - "valueLayout": "HORIZONTAL", - } - } - - add_pivot_tables( - sheets_id, - page_id, - first_visit_facebook_pt_def, - ) - add_pivot_tables( - sheets_id, - page_id, - first_visit_rt_pt_def, - row_idx=0, - col_idx=5, - ) - - -def create_session_start_pt(sheets_id, page_id, col_dict): - # Add sessions pivot table, facebook - sessions_facebook_pt_def = { - "pivotTable": { - "source": { - # First Sheet (Sheet1) is always ID 0 - "sheetId": 0, - }, - "rows": [ - { - "sourceColumnOffset": col_dict["eventName"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["firstUserSource"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["eventCount"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - ], - "filterSpecs": [ - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "session_start", - } - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["eventName"], - }, - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "facebook", - }, - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["firstUserSource"], - }, - ], - "values": [ - { - "summarizeFunction": "SUM", - "sourceColumnOffset": col_dict["eventCount"], - } - ], - "valueLayout": "HORIZONTAL", - } - } - sessions_rt_pt_def = { - "pivotTable": { - "source": { - # First Sheet (Sheet1) is always ID 0 - "sheetId": 0, - }, - "rows": [ - { - "sourceColumnOffset": col_dict["eventName"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["firstUserSource"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["eventCount"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - ], - "filterSpecs": [ - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "session_start", - } - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["eventName"], - }, - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "rt", - }, - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["firstUserSource"], - }, - ], - "values": [ - { - "summarizeFunction": "SUM", - "sourceColumnOffset": col_dict["eventCount"], - } - ], - "valueLayout": "HORIZONTAL", - } - } - - add_pivot_tables(sheets_id, page_id, sessions_facebook_pt_def) - add_pivot_tables(sheets_id, page_id, sessions_rt_pt_def, row_idx=0, col_idx=5) - - -def create_clicks_pt(sheets_id, page_id, col_dict): - clicks_pt_def = { - "pivotTable": { - "source": { - # First Sheet (Sheet1) is always ID 0 - "sheetId": 0, - }, - "rows": [ - { - "sourceColumnOffset": col_dict["eventName"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["eventCount"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - ], - "values": [ - { - "summarizeFunction": "SUM", - "sourceColumnOffset": col_dict["eventCount"], - } - ], - "valueLayout": "HORIZONTAL", - } - } - - add_pivot_tables(sheets_id, page_id, clicks_pt_def) - - -def create_feedback_pt(sheets_id, page_id, col_dict): - feedback_pt_def = { - "pivotTable": { - "source": { - "sheetId": 0, - }, - "rows": [ - { - "sourceColumnOffset": col_dict["eventName"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - { - "sourceColumnOffset": col_dict["eventCount"], - "showTotals": True, - "sortOrder": "ASCENDING", - }, - ], - "filterSpecs": [ - { - "filterCriteria": { - "condition": { - "type": "TEXT_CONTAINS", - "values": [ - { - "userEnteredValue": "feedback", - } - ], - }, - "visibleByDefault": True, - }, - "columnOffsetIndex": col_dict["linkUrl"], - }, - ], - "values": [ - { - "summarizeFunction": "SUM", - "sourceColumnOffset": col_dict["eventCount"], - } - ], - "valueLayout": "HORIZONTAL", - } - } - - add_pivot_tables(sheets_id, page_id, feedback_pt_def) - - def upload_participant( first, last, diff --git a/gdrive/sheets_entities.py b/gdrive/sheets_entities.py deleted file mode 100644 index e69de29..0000000