Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance Improvements in User Stats Processing #124

Merged
merged 20 commits into from
Sep 18, 2024
Merged
12 changes: 7 additions & 5 deletions app_sidebar_collapsible.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,17 +210,19 @@ def make_controls():
'flex-direction': 'column'}
)

page_content = dcc.Loading(
type='default',
fullscreen=True,
children=html.Div(dash.page_container, style={
# Dcc Loading removed for Data Page Lazy Loading.
# TODO Figure out how to enable Loading on everything BUT Data Page UUIDs Tab
page_content = html.Div(
dash.page_container,
style={
TeachMeTW marked this conversation as resolved.
Show resolved Hide resolved
"margin-left": "5rem",
"margin-right": "2rem",
"padding": "2rem 1rem",
})
}
)



TeachMeTW marked this conversation as resolved.
Show resolved Hide resolved
def make_home_page(): return [
sidebar,
html.Div([make_controls(), page_content])
Expand Down
143 changes: 98 additions & 45 deletions pages/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
Since the dcc.Location component is not in the layout when navigating to this page, it triggers the callback.
The workaround is to check if the input value is None.
"""
from dash import dcc, html, Input, Output, callback, register_page, dash_table, State
from dash import dcc, html, Input, Output, callback, register_page, dash_table, State, callback_context, Patch
# Etc
import logging
import time
import pandas as pd
from dash.exceptions import PreventUpdate

from concurrent.futures import ThreadPoolExecutor, as_completed
from utils import constants
from utils import permissions as perm_utils
from utils import db_utils
Expand All @@ -28,10 +29,14 @@
dcc.Tab(label='Trajectories', value='tab-trajectories-datatable'),
]),
html.Div(id='tabs-content'),
dcc.Interval(id='interval-load-more', interval=6000, n_intervals=0),
dcc.Store(id='store-uuids', data=[]), # Store to hold the original UUIDs data
dcc.Store(id='store-loaded-uuids', data={'data': [], 'loaded': False}) # Store to track loaded data
]
)



def clean_location_data(df):
if 'data.start_loc.coordinates' in df.columns:
df['data.start_loc.coordinates'] = df['data.start_loc.coordinates'].apply(lambda x: f'({x[0]}, {x[1]})')
Expand All @@ -51,6 +56,8 @@ def update_store_trajectories(start_date: str, end_date: str, tz: str, excluded_

@callback(
Output('tabs-content', 'children'),
Output('store-loaded-uuids', 'data'),
Output('interval-load-more', 'disabled'), # Disable interval when all data is loaded
Input('tabs-datatable', 'value'),
Input('store-uuids', 'data'),
Input('store-excluded-uuids', 'data'),
Expand All @@ -60,66 +67,110 @@ def update_store_trajectories(start_date: str, end_date: str, tz: str, excluded_
Input('date-picker', 'start_date'),
Input('date-picker', 'end_date'),
Input('date-picker-timezone', 'value'),
Input('interval-load-more', 'n_intervals'),# Interval to trigger the loading of more data
State('store-loaded-uuids', 'data'), # Use State to track already loaded data
State('store-loaded-uuids', 'loaded'), # Keep track if we have finished loading all data
)
def render_content(tab, store_uuids, store_excluded_uuids, store_trips, store_demographics, store_trajectories, start_date, end_date, timezone):
data, columns, has_perm = None, [], False
if tab == 'tab-uuids-datatable':
data = store_uuids["data"]
data = db_utils.add_user_stats(data)
columns = perm_utils.get_uuids_columns()
has_perm = perm_utils.has_permission('data_uuids')
def render_content(tab, store_uuids, store_excluded_uuids, store_trips, store_demographics, store_trajectories, start_date, end_date, timezone, n_intervals, loaded_uuids_store, all_data_loaded):
initial_batch_size = 10 # Define the batch size for loading UUIDs

# Ensure store_uuids contains the key 'data' which is a list of dictionaries
if not isinstance(store_uuids, dict) or 'data' not in store_uuids:
logging.error(f"Expected store_uuids to be a dict with a 'data' key, but got {type(store_uuids)}")
return html.Div([html.P("Data structure error.")]), loaded_uuids_store, True

# Extract the list of UUIDs from the dict
uuids_list = store_uuids['data']

# Ensure uuids_list is a list for slicing
if not isinstance(uuids_list, list):
logging.error(f"Expected store_uuids['data'] to be a list but got {type(uuids_list)}")
return html.Div([html.P("Data structure error.")]), loaded_uuids_store, True

# Retrieve already loaded data from the store
loaded_data = loaded_uuids_store.get('data', [])
total_loaded = len(loaded_data)

# Handle the UUIDs tab with lazy loading
if tab == 'tab-uuids-datatable' and not loaded_uuids_store.get('loaded', False):
total_to_load = total_loaded + initial_batch_size
total_to_load = min(total_to_load, len(uuids_list)) # Avoid loading more than available

logging.debug(f"Loading next batch of UUIDs: {total_loaded} to {total_to_load}")

# Slice the list of UUIDs from the dict
new_data = uuids_list[total_loaded:total_to_load]

if new_data:
# Process and append the new data to the loaded store
processed_data = db_utils.add_user_stats(new_data, initial_batch_size)
loaded_data.extend(processed_data)

# Create a Patch object to append data progressively
patched_data = Patch()
patched_data['data'] = processed_data

Comment on lines +109 to +112
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand how this is used. I see that the patched_data object is created here, but I don't see it used anywhere else in this PR. I even see that line 129 references this Patch object in a comment, but I don't see any of the Patch object methods, such as append.

Are we actually using patch? If not, what are we doing for lazy loading?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this was a relic of a prior iteration I made; I can remove/clean this section in another PR.

# Update the store with the new data
loaded_uuids_store['data'] = loaded_data
loaded_uuids_store['loaded'] = len(loaded_data) >= len(uuids_list) # Mark all data as loaded if done

logging.debug(f"New batch loaded. Total loaded: {len(loaded_data)}")

# Prepare the data to be displayed
columns = perm_utils.get_uuids_columns() # Get the relevant columns
df = pd.DataFrame(loaded_data)

if df.empty or not perm_utils.has_permission('data_uuids'):
logging.debug("No data or permission issues.")
return html.Div([html.P("No data available or you don't have permission.")]), loaded_uuids_store, False

df = df.drop(columns=[col for col in df.columns if col not in columns])

# Use the Patch() object to append new data instead of fully replacing the table
logging.debug("Returning patched data to update the UI.")
return html.Div([populate_datatable(df)]), loaded_uuids_store, False if not loaded_uuids_store['loaded'] else True


# Handle other tabs normally
elif tab == 'tab-trips-datatable':
data = store_trips["data"]
columns = perm_utils.get_allowed_trip_columns()
columns.update(
col['label'] for col in perm_utils.get_allowed_named_trip_columns()
)
columns.update(col['label'] for col in perm_utils.get_allowed_named_trip_columns())
columns.update(store_trips["userinputcols"])
has_perm = perm_utils.has_permission('data_trips')

df = pd.DataFrame(data)
if df.empty or not has_perm:
return None
return None, loaded_uuids_store, True

logging.debug(f"Final list of retained cols {columns=}")
logging.debug(f"Before dropping, {df.columns=}")
df = df.drop(columns=[col for col in df.columns if col not in columns])
logging.debug(f"After dropping, {df.columns=}")
df = clean_location_data(df)

trips_table = populate_datatable(df,'trips-table')
#Return an HTML Div containing a button (button-clicked) and the populated datatable
trips_table = populate_datatable(df, 'trips-table')
logging.debug(f"Returning 3 values: {trips_table}, {loaded_uuids_store}, True")
return html.Div([
html.Button(
'Display columns with raw units',
id='button-clicked', #identifier for the button
n_clicks=0, #initialize number of clicks to 0
style={'marginLeft':'5px'}
),
trips_table, #populated trips table component
])

html.Button('Display columns with raw units', id='button-clicked', n_clicks=0, style={'marginLeft': '5px'}),
trips_table
]), loaded_uuids_store, True

elif tab == 'tab-demographics-datatable':
data = store_demographics["data"]
has_perm = perm_utils.has_permission('data_demographics')
# if only one survey is available, process it without creating a subtab
if len(data) == 1:
# here data is a dictionary

if len(data) == 1:
data = list(data.values())[0]
columns = list(data[0].keys())
# for multiple survey, create subtabs for unique surveys
elif len(data) > 1:
#returns subtab only if has_perm is True
if not has_perm:
return None
return None, loaded_uuids_store
return html.Div([
dcc.Tabs(id='subtabs-demographics', value=list(data.keys())[0], children=[
dcc.Tab(label= key, value= key) for key in data
]),
dcc.Tab(label=key, value=key) for key in data
]),
html.Div(id='subtabs-demographics-content')
])
]), loaded_uuids_store, True

elif tab == 'tab-trajectories-datatable':
# Currently store_trajectories data is loaded only when the respective tab is selected
#Here we query for trajectory data once "Trajectories" tab is selected
(start_date, end_date) = iso_to_date_only(start_date, end_date)
if store_trajectories == {}:
store_trajectories = update_store_trajectories(start_date, end_date, timezone, store_excluded_uuids)
Expand All @@ -128,14 +179,17 @@ def render_content(tab, store_uuids, store_excluded_uuids, store_trips, store_de
columns = list(data[0].keys())
columns = perm_utils.get_trajectories_columns(columns)
has_perm = perm_utils.has_permission('data_trajectories')

df = pd.DataFrame(data)
if df.empty or not has_perm:
return None

df = df.drop(columns=[col for col in df.columns if col not in columns])
df = pd.DataFrame(data)
if df.empty or not has_perm:
return None, loaded_uuids_store

df = df.drop(columns=[col for col in df.columns if col not in columns])
return populate_datatable(df), loaded_uuids_store, True

# Default case: if no data is loaded or the tab is not handled
return None, loaded_uuids_store, True

return populate_datatable(df)

# handle subtabs for demographic table when there are multiple surveys
@callback(
Expand Down Expand Up @@ -177,7 +231,6 @@ def update_dropdowns_trips(n_clicks, button_label):
#return the list of hidden columns and the updated button label
return hidden_col, button_label


def populate_datatable(df, table_id=''):
if not isinstance(df, pd.DataFrame):
raise PreventUpdate
Expand Down
Loading