Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
ktmanley23 authored Dec 10, 2024
1 parent 466341c commit 7552232
Showing 1 changed file with 347 additions and 0 deletions.
347 changes: 347 additions & 0 deletions VisitationModelDataPrep.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,347 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "28c1cdfa-b828-40fc-b5b5-dc7688c619a1",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import geopandas as gpd\n",
"from shapely.geometry import Point\n",
"from datetime import timedelta\n",
"import requests\n",
"from geopy.distance import geodesic\n",
"import holidays\n",
"\n",
"# Load the point datasets (social sensing data)\n",
"points1 = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/ProxyData/filtered_eBird2024_1km.csv', sep='\\t')\n",
"points2 = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/ProxyData/filtered_COiNaturalist_1km.csv', sep='\\t')\n",
"points3 = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/ProxyData/filtered_COFlickr_1km.csv')\n",
"\n",
"# Convert date columns to datetime format and to geodataframes\n",
"points1['eventDate'] = pd.to_datetime(points1['eventDate'], errors='coerce')\n",
"points2['eventDate'] = pd.to_datetime(points2['eventDate'], errors='coerce')\n",
"points3['eventDate'] = pd.to_datetime(points3['eventDate'], errors='coerce')\n",
"\n",
"geometry1 = [Point(xy) for xy in zip(points1['decimalLongitude'], points1['decimalLatitude'])]\n",
"points_gdf1 = gpd.GeoDataFrame(points1, geometry=geometry1, crs='EPSG:4326')\n",
"\n",
"geometry2 = [Point(xy) for xy in zip(points2['decimalLongitude'], points2['decimalLatitude'])]\n",
"points_gdf2 = gpd.GeoDataFrame(points2, geometry=geometry2, crs='EPSG:4326')\n",
"\n",
"geometry3 = [Point(xy) for xy in zip(points3['decimalLongitude'], points3['decimalLatitude'])]\n",
"points_gdf3 = gpd.GeoDataFrame(points3, geometry=geometry3, crs='EPSG:4326')\n",
"\n",
"# Load the trails shapefile containing trails with training data\n",
"trails = gpd.read_file('/Users/kyma3609/Downloads/OSMP_Trails/OSMP_Trails.shp')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4b233d7b-4008-4cf4-8448-5a5ab3bb390d",
"metadata": {},
"outputs": [],
"source": [
"# Dictionary to map alternate trail names to dataset names where there are discrepencies\n",
"trail_name_mapping = {\n",
" 'Boulder Valley Ranch - Sage Trail': 'Boulder Valley - Sage Trail',\n",
"}\n",
"\n",
"# List of trails by name that have on-site data from OSMP to train/validate on\n",
"selected_trail_names = ['Doudy Draw', 'South Mesa Trail', 'Flatirons Vista', 'Boulder Valley Ranch - Sage Trail', 'Chautauqua Trail', 'Dakota Ridge', 'Eagle Trailhead', 'Foothills South', 'Sanitas Valley Trail', 'South Boulder Creek', 'South Boulder Creek West', 'Coal Seam TH Access', 'Sawhill Ponds', 'East Boulder - Teller Farm', 'East Boulder - White Rocks', 'Lehigh Connector - North', 'East Boulder - Gunbarrel', 'Joder Ranch TH', 'Green Mountain West Ridge', 'South Boulder Creek Marshall', 'Fowler Trail']\n",
"\n",
"# Prepare empty DataFrame to store results for all trails\n",
"all_trails_visitation = pd.DataFrame()\n",
"\n",
"# Loop through each trail\n",
"for trail_name in selected_trail_names:\n",
" print(f\"Processing {trail_name}...\")\n",
" dataset_trail_name = trail_name_mapping.get(trail_name, trail_name)\n",
" selected_trail = trails[trails['GISPROD313'] == dataset_trail_name]\n",
" \n",
" if selected_trail.empty:\n",
" print(f\"No data for trail: {trail_name}\")\n",
" continue\n",
" buffered_trail = selected_trail.buffer(30.48)\n",
" \n",
" if buffered_trail.is_empty.any():\n",
" print(f\"Empty buffer for trail: {trail_name}\")\n",
" continue\n",
" \n",
"# Spatial join to get the social sensing data within the buffer for each dataset\n",
" points_within_buffer_ebird = gpd.sjoin(points_gdf1, gpd.GeoDataFrame(geometry=buffered_trail, crs=buffered_trail.crs), how=\"inner\", predicate=\"within\")\n",
" points_within_buffer_inaturalist = gpd.sjoin(points_gdf2, gpd.GeoDataFrame(geometry=buffered_trail, crs=buffered_trail.crs), how=\"inner\", predicate=\"within\")\n",
" points_within_buffer_flickr = gpd.sjoin(points_gdf3, gpd.GeoDataFrame(geometry=buffered_trail, crs=buffered_trail.crs), how=\"inner\", predicate=\"within\")\n",
" print(f\"{trail_name}: {len(points_within_buffer_ebird)} eBird points, {len(points_within_buffer_inaturalist)} iNaturalist points, {len(points_within_buffer_flickr)} Flickr points\")\n",
" \n",
" if len(points_within_buffer_ebird) == 0 and len(points_within_buffer_inaturalist) == 0 and len(points_within_buffer_flickr) == 0:\n",
" print(f\"No points found within buffer for trail: {trail_name}\")\n",
" continue\n",
"# Normalize the eventDate to ensure all timestamps are in the same format\n",
" points_within_buffer_ebird['eventDate'] = points_within_buffer_ebird['eventDate'].dt.date\n",
" points_within_buffer_inaturalist['eventDate'] = points_within_buffer_inaturalist['eventDate'].dt.date\n",
" points_within_buffer_flickr['eventDate'] = points_within_buffer_flickr['eventDate'].dt.date\n",
" # Count visitation per day for each dataset\n",
" visitation_ebird = points_within_buffer_ebird.groupby('eventDate').size().reset_index(name='visitation_ebird')\n",
" visitation_inaturalist = points_within_buffer_inaturalist.groupby('eventDate').size().reset_index(name='visitation_inaturalist')\n",
" visitation_flickr = points_within_buffer_flickr.groupby('eventDate').size().reset_index(name='visitation_flickr')\n",
" visitation_ebird['eventDate'] = pd.to_datetime(visitation_ebird['eventDate'])\n",
" visitation_inaturalist['eventDate'] = pd.to_datetime(visitation_inaturalist['eventDate'])\n",
" visitation_flickr['eventDate'] = pd.to_datetime(visitation_flickr['eventDate'])\n",
"# Create a DataFrame with all dates in the range\n",
" visitation_start_date = '2020-01-01'\n",
" visitation_end_date = '2024-12-31'\n",
" all_dates = pd.DataFrame({'date': pd.date_range(start=visitation_start_date, end=visitation_end_date)})\n",
"# Merge visitation data with all_dates to include zeroes for missing dates\n",
" full_visitation_ebird = all_dates.merge(visitation_ebird, left_on='date', right_on='eventDate', how='left').fillna(0)\n",
" full_visitation_inaturalist = all_dates.merge(visitation_inaturalist, left_on='date', right_on='eventDate', how='left').fillna(0)\n",
" full_visitation_flickr = all_dates.merge(visitation_flickr, left_on='date', right_on='eventDate', how='left').fillna(0)\n",
" full_visitation_ebird.drop(columns=['eventDate'], inplace=True)\n",
" full_visitation_inaturalist.drop(columns=['eventDate'], inplace=True)\n",
" full_visitation_flickr.drop(columns=['eventDate'], inplace=True)\n",
"# Combine the visitation data into one DataFrame\n",
" full_visitation = full_visitation_ebird.merge(full_visitation_inaturalist, on='date', how='left')\n",
" full_visitation = full_visitation.merge(full_visitation_flickr, on='date', how='left')\n",
"# Calculate the total visitation\n",
" full_visitation['visitation_count'] = full_visitation[['visitation_ebird', 'visitation_inaturalist', 'visitation_flickr']].sum(axis=1).astype(int)\n",
" full_visitation['trail_name'] = trail_name\n",
" all_trails_visitation = pd.concat([all_trails_visitation, full_visitation])\n",
"all_trails_visitation.to_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/VisitationModelData_AllTrails.csv', index=False)\n",
"\n",
"print(\"Processing complete\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c4967c8-5521-47e9-9394-30e3bc2a04c1",
"metadata": {},
"outputs": [],
"source": [
"# Load visitation data\n",
"visitation_data = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/VisitationModelData_AllTrails.csv')\n",
"\n",
"# Load NOAA weather station data\n",
"weather_stations = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/ghcnd_stations_final.csv')\n",
"\n",
"# Function to get the mapped trail name\n",
"def get_mapped_trail_name(trail_name, mapping):\n",
" return mapping.get(trail_name, trail_name)\n",
"\n",
"# Select all trails based on the mapped names\n",
"selected_trails = trails[trails['GISPROD313'].isin([get_mapped_trail_name(name, trail_name_mapping) for name in selected_trail_names])]\n",
"if selected_trails.empty:\n",
" raise ValueError(\"None of the selected trails were found in the shapefile.\")\n",
"all_trails_centroid = selected_trails.geometry.centroid.unary_union.centroid\n",
"all_trails_centroid_geo = gpd.GeoSeries([all_trails_centroid], crs=\"EPSG:26913\").to_crs(epsg=4326).geometry.iloc[0]\n",
"print(f\"All trails centroid coordinates (latitude/longitude): {all_trails_centroid_geo.y}, {all_trails_centroid_geo.x}\")\n",
"\n",
"# Function to find the closest weather stations\n",
"def find_closest_stations(trail_centroid, weather_stations, num_stations=25):\n",
" weather_stations['distance'] = weather_stations.apply(\n",
" lambda row: geodesic((trail_centroid.y, trail_centroid.x), (row['Latitude'], row['Longitude'])).km, axis=1)\n",
" return weather_stations.sort_values('distance').head(num_stations)\n",
"\n",
"# NOAA API token\n",
"token = 'AYmKyDYxGCqUWzGOxWcupbuRpfMPWboP'\n",
"\n",
"# Function to get weather data from the NOAA API\n",
"def get_weather_data(token, station_ids, start_date, end_date, datatype):\n",
" url = f\"https://www.ncdc.noaa.gov/cdo-web/api/v2/data\"\n",
" headers = {'token': token}\n",
" all_data = pd.DataFrame()\n",
"\n",
" for station_id in station_ids:\n",
" current_start_date = pd.to_datetime(start_date)\n",
" current_end_date = min(current_start_date + timedelta(days=364), pd.to_datetime(end_date))\n",
" station_data = pd.DataFrame()\n",
"\n",
" while current_start_date <= pd.to_datetime(end_date):\n",
" params = {\n",
" 'datasetid': 'GHCND',\n",
" 'stationid': f'GHCND:{station_id}',\n",
" 'startdate': current_start_date.strftime('%Y-%m-%d'),\n",
" 'enddate': current_end_date.strftime('%Y-%m-%d'),\n",
" 'datatypeid': datatype,\n",
" 'limit': 1000,\n",
" 'units': 'metric'\n",
" }\n",
"\n",
" response = requests.get(url, params=params, headers=headers)\n",
" if response.status_code == 200:\n",
" data = response.json().get('results', [])\n",
" if data:\n",
" df = pd.DataFrame(data)\n",
" df['station_id'] = station_id\n",
" station_data = pd.concat([station_data, df], ignore_index=True)\n",
" else:\n",
" print(f\"No data available for {datatype} at station {station_id} from {current_start_date} to {current_end_date}.\")\n",
" else:\n",
" print(f\"Error fetching data: {response.status_code} {response.text}\")\n",
" break\n",
"\n",
" current_start_date = current_end_date + timedelta(days=1)\n",
" current_end_date = min(current_start_date + timedelta(days=364), pd.to_datetime(end_date))\n",
"\n",
" if not station_data.empty:\n",
" all_data = pd.concat([all_data, station_data], ignore_index=True).drop_duplicates(subset=['date'], keep='first')\n",
"\n",
" return all_data\n",
"\n",
"# Find the closest weather stations and define start/end dates\n",
"closest_stations = find_closest_stations(all_trails_centroid_geo, weather_stations)\n",
"start_date = visitation_data['date'].min()\n",
"end_date = visitation_data['date'].max()\n",
"\n",
"# Pull precipitation and temperature data from closest station for defined timeframe\n",
"precipitation_data = get_weather_data(token, closest_stations['Station_ID'], start_date, end_date, 'PRCP')\n",
"temperature_max_data = get_weather_data(token, closest_stations['Station_ID'], start_date, end_date, 'TMAX')\n",
"visitation_data['date'] = pd.to_datetime(visitation_data['date']).dt.date\n",
"precipitation_data['date'] = pd.to_datetime(precipitation_data['date']).dt.date\n",
"temperature_max_data['date'] = pd.to_datetime(temperature_max_data['date']).dt.date\n",
"\n",
"# Merge weather data with visitation data\n",
"visitation_with_weather = pd.merge(visitation_data, precipitation_data[['date', 'value']], on='date', how='left', suffixes=('', '_precip'))\n",
"visitation_with_weather = pd.merge(visitation_with_weather, temperature_max_data[['date', 'value']], on='date', how='left', suffixes=('', '_tmax'))\n",
"visitation_with_weather.rename(columns={'value': 'precipitation', 'value_tmax': 'max_temperature'}, inplace=True)\n",
"visitation_with_weather.to_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/VisitationModelData_WithWeather.csv', index=False)\n",
"\n",
"print(\"Weather data has been successfully appended to the visitation dataset.\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3e2c0136-2173-4483-870b-a2f6ae1db306",
"metadata": {},
"outputs": [],
"source": [
"# Load the visitation data with weather included\n",
"all_trails_visitation = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/VisitationModelData_WithWeather.csv')\n",
"all_trails_visitation['date'] = pd.to_datetime(all_trails_visitation['date'])\n",
"visitation_start_date = all_trails_visitation['date'].min()\n",
"visitation_end_date = all_trails_visitation['date'].max()\n",
"\n",
"# Generate holidays column for weekly holiday binary variable\n",
"us_holidays = holidays.US(years=range(visitation_start_date.year, visitation_end_date.year + 1))\n",
"all_trails_visitation['holiday'] = all_trails_visitation['date'].apply(lambda x: 1 if x in us_holidays else 0)\n",
"\n",
"# Add the 'week_start_date' and 'week_of_year' to maintain weekly records\n",
"all_trails_visitation['week_start_date'] = all_trails_visitation['date'] - pd.to_timedelta(all_trails_visitation['date'].dt.dayofweek, unit='d')\n",
"all_trails_visitation['week_of_year'] = all_trails_visitation['date'].dt.isocalendar().week\n",
"\n",
"# Group by both trail name and week to maintain unique trail data and aggregate weekly\n",
"weekly_data = all_trails_visitation.groupby(['trail_name', 'week_start_date', 'week_of_year']).agg({\n",
" 'visitation_ebird': 'sum',\n",
" 'visitation_inaturalist': 'sum',\n",
" 'visitation_flickr': 'sum',\n",
" 'visitation_count': 'sum',\n",
" 'precipitation': 'sum', \n",
" 'holiday': lambda x: 1 if x.sum() > 0 else 0,\n",
" 'max_temperature': 'mean'\n",
"}).reset_index()\n",
"\n",
"# Reorder columns for clarity\n",
"weekly_data = weekly_data[['trail_name', 'week_start_date', 'week_of_year', 'visitation_ebird',\n",
" 'visitation_inaturalist', 'visitation_flickr', 'visitation_count', \n",
" 'precipitation', 'holiday', 'max_temperature']]\n",
"\n",
"# Save the final weekly aggregated dataset\n",
"weekly_data.to_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/WeeklyVisitationModelData_Trails.csv', index=False)\n",
"print(\"Processing complete. Data saved.\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2ff1bbe0-ac0f-4f77-947a-dba9223f91a8",
"metadata": {},
"outputs": [],
"source": [
"from datetime import timedelta\n",
"\n",
"# Load the OSMP visitation data\n",
"osmp_visitation_data = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/OSMP_daily_counts_continuous.csv')\n",
"osmp_visitation_data['date'] = pd.to_datetime(osmp_visitation_data['date'])\n",
"start_date = pd.Timestamp('2019-12-30')\n",
"osmp_visitation_data = osmp_visitation_data.dropna(subset=['daily_visit'])\n",
"osmp_visitation_data = osmp_visitation_data[osmp_visitation_data['daily_visit'] > 0]\n",
"current_date = pd.Timestamp.today().normalize() \n",
"osmp_visitation_data = osmp_visitation_data[osmp_visitation_data['date'] <= current_date]\n",
"\n",
"# Calculate the week start date based on the custom starting date for each entry\n",
"osmp_visitation_data['week_start_date'] = osmp_visitation_data['date'].apply(\n",
" lambda x: start_date + timedelta(days=((x - start_date).days // 7) * 7)\n",
")\n",
"\n",
"# Aggregate OSMP data weekly summing the 'daily_count' for each trail\n",
"osmp_weekly = osmp_visitation_data.groupby(['trail_name', 'week_start_date']).agg(weekly_count=('daily_visit', 'sum')).reset_index()\n",
"osmp_weekly = osmp_weekly[osmp_weekly['weekly_count'] > 0]\n",
"osmp_weekly.to_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/OSMP_VisitationWeekly_Cleaned.csv', index=False)\n",
"\n",
"print(\"OSMP weekly visitation data (cleaned) saved successfully.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f90e5789-2ded-4d07-a156-4c41a192cd06",
"metadata": {},
"outputs": [],
"source": [
"# Load the model dataset and cleaned OSMP dataset\n",
"trail_data = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/WeeklyVisitationModelData_Trails.csv')\n",
"osmp_weekly = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/OSMP_VisitationWeekly_Cleaned.csv')\n",
"trail_data['week_start_date'] = pd.to_datetime(trail_data['week_start_date'])\n",
"osmp_weekly['week_start_date'] = pd.to_datetime(osmp_weekly['week_start_date'])\n",
"\n",
"# Uncomment and adjust the below code if there are discrepancies in trail names\n",
"# osmp_weekly['trail_name'] = osmp_weekly['trail_name'].replace({\n",
"# 'Doudy Draw Trailhead': 'Doudy Draw',\n",
"# 'Marshall Mesa Trailhead': 'Marshall Mesa',\n",
"# 'Sanitas Valley Trail': 'Sanitas Valley'\n",
"# })\n",
"\n",
"# Merge the OSMP weekly data with the trail data\n",
"combined_data = pd.merge(trail_data, osmp_weekly, on=['trail_name', 'week_start_date'], how='left')\n",
"combined_data = combined_data.dropna(subset=['weekly_count'])\n",
"combined_data = combined_data[combined_data['weekly_count'] > 0]\n",
"combined_data.to_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/CombinedWeeklyVisitationData.csv', index=False)\n",
"\n",
"print(\"Data successfully merged, cleaned, and saved.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "77952fc6-d1f0-48e8-853c-4363d95a3c35",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 7552232

Please sign in to comment.