generated from CU-ESIIL/Postdoc_OASIS
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
466341c
commit 7552232
Showing
1 changed file
with
347 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,347 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "28c1cdfa-b828-40fc-b5b5-dc7688c619a1", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import geopandas as gpd\n", | ||
"from shapely.geometry import Point\n", | ||
"from datetime import timedelta\n", | ||
"import requests\n", | ||
"from geopy.distance import geodesic\n", | ||
"import holidays\n", | ||
"\n", | ||
"# Load the point datasets (social sensing data)\n", | ||
"points1 = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/ProxyData/filtered_eBird2024_1km.csv', sep='\\t')\n", | ||
"points2 = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/ProxyData/filtered_COiNaturalist_1km.csv', sep='\\t')\n", | ||
"points3 = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/ProxyData/filtered_COFlickr_1km.csv')\n", | ||
"\n", | ||
"# Convert date columns to datetime format and to geodataframes\n", | ||
"points1['eventDate'] = pd.to_datetime(points1['eventDate'], errors='coerce')\n", | ||
"points2['eventDate'] = pd.to_datetime(points2['eventDate'], errors='coerce')\n", | ||
"points3['eventDate'] = pd.to_datetime(points3['eventDate'], errors='coerce')\n", | ||
"\n", | ||
"geometry1 = [Point(xy) for xy in zip(points1['decimalLongitude'], points1['decimalLatitude'])]\n", | ||
"points_gdf1 = gpd.GeoDataFrame(points1, geometry=geometry1, crs='EPSG:4326')\n", | ||
"\n", | ||
"geometry2 = [Point(xy) for xy in zip(points2['decimalLongitude'], points2['decimalLatitude'])]\n", | ||
"points_gdf2 = gpd.GeoDataFrame(points2, geometry=geometry2, crs='EPSG:4326')\n", | ||
"\n", | ||
"geometry3 = [Point(xy) for xy in zip(points3['decimalLongitude'], points3['decimalLatitude'])]\n", | ||
"points_gdf3 = gpd.GeoDataFrame(points3, geometry=geometry3, crs='EPSG:4326')\n", | ||
"\n", | ||
"# Load the trails shapefile containing trails with training data\n", | ||
"trails = gpd.read_file('/Users/kyma3609/Downloads/OSMP_Trails/OSMP_Trails.shp')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "4b233d7b-4008-4cf4-8448-5a5ab3bb390d", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Dictionary to map alternate trail names to dataset names where there are discrepencies\n", | ||
"trail_name_mapping = {\n", | ||
" 'Boulder Valley Ranch - Sage Trail': 'Boulder Valley - Sage Trail',\n", | ||
"}\n", | ||
"\n", | ||
"# List of trails by name that have on-site data from OSMP to train/validate on\n", | ||
"selected_trail_names = ['Doudy Draw', 'South Mesa Trail', 'Flatirons Vista', 'Boulder Valley Ranch - Sage Trail', 'Chautauqua Trail', 'Dakota Ridge', 'Eagle Trailhead', 'Foothills South', 'Sanitas Valley Trail', 'South Boulder Creek', 'South Boulder Creek West', 'Coal Seam TH Access', 'Sawhill Ponds', 'East Boulder - Teller Farm', 'East Boulder - White Rocks', 'Lehigh Connector - North', 'East Boulder - Gunbarrel', 'Joder Ranch TH', 'Green Mountain West Ridge', 'South Boulder Creek Marshall', 'Fowler Trail']\n", | ||
"\n", | ||
"# Prepare empty DataFrame to store results for all trails\n", | ||
"all_trails_visitation = pd.DataFrame()\n", | ||
"\n", | ||
"# Loop through each trail\n", | ||
"for trail_name in selected_trail_names:\n", | ||
" print(f\"Processing {trail_name}...\")\n", | ||
" dataset_trail_name = trail_name_mapping.get(trail_name, trail_name)\n", | ||
" selected_trail = trails[trails['GISPROD313'] == dataset_trail_name]\n", | ||
" \n", | ||
" if selected_trail.empty:\n", | ||
" print(f\"No data for trail: {trail_name}\")\n", | ||
" continue\n", | ||
" buffered_trail = selected_trail.buffer(30.48)\n", | ||
" \n", | ||
" if buffered_trail.is_empty.any():\n", | ||
" print(f\"Empty buffer for trail: {trail_name}\")\n", | ||
" continue\n", | ||
" \n", | ||
"# Spatial join to get the social sensing data within the buffer for each dataset\n", | ||
" points_within_buffer_ebird = gpd.sjoin(points_gdf1, gpd.GeoDataFrame(geometry=buffered_trail, crs=buffered_trail.crs), how=\"inner\", predicate=\"within\")\n", | ||
" points_within_buffer_inaturalist = gpd.sjoin(points_gdf2, gpd.GeoDataFrame(geometry=buffered_trail, crs=buffered_trail.crs), how=\"inner\", predicate=\"within\")\n", | ||
" points_within_buffer_flickr = gpd.sjoin(points_gdf3, gpd.GeoDataFrame(geometry=buffered_trail, crs=buffered_trail.crs), how=\"inner\", predicate=\"within\")\n", | ||
" print(f\"{trail_name}: {len(points_within_buffer_ebird)} eBird points, {len(points_within_buffer_inaturalist)} iNaturalist points, {len(points_within_buffer_flickr)} Flickr points\")\n", | ||
" \n", | ||
" if len(points_within_buffer_ebird) == 0 and len(points_within_buffer_inaturalist) == 0 and len(points_within_buffer_flickr) == 0:\n", | ||
" print(f\"No points found within buffer for trail: {trail_name}\")\n", | ||
" continue\n", | ||
"# Normalize the eventDate to ensure all timestamps are in the same format\n", | ||
" points_within_buffer_ebird['eventDate'] = points_within_buffer_ebird['eventDate'].dt.date\n", | ||
" points_within_buffer_inaturalist['eventDate'] = points_within_buffer_inaturalist['eventDate'].dt.date\n", | ||
" points_within_buffer_flickr['eventDate'] = points_within_buffer_flickr['eventDate'].dt.date\n", | ||
" # Count visitation per day for each dataset\n", | ||
" visitation_ebird = points_within_buffer_ebird.groupby('eventDate').size().reset_index(name='visitation_ebird')\n", | ||
" visitation_inaturalist = points_within_buffer_inaturalist.groupby('eventDate').size().reset_index(name='visitation_inaturalist')\n", | ||
" visitation_flickr = points_within_buffer_flickr.groupby('eventDate').size().reset_index(name='visitation_flickr')\n", | ||
" visitation_ebird['eventDate'] = pd.to_datetime(visitation_ebird['eventDate'])\n", | ||
" visitation_inaturalist['eventDate'] = pd.to_datetime(visitation_inaturalist['eventDate'])\n", | ||
" visitation_flickr['eventDate'] = pd.to_datetime(visitation_flickr['eventDate'])\n", | ||
"# Create a DataFrame with all dates in the range\n", | ||
" visitation_start_date = '2020-01-01'\n", | ||
" visitation_end_date = '2024-12-31'\n", | ||
" all_dates = pd.DataFrame({'date': pd.date_range(start=visitation_start_date, end=visitation_end_date)})\n", | ||
"# Merge visitation data with all_dates to include zeroes for missing dates\n", | ||
" full_visitation_ebird = all_dates.merge(visitation_ebird, left_on='date', right_on='eventDate', how='left').fillna(0)\n", | ||
" full_visitation_inaturalist = all_dates.merge(visitation_inaturalist, left_on='date', right_on='eventDate', how='left').fillna(0)\n", | ||
" full_visitation_flickr = all_dates.merge(visitation_flickr, left_on='date', right_on='eventDate', how='left').fillna(0)\n", | ||
" full_visitation_ebird.drop(columns=['eventDate'], inplace=True)\n", | ||
" full_visitation_inaturalist.drop(columns=['eventDate'], inplace=True)\n", | ||
" full_visitation_flickr.drop(columns=['eventDate'], inplace=True)\n", | ||
"# Combine the visitation data into one DataFrame\n", | ||
" full_visitation = full_visitation_ebird.merge(full_visitation_inaturalist, on='date', how='left')\n", | ||
" full_visitation = full_visitation.merge(full_visitation_flickr, on='date', how='left')\n", | ||
"# Calculate the total visitation\n", | ||
" full_visitation['visitation_count'] = full_visitation[['visitation_ebird', 'visitation_inaturalist', 'visitation_flickr']].sum(axis=1).astype(int)\n", | ||
" full_visitation['trail_name'] = trail_name\n", | ||
" all_trails_visitation = pd.concat([all_trails_visitation, full_visitation])\n", | ||
"all_trails_visitation.to_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/VisitationModelData_AllTrails.csv', index=False)\n", | ||
"\n", | ||
"print(\"Processing complete\")\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "4c4967c8-5521-47e9-9394-30e3bc2a04c1", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Load visitation data\n", | ||
"visitation_data = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/VisitationModelData_AllTrails.csv')\n", | ||
"\n", | ||
"# Load NOAA weather station data\n", | ||
"weather_stations = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/ghcnd_stations_final.csv')\n", | ||
"\n", | ||
"# Function to get the mapped trail name\n", | ||
"def get_mapped_trail_name(trail_name, mapping):\n", | ||
" return mapping.get(trail_name, trail_name)\n", | ||
"\n", | ||
"# Select all trails based on the mapped names\n", | ||
"selected_trails = trails[trails['GISPROD313'].isin([get_mapped_trail_name(name, trail_name_mapping) for name in selected_trail_names])]\n", | ||
"if selected_trails.empty:\n", | ||
" raise ValueError(\"None of the selected trails were found in the shapefile.\")\n", | ||
"all_trails_centroid = selected_trails.geometry.centroid.unary_union.centroid\n", | ||
"all_trails_centroid_geo = gpd.GeoSeries([all_trails_centroid], crs=\"EPSG:26913\").to_crs(epsg=4326).geometry.iloc[0]\n", | ||
"print(f\"All trails centroid coordinates (latitude/longitude): {all_trails_centroid_geo.y}, {all_trails_centroid_geo.x}\")\n", | ||
"\n", | ||
"# Function to find the closest weather stations\n", | ||
"def find_closest_stations(trail_centroid, weather_stations, num_stations=25):\n", | ||
" weather_stations['distance'] = weather_stations.apply(\n", | ||
" lambda row: geodesic((trail_centroid.y, trail_centroid.x), (row['Latitude'], row['Longitude'])).km, axis=1)\n", | ||
" return weather_stations.sort_values('distance').head(num_stations)\n", | ||
"\n", | ||
"# NOAA API token\n", | ||
"token = 'AYmKyDYxGCqUWzGOxWcupbuRpfMPWboP'\n", | ||
"\n", | ||
"# Function to get weather data from the NOAA API\n", | ||
"def get_weather_data(token, station_ids, start_date, end_date, datatype):\n", | ||
" url = f\"https://www.ncdc.noaa.gov/cdo-web/api/v2/data\"\n", | ||
" headers = {'token': token}\n", | ||
" all_data = pd.DataFrame()\n", | ||
"\n", | ||
" for station_id in station_ids:\n", | ||
" current_start_date = pd.to_datetime(start_date)\n", | ||
" current_end_date = min(current_start_date + timedelta(days=364), pd.to_datetime(end_date))\n", | ||
" station_data = pd.DataFrame()\n", | ||
"\n", | ||
" while current_start_date <= pd.to_datetime(end_date):\n", | ||
" params = {\n", | ||
" 'datasetid': 'GHCND',\n", | ||
" 'stationid': f'GHCND:{station_id}',\n", | ||
" 'startdate': current_start_date.strftime('%Y-%m-%d'),\n", | ||
" 'enddate': current_end_date.strftime('%Y-%m-%d'),\n", | ||
" 'datatypeid': datatype,\n", | ||
" 'limit': 1000,\n", | ||
" 'units': 'metric'\n", | ||
" }\n", | ||
"\n", | ||
" response = requests.get(url, params=params, headers=headers)\n", | ||
" if response.status_code == 200:\n", | ||
" data = response.json().get('results', [])\n", | ||
" if data:\n", | ||
" df = pd.DataFrame(data)\n", | ||
" df['station_id'] = station_id\n", | ||
" station_data = pd.concat([station_data, df], ignore_index=True)\n", | ||
" else:\n", | ||
" print(f\"No data available for {datatype} at station {station_id} from {current_start_date} to {current_end_date}.\")\n", | ||
" else:\n", | ||
" print(f\"Error fetching data: {response.status_code} {response.text}\")\n", | ||
" break\n", | ||
"\n", | ||
" current_start_date = current_end_date + timedelta(days=1)\n", | ||
" current_end_date = min(current_start_date + timedelta(days=364), pd.to_datetime(end_date))\n", | ||
"\n", | ||
" if not station_data.empty:\n", | ||
" all_data = pd.concat([all_data, station_data], ignore_index=True).drop_duplicates(subset=['date'], keep='first')\n", | ||
"\n", | ||
" return all_data\n", | ||
"\n", | ||
"# Find the closest weather stations and define start/end dates\n", | ||
"closest_stations = find_closest_stations(all_trails_centroid_geo, weather_stations)\n", | ||
"start_date = visitation_data['date'].min()\n", | ||
"end_date = visitation_data['date'].max()\n", | ||
"\n", | ||
"# Pull precipitation and temperature data from closest station for defined timeframe\n", | ||
"precipitation_data = get_weather_data(token, closest_stations['Station_ID'], start_date, end_date, 'PRCP')\n", | ||
"temperature_max_data = get_weather_data(token, closest_stations['Station_ID'], start_date, end_date, 'TMAX')\n", | ||
"visitation_data['date'] = pd.to_datetime(visitation_data['date']).dt.date\n", | ||
"precipitation_data['date'] = pd.to_datetime(precipitation_data['date']).dt.date\n", | ||
"temperature_max_data['date'] = pd.to_datetime(temperature_max_data['date']).dt.date\n", | ||
"\n", | ||
"# Merge weather data with visitation data\n", | ||
"visitation_with_weather = pd.merge(visitation_data, precipitation_data[['date', 'value']], on='date', how='left', suffixes=('', '_precip'))\n", | ||
"visitation_with_weather = pd.merge(visitation_with_weather, temperature_max_data[['date', 'value']], on='date', how='left', suffixes=('', '_tmax'))\n", | ||
"visitation_with_weather.rename(columns={'value': 'precipitation', 'value_tmax': 'max_temperature'}, inplace=True)\n", | ||
"visitation_with_weather.to_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/VisitationModelData_WithWeather.csv', index=False)\n", | ||
"\n", | ||
"print(\"Weather data has been successfully appended to the visitation dataset.\")\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "3e2c0136-2173-4483-870b-a2f6ae1db306", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Load the visitation data with weather included\n", | ||
"all_trails_visitation = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/VisitationModelData_WithWeather.csv')\n", | ||
"all_trails_visitation['date'] = pd.to_datetime(all_trails_visitation['date'])\n", | ||
"visitation_start_date = all_trails_visitation['date'].min()\n", | ||
"visitation_end_date = all_trails_visitation['date'].max()\n", | ||
"\n", | ||
"# Generate holidays column for weekly holiday binary variable\n", | ||
"us_holidays = holidays.US(years=range(visitation_start_date.year, visitation_end_date.year + 1))\n", | ||
"all_trails_visitation['holiday'] = all_trails_visitation['date'].apply(lambda x: 1 if x in us_holidays else 0)\n", | ||
"\n", | ||
"# Add the 'week_start_date' and 'week_of_year' to maintain weekly records\n", | ||
"all_trails_visitation['week_start_date'] = all_trails_visitation['date'] - pd.to_timedelta(all_trails_visitation['date'].dt.dayofweek, unit='d')\n", | ||
"all_trails_visitation['week_of_year'] = all_trails_visitation['date'].dt.isocalendar().week\n", | ||
"\n", | ||
"# Group by both trail name and week to maintain unique trail data and aggregate weekly\n", | ||
"weekly_data = all_trails_visitation.groupby(['trail_name', 'week_start_date', 'week_of_year']).agg({\n", | ||
" 'visitation_ebird': 'sum',\n", | ||
" 'visitation_inaturalist': 'sum',\n", | ||
" 'visitation_flickr': 'sum',\n", | ||
" 'visitation_count': 'sum',\n", | ||
" 'precipitation': 'sum', \n", | ||
" 'holiday': lambda x: 1 if x.sum() > 0 else 0,\n", | ||
" 'max_temperature': 'mean'\n", | ||
"}).reset_index()\n", | ||
"\n", | ||
"# Reorder columns for clarity\n", | ||
"weekly_data = weekly_data[['trail_name', 'week_start_date', 'week_of_year', 'visitation_ebird',\n", | ||
" 'visitation_inaturalist', 'visitation_flickr', 'visitation_count', \n", | ||
" 'precipitation', 'holiday', 'max_temperature']]\n", | ||
"\n", | ||
"# Save the final weekly aggregated dataset\n", | ||
"weekly_data.to_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/WeeklyVisitationModelData_Trails.csv', index=False)\n", | ||
"print(\"Processing complete. Data saved.\")\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "2ff1bbe0-ac0f-4f77-947a-dba9223f91a8", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from datetime import timedelta\n", | ||
"\n", | ||
"# Load the OSMP visitation data\n", | ||
"osmp_visitation_data = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/OSMP_daily_counts_continuous.csv')\n", | ||
"osmp_visitation_data['date'] = pd.to_datetime(osmp_visitation_data['date'])\n", | ||
"start_date = pd.Timestamp('2019-12-30')\n", | ||
"osmp_visitation_data = osmp_visitation_data.dropna(subset=['daily_visit'])\n", | ||
"osmp_visitation_data = osmp_visitation_data[osmp_visitation_data['daily_visit'] > 0]\n", | ||
"current_date = pd.Timestamp.today().normalize() \n", | ||
"osmp_visitation_data = osmp_visitation_data[osmp_visitation_data['date'] <= current_date]\n", | ||
"\n", | ||
"# Calculate the week start date based on the custom starting date for each entry\n", | ||
"osmp_visitation_data['week_start_date'] = osmp_visitation_data['date'].apply(\n", | ||
" lambda x: start_date + timedelta(days=((x - start_date).days // 7) * 7)\n", | ||
")\n", | ||
"\n", | ||
"# Aggregate OSMP data weekly summing the 'daily_count' for each trail\n", | ||
"osmp_weekly = osmp_visitation_data.groupby(['trail_name', 'week_start_date']).agg(weekly_count=('daily_visit', 'sum')).reset_index()\n", | ||
"osmp_weekly = osmp_weekly[osmp_weekly['weekly_count'] > 0]\n", | ||
"osmp_weekly.to_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/OSMP_VisitationWeekly_Cleaned.csv', index=False)\n", | ||
"\n", | ||
"print(\"OSMP weekly visitation data (cleaned) saved successfully.\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "f90e5789-2ded-4d07-a156-4c41a192cd06", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Load the model dataset and cleaned OSMP dataset\n", | ||
"trail_data = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/WeeklyVisitationModelData_Trails.csv')\n", | ||
"osmp_weekly = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/OSMP_VisitationWeekly_Cleaned.csv')\n", | ||
"trail_data['week_start_date'] = pd.to_datetime(trail_data['week_start_date'])\n", | ||
"osmp_weekly['week_start_date'] = pd.to_datetime(osmp_weekly['week_start_date'])\n", | ||
"\n", | ||
"# Uncomment and adjust the below code if there are discrepancies in trail names\n", | ||
"# osmp_weekly['trail_name'] = osmp_weekly['trail_name'].replace({\n", | ||
"# 'Doudy Draw Trailhead': 'Doudy Draw',\n", | ||
"# 'Marshall Mesa Trailhead': 'Marshall Mesa',\n", | ||
"# 'Sanitas Valley Trail': 'Sanitas Valley'\n", | ||
"# })\n", | ||
"\n", | ||
"# Merge the OSMP weekly data with the trail data\n", | ||
"combined_data = pd.merge(trail_data, osmp_weekly, on=['trail_name', 'week_start_date'], how='left')\n", | ||
"combined_data = combined_data.dropna(subset=['weekly_count'])\n", | ||
"combined_data = combined_data[combined_data['weekly_count'] > 0]\n", | ||
"combined_data.to_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/CombinedWeeklyVisitationData.csv', index=False)\n", | ||
"\n", | ||
"print(\"Data successfully merged, cleaned, and saved.\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "77952fc6-d1f0-48e8-853c-4363d95a3c35", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |