diff --git a/VisitationModelDataPrep.ipynb b/VisitationModelDataPrep.ipynb new file mode 100644 index 0000000..4586217 --- /dev/null +++ b/VisitationModelDataPrep.ipynb @@ -0,0 +1,347 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "28c1cdfa-b828-40fc-b5b5-dc7688c619a1", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import geopandas as gpd\n", + "from shapely.geometry import Point\n", + "from datetime import timedelta\n", + "import requests\n", + "from geopy.distance import geodesic\n", + "import holidays\n", + "\n", + "# Load the point datasets (social sensing data)\n", + "points1 = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/ProxyData/filtered_eBird2024_1km.csv', sep='\\t')\n", + "points2 = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/ProxyData/filtered_COiNaturalist_1km.csv', sep='\\t')\n", + "points3 = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/ProxyData/filtered_COFlickr_1km.csv')\n", + "\n", + "# Convert date columns to datetime format and to geodataframes\n", + "points1['eventDate'] = pd.to_datetime(points1['eventDate'], errors='coerce')\n", + "points2['eventDate'] = pd.to_datetime(points2['eventDate'], errors='coerce')\n", + "points3['eventDate'] = pd.to_datetime(points3['eventDate'], errors='coerce')\n", + "\n", + "geometry1 = [Point(xy) for xy in zip(points1['decimalLongitude'], points1['decimalLatitude'])]\n", + "points_gdf1 = gpd.GeoDataFrame(points1, geometry=geometry1, crs='EPSG:4326')\n", + "\n", + "geometry2 = [Point(xy) for xy in zip(points2['decimalLongitude'], points2['decimalLatitude'])]\n", + "points_gdf2 = gpd.GeoDataFrame(points2, geometry=geometry2, crs='EPSG:4326')\n", + "\n", + "geometry3 = [Point(xy) for xy in zip(points3['decimalLongitude'], points3['decimalLatitude'])]\n", + "points_gdf3 = gpd.GeoDataFrame(points3, geometry=geometry3, crs='EPSG:4326')\n", + "\n", + "# Load the trails shapefile containing trails with training data\n", + "trails = gpd.read_file('/Users/kyma3609/Downloads/OSMP_Trails/OSMP_Trails.shp')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b233d7b-4008-4cf4-8448-5a5ab3bb390d", + "metadata": {}, + "outputs": [], + "source": [ + "# Dictionary to map alternate trail names to dataset names where there are discrepencies\n", + "trail_name_mapping = {\n", + " 'Boulder Valley Ranch - Sage Trail': 'Boulder Valley - Sage Trail',\n", + "}\n", + "\n", + "# List of trails by name that have on-site data from OSMP to train/validate on\n", + "selected_trail_names = ['Doudy Draw', 'South Mesa Trail', 'Flatirons Vista', 'Boulder Valley Ranch - Sage Trail', 'Chautauqua Trail', 'Dakota Ridge', 'Eagle Trailhead', 'Foothills South', 'Sanitas Valley Trail', 'South Boulder Creek', 'South Boulder Creek West', 'Coal Seam TH Access', 'Sawhill Ponds', 'East Boulder - Teller Farm', 'East Boulder - White Rocks', 'Lehigh Connector - North', 'East Boulder - Gunbarrel', 'Joder Ranch TH', 'Green Mountain West Ridge', 'South Boulder Creek Marshall', 'Fowler Trail']\n", + "\n", + "# Prepare empty DataFrame to store results for all trails\n", + "all_trails_visitation = pd.DataFrame()\n", + "\n", + "# Loop through each trail\n", + "for trail_name in selected_trail_names:\n", + " print(f\"Processing {trail_name}...\")\n", + " dataset_trail_name = trail_name_mapping.get(trail_name, trail_name)\n", + " selected_trail = trails[trails['GISPROD313'] == dataset_trail_name]\n", + " \n", + " if selected_trail.empty:\n", + " print(f\"No data for trail: {trail_name}\")\n", + " continue\n", + " buffered_trail = selected_trail.buffer(30.48)\n", + " \n", + " if buffered_trail.is_empty.any():\n", + " print(f\"Empty buffer for trail: {trail_name}\")\n", + " continue\n", + " \n", + "# Spatial join to get the social sensing data within the buffer for each dataset\n", + " points_within_buffer_ebird = gpd.sjoin(points_gdf1, gpd.GeoDataFrame(geometry=buffered_trail, crs=buffered_trail.crs), how=\"inner\", predicate=\"within\")\n", + " points_within_buffer_inaturalist = gpd.sjoin(points_gdf2, gpd.GeoDataFrame(geometry=buffered_trail, crs=buffered_trail.crs), how=\"inner\", predicate=\"within\")\n", + " points_within_buffer_flickr = gpd.sjoin(points_gdf3, gpd.GeoDataFrame(geometry=buffered_trail, crs=buffered_trail.crs), how=\"inner\", predicate=\"within\")\n", + " print(f\"{trail_name}: {len(points_within_buffer_ebird)} eBird points, {len(points_within_buffer_inaturalist)} iNaturalist points, {len(points_within_buffer_flickr)} Flickr points\")\n", + " \n", + " if len(points_within_buffer_ebird) == 0 and len(points_within_buffer_inaturalist) == 0 and len(points_within_buffer_flickr) == 0:\n", + " print(f\"No points found within buffer for trail: {trail_name}\")\n", + " continue\n", + "# Normalize the eventDate to ensure all timestamps are in the same format\n", + " points_within_buffer_ebird['eventDate'] = points_within_buffer_ebird['eventDate'].dt.date\n", + " points_within_buffer_inaturalist['eventDate'] = points_within_buffer_inaturalist['eventDate'].dt.date\n", + " points_within_buffer_flickr['eventDate'] = points_within_buffer_flickr['eventDate'].dt.date\n", + " # Count visitation per day for each dataset\n", + " visitation_ebird = points_within_buffer_ebird.groupby('eventDate').size().reset_index(name='visitation_ebird')\n", + " visitation_inaturalist = points_within_buffer_inaturalist.groupby('eventDate').size().reset_index(name='visitation_inaturalist')\n", + " visitation_flickr = points_within_buffer_flickr.groupby('eventDate').size().reset_index(name='visitation_flickr')\n", + " visitation_ebird['eventDate'] = pd.to_datetime(visitation_ebird['eventDate'])\n", + " visitation_inaturalist['eventDate'] = pd.to_datetime(visitation_inaturalist['eventDate'])\n", + " visitation_flickr['eventDate'] = pd.to_datetime(visitation_flickr['eventDate'])\n", + "# Create a DataFrame with all dates in the range\n", + " visitation_start_date = '2020-01-01'\n", + " visitation_end_date = '2024-12-31'\n", + " all_dates = pd.DataFrame({'date': pd.date_range(start=visitation_start_date, end=visitation_end_date)})\n", + "# Merge visitation data with all_dates to include zeroes for missing dates\n", + " full_visitation_ebird = all_dates.merge(visitation_ebird, left_on='date', right_on='eventDate', how='left').fillna(0)\n", + " full_visitation_inaturalist = all_dates.merge(visitation_inaturalist, left_on='date', right_on='eventDate', how='left').fillna(0)\n", + " full_visitation_flickr = all_dates.merge(visitation_flickr, left_on='date', right_on='eventDate', how='left').fillna(0)\n", + " full_visitation_ebird.drop(columns=['eventDate'], inplace=True)\n", + " full_visitation_inaturalist.drop(columns=['eventDate'], inplace=True)\n", + " full_visitation_flickr.drop(columns=['eventDate'], inplace=True)\n", + "# Combine the visitation data into one DataFrame\n", + " full_visitation = full_visitation_ebird.merge(full_visitation_inaturalist, on='date', how='left')\n", + " full_visitation = full_visitation.merge(full_visitation_flickr, on='date', how='left')\n", + "# Calculate the total visitation\n", + " full_visitation['visitation_count'] = full_visitation[['visitation_ebird', 'visitation_inaturalist', 'visitation_flickr']].sum(axis=1).astype(int)\n", + " full_visitation['trail_name'] = trail_name\n", + " all_trails_visitation = pd.concat([all_trails_visitation, full_visitation])\n", + "all_trails_visitation.to_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/VisitationModelData_AllTrails.csv', index=False)\n", + "\n", + "print(\"Processing complete\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c4967c8-5521-47e9-9394-30e3bc2a04c1", + "metadata": {}, + "outputs": [], + "source": [ + "# Load visitation data\n", + "visitation_data = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/VisitationModelData_AllTrails.csv')\n", + "\n", + "# Load NOAA weather station data\n", + "weather_stations = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/ghcnd_stations_final.csv')\n", + "\n", + "# Function to get the mapped trail name\n", + "def get_mapped_trail_name(trail_name, mapping):\n", + " return mapping.get(trail_name, trail_name)\n", + "\n", + "# Select all trails based on the mapped names\n", + "selected_trails = trails[trails['GISPROD313'].isin([get_mapped_trail_name(name, trail_name_mapping) for name in selected_trail_names])]\n", + "if selected_trails.empty:\n", + " raise ValueError(\"None of the selected trails were found in the shapefile.\")\n", + "all_trails_centroid = selected_trails.geometry.centroid.unary_union.centroid\n", + "all_trails_centroid_geo = gpd.GeoSeries([all_trails_centroid], crs=\"EPSG:26913\").to_crs(epsg=4326).geometry.iloc[0]\n", + "print(f\"All trails centroid coordinates (latitude/longitude): {all_trails_centroid_geo.y}, {all_trails_centroid_geo.x}\")\n", + "\n", + "# Function to find the closest weather stations\n", + "def find_closest_stations(trail_centroid, weather_stations, num_stations=25):\n", + " weather_stations['distance'] = weather_stations.apply(\n", + " lambda row: geodesic((trail_centroid.y, trail_centroid.x), (row['Latitude'], row['Longitude'])).km, axis=1)\n", + " return weather_stations.sort_values('distance').head(num_stations)\n", + "\n", + "# NOAA API token\n", + "token = 'AYmKyDYxGCqUWzGOxWcupbuRpfMPWboP'\n", + "\n", + "# Function to get weather data from the NOAA API\n", + "def get_weather_data(token, station_ids, start_date, end_date, datatype):\n", + " url = f\"https://www.ncdc.noaa.gov/cdo-web/api/v2/data\"\n", + " headers = {'token': token}\n", + " all_data = pd.DataFrame()\n", + "\n", + " for station_id in station_ids:\n", + " current_start_date = pd.to_datetime(start_date)\n", + " current_end_date = min(current_start_date + timedelta(days=364), pd.to_datetime(end_date))\n", + " station_data = pd.DataFrame()\n", + "\n", + " while current_start_date <= pd.to_datetime(end_date):\n", + " params = {\n", + " 'datasetid': 'GHCND',\n", + " 'stationid': f'GHCND:{station_id}',\n", + " 'startdate': current_start_date.strftime('%Y-%m-%d'),\n", + " 'enddate': current_end_date.strftime('%Y-%m-%d'),\n", + " 'datatypeid': datatype,\n", + " 'limit': 1000,\n", + " 'units': 'metric'\n", + " }\n", + "\n", + " response = requests.get(url, params=params, headers=headers)\n", + " if response.status_code == 200:\n", + " data = response.json().get('results', [])\n", + " if data:\n", + " df = pd.DataFrame(data)\n", + " df['station_id'] = station_id\n", + " station_data = pd.concat([station_data, df], ignore_index=True)\n", + " else:\n", + " print(f\"No data available for {datatype} at station {station_id} from {current_start_date} to {current_end_date}.\")\n", + " else:\n", + " print(f\"Error fetching data: {response.status_code} {response.text}\")\n", + " break\n", + "\n", + " current_start_date = current_end_date + timedelta(days=1)\n", + " current_end_date = min(current_start_date + timedelta(days=364), pd.to_datetime(end_date))\n", + "\n", + " if not station_data.empty:\n", + " all_data = pd.concat([all_data, station_data], ignore_index=True).drop_duplicates(subset=['date'], keep='first')\n", + "\n", + " return all_data\n", + "\n", + "# Find the closest weather stations and define start/end dates\n", + "closest_stations = find_closest_stations(all_trails_centroid_geo, weather_stations)\n", + "start_date = visitation_data['date'].min()\n", + "end_date = visitation_data['date'].max()\n", + "\n", + "# Pull precipitation and temperature data from closest station for defined timeframe\n", + "precipitation_data = get_weather_data(token, closest_stations['Station_ID'], start_date, end_date, 'PRCP')\n", + "temperature_max_data = get_weather_data(token, closest_stations['Station_ID'], start_date, end_date, 'TMAX')\n", + "visitation_data['date'] = pd.to_datetime(visitation_data['date']).dt.date\n", + "precipitation_data['date'] = pd.to_datetime(precipitation_data['date']).dt.date\n", + "temperature_max_data['date'] = pd.to_datetime(temperature_max_data['date']).dt.date\n", + "\n", + "# Merge weather data with visitation data\n", + "visitation_with_weather = pd.merge(visitation_data, precipitation_data[['date', 'value']], on='date', how='left', suffixes=('', '_precip'))\n", + "visitation_with_weather = pd.merge(visitation_with_weather, temperature_max_data[['date', 'value']], on='date', how='left', suffixes=('', '_tmax'))\n", + "visitation_with_weather.rename(columns={'value': 'precipitation', 'value_tmax': 'max_temperature'}, inplace=True)\n", + "visitation_with_weather.to_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/VisitationModelData_WithWeather.csv', index=False)\n", + "\n", + "print(\"Weather data has been successfully appended to the visitation dataset.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e2c0136-2173-4483-870b-a2f6ae1db306", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the visitation data with weather included\n", + "all_trails_visitation = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/VisitationModelData_WithWeather.csv')\n", + "all_trails_visitation['date'] = pd.to_datetime(all_trails_visitation['date'])\n", + "visitation_start_date = all_trails_visitation['date'].min()\n", + "visitation_end_date = all_trails_visitation['date'].max()\n", + "\n", + "# Generate holidays column for weekly holiday binary variable\n", + "us_holidays = holidays.US(years=range(visitation_start_date.year, visitation_end_date.year + 1))\n", + "all_trails_visitation['holiday'] = all_trails_visitation['date'].apply(lambda x: 1 if x in us_holidays else 0)\n", + "\n", + "# Add the 'week_start_date' and 'week_of_year' to maintain weekly records\n", + "all_trails_visitation['week_start_date'] = all_trails_visitation['date'] - pd.to_timedelta(all_trails_visitation['date'].dt.dayofweek, unit='d')\n", + "all_trails_visitation['week_of_year'] = all_trails_visitation['date'].dt.isocalendar().week\n", + "\n", + "# Group by both trail name and week to maintain unique trail data and aggregate weekly\n", + "weekly_data = all_trails_visitation.groupby(['trail_name', 'week_start_date', 'week_of_year']).agg({\n", + " 'visitation_ebird': 'sum',\n", + " 'visitation_inaturalist': 'sum',\n", + " 'visitation_flickr': 'sum',\n", + " 'visitation_count': 'sum',\n", + " 'precipitation': 'sum', \n", + " 'holiday': lambda x: 1 if x.sum() > 0 else 0,\n", + " 'max_temperature': 'mean'\n", + "}).reset_index()\n", + "\n", + "# Reorder columns for clarity\n", + "weekly_data = weekly_data[['trail_name', 'week_start_date', 'week_of_year', 'visitation_ebird',\n", + " 'visitation_inaturalist', 'visitation_flickr', 'visitation_count', \n", + " 'precipitation', 'holiday', 'max_temperature']]\n", + "\n", + "# Save the final weekly aggregated dataset\n", + "weekly_data.to_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/WeeklyVisitationModelData_Trails.csv', index=False)\n", + "print(\"Processing complete. Data saved.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ff1bbe0-ac0f-4f77-947a-dba9223f91a8", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import timedelta\n", + "\n", + "# Load the OSMP visitation data\n", + "osmp_visitation_data = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/OSMP_daily_counts_continuous.csv')\n", + "osmp_visitation_data['date'] = pd.to_datetime(osmp_visitation_data['date'])\n", + "start_date = pd.Timestamp('2019-12-30')\n", + "osmp_visitation_data = osmp_visitation_data.dropna(subset=['daily_visit'])\n", + "osmp_visitation_data = osmp_visitation_data[osmp_visitation_data['daily_visit'] > 0]\n", + "current_date = pd.Timestamp.today().normalize() \n", + "osmp_visitation_data = osmp_visitation_data[osmp_visitation_data['date'] <= current_date]\n", + "\n", + "# Calculate the week start date based on the custom starting date for each entry\n", + "osmp_visitation_data['week_start_date'] = osmp_visitation_data['date'].apply(\n", + " lambda x: start_date + timedelta(days=((x - start_date).days // 7) * 7)\n", + ")\n", + "\n", + "# Aggregate OSMP data weekly summing the 'daily_count' for each trail\n", + "osmp_weekly = osmp_visitation_data.groupby(['trail_name', 'week_start_date']).agg(weekly_count=('daily_visit', 'sum')).reset_index()\n", + "osmp_weekly = osmp_weekly[osmp_weekly['weekly_count'] > 0]\n", + "osmp_weekly.to_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/OSMP_VisitationWeekly_Cleaned.csv', index=False)\n", + "\n", + "print(\"OSMP weekly visitation data (cleaned) saved successfully.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f90e5789-2ded-4d07-a156-4c41a192cd06", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the model dataset and cleaned OSMP dataset\n", + "trail_data = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/WeeklyVisitationModelData_Trails.csv')\n", + "osmp_weekly = pd.read_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/OSMP_VisitationWeekly_Cleaned.csv')\n", + "trail_data['week_start_date'] = pd.to_datetime(trail_data['week_start_date'])\n", + "osmp_weekly['week_start_date'] = pd.to_datetime(osmp_weekly['week_start_date'])\n", + "\n", + "# Uncomment and adjust the below code if there are discrepancies in trail names\n", + "# osmp_weekly['trail_name'] = osmp_weekly['trail_name'].replace({\n", + "# 'Doudy Draw Trailhead': 'Doudy Draw',\n", + "# 'Marshall Mesa Trailhead': 'Marshall Mesa',\n", + "# 'Sanitas Valley Trail': 'Sanitas Valley'\n", + "# })\n", + "\n", + "# Merge the OSMP weekly data with the trail data\n", + "combined_data = pd.merge(trail_data, osmp_weekly, on=['trail_name', 'week_start_date'], how='left')\n", + "combined_data = combined_data.dropna(subset=['weekly_count'])\n", + "combined_data = combined_data[combined_data['weekly_count'] > 0]\n", + "combined_data.to_csv('/Users/kyma3609/Documents/Recreation_Wildfire/MORPHO/CombinedWeeklyVisitationData.csv', index=False)\n", + "\n", + "print(\"Data successfully merged, cleaned, and saved.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77952fc6-d1f0-48e8-853c-4363d95a3c35", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}