-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
119 lines (98 loc) · 4.23 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import argparse
import json
import re
from datetime import datetime, timedelta
from pathlib import Path
from isodate import duration_isoformat
from scraper import MTBEventsPage, MTBRacesPage, MTBResultsPage
def custom_serializer(obj):
"""Custom JSON serializer for wonky data types."""
if isinstance(obj, datetime):
return obj.date().isoformat() # Convert datetime to ISO 8601 date
if isinstance(obj, timedelta):
return duration_isoformat(obj) # Convert timedelta to ISO 8601 period
raise TypeError(f"Type {type(obj)} not serializable")
if __name__ == "__main__":
# Parse command line arguments
description = ("Scrape MTB event results from the UCI MTB World Series "
"website and save them to a local folder.")
parser = argparse.ArgumentParser(description=description)
# Add arguments
parser.add_argument(
"year",
type=int,
help="Event year to scrape."
)
parser.add_argument(
"--output", "-o",
default="./data",
type=Path,
help="Output folder for the scraped data. (default: ./data)"
)
args = parser.parse_args()
year = args.year
root = args.output
# Create the output folder if it doesn't exist
year_folder = Path(f"{root}/{year}")
year_folder.mkdir(parents=True, exist_ok=True)
# Extract all event information for the given year
event_page = MTBEventsPage(year, use_selenium=True)
print(f"Extracting events for {year}...")
events = event_page.fetch_events()
# Enrich events with race information
enriched_events = []
for event in events:
print(f"Extracting race info for {event['location']}...")
enriched_event = {
**event,
'races': MTBRacesPage(event['results_url']).fetch_races()
}
enriched_events.append(enriched_event)
# Extract results for each race and event and save
for num, event in enumerate(enriched_events):
# Event number for the year
event_num = num + 1
# Create a folder for each event for the year
location = re.sub(r'[\s\-\–]+', '_', event["location"]) \
.replace(",", "") \
.lower()
event_folder = year_folder / f"{event_num:02d}_{location}"
event_folder.mkdir(parents=True, exist_ok=True)
# Extract results for each race
for i, race in enumerate(event['races']):
# Define a name for the race
name = (f"{race['discipline']}_{race['gender']}_{race['category']}"
f"_{race['race_type']}")
name = re.sub(r'\s+|-', '_', name).lower()
# Create a folder and file for the race
race_folder = event_folder / "results" / race['discipline']
race_folder.mkdir(parents=True, exist_ok=True)
race_file = race_folder / f"{name}.json"
# Skip if the file already exists
if race_file.exists():
# Load the results from the file into the event object
# so that we can save the event object to a file with
# complete information
with open(race_file, "r") as f:
event['races'][i] = json.load(f)
continue
# Fetch race results
print(f"Extracting results for {event['location']} {name}...")
page = MTBResultsPage(race['url'])
race['event'] = event['location']
race.update(page.fetch_results())
# Create a folder
race_folder = event_folder / "results" / race['discipline']
race_folder.mkdir(parents=True, exist_ok=True)
# Save race results to a JSON file
race_file = race_folder / f"{name}.json"
with open(race_file, "w") as f:
json.dump(race, f, default=custom_serializer,
ensure_ascii=False, indent=2)
# Remove event name from each race for saving event file
race.pop("event")
# Save the event details to a JSON file
event_file = event_folder / "event.json"
with open(event_file, "w") as f:
json.dump(event, f, default=custom_serializer,
ensure_ascii=False, indent=2)