From c7cd33df9433b4cbe6dc12873719e228e0c5642f Mon Sep 17 00:00:00 2001 From: vcai122 Date: Tue, 5 Mar 2024 20:37:50 -0500 Subject: [PATCH] fix some date parsing + refactor --- .../commands/get_penn_today_events.py | 69 +++++++++---------- 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/backend/penndata/management/commands/get_penn_today_events.py b/backend/penndata/management/commands/get_penn_today_events.py index 3286db4f..8d5b5c85 100644 --- a/backend/penndata/management/commands/get_penn_today_events.py +++ b/backend/penndata/management/commands/get_penn_today_events.py @@ -10,6 +10,7 @@ from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from webdriver_manager.firefox import GeckoDriverManager +from selenium.webdriver.firefox.options import Options from penndata.models import Event @@ -28,21 +29,8 @@ def handle(self, *args, **kwargs): # past_events.delete() # Scrapes Penn Today - try: - driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install())) - - driver.get(PENN_TODAY_WEBSITE) - events_list = WebDriverWait(driver, 10).until( - EC.presence_of_element_located((By.ID, "events-list")) - ) - - html_content = events_list.get_attribute("innerHTML") - driver.quit() - except ConnectionError: - print("Connection Error to webdriver") - return None - - soup = BeautifulSoup(html_content, "html.parser") + if not (soup := self.connect_and_parse_html(PENN_TODAY_WEBSITE, EC.presence_of_element_located((By.ID, "events-list")))): + return event_articles = soup.find_all("article", class_="tease") @@ -76,7 +64,8 @@ def handle(self, *args, **kwargs): if start_date.month < current_month: # If scraped month is before current month, increment year start_date = start_date.replace(year=current_year + 1) - if start_time_str == ALL_DAY: + print(start_date_str) + if ALL_DAY in start_time_str.lower(): start_time = datetime.time(0, 0) else: start_time = datetime.datetime.strptime(start_time_str, "%I:%M%p").time() @@ -98,19 +87,20 @@ def handle(self, *args, **kwargs): end_of_day = datetime.time(23, 59, 59) if end_date_elem: # end date but no end time end_date_str = end_date_elem.text.strip().split(" ")[-1] - end_date = datetime.combine( + end_date = datetime.datetime.combine( datetime.datetime.strptime(end_date_str, "%m/%d/%Y"), end_of_day ) + else: # no end date or end time - end_date = datetime.combine(start_date, end_of_day) + end_date = datetime.datetime.combine(start_date, end_of_day) Event.objects.update_or_create( name=name, defaults={ "event_type": "", "image_url": "", - "start": start_date, - "end": end_date, + "start": timezone.make_aware(start_date), + "end": timezone.make_aware(end_date), "location": location, "website": event_url, "description": description, @@ -120,25 +110,32 @@ def handle(self, *args, **kwargs): self.stdout.write("Uploaded Events!") + def connect_and_parse_html(self, event_url, condition): + try: + options = Options() + options.add_argument("--headless") + driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()), options=options) + + driver.get(event_url) + print("WAITING FOR ELEMENT") + element = WebDriverWait(driver, 10).until(condition) + print("ELEMENT FOUND") + + html_content = element.get_attribute("innerHTML") + driver.quit() + return BeautifulSoup(html_content, "html.parser") + except ConnectionError: + print("Connection Error to webdriver") + return None + def get_end_time(self, event_url): - driver = webdriver.Chrome() - driver.get(event_url) - event_element = WebDriverWait(driver, 10).until( - EC.presence_of_element_located((By.CLASS_NAME, "event__topper-content")) - ) - end_time_soup = BeautifulSoup(event_element.get_attribute("innerHTML"), "html.parser") + end_time_soup = self.connect_and_parse_html(event_url, EC.presence_of_element_located((By.CLASS_NAME, "event__topper-content"))) end_time_range_str = ( end_time_soup.find("p", class_="event__meta event__time").text.strip().replace(".", "") ) - print(end_time_range_str) - if not end_time_range_str or ALL_DAY in end_time_range_str.lower(): - driver.quit() + + if not end_time_range_str or ALL_DAY in end_time_range_str.lower() or len(times := end_time_range_str.split(" - ")) <= 1: return None # No end time if the event is all day - times = end_time_range_str.split(" - ") - if len(times) <= 1: - driver.quit() - return None - end_time_str = times[1] - driver.quit() - return end_time_str + + return times[1]