Skip to content

Commit

Permalink
fix some date parsing + refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
vcai122 committed Mar 6, 2024
1 parent b8547c1 commit c7cd33d
Showing 1 changed file with 33 additions and 36 deletions.
69 changes: 33 additions & 36 deletions backend/penndata/management/commands/get_penn_today_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.firefox.options import Options

from penndata.models import Event

Expand All @@ -28,21 +29,8 @@ def handle(self, *args, **kwargs):
# past_events.delete()

# Scrapes Penn Today
try:
driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))

driver.get(PENN_TODAY_WEBSITE)
events_list = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "events-list"))
)

html_content = events_list.get_attribute("innerHTML")
driver.quit()
except ConnectionError:
print("Connection Error to webdriver")
return None

soup = BeautifulSoup(html_content, "html.parser")
if not (soup := self.connect_and_parse_html(PENN_TODAY_WEBSITE, EC.presence_of_element_located((By.ID, "events-list")))):
return

event_articles = soup.find_all("article", class_="tease")

Expand Down Expand Up @@ -76,7 +64,8 @@ def handle(self, *args, **kwargs):
if start_date.month < current_month:
# If scraped month is before current month, increment year
start_date = start_date.replace(year=current_year + 1)
if start_time_str == ALL_DAY:
print(start_date_str)
if ALL_DAY in start_time_str.lower():
start_time = datetime.time(0, 0)
else:
start_time = datetime.datetime.strptime(start_time_str, "%I:%M%p").time()
Expand All @@ -98,19 +87,20 @@ def handle(self, *args, **kwargs):
end_of_day = datetime.time(23, 59, 59)
if end_date_elem: # end date but no end time
end_date_str = end_date_elem.text.strip().split(" ")[-1]
end_date = datetime.combine(
end_date = datetime.datetime.combine(
datetime.datetime.strptime(end_date_str, "%m/%d/%Y"), end_of_day
)

else: # no end date or end time
end_date = datetime.combine(start_date, end_of_day)
end_date = datetime.datetime.combine(start_date, end_of_day)

Event.objects.update_or_create(
name=name,
defaults={
"event_type": "",
"image_url": "",
"start": start_date,
"end": end_date,
"start": timezone.make_aware(start_date),
"end": timezone.make_aware(end_date),
"location": location,
"website": event_url,
"description": description,
Expand All @@ -120,25 +110,32 @@ def handle(self, *args, **kwargs):

self.stdout.write("Uploaded Events!")

def connect_and_parse_html(self, event_url, condition):
try:
options = Options()
options.add_argument("--headless")
driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()), options=options)

driver.get(event_url)
print("WAITING FOR ELEMENT")
element = WebDriverWait(driver, 10).until(condition)
print("ELEMENT FOUND")

html_content = element.get_attribute("innerHTML")
driver.quit()
return BeautifulSoup(html_content, "html.parser")
except ConnectionError:
print("Connection Error to webdriver")
return None

def get_end_time(self, event_url):
driver = webdriver.Chrome()
driver.get(event_url)
event_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "event__topper-content"))
)
end_time_soup = BeautifulSoup(event_element.get_attribute("innerHTML"), "html.parser")
end_time_soup = self.connect_and_parse_html(event_url, EC.presence_of_element_located((By.CLASS_NAME, "event__topper-content")))

end_time_range_str = (
end_time_soup.find("p", class_="event__meta event__time").text.strip().replace(".", "")
)
print(end_time_range_str)
if not end_time_range_str or ALL_DAY in end_time_range_str.lower():
driver.quit()

if not end_time_range_str or ALL_DAY in end_time_range_str.lower() or len(times := end_time_range_str.split(" - ")) <= 1:
return None # No end time if the event is all day
times = end_time_range_str.split(" - ")
if len(times) <= 1:
driver.quit()
return None
end_time_str = times[1]
driver.quit()
return end_time_str

return times[1]

0 comments on commit c7cd33d

Please sign in to comment.