Skip to content

Commit

Permalink
fixup! Add scraper for new Scottish Parliament site
Browse files Browse the repository at this point in the history
  • Loading branch information
ajparsons committed Apr 23, 2024
1 parent 930cfaa commit 6a17a4c
Showing 1 changed file with 23 additions and 8 deletions.
31 changes: 23 additions & 8 deletions pyscraper/sp_2024/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from pathlib import Path

from bs4 import BeautifulSoup, Tag
from bs4 import BeautifulSoup, Tag, NavigableString

# HTML elements we accept moving from raw_html to parsed
acceptable_elements = [
Expand Down Expand Up @@ -61,6 +61,7 @@
"tt",
"u",
"ul",
"timestamp"
]


Expand Down Expand Up @@ -137,13 +138,27 @@ def process_raw_html(raw_html: Tag, agenda_item_url: str) -> BeautifulSoup:
# sequential tags from an acceptable element should be grouped together under the speech
# to create the speech object
for speaker in soup.find_all("speech"):
next_sibling = speaker.find_next_sibling()
while next_sibling and next_sibling.name in acceptable_elements:
# if the class is 'or-contribution-box' remove that class
if next_sibling.get("class") == ["or-contribution-box"]:
del next_sibling["class"]
speaker.append(next_sibling)
next_sibling = speaker.find_next_sibling()
next_sibling = speaker.next_sibling
while next_sibling:
if isinstance(next_sibling, NavigableString) and str(next_sibling).strip():
nt = soup.new_tag("timestamp")
nt.string = next_sibling.strip()
# replace all weird whitespace with normal space
nt.string = nt.string.replace("\xa0", " ")
nt.next_sibling = next_sibling.next_sibling
# delete original navigablestring
next_sibling.extract()
next_sibling = nt
if next_sibling.name in acceptable_elements:
# if the class is 'or-contribution-box' remove that class
if next_sibling.get("class") == ["or-contribution-box"]:
del next_sibling["class"]
speaker.append(next_sibling)
next_sibling = speaker.next_sibling
elif next_sibling.name == "speech":
break
else:
next_sibling = next_sibling.next_sibling

# now, in each speech - we want to iterate through and check for a p tag that's just 'For' or 'Against'
# if so the next sibling will be a list of speakers seperated by <br/>
Expand Down

0 comments on commit 6a17a4c

Please sign in to comment.