Skip to content

Commit

Permalink
fixup! fixup! fixup! fixup! Add scraper for new Scottish Parliament site
Browse files Browse the repository at this point in the history
Metaspeeches
  • Loading branch information
dracos committed Apr 22, 2024
1 parent 5a915f7 commit f135873
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 3 deletions.
6 changes: 3 additions & 3 deletions pyscraper/sp_2024/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,9 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False
print(f"Could not find person id for {speaker_name}")
speech = etree.Element("speech")
speech.set("id", id_factory.get_next_minor_id())
speech.set("url", subitem.get("speech_url"))
speech.set("url", subitem.get("speech_url") or "")
speech.set("speakername", speaker_name)
speech.set("person_id", person_id or "Unknown")
speech.set("person_id", person_id or "unknown")
for child in subitem:
speech.append(child)
root.append(speech)
Expand Down Expand Up @@ -149,7 +149,7 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False
person_id = is_member_vote(person_name, iso_date)
if person_id is None:
print(f"Could not find person id for {person_name}")
mspname.set("id", person_id or "Unknown")
mspname.set("id", person_id or "unknown")

# write the new xml to a file
etree.indent(root, space=" ")
Expand Down
18 changes: 18 additions & 0 deletions pyscraper/sp_2024/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,30 @@ def process_raw_html(html: str, agenda_item_url: str):
speaker["speech_url"] = agenda_item_url + "#" + p["id"]
p.replace_with(speaker)

# This is for finding topical question headings
for p in soup.find_all('p', class_='lead'):
heading = soup.new_tag("heading")
heading.string = p.string
heading["url"] = agenda_item_url + "#" + p["id"]
p.replace_with(heading)

# This is a speaker with some meta-speech e.g. "rose-"
# or multiple speakers and the like, e.g. "Members: No"
for p in soup.find_all('p', class_='or-contribution-box'):
bold = p.find(class_='or-bill-section-bold')
italic = p.find(class_='or-italic')
if bold:
speech = soup.new_tag("speech")
speech["speaker_name"] = bold.text.strip()
bold.decompose()
text = p.text.strip()
new_p = soup.new_tag("p")
if italic:
new_p["class"] = "italic"
new_p.string = text
speech.append(new_p)
p.replace_with(speech)

# sequential tags from an acceptable element should be grouped together under the speech
# to create the speech object
for speaker in soup.find_all("speech"):
Expand Down

0 comments on commit f135873

Please sign in to comment.