Skip to content

Commit

Permalink
fixup! fixup! fixup! Add scraper for new Scottish Parliament site
Browse files Browse the repository at this point in the history
Add topical question heading support.
  • Loading branch information
dracos committed Apr 22, 2024
1 parent 3a09dd0 commit 5a915f7
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 30 deletions.
2 changes: 1 addition & 1 deletion pyscraper/sp_2024/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def cmd_parse_downloaded(verbose: bool = False, pattern: str | None = None):
"""
Re-parse downloaded
"""
parse_downloaded(cache_dir=cache_dir, glob_pattern=pattern, verbose=verbose)
parse_downloaded(cache_dir=cache_dir, partial_file_name=pattern, verbose=verbose)


@cli.command(name="convert-twfy")
Expand Down
11 changes: 9 additions & 2 deletions pyscraper/sp_2024/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
Link to TWFY IDs for members and debate items.
"""

import datetime as datetime
import datetime
import re
from dataclasses import dataclass
from pathlib import Path
Expand Down Expand Up @@ -82,7 +82,7 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False
title = title.replace("[Draft]", "").strip()

# get the date in format Thursday 9 June 2005
date_str = datetime.datetime.fromisoformat(iso_date).strftime("%A %d %B %Y")
date_str = datetime.date.fromisoformat(iso_date).strftime("%A %d %B %Y")

committee_slug = slugify_committee(title)

Expand Down Expand Up @@ -120,6 +120,13 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False
root.append(speech)
previous_speech = speech

elif subitem.tag == "heading":
minor_heading = etree.Element("minor-heading")
minor_heading.set("id", id_factory.get_next_minor_id())
minor_heading.set("url", item.get("url"))
minor_heading.text = subitem.text
root.append(minor_heading)

elif subitem.tag == "division":
# get previous sibling of the subitem to get the speech info

Expand Down
33 changes: 6 additions & 27 deletions pyscraper/sp_2024/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,63 +17,39 @@
"abbr",
"acronym",
"address",
"area",
"b",
"big",
"blockquote",
"body",
"br",
"button",
"caption",
"center",
"cite",
"code",
"col",
"colgroup",
"dd",
"del",
"dfn",
"dir",
"div",
"dl",
"dt",
"em",
"font",
"form",
"head",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"hr",
"html",
"i",
"img",
"input",
"ins",
"kbd",
"label",
"legend",
"li",
"link",
"map",
"menu",
"meta",
"noscript",
"ol",
"p",
"pre",
"q",
"s",
"samp",
"script",
"small",
"span",
"strike",
"strong",
"style",
"sub",
"sup",
"table",
Expand All @@ -87,9 +63,6 @@
"tt",
"u",
"ul",
"var",
"form",
"body",
]


Expand Down Expand Up @@ -135,6 +108,12 @@ def process_raw_html(html: str, agenda_item_url: str):
speaker["speech_url"] = agenda_item_url + "#" + p["id"]
p.replace_with(speaker)

for p in soup.find_all('p', class_='lead'):
heading = soup.new_tag("heading")
heading.string = p.string
heading["url"] = agenda_item_url + "#" + p["id"]
p.replace_with(heading)

# sequential tags from an acceptable element should be grouped together under the speech
# to create the speech object
for speaker in soup.find_all("speech"):
Expand Down

0 comments on commit 5a915f7

Please sign in to comment.