fixup! fixup! fixup! Add scraper for new Scottish Parliament site

Add topical question heading support.
mysociety · Apr 22, 2024 · 5a915f7 · 5a915f7
1 parent 3a09dd0
commit 5a915f7
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 30 deletions.
diff --git a/pyscraper/sp_2024/__main__.py b/pyscraper/sp_2024/__main__.py
@@ -99,7 +99,7 @@ def cmd_parse_downloaded(verbose: bool = False, pattern: str | None = None):
     """
     Re-parse downloaded
     """
-    parse_downloaded(cache_dir=cache_dir, glob_pattern=pattern, verbose=verbose)
+    parse_downloaded(cache_dir=cache_dir, partial_file_name=pattern, verbose=verbose)
 
 
 @cli.command(name="convert-twfy")

diff --git a/pyscraper/sp_2024/convert.py b/pyscraper/sp_2024/convert.py
@@ -5,7 +5,7 @@
 Link to TWFY IDs for members and debate items.
 """
 
-import datetime as datetime
+import datetime
 import re
 from dataclasses import dataclass
 from pathlib import Path
@@ -82,7 +82,7 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False
     title = title.replace("[Draft]", "").strip()
 
     # get the date in format Thursday 9 June 2005
-    date_str = datetime.datetime.fromisoformat(iso_date).strftime("%A %d %B %Y")
+    date_str = datetime.date.fromisoformat(iso_date).strftime("%A %d %B %Y")
 
     committee_slug = slugify_committee(title)
 
@@ -120,6 +120,13 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False
                 root.append(speech)
                 previous_speech = speech
 
+            elif subitem.tag == "heading":
+                minor_heading = etree.Element("minor-heading")
+                minor_heading.set("id", id_factory.get_next_minor_id())
+                minor_heading.set("url", item.get("url"))
+                minor_heading.text = subitem.text
+                root.append(minor_heading)
+
             elif subitem.tag == "division":
                 # get previous sibling of the subitem to get the speech info
 

diff --git a/pyscraper/sp_2024/parse.py b/pyscraper/sp_2024/parse.py
@@ -17,63 +17,39 @@
     "abbr",
     "acronym",
     "address",
-    "area",
     "b",
     "big",
     "blockquote",
-    "body",
     "br",
-    "button",
     "caption",
     "center",
     "cite",
-    "code",
     "col",
     "colgroup",
     "dd",
-    "del",
-    "dfn",
     "dir",
     "div",
     "dl",
     "dt",
     "em",
-    "font",
-    "form",
-    "head",
     "h1",
     "h2",
     "h3",
     "h4",
     "h5",
     "h6",
-    "hr",
-    "html",
     "i",
     "img",
-    "input",
-    "ins",
-    "kbd",
-    "label",
-    "legend",
     "li",
-    "link",
-    "map",
-    "menu",
-    "meta",
-    "noscript",
     "ol",
     "p",
     "pre",
     "q",
     "s",
-    "samp",
-    "script",
     "small",
     "span",
     "strike",
     "strong",
-    "style",
     "sub",
     "sup",
     "table",
@@ -87,9 +63,6 @@
     "tt",
     "u",
     "ul",
-    "var",
-    "form",
-    "body",
 ]
 
 
@@ -135,6 +108,12 @@ def process_raw_html(html: str, agenda_item_url: str):
             speaker["speech_url"] = agenda_item_url + "#" + p["id"]
             p.replace_with(speaker)
 
+    for p in soup.find_all('p', class_='lead'):
+        heading = soup.new_tag("heading")
+        heading.string = p.string
+        heading["url"] = agenda_item_url + "#" + p["id"]
+        p.replace_with(heading)
+
     # sequential tags from an acceptable element should be grouped together under the speech
     # to create the speech object
     for speaker in soup.find_all("speech"):