fixup! fixup! fixup! fixup! Add scraper for new Scottish Parliament site

Metaspeeches
mysociety · Apr 22, 2024 · f135873 · f135873
1 parent 5a915f7
commit f135873
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 3 deletions.
diff --git a/pyscraper/sp_2024/convert.py b/pyscraper/sp_2024/convert.py
@@ -112,9 +112,9 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False
                     print(f"Could not find person id for {speaker_name}")
                 speech = etree.Element("speech")
                 speech.set("id", id_factory.get_next_minor_id())
-                speech.set("url", subitem.get("speech_url"))
+                speech.set("url", subitem.get("speech_url") or "")
                 speech.set("speakername", speaker_name)
-                speech.set("person_id", person_id or "Unknown")
+                speech.set("person_id", person_id or "unknown")
                 for child in subitem:
                     speech.append(child)
                 root.append(speech)
@@ -149,7 +149,7 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False
         person_id = is_member_vote(person_name, iso_date)
         if person_id is None:
             print(f"Could not find person id for {person_name}")
-        mspname.set("id", person_id or "Unknown")
+        mspname.set("id", person_id or "unknown")
 
     # write the new xml to a file
     etree.indent(root, space="    ")

diff --git a/pyscraper/sp_2024/parse.py b/pyscraper/sp_2024/parse.py
@@ -108,12 +108,30 @@ def process_raw_html(html: str, agenda_item_url: str):
             speaker["speech_url"] = agenda_item_url + "#" + p["id"]
             p.replace_with(speaker)
 
+    # This is for finding topical question headings
     for p in soup.find_all('p', class_='lead'):
         heading = soup.new_tag("heading")
         heading.string = p.string
         heading["url"] = agenda_item_url + "#" + p["id"]
         p.replace_with(heading)
 
+    # This is a speaker with some meta-speech e.g. "rose-"
+    # or multiple speakers and the like, e.g. "Members: No"
+    for p in soup.find_all('p', class_='or-contribution-box'):
+        bold = p.find(class_='or-bill-section-bold')
+        italic = p.find(class_='or-italic')
+        if bold:
+            speech = soup.new_tag("speech")
+            speech["speaker_name"] = bold.text.strip()
+            bold.decompose()
+            text = p.text.strip()
+            new_p = soup.new_tag("p")
+            if italic:
+                new_p["class"] = "italic"
+            new_p.string = text
+            speech.append(new_p)
+            p.replace_with(speech)
+
     # sequential tags from an acceptable element should be grouped together under the speech
     # to create the speech object
     for speaker in soup.find_all("speech"):