opencivicdata · iepmas · May 30, 2024 · Jun 5, 2024 · Oct 29, 2024 · Oct 29, 2024
diff --git a/ca_bc/people.py b/ca_bc/people.py
@@ -1,76 +1,61 @@
-import re
+import json
+from textwrap import dedent
 
 from utils import CanadianPerson as Person
 from utils import CanadianScraper
 
-COUNCIL_PAGE = "https://www.leg.bc.ca/_api/search/query?querytext='(contentclass:sts_listitem%20OR%20IsDocument:True)%20SPSiteUrl:/content%20ListId:8ecafcaa-2bf9-4434-a60c-3663a9afd175%20MLAActiveOWSBOOL:1%20-LastNameOWSTEXT:Vacant'&selectproperties='Picture1OWSIMGE,Title,Path'&&sortlist='LastNameSort:ascending'&rowlimit=100&QueryTemplatePropertiesUrl='spfile://webroot/queryparametertemplate.xml'"
+COUNCIL_PAGE = "https://www.leg.bc.ca/members"
 
 
 class BritishColumbiaPersonScraper(CanadianScraper):
     def scrape(self):
-        parties = {
-            "BC NDP": "New Democratic Party of British Columbia",
-            "BC Liberal Party": "British Columbia Liberal Party",
-        }
+        response = self.post(url="https://lims.leg.bc.ca/graphql", json={
+            "query": dedent("""\
+                query GetMLAsByConstituency($parliamentId: Int!) {
+                  allMemberParliaments(condition: {parliamentId: $parliamentId, active: true}) {
+                    nodes {
+                      image: imageBySmallImageId {
+                        path
+                        description
+                        __typename
+                      }
+                      constituency: constituencyByConstituencyId {
+                        name
+                        __typename
+                      }
+                      member: memberByMemberId {
+                        firstName
+                        lastName
+                        __typename
+                      }
+                      isCounsel
+                      isDoctor
+                      isHonourable
+                      party: partyByPartyId {
+                        name
+                        abbreviation
+                        __typename
+                      }
+                      nodeId
+                      __typename
+                    }
+                    __typename
+                  }
+                }"""
+            ),
+            "variables": {"parliamentId": 43},
+        })
+        data = json.loads(response.content.decode("utf-8"))
+        members = data["data"]["allMemberParliaments"]["nodes"]
 
-        page = self.lxmlize(COUNCIL_PAGE, xml=True)
-
-        nsmap = {"d": "http://schemas.microsoft.com/ado/2007/08/dataservices"}
-        members = page.xpath("//d:Cells", namespaces=nsmap)
         assert len(members), "No members found"
         for member in members:
-            url = member.xpath('./d:element/d:Key[text()="Path"]/following-sibling::d:Value/text()', namespaces=nsmap)[
-                0
-            ]
-            if "vacant" in url.lower():
-                continue
-            page = self.lxmlize(url)
+            image = "https://lims.leg.bc.ca/public" + member["image"]["path"]
+            district = member["constituency"]["name"]
+            name = member["member"]["firstName"] + " " + member["member"]["lastName"]
+            party = member["party"]["name"]
 
-            name = (
-                page.xpath('//div[contains(@class, "BCLASS-pagetitle")]//h3/text()')[0]
-                .replace("Wm.", "")
-                .replace(", Q.C.", "")
-                .replace(", K.C.", "")
-                .strip()
-            )
-            district, party = cleanup_list(page.xpath('//div[@id="MinisterTitle"]/following-sibling::text()'))
-            party = parties.get(party, party)
-            p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party)
+            p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party, image=image)
             p.add_source(COUNCIL_PAGE)
-            p.add_source(url)
-
-            p.image = page.xpath('//img[contains(@src, "Members")]/@src')[0]
-
-            email = page.xpath('//div[@class="convertToEmail"]//text()')[0].strip()
-            if "#" in email:
-                email = email.split("#")[0]
-            if email:
-                p.add_contact("email", email)
-
-            office = ", ".join(cleanup_list(page.xpath('//h4[contains(text(), "Office:")]/ancestor::div/text()')))
-            office = re.sub(r"\s{2,}", " ", office)
-            p.add_contact("address", office, "legislature")
-
-            constituency = ", ".join(
-                cleanup_list(page.xpath('//h4[contains(text(), "Constituency:")]/ancestor::div[1]//text()'))
-            )
-            constituency = re.sub(r"\s{2,}", " ", constituency).split(", Phone")[0]
-            p.add_contact("address", constituency, "constituency")
-
-            phones = cleanup_list(page.xpath('//span[contains(text(), "Phone:")]/following-sibling::text()'))
-
-            office_phone = phones[0]
-            p.add_contact("voice", office_phone, "legislature")
-            if len(phones) > 1:
-                constituency_phone = phones[1]
-                p.add_contact("voice", constituency_phone, "constituency")
-
-            roles = page.xpath('//div[@id="MinisterTitle"]/text()')[0].strip()
-            if roles:
-                p.extras["roles"] = [roles]
 
             yield p
-
-
-def cleanup_list(dirty_list):
-    return list(filter(None, (x.strip() for x in dirty_list)))
diff --git a/ca_ns_cape_breton/people.py b/ca_ns_cape_breton/people.py
@@ -53,7 +53,8 @@ def decode_email(script):
             councillor_url = councillor.xpath(".//a/@href")[0]
             p.add_source(councillor_url)
             page = self.lxmlize(councillor_url, user_agent=CUSTOM_USER_AGENT)
-            image = page.xpath(f'//img[contains(@title, "{name}")]/@src')
+            escaped_name = name.replace('"', "&quot;")
+            image = page.xpath(f'//img[contains(@title, "{escaped_name}")]/@src')
             if image:
                 p.image = image[0]
             yield p

diff --git a/ca_on_kawartha_lakes/people.py b/ca_on_kawartha_lakes/people.py
@@ -23,6 +23,9 @@ def scrape(self):
                 name = councillor.text_content().replace("Mayor", "").strip()
                 role = "Mayor"
 
+            if "RESIGNED" in name:
+                continue
+
             info_node = councillor.xpath("./following-sibling::*")[0]
             email = self.get_email(info_node)
             phone = self.get_phone(info_node)

diff --git a/ca_on_thunder_bay/people.py b/ca_on_thunder_bay/people.py
@@ -1,6 +1,4 @@
-import requests
-
-from utils import DEFAULT_USER_AGENT, CanadianScraper
+from utils import CanadianScraper
 from utils import CanadianPerson as Person
 
 COUNCIL_PAGE = "https://www.thunderbay.ca/en/city-hall/mayor-and-council-profiles.aspx"
@@ -9,13 +7,14 @@
 class ThunderBayPersonScraper(CanadianScraper):
     def scrape(self):
         seat_number = 1
-        page = self.lxmlize(COUNCIL_PAGE)
+        # SSLError(SSLError(1, '[SSL: DH_KEY_TOO_SMALL] dh key too small (_ssl.c:1133)'))
+        page = self.lxmlize(COUNCIL_PAGE, verify=False)
 
         councillors = page.xpath("//p[@class='Center']/a[@href]")
         assert len(councillors), "No councillors found"
         for councillor in councillors:
             url = councillor.xpath("./@href")[0]
-            councillor_page = self.lxmlize(url)
+            councillor_page = self.lxmlize(url, verify=False)
             info = councillor_page.xpath("//div[@class='iCreateDynaToken']")[1]
             role = info.xpath("./h2")[0].text_content()
             name = info.xpath("./h3")[0].text_content()
@@ -42,7 +41,3 @@ def scrape(self):
             p.image = photo
 
             yield p
-
-    def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False):
-        requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ":HIGH:!DH:!aNULL"  # site uses a weak DH key
-        return super().lxmlize(url, encoding, user_agent=user_agent, cookies=cookies, xml=xml)
diff --git a/ca_on_wilmot/people.py b/ca_on_wilmot/people.py
@@ -14,6 +14,9 @@ def scrape(self):
             role_name, contact_info = councillors[i], councillors[i + 1]
             role, name = role_name.text_content().strip().replace("\xa0", " ").split("— ")
 
+            if "Executive Officer to the Mayor and Council" in role:
+                continue
+
             # "Ward 1 Councillor"
             if "Councillor" in role:
                 district = role.split(" Councillor")[0]

diff --git a/ca_qc_kirkland/people.py b/ca_qc_kirkland/people.py
@@ -23,14 +23,18 @@ def scrape(self):
 
             name = councillor.xpath(".//strong/text()")[0]
 
+            # Using self.get_phone does not include the extension #
             phone = (
                 councillor.xpath('.//div[contains(text(), "#")]/text()')[0]
                 .replace("T ", "")
                 .replace(" ", "-")
-                .replace(".", ",")  # correcting a typo
+                .replace(".", ",")
                 .replace(",-#-", " x")
             )
-            email = self.get_email(councillor)
+
+            # cloudflare encrypts the email data
+            encrypted_email = councillor.xpath('.//@href[contains(., "email")]')[0]
+            email = self._cloudflare_decode(encrypted_email)
 
             p = Person(primary_org="legislature", name=name, district=district, role=role)
             p.add_source(COUNCIL_PAGE)

diff --git a/ca_qc_montreal_est/people.py b/ca_qc_montreal_est/people.py
@@ -7,20 +7,22 @@
 class MontrealEstPersonScraper(CanadianScraper):
     def scrape(self):
         page = self.lxmlize(COUNCIL_PAGE)
-        councillors = page.xpath("//div[contains(@id, 'membres-conseil-block_')]")
+        councillors = page.xpath(
+            '//div[contains (@class, "membreimg text-center membres-conseil")]//div//div[@class="col-lg-6"]'
+        )
         assert len(councillors), "No councillors found"
         for councillor in councillors:
-            name, role_district = councillor.xpath(".//span[@class='h3 d-block']")[0].text_content().split(" – ", 1)
+            name, role_district = councillor.xpath('.//div[@class="bg-trans-gris"]/span/text()')[0].split(" – ", 1)
 
-            if "Maire" in role_district:
+            if "Maire" in role_district or "Mairesse" in role_district:
                 district = "Montréal-Est"
                 role = "Maire"
             else:
                 district = f"District {role_district[-1]}"
                 role = "Conseiller"
 
-            p = Person(primary_org="legislature", name=name, district=district, role=role)
-            p.image = councillor.xpath(".//@data-lazy-src")[0]
+            p = Person(primary_org="legislature", name=name.strip(), district=district, role=role)
+            p.image = councillor.xpath(".//div[not(@id)]/img//@src")[0]
             p.add_contact("email", self.get_email(councillor))
             p.add_source(COUNCIL_PAGE)
             yield p
diff --git a/ca_qc_sainte_anne_de_bellevue/people.py b/ca_qc_sainte_anne_de_bellevue/people.py
@@ -10,11 +10,12 @@ class SainteAnneDeBellevuePersonScraper(CanadianScraper):
     def scrape(self):
         page = self.lxmlize(COUNCIL_PAGE)
 
-        councillors = page.xpath('//div[@class="block text"]')
+        councillors = page.xpath('//p[a[contains(@href, "@")]]')
         assert len(councillors), "No councillors found"
+
         for councillor in councillors:
-            name = councillor.xpath('.//div[@class="content-writable"]//strong/text()')[0]
-            district = councillor.xpath(".//h2/text()")[0]
+            name = councillor.text_content().split(" |", 1)[0]
+            district = councillor.xpath("./preceding-sibling::h2[1]/text()")[0]
 
             if "Maire" in district:
                 district = "Sainte-Anne-de-Bellevue"
@@ -26,6 +27,5 @@ def scrape(self):
             p = Person(primary_org="legislature", name=name, district=district, role=role)
             p.add_source(COUNCIL_PAGE)
 
-            p.image = councillor.xpath(".//@src")[0]
             p.add_contact("email", self.get_email(councillor))
             yield p
diff --git a/ca_yt/people.py b/ca_yt/people.py
@@ -6,6 +6,8 @@
 COUNCIL_PAGE = "https://yukonassembly.ca/mlas"
 
 
+# This website uses Cloudflare bot products (setting a __cf_bm cookie), which is hard to circumvent.
+# https://developers.cloudflare.com/fundamentals/reference/policies-compliances/cloudflare-cookies/
 class YukonPersonScraper(CanadianScraper):
     def scrape(self):
         page = self.lxmlize(COUNCIL_PAGE)

diff --git a/patch.py b/patch.py
@@ -121,17 +121,17 @@
     r"(?:Jr|Rev|Sr|St)\.|"
     r"da|de|den|der|la|van|von|"
     r'[("](?:\p{Lu}+|\p{Lu}\p{Ll}*(?:-\p{Lu}\p{Ll}*)*)[)"]|'
-    r"(?:D'|d'|De|de|Des|Di|Du|L'|La|Le|Mac|Mc|O'|San|Van|Vander?|van|vanden)?\p{Lu}\p{Ll}+|"
+    r"(?:D'|d'|De|de|Des|Di|Du|L'|La|Le|Mac|Mc|O'|San|St\.|Van|Vander?|van|vanden)?\p{Lu}\p{Ll}+|"
     r"\p{Lu}\p{Ll}+Anne?|Marie\p{Lu}\p{Ll}+|"
-    r"Ch'ng|Prud'homme|"
+    r"A'aliya|Ch'ng|Prud'homme|"
     r"D!ONNE|IsaBelle|Ya'ara"
     r")"
 )
 
 # Name components can be joined by apostrophes, hyphens or spaces.
 person_schema["properties"]["name"]["pattern"] = re.compile(
     r"\A"
-    r"(?!(?:Chair|Commissioner|Conseiller|Councillor|Deputy|Dr|Hon|M|Maire|Mayor|Miss|Mme|Mr|Mrs|Ms|Regional|Warden)\b)"
+    r"(?!(?:Chair|Commissioner|Conseiller|Councillor|Deputy|Dr|M|Maire|Mayor|Miss|Mme|Mr|Mrs|Ms|Regional|Warden)\b)"
     r"(?:" + name_fragment + r"(?:'|-| - | )"
     r")+" + name_fragment + r"\Z"
 )

diff --git a/requirements.txt b/requirements.txt
@@ -35,7 +35,7 @@ lxml==4.9.1
     # via -r requirements.in
 olefile==0.47
     # via agate-excel
-opencivicdata==3.3.1
+opencivicdata==3.4.0
     # via
     #   -r requirements.in
     #   pupa

diff --git a/utils.py b/utils.py
@@ -193,15 +193,17 @@ def get_link(self, node, substring, *, error=True):
         return None
 
     def get(self, *args, **kwargs):
-        return super().get(*args, verify=SSL_VERIFY, **kwargs)
+        return super().get(*args, verify=kwargs.pop("verify", SSL_VERIFY), **kwargs)
 
     def post(self, *args, **kwargs):
-        return super().post(*args, verify=SSL_VERIFY, **kwargs)
+        return super().post(*args, verify=kwargs.pop("verify", SSL_VERIFY), **kwargs)
 
-    def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False):
+    def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False, verify=SSL_VERIFY):
+        # Sets User-Agent header.
+        # https://github.com/jamesturk/scrapelib/blob/5ce0916/scrapelib/__init__.py#L505
         self.user_agent = user_agent
 
-        response = self.get(url, cookies=cookies)
+        response = self.get(url, cookies=cookies, verify=verify)
         if encoding:
             response.encoding = encoding
 
@@ -737,9 +739,10 @@ def clean_string(s):
 
 
 def clean_name(s):
-    return honorific_suffix_re.sub(
-        "", honorific_prefix_re.sub("", whitespace_re.sub(" ", str(s).translate(table)).strip())
-    )
+    name = honorific_suffix_re.sub("", whitespace_re.sub(" ", str(s).translate(table)).strip())
+    if name.count(" ") > 1:
+        return honorific_prefix_re.sub("", name)  # Avoid truncating names like "Hon Chan"
+    return name
 
 
 def clean_type_id(type_id):