diff --git a/ca_bc/people.py b/ca_bc/people.py index 356e4379..68a7984c 100644 --- a/ca_bc/people.py +++ b/ca_bc/people.py @@ -1,76 +1,61 @@ -import re +import json +from textwrap import dedent from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "https://www.leg.bc.ca/_api/search/query?querytext='(contentclass:sts_listitem%20OR%20IsDocument:True)%20SPSiteUrl:/content%20ListId:8ecafcaa-2bf9-4434-a60c-3663a9afd175%20MLAActiveOWSBOOL:1%20-LastNameOWSTEXT:Vacant'&selectproperties='Picture1OWSIMGE,Title,Path'&&sortlist='LastNameSort:ascending'&rowlimit=100&QueryTemplatePropertiesUrl='spfile://webroot/queryparametertemplate.xml'" +COUNCIL_PAGE = "https://www.leg.bc.ca/members" class BritishColumbiaPersonScraper(CanadianScraper): def scrape(self): - parties = { - "BC NDP": "New Democratic Party of British Columbia", - "BC Liberal Party": "British Columbia Liberal Party", - } + response = self.post(url="https://lims.leg.bc.ca/graphql", json={ + "query": dedent("""\ + query GetMLAsByConstituency($parliamentId: Int!) { + allMemberParliaments(condition: {parliamentId: $parliamentId, active: true}) { + nodes { + image: imageBySmallImageId { + path + description + __typename + } + constituency: constituencyByConstituencyId { + name + __typename + } + member: memberByMemberId { + firstName + lastName + __typename + } + isCounsel + isDoctor + isHonourable + party: partyByPartyId { + name + abbreviation + __typename + } + nodeId + __typename + } + __typename + } + }""" + ), + "variables": {"parliamentId": 43}, + }) + data = json.loads(response.content.decode("utf-8")) + members = data["data"]["allMemberParliaments"]["nodes"] - page = self.lxmlize(COUNCIL_PAGE, xml=True) - - nsmap = {"d": "http://schemas.microsoft.com/ado/2007/08/dataservices"} - members = page.xpath("//d:Cells", namespaces=nsmap) assert len(members), "No members found" for member in members: - url = member.xpath('./d:element/d:Key[text()="Path"]/following-sibling::d:Value/text()', namespaces=nsmap)[ - 0 - ] - if "vacant" in url.lower(): - continue - page = self.lxmlize(url) + image = "https://lims.leg.bc.ca/public" + member["image"]["path"] + district = member["constituency"]["name"] + name = member["member"]["firstName"] + " " + member["member"]["lastName"] + party = member["party"]["name"] - name = ( - page.xpath('//div[contains(@class, "BCLASS-pagetitle")]//h3/text()')[0] - .replace("Wm.", "") - .replace(", Q.C.", "") - .replace(", K.C.", "") - .strip() - ) - district, party = cleanup_list(page.xpath('//div[@id="MinisterTitle"]/following-sibling::text()')) - party = parties.get(party, party) - p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party) + p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party, image=image) p.add_source(COUNCIL_PAGE) - p.add_source(url) - - p.image = page.xpath('//img[contains(@src, "Members")]/@src')[0] - - email = page.xpath('//div[@class="convertToEmail"]//text()')[0].strip() - if "#" in email: - email = email.split("#")[0] - if email: - p.add_contact("email", email) - - office = ", ".join(cleanup_list(page.xpath('//h4[contains(text(), "Office:")]/ancestor::div/text()'))) - office = re.sub(r"\s{2,}", " ", office) - p.add_contact("address", office, "legislature") - - constituency = ", ".join( - cleanup_list(page.xpath('//h4[contains(text(), "Constituency:")]/ancestor::div[1]//text()')) - ) - constituency = re.sub(r"\s{2,}", " ", constituency).split(", Phone")[0] - p.add_contact("address", constituency, "constituency") - - phones = cleanup_list(page.xpath('//span[contains(text(), "Phone:")]/following-sibling::text()')) - - office_phone = phones[0] - p.add_contact("voice", office_phone, "legislature") - if len(phones) > 1: - constituency_phone = phones[1] - p.add_contact("voice", constituency_phone, "constituency") - - roles = page.xpath('//div[@id="MinisterTitle"]/text()')[0].strip() - if roles: - p.extras["roles"] = [roles] yield p - - -def cleanup_list(dirty_list): - return list(filter(None, (x.strip() for x in dirty_list))) diff --git a/ca_ns_cape_breton/people.py b/ca_ns_cape_breton/people.py index 9d9272a5..14b749ac 100644 --- a/ca_ns_cape_breton/people.py +++ b/ca_ns_cape_breton/people.py @@ -53,7 +53,8 @@ def decode_email(script): councillor_url = councillor.xpath(".//a/@href")[0] p.add_source(councillor_url) page = self.lxmlize(councillor_url, user_agent=CUSTOM_USER_AGENT) - image = page.xpath(f'//img[contains(@title, "{name}")]/@src') + escaped_name = name.replace('"', """) + image = page.xpath(f'//img[contains(@title, "{escaped_name}")]/@src') if image: p.image = image[0] yield p diff --git a/ca_on_kawartha_lakes/people.py b/ca_on_kawartha_lakes/people.py index ad2d33db..1c075f87 100644 --- a/ca_on_kawartha_lakes/people.py +++ b/ca_on_kawartha_lakes/people.py @@ -23,6 +23,9 @@ def scrape(self): name = councillor.text_content().replace("Mayor", "").strip() role = "Mayor" + if "RESIGNED" in name: + continue + info_node = councillor.xpath("./following-sibling::*")[0] email = self.get_email(info_node) phone = self.get_phone(info_node) diff --git a/ca_on_thunder_bay/people.py b/ca_on_thunder_bay/people.py index 86d56487..e39dee76 100644 --- a/ca_on_thunder_bay/people.py +++ b/ca_on_thunder_bay/people.py @@ -1,6 +1,4 @@ -import requests - -from utils import DEFAULT_USER_AGENT, CanadianScraper +from utils import CanadianScraper from utils import CanadianPerson as Person COUNCIL_PAGE = "https://www.thunderbay.ca/en/city-hall/mayor-and-council-profiles.aspx" @@ -9,13 +7,14 @@ class ThunderBayPersonScraper(CanadianScraper): def scrape(self): seat_number = 1 - page = self.lxmlize(COUNCIL_PAGE) + # SSLError(SSLError(1, '[SSL: DH_KEY_TOO_SMALL] dh key too small (_ssl.c:1133)')) + page = self.lxmlize(COUNCIL_PAGE, verify=False) councillors = page.xpath("//p[@class='Center']/a[@href]") assert len(councillors), "No councillors found" for councillor in councillors: url = councillor.xpath("./@href")[0] - councillor_page = self.lxmlize(url) + councillor_page = self.lxmlize(url, verify=False) info = councillor_page.xpath("//div[@class='iCreateDynaToken']")[1] role = info.xpath("./h2")[0].text_content() name = info.xpath("./h3")[0].text_content() @@ -42,7 +41,3 @@ def scrape(self): p.image = photo yield p - - def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False): - requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ":HIGH:!DH:!aNULL" # site uses a weak DH key - return super().lxmlize(url, encoding, user_agent=user_agent, cookies=cookies, xml=xml) diff --git a/ca_on_wilmot/people.py b/ca_on_wilmot/people.py index d4ac6f1c..ff35f3a2 100644 --- a/ca_on_wilmot/people.py +++ b/ca_on_wilmot/people.py @@ -14,6 +14,9 @@ def scrape(self): role_name, contact_info = councillors[i], councillors[i + 1] role, name = role_name.text_content().strip().replace("\xa0", " ").split("— ") + if "Executive Officer to the Mayor and Council" in role: + continue + # "Ward 1 Councillor" if "Councillor" in role: district = role.split(" Councillor")[0] diff --git a/ca_qc_kirkland/people.py b/ca_qc_kirkland/people.py index 3f0bab4b..d4ae8099 100644 --- a/ca_qc_kirkland/people.py +++ b/ca_qc_kirkland/people.py @@ -23,14 +23,18 @@ def scrape(self): name = councillor.xpath(".//strong/text()")[0] + # Using self.get_phone does not include the extension # phone = ( councillor.xpath('.//div[contains(text(), "#")]/text()')[0] .replace("T ", "") .replace(" ", "-") - .replace(".", ",") # correcting a typo + .replace(".", ",") .replace(",-#-", " x") ) - email = self.get_email(councillor) + + # cloudflare encrypts the email data + encrypted_email = councillor.xpath('.//@href[contains(., "email")]')[0] + email = self._cloudflare_decode(encrypted_email) p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) diff --git a/ca_qc_montreal_est/people.py b/ca_qc_montreal_est/people.py index 8d7fb6d4..322234aa 100644 --- a/ca_qc_montreal_est/people.py +++ b/ca_qc_montreal_est/people.py @@ -7,20 +7,22 @@ class MontrealEstPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath("//div[contains(@id, 'membres-conseil-block_')]") + councillors = page.xpath( + '//div[contains (@class, "membreimg text-center membres-conseil")]//div//div[@class="col-lg-6"]' + ) assert len(councillors), "No councillors found" for councillor in councillors: - name, role_district = councillor.xpath(".//span[@class='h3 d-block']")[0].text_content().split(" – ", 1) + name, role_district = councillor.xpath('.//div[@class="bg-trans-gris"]/span/text()')[0].split(" – ", 1) - if "Maire" in role_district: + if "Maire" in role_district or "Mairesse" in role_district: district = "Montréal-Est" role = "Maire" else: district = f"District {role_district[-1]}" role = "Conseiller" - p = Person(primary_org="legislature", name=name, district=district, role=role) - p.image = councillor.xpath(".//@data-lazy-src")[0] + p = Person(primary_org="legislature", name=name.strip(), district=district, role=role) + p.image = councillor.xpath(".//div[not(@id)]/img//@src")[0] p.add_contact("email", self.get_email(councillor)) p.add_source(COUNCIL_PAGE) yield p diff --git a/ca_qc_sainte_anne_de_bellevue/people.py b/ca_qc_sainte_anne_de_bellevue/people.py index 30d4662b..e2fb06cc 100644 --- a/ca_qc_sainte_anne_de_bellevue/people.py +++ b/ca_qc_sainte_anne_de_bellevue/people.py @@ -10,11 +10,12 @@ class SainteAnneDeBellevuePersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//div[@class="block text"]') + councillors = page.xpath('//p[a[contains(@href, "@")]]') assert len(councillors), "No councillors found" + for councillor in councillors: - name = councillor.xpath('.//div[@class="content-writable"]//strong/text()')[0] - district = councillor.xpath(".//h2/text()")[0] + name = councillor.text_content().split(" |", 1)[0] + district = councillor.xpath("./preceding-sibling::h2[1]/text()")[0] if "Maire" in district: district = "Sainte-Anne-de-Bellevue" @@ -26,6 +27,5 @@ def scrape(self): p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) - p.image = councillor.xpath(".//@src")[0] p.add_contact("email", self.get_email(councillor)) yield p diff --git a/ca_yt/people.py b/ca_yt/people.py index b6dd62f4..4808f220 100644 --- a/ca_yt/people.py +++ b/ca_yt/people.py @@ -6,6 +6,8 @@ COUNCIL_PAGE = "https://yukonassembly.ca/mlas" +# This website uses Cloudflare bot products (setting a __cf_bm cookie), which is hard to circumvent. +# https://developers.cloudflare.com/fundamentals/reference/policies-compliances/cloudflare-cookies/ class YukonPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) diff --git a/patch.py b/patch.py index 6d6312e8..4540cfc8 100644 --- a/patch.py +++ b/patch.py @@ -121,9 +121,9 @@ r"(?:Jr|Rev|Sr|St)\.|" r"da|de|den|der|la|van|von|" r'[("](?:\p{Lu}+|\p{Lu}\p{Ll}*(?:-\p{Lu}\p{Ll}*)*)[)"]|' - r"(?:D'|d'|De|de|Des|Di|Du|L'|La|Le|Mac|Mc|O'|San|Van|Vander?|van|vanden)?\p{Lu}\p{Ll}+|" + r"(?:D'|d'|De|de|Des|Di|Du|L'|La|Le|Mac|Mc|O'|San|St\.|Van|Vander?|van|vanden)?\p{Lu}\p{Ll}+|" r"\p{Lu}\p{Ll}+Anne?|Marie\p{Lu}\p{Ll}+|" - r"Ch'ng|Prud'homme|" + r"A'aliya|Ch'ng|Prud'homme|" r"D!ONNE|IsaBelle|Ya'ara" r")" ) @@ -131,7 +131,7 @@ # Name components can be joined by apostrophes, hyphens or spaces. person_schema["properties"]["name"]["pattern"] = re.compile( r"\A" - r"(?!(?:Chair|Commissioner|Conseiller|Councillor|Deputy|Dr|Hon|M|Maire|Mayor|Miss|Mme|Mr|Mrs|Ms|Regional|Warden)\b)" + r"(?!(?:Chair|Commissioner|Conseiller|Councillor|Deputy|Dr|M|Maire|Mayor|Miss|Mme|Mr|Mrs|Ms|Regional|Warden)\b)" r"(?:" + name_fragment + r"(?:'|-| - | )" r")+" + name_fragment + r"\Z" ) diff --git a/requirements.txt b/requirements.txt index 080af981..5b06ad27 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,7 +35,7 @@ lxml==4.9.1 # via -r requirements.in olefile==0.47 # via agate-excel -opencivicdata==3.3.1 +opencivicdata==3.4.0 # via # -r requirements.in # pupa diff --git a/utils.py b/utils.py index 3a5ded50..52ef6e34 100644 --- a/utils.py +++ b/utils.py @@ -193,15 +193,17 @@ def get_link(self, node, substring, *, error=True): return None def get(self, *args, **kwargs): - return super().get(*args, verify=SSL_VERIFY, **kwargs) + return super().get(*args, verify=kwargs.pop("verify", SSL_VERIFY), **kwargs) def post(self, *args, **kwargs): - return super().post(*args, verify=SSL_VERIFY, **kwargs) + return super().post(*args, verify=kwargs.pop("verify", SSL_VERIFY), **kwargs) - def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False): + def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False, verify=SSL_VERIFY): + # Sets User-Agent header. + # https://github.com/jamesturk/scrapelib/blob/5ce0916/scrapelib/__init__.py#L505 self.user_agent = user_agent - response = self.get(url, cookies=cookies) + response = self.get(url, cookies=cookies, verify=verify) if encoding: response.encoding = encoding @@ -737,9 +739,10 @@ def clean_string(s): def clean_name(s): - return honorific_suffix_re.sub( - "", honorific_prefix_re.sub("", whitespace_re.sub(" ", str(s).translate(table)).strip()) - ) + name = honorific_suffix_re.sub("", whitespace_re.sub(" ", str(s).translate(table)).strip()) + if name.count(" ") > 1: + return honorific_prefix_re.sub("", name) # Avoid truncating names like "Hon Chan" + return name def clean_type_id(type_id):