From 49fd81486de4a75d25995281b2c5319fa213362f Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Thu, 30 May 2024 15:46:16 -0400 Subject: [PATCH 01/29] Fix Kirkland scraper --- ca_qc_kirkland/people.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/ca_qc_kirkland/people.py b/ca_qc_kirkland/people.py index 03a1536b..3425e57c 100644 --- a/ca_qc_kirkland/people.py +++ b/ca_qc_kirkland/people.py @@ -8,13 +8,20 @@ class KirklandPersonScraper(CanadianScraper): def scrape(self): - page = self.lxmlize(COUNCIL_PAGE, "iso-8859-1") + def decode_email(e): + de = "" + k = int(e[:2], 16) - councillors = page.xpath('//div[@id="PageContent"]/table/tbody/tr/td') + for i in range(2, len(e) - 1, 2): + de += chr(int(e[i : i + 2], 16) ^ k) + + return de + + page = self.lxmlize(COUNCIL_PAGE) + + councillors = page.xpath('//div[@class="container_content"]//tbody/tr') assert len(councillors), "No councillors found" for councillor in councillors: - if not councillor.text_content().strip(): - continue if councillor == councillors[0]: district = "Kirkland" role = "Maire" @@ -29,9 +36,11 @@ def scrape(self): councillor.xpath('.//div[contains(text(), "#")]/text()')[0] .replace("T ", "") .replace(" ", "-") + .replace(".", ",") # correcting a typo .replace(",-#-", " x") ) - email = self.get_email(councillor) + encrypted_email = councillor.xpath('.//@href[contains(., "email")]')[0].split("#")[1] + email = decode_email(encrypted_email) p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) From 6f527011988eb4ce8c66c7fa95220a1c8967b163 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Wed, 5 Jun 2024 15:17:00 -0400 Subject: [PATCH 02/29] Update Montreal Est scraper --- ca_qc_montreal_est/people.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/ca_qc_montreal_est/people.py b/ca_qc_montreal_est/people.py index 9dd0c650..5409ca97 100644 --- a/ca_qc_montreal_est/people.py +++ b/ca_qc_montreal_est/people.py @@ -1,28 +1,26 @@ from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://ville.montreal-est.qc.ca/la-ville/conseil-municipal/conseils-municipaux/" +COUNCIL_PAGE = "https://ville.montreal-est.qc.ca/vie-democratique/conseil-municipal/" class MontrealEstPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - - councillors = page.xpath("//table") + councillors = page.xpath("//div[contains(@id, 'membres-conseil-block_')]") assert len(councillors), "No councillors found" for councillor in councillors: - name = councillor.xpath(".//h3")[0].text_content() + name, role_district = councillor.xpath(".//span[@class='h3 d-block']")[0].text_content().split(" – ", 1) - if "maire" in name: - name = name.split(" ", 2)[-1] + if "Maire" in role_district: district = "Montréal-Est" role = "Maire" else: - district = "District {}".format(councillor.xpath(".//h3")[1].text_content()[-1]) + district = "District {}".format(role_district[-1]) role = "Conseiller" p = Person(primary_org="legislature", name=name, district=district, role=role) - p.image = councillor.xpath(".//@src")[0] + p.image = councillor.xpath(".//@data-lazy-src")[0] p.add_contact("email", self.get_email(councillor)) p.add_source(COUNCIL_PAGE) yield p From ca587f3f5b4939e113f91809222e89db264f7605 Mon Sep 17 00:00:00 2001 From: Samuel Pei Date: Tue, 29 Oct 2024 13:36:18 -0400 Subject: [PATCH 03/29] Fixed Errors --- ca_qc_montreal_est/people.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/ca_qc_montreal_est/people.py b/ca_qc_montreal_est/people.py index 8d7fb6d4..9457cb05 100644 --- a/ca_qc_montreal_est/people.py +++ b/ca_qc_montreal_est/people.py @@ -1,26 +1,27 @@ from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "https://ville.montreal-est.qc.ca/vie-democratique/conseil-municipal/" - - +COUNCIL_PAGE = "http://ville.montreal-est.qc.ca/vie-democratique/conseil-municipal/" class MontrealEstPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath("//div[contains(@id, 'membres-conseil-block_')]") + councillors = page.xpath('//div[contains (@class, "membreimg text-center membres-conseil")]//div//div[@class="col-lg-6"]') assert len(councillors), "No councillors found" for councillor in councillors: - name, role_district = councillor.xpath(".//span[@class='h3 d-block']")[0].text_content().split(" – ", 1) + name = councillor.xpath('.//div[@class="bg-trans-gris"]/span/text()')[0] - if "Maire" in role_district: + if "Maire" in name or "Mairesse" in name: + name = name.split(" ", 2)[:2] + name = " ".join(name) district = "Montréal-Est" role = "Maire" else: - district = f"District {role_district[-1]}" + name, district = name.split(" ", 2)[:2], "District " + name.split(" ")[-1] + name = " ".join(name) role = "Conseiller" p = Person(primary_org="legislature", name=name, district=district, role=role) - p.image = councillor.xpath(".//@data-lazy-src")[0] + p.image = councillor.xpath(".//div[not(@id)]/img//@src")[0] p.add_contact("email", self.get_email(councillor)) p.add_source(COUNCIL_PAGE) yield p From 1bd0cdec18ad552e5937c868484574f91ccced3d Mon Sep 17 00:00:00 2001 From: Samuel Pei Date: Tue, 29 Oct 2024 13:41:23 -0400 Subject: [PATCH 04/29] Similar solution to burnaby web scraper. --- ca_qc_kirkland/people.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ca_qc_kirkland/people.py b/ca_qc_kirkland/people.py index cadb544d..4e8ff0df 100644 --- a/ca_qc_kirkland/people.py +++ b/ca_qc_kirkland/people.py @@ -8,9 +8,16 @@ class KirklandPersonScraper(CanadianScraper): def scrape(self): - page = self.lxmlize(COUNCIL_PAGE) + def decode_email(e): + de = "" + k = int(e[:2], 16) + + for i in range(2, len(e) - 1, 2): + de += chr(int(e[i : i + 2], 16) ^ k) + + return de + page = self.lxmlize(COUNCIL_PAGE, "iso-8859-1") - # councillors = page.xpath('//div[@id="PageContent"]/table/tbody/tr/td') councillors = page.xpath('//table/tbody[not(@id)]/tr/td[@valign="top"]') assert len(councillors), "No councillors found" for councillor in councillors: From 0b49b23918cd42d584d382a88a4712547fdf4098 Mon Sep 17 00:00:00 2001 From: Samuel Pei Date: Wed, 6 Nov 2024 11:12:10 -0500 Subject: [PATCH 05/29] Updated formatting --- ca_qc_kirkland/people.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ca_qc_kirkland/people.py b/ca_qc_kirkland/people.py index 4e8ff0df..6b64b6e5 100644 --- a/ca_qc_kirkland/people.py +++ b/ca_qc_kirkland/people.py @@ -16,6 +16,7 @@ def decode_email(e): de += chr(int(e[i : i + 2], 16) ^ k) return de + page = self.lxmlize(COUNCIL_PAGE, "iso-8859-1") councillors = page.xpath('//table/tbody[not(@id)]/tr/td[@valign="top"]') @@ -43,7 +44,7 @@ def decode_email(e): email = decode_email(encrypted_email) # cloudflare encrypts the email data - email = councillor.xpath('.//div/*/*/@href | .//div/*/@href | .//@href')[0] + email = councillor.xpath(".//div/*/*/@href | .//div/*/@href | .//@href")[0] decoded_email = decode_email(email.split("#", 1)[1]) p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) From f8d27893515934c6937c56761c3b3ddbd6be5ea9 Mon Sep 17 00:00:00 2001 From: Samuel Pei Date: Wed, 6 Nov 2024 11:13:54 -0500 Subject: [PATCH 06/29] Updated formatting --- ca_qc_montreal_est/people.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ca_qc_montreal_est/people.py b/ca_qc_montreal_est/people.py index 9457cb05..ca966495 100644 --- a/ca_qc_montreal_est/people.py +++ b/ca_qc_montreal_est/people.py @@ -2,10 +2,14 @@ from utils import CanadianScraper COUNCIL_PAGE = "http://ville.montreal-est.qc.ca/vie-democratique/conseil-municipal/" + + class MontrealEstPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//div[contains (@class, "membreimg text-center membres-conseil")]//div//div[@class="col-lg-6"]') + councillors = page.xpath( + '//div[contains (@class, "membreimg text-center membres-conseil")]//div//div[@class="col-lg-6"]' + ) assert len(councillors), "No councillors found" for councillor in councillors: name = councillor.xpath('.//div[@class="bg-trans-gris"]/span/text()')[0] From 3083d469e1272fab6e6c22a46331aa472527aafe Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Thu, 7 Nov 2024 15:18:14 -0500 Subject: [PATCH 07/29] ca_qc_montreal_est: Make changes closer to original --- ca_qc_montreal_est/people.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/ca_qc_montreal_est/people.py b/ca_qc_montreal_est/people.py index ca966495..322234aa 100644 --- a/ca_qc_montreal_est/people.py +++ b/ca_qc_montreal_est/people.py @@ -1,7 +1,7 @@ from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://ville.montreal-est.qc.ca/vie-democratique/conseil-municipal/" +COUNCIL_PAGE = "https://ville.montreal-est.qc.ca/vie-democratique/conseil-municipal/" class MontrealEstPersonScraper(CanadianScraper): @@ -12,19 +12,16 @@ def scrape(self): ) assert len(councillors), "No councillors found" for councillor in councillors: - name = councillor.xpath('.//div[@class="bg-trans-gris"]/span/text()')[0] + name, role_district = councillor.xpath('.//div[@class="bg-trans-gris"]/span/text()')[0].split(" – ", 1) - if "Maire" in name or "Mairesse" in name: - name = name.split(" ", 2)[:2] - name = " ".join(name) + if "Maire" in role_district or "Mairesse" in role_district: district = "Montréal-Est" role = "Maire" else: - name, district = name.split(" ", 2)[:2], "District " + name.split(" ")[-1] - name = " ".join(name) + district = f"District {role_district[-1]}" role = "Conseiller" - p = Person(primary_org="legislature", name=name, district=district, role=role) + p = Person(primary_org="legislature", name=name.strip(), district=district, role=role) p.image = councillor.xpath(".//div[not(@id)]/img//@src")[0] p.add_contact("email", self.get_email(councillor)) p.add_source(COUNCIL_PAGE) From 6e7c15ef88b1af61a8a1b4f51f4f54c8c329725a Mon Sep 17 00:00:00 2001 From: Samuel Pei Date: Fri, 8 Nov 2024 13:08:12 -0500 Subject: [PATCH 08/29] Fixed email encryption error --- ca_qc_kirkland/people.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ca_qc_kirkland/people.py b/ca_qc_kirkland/people.py index 6b64b6e5..f6dd40e9 100644 --- a/ca_qc_kirkland/people.py +++ b/ca_qc_kirkland/people.py @@ -40,16 +40,15 @@ def decode_email(e): .replace(".", ",") .replace(",-#-", " x") ) - encrypted_email = councillor.xpath('.//@href[contains(., "email")]')[0].split("#")[1] - email = decode_email(encrypted_email) # cloudflare encrypts the email data - email = councillor.xpath(".//div/*/*/@href | .//div/*/@href | .//@href")[0] - decoded_email = decode_email(email.split("#", 1)[1]) + encrypted_email = councillor.xpath('.//@href[contains(., "email")]')[0] + email = self._cloudflare_decode(encrypted_email) + p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_contact("voice", phone, "legislature") - p.add_contact("email", decoded_email) + p.add_contact("email", email) image = councillor.xpath(".//img/@src") if image: p.image = image[0] From a4770b7b0ed8e15feedbb70d818e6c2cd9778fc9 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 11 Nov 2024 15:20:36 -0500 Subject: [PATCH 09/29] ca_qc_sainte_anne_de_bellevue: Squash #447 after simplifying changes --- ca_qc_sainte_anne_de_bellevue/people.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ca_qc_sainte_anne_de_bellevue/people.py b/ca_qc_sainte_anne_de_bellevue/people.py index 30d4662b..e2fb06cc 100644 --- a/ca_qc_sainte_anne_de_bellevue/people.py +++ b/ca_qc_sainte_anne_de_bellevue/people.py @@ -10,11 +10,12 @@ class SainteAnneDeBellevuePersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//div[@class="block text"]') + councillors = page.xpath('//p[a[contains(@href, "@")]]') assert len(councillors), "No councillors found" + for councillor in councillors: - name = councillor.xpath('.//div[@class="content-writable"]//strong/text()')[0] - district = councillor.xpath(".//h2/text()")[0] + name = councillor.text_content().split(" |", 1)[0] + district = councillor.xpath("./preceding-sibling::h2[1]/text()")[0] if "Maire" in district: district = "Sainte-Anne-de-Bellevue" @@ -26,6 +27,5 @@ def scrape(self): p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) - p.image = councillor.xpath(".//@src")[0] p.add_contact("email", self.get_email(councillor)) yield p From 3fb90a221cf1db795325af320da755d50b875807 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 11 Nov 2024 17:18:40 -0500 Subject: [PATCH 10/29] build: Upgrade opencivicdata --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 080af981..5b06ad27 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,7 +35,7 @@ lxml==4.9.1 # via -r requirements.in olefile==0.47 # via agate-excel -opencivicdata==3.3.1 +opencivicdata==3.4.0 # via # -r requirements.in # pupa From fef5d3f68071338a96c0b0e772ce0234abc07fa2 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 11 Nov 2024 17:29:36 -0500 Subject: [PATCH 11/29] ca_on_wilmot: Skip executive officer --- ca_on_wilmot/people.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ca_on_wilmot/people.py b/ca_on_wilmot/people.py index d4ac6f1c..ff35f3a2 100644 --- a/ca_on_wilmot/people.py +++ b/ca_on_wilmot/people.py @@ -14,6 +14,9 @@ def scrape(self): role_name, contact_info = councillors[i], councillors[i + 1] role, name = role_name.text_content().strip().replace("\xa0", " ").split("— ") + if "Executive Officer to the Mayor and Council" in role: + continue + # "Ward 1 Councillor" if "Councillor" in role: district = role.split(" Councillor")[0] From 1e96ba1b4faeea9bf0decbcf185ac5e5bf4d7dc5 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 11 Nov 2024 17:32:35 -0500 Subject: [PATCH 12/29] ca_ns_halifax: Allow Jean St.Amand --- patch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/patch.py b/patch.py index 6d6312e8..d681e08b 100644 --- a/patch.py +++ b/patch.py @@ -121,7 +121,7 @@ r"(?:Jr|Rev|Sr|St)\.|" r"da|de|den|der|la|van|von|" r'[("](?:\p{Lu}+|\p{Lu}\p{Ll}*(?:-\p{Lu}\p{Ll}*)*)[)"]|' - r"(?:D'|d'|De|de|Des|Di|Du|L'|La|Le|Mac|Mc|O'|San|Van|Vander?|van|vanden)?\p{Lu}\p{Ll}+|" + r"(?:D'|d'|De|de|Des|Di|Du|L'|La|Le|Mac|Mc|O'|San|St\.|Van|Vander?|van|vanden)?\p{Lu}\p{Ll}+|" r"\p{Lu}\p{Ll}+Anne?|Marie\p{Lu}\p{Ll}+|" r"Ch'ng|Prud'homme|" r"D!ONNE|IsaBelle|Ya'ara" From d6249fcb263d31ff7e6fc7f2dcf7790820dedc6f Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 11 Nov 2024 17:33:57 -0500 Subject: [PATCH 13/29] ca_on_kawartha_lakes: Ignore names including content like "RESIGNED AS OF NOV. 1, 2024" --- ca_on_kawartha_lakes/people.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ca_on_kawartha_lakes/people.py b/ca_on_kawartha_lakes/people.py index ad2d33db..1c075f87 100644 --- a/ca_on_kawartha_lakes/people.py +++ b/ca_on_kawartha_lakes/people.py @@ -23,6 +23,9 @@ def scrape(self): name = councillor.text_content().replace("Mayor", "").strip() role = "Mayor" + if "RESIGNED" in name: + continue + info_node = councillor.xpath("./following-sibling::*")[0] email = self.get_email(info_node) phone = self.get_phone(info_node) From e50e78053c492198e4a0e62e6d07078250d7e2ed Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Tue, 12 Nov 2024 16:44:33 -0500 Subject: [PATCH 14/29] ca_on_thunder_bay: Fix SSL error --- ca_on_thunder_bay/people.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ca_on_thunder_bay/people.py b/ca_on_thunder_bay/people.py index 86d56487..93c11e1e 100644 --- a/ca_on_thunder_bay/people.py +++ b/ca_on_thunder_bay/people.py @@ -44,5 +44,11 @@ def scrape(self): yield p def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False): - requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ":HIGH:!DH:!aNULL" # site uses a weak DH key + # SSLError(SSLError(1, '[SSL: DH_KEY_TOO_SMALL] dh key too small (_ssl.c:1133)')) + # https://stackoverflow.com/a/41041028/244258 + requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ":HIGH:!DH:!aNULL" + try: + requests.packages.urllib3.contrib.pyopenssl.util.ssl_.DEFAULT_CIPHERS += ':HIGH:!DH:!aNULL' + except AttributeError: + pass return super().lxmlize(url, encoding, user_agent=user_agent, cookies=cookies, xml=xml) From 744835b940f5901bea8109b61ad57cb82c6e5666 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:05:54 -0500 Subject: [PATCH 15/29] ca_yt: Set user-agent and cookie (DEFAULT_USER_AGENT and cookie from previous request would not work) --- ca_yt/people.py | 9 +++++++-- utils.py | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/ca_yt/people.py b/ca_yt/people.py index b6dd62f4..ae18f27d 100644 --- a/ca_yt/people.py +++ b/ca_yt/people.py @@ -1,21 +1,26 @@ import contextlib +import requests from utils import CanadianPerson as Person from utils import CanadianScraper COUNCIL_PAGE = "https://yukonassembly.ca/mlas" +USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0.1 Safari/605.1.15' +COOKIES = { + '__cf_bm': 'F6Hu6MMBLKVvWHRnv4jMKjzC6rPO.eZiP7e2wFmDDuk-1731447448-1.0.1.1-rOXfHAF4pu2oOjWi79k_ktxvpxutL0x.BKYzcxgqooaC0mZe.oRHqJe_bLzTcFHixlhjd4luXPSxO9kv08_7vw' +} class YukonPersonScraper(CanadianScraper): def scrape(self): - page = self.lxmlize(COUNCIL_PAGE) + page = self.lxmlize(COUNCIL_PAGE, cookies=COOKIES, user_agent=USER_AGENT) members = page.xpath('//*[@id="block-views-block-members-listing-block-1"]/div/div/div[2]/div') assert len(members), "No members found" for member in members: if "Vacant" not in member.xpath("./div/span")[0].text_content(): url = member.xpath("./div/span/a/@href")[0] - page = self.lxmlize(url) + page = self.lxmlize(url, cookies=COOKIES, user_agent=USER_AGENT) name = page.xpath("//html/body/div[1]/div/div/section/div[2]/article/div/h1/span/span")[ 0 ].text_content() diff --git a/utils.py b/utils.py index 3a5ded50..3f487911 100644 --- a/utils.py +++ b/utils.py @@ -199,6 +199,8 @@ def post(self, *args, **kwargs): return super().post(*args, verify=SSL_VERIFY, **kwargs) def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False): + # Sets User-Agent header. + # https://github.com/jamesturk/scrapelib/blob/5ce0916/scrapelib/__init__.py#L505 self.user_agent = user_agent response = self.get(url, cookies=cookies) From b5c125a4577808ae78a95170cd19681c174fb8c2 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:10:58 -0500 Subject: [PATCH 16/29] ca_on_thunder_bay: patching requests didn't work on Heroku --- ca_on_thunder_bay/people.py | 15 +++------------ utils.py | 8 ++++---- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/ca_on_thunder_bay/people.py b/ca_on_thunder_bay/people.py index 93c11e1e..ee51abc5 100644 --- a/ca_on_thunder_bay/people.py +++ b/ca_on_thunder_bay/people.py @@ -9,13 +9,14 @@ class ThunderBayPersonScraper(CanadianScraper): def scrape(self): seat_number = 1 - page = self.lxmlize(COUNCIL_PAGE) + # SSLError(SSLError(1, '[SSL: DH_KEY_TOO_SMALL] dh key too small (_ssl.c:1133)')) + page = self.lxmlize(COUNCIL_PAGE, verify=False) councillors = page.xpath("//p[@class='Center']/a[@href]") assert len(councillors), "No councillors found" for councillor in councillors: url = councillor.xpath("./@href")[0] - councillor_page = self.lxmlize(url) + councillor_page = self.lxmlize(url, verify=False) info = councillor_page.xpath("//div[@class='iCreateDynaToken']")[1] role = info.xpath("./h2")[0].text_content() name = info.xpath("./h3")[0].text_content() @@ -42,13 +43,3 @@ def scrape(self): p.image = photo yield p - - def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False): - # SSLError(SSLError(1, '[SSL: DH_KEY_TOO_SMALL] dh key too small (_ssl.c:1133)')) - # https://stackoverflow.com/a/41041028/244258 - requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ":HIGH:!DH:!aNULL" - try: - requests.packages.urllib3.contrib.pyopenssl.util.ssl_.DEFAULT_CIPHERS += ':HIGH:!DH:!aNULL' - except AttributeError: - pass - return super().lxmlize(url, encoding, user_agent=user_agent, cookies=cookies, xml=xml) diff --git a/utils.py b/utils.py index 3f487911..59561840 100644 --- a/utils.py +++ b/utils.py @@ -193,17 +193,17 @@ def get_link(self, node, substring, *, error=True): return None def get(self, *args, **kwargs): - return super().get(*args, verify=SSL_VERIFY, **kwargs) + return super().get(*args, verify=kwargs.pop("verify", SSL_VERIFY), **kwargs) def post(self, *args, **kwargs): - return super().post(*args, verify=SSL_VERIFY, **kwargs) + return super().post(*args, verify=kwargs.pop("verify", SSL_VERIFY), **kwargs) - def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False): + def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False, verify=SSL_VERIFY): # Sets User-Agent header. # https://github.com/jamesturk/scrapelib/blob/5ce0916/scrapelib/__init__.py#L505 self.user_agent = user_agent - response = self.get(url, cookies=cookies) + response = self.get(url, cookies=cookies, verify=verify) if encoding: response.encoding = encoding From d68412c74a5c326e147594c2dd5544f7a6df05fc Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:17:31 -0500 Subject: [PATCH 17/29] ca_ns_cape_breton: Escape quotation marks in name --- ca_ns_cape_breton/people.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ca_ns_cape_breton/people.py b/ca_ns_cape_breton/people.py index 9d9272a5..14b749ac 100644 --- a/ca_ns_cape_breton/people.py +++ b/ca_ns_cape_breton/people.py @@ -53,7 +53,8 @@ def decode_email(script): councillor_url = councillor.xpath(".//a/@href")[0] p.add_source(councillor_url) page = self.lxmlize(councillor_url, user_agent=CUSTOM_USER_AGENT) - image = page.xpath(f'//img[contains(@title, "{name}")]/@src') + escaped_name = name.replace('"', """) + image = page.xpath(f'//img[contains(@title, "{escaped_name}")]/@src') if image: p.image = image[0] yield p From b48c0251c72a9d4a726df9b7e7b6f3a9e0e09842 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:22:32 -0500 Subject: [PATCH 18/29] ca_yt: Add comment about Cloudflare bot products --- ca_yt/people.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/ca_yt/people.py b/ca_yt/people.py index ae18f27d..293c8690 100644 --- a/ca_yt/people.py +++ b/ca_yt/people.py @@ -5,22 +5,20 @@ from utils import CanadianScraper COUNCIL_PAGE = "https://yukonassembly.ca/mlas" -USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0.1 Safari/605.1.15' -COOKIES = { - '__cf_bm': 'F6Hu6MMBLKVvWHRnv4jMKjzC6rPO.eZiP7e2wFmDDuk-1731447448-1.0.1.1-rOXfHAF4pu2oOjWi79k_ktxvpxutL0x.BKYzcxgqooaC0mZe.oRHqJe_bLzTcFHixlhjd4luXPSxO9kv08_7vw' -} +# This website uses Cloudflare bot products (setting a __cf_bm cookie), which is hard to circumvent. +# https://developers.cloudflare.com/fundamentals/reference/policies-compliances/cloudflare-cookies/ class YukonPersonScraper(CanadianScraper): def scrape(self): - page = self.lxmlize(COUNCIL_PAGE, cookies=COOKIES, user_agent=USER_AGENT) + page = self.lxmlize(COUNCIL_PAGE) members = page.xpath('//*[@id="block-views-block-members-listing-block-1"]/div/div/div[2]/div') assert len(members), "No members found" for member in members: if "Vacant" not in member.xpath("./div/span")[0].text_content(): url = member.xpath("./div/span/a/@href")[0] - page = self.lxmlize(url, cookies=COOKIES, user_agent=USER_AGENT) + page = self.lxmlize(url) name = page.xpath("//html/body/div[1]/div/div/section/div[2]/article/div/h1/span/span")[ 0 ].text_content() From 6ceb72bb318e91703a63517940050b4e9e0d38e3 Mon Sep 17 00:00:00 2001 From: bzhangjma Date: Mon, 18 Nov 2024 14:47:06 -0500 Subject: [PATCH 19/29] Update people.py --- ca_bc/people.py | 78 ++++++++++--------------------------------------- 1 file changed, 15 insertions(+), 63 deletions(-) diff --git a/ca_bc/people.py b/ca_bc/people.py index 356e4379..e8f00f4a 100644 --- a/ca_bc/people.py +++ b/ca_bc/people.py @@ -1,76 +1,28 @@ -import re +import json +import requests from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "https://www.leg.bc.ca/_api/search/query?querytext='(contentclass:sts_listitem%20OR%20IsDocument:True)%20SPSiteUrl:/content%20ListId:8ecafcaa-2bf9-4434-a60c-3663a9afd175%20MLAActiveOWSBOOL:1%20-LastNameOWSTEXT:Vacant'&selectproperties='Picture1OWSIMGE,Title,Path'&&sortlist='LastNameSort:ascending'&rowlimit=100&QueryTemplatePropertiesUrl='spfile://webroot/queryparametertemplate.xml'" +COUNCIL_PAGE = "https://www.leg.bc.ca/members" +query = """ +query GetMLAsByConstituency($parliamentId: Int!) {\n allMemberParliaments(condition: {parliamentId: $parliamentId, active: true}) {\n nodes {\n image: imageBySmallImageId {\n path\n description\n __typename\n }\n constituency: constituencyByConstituencyId {\n name\n __typename\n }\n member: memberByMemberId {\n firstName\n lastName\n __typename\n }\n isCounsel\n isDoctor\n isHonourable\n party: partyByPartyId {\n name\n abbreviation\n __typename\n }\n nodeId\n __typename\n }\n __typename\n }\n} +""" +variables = {"parliamentId": 43} class BritishColumbiaPersonScraper(CanadianScraper): def scrape(self): - parties = { - "BC NDP": "New Democratic Party of British Columbia", - "BC Liberal Party": "British Columbia Liberal Party", - } - - page = self.lxmlize(COUNCIL_PAGE, xml=True) - - nsmap = {"d": "http://schemas.microsoft.com/ado/2007/08/dataservices"} - members = page.xpath("//d:Cells", namespaces=nsmap) - assert len(members), "No members found" + response = requests.post(url="https://lims.leg.bc.ca/graphql", json={"query": query, "variables": variables}) + data = json.loads(response.content.decode("utf-8")) + members = data["data"]["allMemberParliaments"]["nodes"] for member in members: - url = member.xpath('./d:element/d:Key[text()="Path"]/following-sibling::d:Value/text()', namespaces=nsmap)[ - 0 - ] - if "vacant" in url.lower(): - continue - page = self.lxmlize(url) + image = "https://lims.leg.bc.ca/public" + member["image"]["path"] + district = member["constituency"]["name"] + name = member["member"]["firstName"] + " " + member["member"]["lastName"] + party = member["party"]["name"] - name = ( - page.xpath('//div[contains(@class, "BCLASS-pagetitle")]//h3/text()')[0] - .replace("Wm.", "") - .replace(", Q.C.", "") - .replace(", K.C.", "") - .strip() - ) - district, party = cleanup_list(page.xpath('//div[@id="MinisterTitle"]/following-sibling::text()')) - party = parties.get(party, party) - p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party) + p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party, image=image) p.add_source(COUNCIL_PAGE) - p.add_source(url) - - p.image = page.xpath('//img[contains(@src, "Members")]/@src')[0] - - email = page.xpath('//div[@class="convertToEmail"]//text()')[0].strip() - if "#" in email: - email = email.split("#")[0] - if email: - p.add_contact("email", email) - - office = ", ".join(cleanup_list(page.xpath('//h4[contains(text(), "Office:")]/ancestor::div/text()'))) - office = re.sub(r"\s{2,}", " ", office) - p.add_contact("address", office, "legislature") - - constituency = ", ".join( - cleanup_list(page.xpath('//h4[contains(text(), "Constituency:")]/ancestor::div[1]//text()')) - ) - constituency = re.sub(r"\s{2,}", " ", constituency).split(", Phone")[0] - p.add_contact("address", constituency, "constituency") - - phones = cleanup_list(page.xpath('//span[contains(text(), "Phone:")]/following-sibling::text()')) - - office_phone = phones[0] - p.add_contact("voice", office_phone, "legislature") - if len(phones) > 1: - constituency_phone = phones[1] - p.add_contact("voice", constituency_phone, "constituency") - - roles = page.xpath('//div[@id="MinisterTitle"]/text()')[0].strip() - if roles: - p.extras["roles"] = [roles] yield p - - -def cleanup_list(dirty_list): - return list(filter(None, (x.strip() for x in dirty_list))) From adff484f47c5963023a44efacfeadad738046dfb Mon Sep 17 00:00:00 2001 From: bzhangjma Date: Mon, 18 Nov 2024 14:48:24 -0500 Subject: [PATCH 20/29] Update people.py --- ca_bc/people.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ca_bc/people.py b/ca_bc/people.py index e8f00f4a..dfa1bef4 100644 --- a/ca_bc/people.py +++ b/ca_bc/people.py @@ -16,6 +16,7 @@ def scrape(self): response = requests.post(url="https://lims.leg.bc.ca/graphql", json={"query": query, "variables": variables}) data = json.loads(response.content.decode("utf-8")) members = data["data"]["allMemberParliaments"]["nodes"] + assert len(members), "No members found" for member in members: image = "https://lims.leg.bc.ca/public" + member["image"]["path"] district = member["constituency"]["name"] From 4c6e6d493043f43599c5c434080caef058309596 Mon Sep 17 00:00:00 2001 From: bzhangjma Date: Mon, 18 Nov 2024 15:09:47 -0500 Subject: [PATCH 21/29] Update people.py --- ca_bc/people.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ca_bc/people.py b/ca_bc/people.py index dfa1bef4..2eed34b5 100644 --- a/ca_bc/people.py +++ b/ca_bc/people.py @@ -5,15 +5,15 @@ from utils import CanadianScraper COUNCIL_PAGE = "https://www.leg.bc.ca/members" -query = """ -query GetMLAsByConstituency($parliamentId: Int!) {\n allMemberParliaments(condition: {parliamentId: $parliamentId, active: true}) {\n nodes {\n image: imageBySmallImageId {\n path\n description\n __typename\n }\n constituency: constituencyByConstituencyId {\n name\n __typename\n }\n member: memberByMemberId {\n firstName\n lastName\n __typename\n }\n isCounsel\n isDoctor\n isHonourable\n party: partyByPartyId {\n name\n abbreviation\n __typename\n }\n nodeId\n __typename\n }\n __typename\n }\n} -""" -variables = {"parliamentId": 43} +JSON = { + "query": "query GetMLAsByConstituency($parliamentId: Int!) {\n allMemberParliaments(condition: {parliamentId: $parliamentId, active: true}) {\n nodes {\n image: imageBySmallImageId {\n path\n description\n __typename\n }\n constituency: constituencyByConstituencyId {\n name\n __typename\n }\n member: memberByMemberId {\n firstName\n lastName\n __typename\n }\n isCounsel\n isDoctor\n isHonourable\n party: partyByPartyId {\n name\n abbreviation\n __typename\n }\n nodeId\n __typename\n }\n __typename\n }\n}", + "variables": {"parliamentId": 43}, +} class BritishColumbiaPersonScraper(CanadianScraper): def scrape(self): - response = requests.post(url="https://lims.leg.bc.ca/graphql", json={"query": query, "variables": variables}) + response = requests.post(url="https://lims.leg.bc.ca/graphql", json=JSON) data = json.loads(response.content.decode("utf-8")) members = data["data"]["allMemberParliaments"]["nodes"] assert len(members), "No members found" From 4f8f4a79b531bd7cee399a5ca9dfb8498cd8cfb5 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 18 Nov 2024 15:41:25 -0500 Subject: [PATCH 22/29] chore: Remove unused imports --- ca_on_thunder_bay/people.py | 4 +--- ca_yt/people.py | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/ca_on_thunder_bay/people.py b/ca_on_thunder_bay/people.py index ee51abc5..e39dee76 100644 --- a/ca_on_thunder_bay/people.py +++ b/ca_on_thunder_bay/people.py @@ -1,6 +1,4 @@ -import requests - -from utils import DEFAULT_USER_AGENT, CanadianScraper +from utils import CanadianScraper from utils import CanadianPerson as Person COUNCIL_PAGE = "https://www.thunderbay.ca/en/city-hall/mayor-and-council-profiles.aspx" diff --git a/ca_yt/people.py b/ca_yt/people.py index 293c8690..4808f220 100644 --- a/ca_yt/people.py +++ b/ca_yt/people.py @@ -1,5 +1,4 @@ import contextlib -import requests from utils import CanadianPerson as Person from utils import CanadianScraper From 6de91eeaf3d6620590310fc2d95f0a1f587e47b6 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 18 Nov 2024 15:56:58 -0500 Subject: [PATCH 23/29] ca_bc: Use multiline string for readability --- ca_bc/people.py | 44 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/ca_bc/people.py b/ca_bc/people.py index 2eed34b5..68a7984c 100644 --- a/ca_bc/people.py +++ b/ca_bc/people.py @@ -1,21 +1,53 @@ import json -import requests +from textwrap import dedent from utils import CanadianPerson as Person from utils import CanadianScraper COUNCIL_PAGE = "https://www.leg.bc.ca/members" -JSON = { - "query": "query GetMLAsByConstituency($parliamentId: Int!) {\n allMemberParliaments(condition: {parliamentId: $parliamentId, active: true}) {\n nodes {\n image: imageBySmallImageId {\n path\n description\n __typename\n }\n constituency: constituencyByConstituencyId {\n name\n __typename\n }\n member: memberByMemberId {\n firstName\n lastName\n __typename\n }\n isCounsel\n isDoctor\n isHonourable\n party: partyByPartyId {\n name\n abbreviation\n __typename\n }\n nodeId\n __typename\n }\n __typename\n }\n}", - "variables": {"parliamentId": 43}, -} class BritishColumbiaPersonScraper(CanadianScraper): def scrape(self): - response = requests.post(url="https://lims.leg.bc.ca/graphql", json=JSON) + response = self.post(url="https://lims.leg.bc.ca/graphql", json={ + "query": dedent("""\ + query GetMLAsByConstituency($parliamentId: Int!) { + allMemberParliaments(condition: {parliamentId: $parliamentId, active: true}) { + nodes { + image: imageBySmallImageId { + path + description + __typename + } + constituency: constituencyByConstituencyId { + name + __typename + } + member: memberByMemberId { + firstName + lastName + __typename + } + isCounsel + isDoctor + isHonourable + party: partyByPartyId { + name + abbreviation + __typename + } + nodeId + __typename + } + __typename + } + }""" + ), + "variables": {"parliamentId": 43}, + }) data = json.loads(response.content.decode("utf-8")) members = data["data"]["allMemberParliaments"]["nodes"] + assert len(members), "No members found" for member in members: image = "https://lims.leg.bc.ca/public" + member["image"]["path"] From 05054d96cb9a91c2313ee8546b508028ee2433f5 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 18 Nov 2024 15:57:53 -0500 Subject: [PATCH 24/29] ca_bc: Fix validation to allow "Hon Chan" and "A'aliya --- patch.py | 4 ++-- utils.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/patch.py b/patch.py index d681e08b..4540cfc8 100644 --- a/patch.py +++ b/patch.py @@ -123,7 +123,7 @@ r'[("](?:\p{Lu}+|\p{Lu}\p{Ll}*(?:-\p{Lu}\p{Ll}*)*)[)"]|' r"(?:D'|d'|De|de|Des|Di|Du|L'|La|Le|Mac|Mc|O'|San|St\.|Van|Vander?|van|vanden)?\p{Lu}\p{Ll}+|" r"\p{Lu}\p{Ll}+Anne?|Marie\p{Lu}\p{Ll}+|" - r"Ch'ng|Prud'homme|" + r"A'aliya|Ch'ng|Prud'homme|" r"D!ONNE|IsaBelle|Ya'ara" r")" ) @@ -131,7 +131,7 @@ # Name components can be joined by apostrophes, hyphens or spaces. person_schema["properties"]["name"]["pattern"] = re.compile( r"\A" - r"(?!(?:Chair|Commissioner|Conseiller|Councillor|Deputy|Dr|Hon|M|Maire|Mayor|Miss|Mme|Mr|Mrs|Ms|Regional|Warden)\b)" + r"(?!(?:Chair|Commissioner|Conseiller|Councillor|Deputy|Dr|M|Maire|Mayor|Miss|Mme|Mr|Mrs|Ms|Regional|Warden)\b)" r"(?:" + name_fragment + r"(?:'|-| - | )" r")+" + name_fragment + r"\Z" ) diff --git a/utils.py b/utils.py index 59561840..52ef6e34 100644 --- a/utils.py +++ b/utils.py @@ -739,9 +739,10 @@ def clean_string(s): def clean_name(s): - return honorific_suffix_re.sub( - "", honorific_prefix_re.sub("", whitespace_re.sub(" ", str(s).translate(table)).strip()) - ) + name = honorific_suffix_re.sub("", whitespace_re.sub(" ", str(s).translate(table)).strip()) + if name.count(" ") > 1: + return honorific_prefix_re.sub("", name) # Avoid truncating names like "Hon Chan" + return name def clean_type_id(type_id): From 8f5acfb4270d5055ead5de4ff9b8f93d1524e6ad Mon Sep 17 00:00:00 2001 From: Samuel Pei Date: Wed, 20 Nov 2024 17:34:50 -0500 Subject: [PATCH 25/29] Removed unused function --- ca_qc_kirkland/people.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/ca_qc_kirkland/people.py b/ca_qc_kirkland/people.py index f6dd40e9..3ff3df41 100644 --- a/ca_qc_kirkland/people.py +++ b/ca_qc_kirkland/people.py @@ -8,15 +8,6 @@ class KirklandPersonScraper(CanadianScraper): def scrape(self): - def decode_email(e): - de = "" - k = int(e[:2], 16) - - for i in range(2, len(e) - 1, 2): - de += chr(int(e[i : i + 2], 16) ^ k) - - return de - page = self.lxmlize(COUNCIL_PAGE, "iso-8859-1") councillors = page.xpath('//table/tbody[not(@id)]/tr/td[@valign="top"]') From 83a5a0416c5217cd1b1f358a1bf374a7856d6d1d Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Thu, 30 May 2024 15:46:16 -0400 Subject: [PATCH 26/29] Fix Kirkland scraper --- ca_qc_kirkland/people.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ca_qc_kirkland/people.py b/ca_qc_kirkland/people.py index 3f0bab4b..37da350f 100644 --- a/ca_qc_kirkland/people.py +++ b/ca_qc_kirkland/people.py @@ -30,7 +30,8 @@ def scrape(self): .replace(".", ",") # correcting a typo .replace(",-#-", " x") ) - email = self.get_email(councillor) + encrypted_email = councillor.xpath('.//@href[contains(., "email")]')[0].split("#")[1] + email = decode_email(encrypted_email) p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) From 01536a3e17b44f9c97631888ed49f0f21902f8df Mon Sep 17 00:00:00 2001 From: Samuel Pei Date: Tue, 29 Oct 2024 13:41:23 -0400 Subject: [PATCH 27/29] Similar solution to burnaby web scraper. --- ca_qc_kirkland/people.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ca_qc_kirkland/people.py b/ca_qc_kirkland/people.py index 37da350f..9ef3d32e 100644 --- a/ca_qc_kirkland/people.py +++ b/ca_qc_kirkland/people.py @@ -10,7 +10,8 @@ class KirklandPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//div[@class="container_content"]//tbody/tr') + # councillors = page.xpath('//div[@id="PageContent"]/table/tbody/tr/td') + councillors = page.xpath('//table/tbody[not(@id)]/tr/td[@valign="top"]') assert len(councillors), "No councillors found" for councillor in councillors: if councillor == councillors[0]: From 42571bda117cd6345a0620f56df4c2fc3f8e6dfe Mon Sep 17 00:00:00 2001 From: Samuel Pei Date: Fri, 8 Nov 2024 13:08:12 -0500 Subject: [PATCH 28/29] Fixed email encryption error --- ca_qc_kirkland/people.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ca_qc_kirkland/people.py b/ca_qc_kirkland/people.py index 9ef3d32e..431211dd 100644 --- a/ca_qc_kirkland/people.py +++ b/ca_qc_kirkland/people.py @@ -31,8 +31,10 @@ def scrape(self): .replace(".", ",") # correcting a typo .replace(",-#-", " x") ) - encrypted_email = councillor.xpath('.//@href[contains(., "email")]')[0].split("#")[1] - email = decode_email(encrypted_email) + + # cloudflare encrypts the email data + encrypted_email = councillor.xpath('.//@href[contains(., "email")]')[0] + email = self._cloudflare_decode(encrypted_email) p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) From d3894c4a67b2bab1acc2719e44ccde4e1d2ffbe9 Mon Sep 17 00:00:00 2001 From: Samuel Pei Date: Wed, 20 Nov 2024 17:34:50 -0500 Subject: [PATCH 29/29] Removed unused function --- ca_qc_kirkland/people.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ca_qc_kirkland/people.py b/ca_qc_kirkland/people.py index 431211dd..477eab9d 100644 --- a/ca_qc_kirkland/people.py +++ b/ca_qc_kirkland/people.py @@ -8,9 +8,7 @@ class KirklandPersonScraper(CanadianScraper): def scrape(self): - page = self.lxmlize(COUNCIL_PAGE) - - # councillors = page.xpath('//div[@id="PageContent"]/table/tbody/tr/td') + page = self.lxmlize(COUNCIL_PAGE, "iso-8859-1") councillors = page.xpath('//table/tbody[not(@id)]/tr/td[@valign="top"]') assert len(councillors), "No councillors found" for councillor in councillors: