From 5a67f286920b4ad6cf2592067558f6abe5fdb87e Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Thu, 6 Jun 2024 09:36:21 -0400 Subject: [PATCH 1/4] Fix getting emails protected by cloudflare --- ca_qc_sainte_anne_de_bellevue/people.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/ca_qc_sainte_anne_de_bellevue/people.py b/ca_qc_sainte_anne_de_bellevue/people.py index 48cbbdab..e4a338e0 100644 --- a/ca_qc_sainte_anne_de_bellevue/people.py +++ b/ca_qc_sainte_anne_de_bellevue/people.py @@ -8,11 +8,20 @@ class SainteAnneDeBellevuePersonScraper(CanadianScraper): def scrape(self): + def decode_email(e): + de = "" + k = int(e[:2], 16) + + for i in range(2, len(e) - 1, 2): + de += chr(int(e[i : i + 2], 16) ^ k) + + return de + page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="block text"]') assert len(councillors), "No councillors found" - for i, councillor in enumerate(councillors): + for councillor in councillors: name = councillor.xpath('.//div[@class="content-writable"]//strong/text()')[0] district = councillor.xpath(".//h2/text()")[0] @@ -23,9 +32,11 @@ def scrape(self): district = "District {}".format(re.search(r"\d+", district)[0]) role = "Conseiller" + encoded_email = councillor.xpath('.//@href[contains(., "email-protection")]')[0].split("#")[1] + p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath(".//@src")[0] - p.add_contact("email", self.get_email(councillor)) + p.add_contact("email", decode_email(encoded_email)) yield p From e2d1b3931bc47a580f2ed4850f3eba57a3a6801f Mon Sep 17 00:00:00 2001 From: Samuel Pei Date: Tue, 29 Oct 2024 14:36:27 -0400 Subject: [PATCH 2/4] All the councillors are in one div so I took a different approach to extracting the data. --- ca_qc_sainte_anne_de_bellevue/people.py | 55 ++++++++++++++++++------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/ca_qc_sainte_anne_de_bellevue/people.py b/ca_qc_sainte_anne_de_bellevue/people.py index e4a338e0..e7b660b1 100644 --- a/ca_qc_sainte_anne_de_bellevue/people.py +++ b/ca_qc_sainte_anne_de_bellevue/people.py @@ -5,7 +5,6 @@ COUNCIL_PAGE = "https://www.ville.sainte-anne-de-bellevue.qc.ca/fr/199/elus-municipaux" - class SainteAnneDeBellevuePersonScraper(CanadianScraper): def scrape(self): def decode_email(e): @@ -19,24 +18,48 @@ def decode_email(e): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//div[@class="block text"]') + councillors = page.xpath('//div[@class="col-md-12"]')[0] assert len(councillors), "No councillors found" - for councillor in councillors: - name = councillor.xpath('.//div[@class="content-writable"]//strong/text()')[0] - district = councillor.xpath(".//h2/text()")[0] + + roles_and_districts = councillors.xpath('.//h2/text()') + roles = [] + districts = [] + names = [] + emails = [] - if "Maire" in district: - district = "Sainte-Anne-de-Bellevue" - role = "Maire" - else: - district = "District {}".format(re.search(r"\d+", district)[0]) - role = "Conseiller" + # Fill in roles and districts via h2 tags + for role in roles_and_districts: + role_and_district = role.split() - encoded_email = councillor.xpath('.//@href[contains(., "email-protection")]')[0].split("#")[1] + roles.append(role_and_district[0]) + + if len(role_and_district) == 1: + districts.append("Sainte-Anne-de-Bellevue") + else: + districts.append("District " + role_and_district[2]) + + # Fill in contact info via p tags. + contact_info = councillors.xpath('.//p[a[contains(@href, "@")]]') + for contact in contact_info: + contact = contact.text_content().split() + print(contact) + input() + name = " ".join(contact[:2]) + names.append(name) + + email = contact[3] + email = email.replace("Président", "") + emails.append(email) + + print(roles) + print(districts) + print(names) + print(emails) + input() - p = Person(primary_org="legislature", name=name, district=district, role=role) + assert len(roles) == len(districts) == len(names) == len(emails), "Lists are not of equal length" + for i in range(len(roles)): + p = Person(primary_org="legislature", name=names[i], district=districts[i], role=roles[i]) p.add_source(COUNCIL_PAGE) - - p.image = councillor.xpath(".//@src")[0] - p.add_contact("email", decode_email(encoded_email)) + p.add_contact("email", emails[i]) yield p From 8b408b9ec97c69c9958930eb4d7418cf093af52e Mon Sep 17 00:00:00 2001 From: Samuel Pei Date: Wed, 6 Nov 2024 11:14:46 -0500 Subject: [PATCH 3/4] Updated formatting --- ca_qc_sainte_anne_de_bellevue/people.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/ca_qc_sainte_anne_de_bellevue/people.py b/ca_qc_sainte_anne_de_bellevue/people.py index e7b660b1..5b6ba033 100644 --- a/ca_qc_sainte_anne_de_bellevue/people.py +++ b/ca_qc_sainte_anne_de_bellevue/people.py @@ -1,10 +1,9 @@ -import re - from utils import CanadianPerson as Person from utils import CanadianScraper COUNCIL_PAGE = "https://www.ville.sainte-anne-de-bellevue.qc.ca/fr/199/elus-municipaux" + class SainteAnneDeBellevuePersonScraper(CanadianScraper): def scrape(self): def decode_email(e): @@ -20,8 +19,8 @@ def decode_email(e): councillors = page.xpath('//div[@class="col-md-12"]')[0] assert len(councillors), "No councillors found" - - roles_and_districts = councillors.xpath('.//h2/text()') + + roles_and_districts = councillors.xpath(".//h2/text()") roles = [] districts = [] names = [] @@ -32,30 +31,22 @@ def decode_email(e): role_and_district = role.split() roles.append(role_and_district[0]) - + if len(role_and_district) == 1: districts.append("Sainte-Anne-de-Bellevue") else: districts.append("District " + role_and_district[2]) - + # Fill in contact info via p tags. contact_info = councillors.xpath('.//p[a[contains(@href, "@")]]') for contact in contact_info: contact = contact.text_content().split() - print(contact) - input() name = " ".join(contact[:2]) names.append(name) - + email = contact[3] email = email.replace("Président", "") emails.append(email) - - print(roles) - print(districts) - print(names) - print(emails) - input() assert len(roles) == len(districts) == len(names) == len(emails), "Lists are not of equal length" for i in range(len(roles)): From 6ba3826af1d9f2607db5400f85387c536168464b Mon Sep 17 00:00:00 2001 From: Samuel Pei Date: Fri, 8 Nov 2024 14:26:18 -0500 Subject: [PATCH 4/4] Re-implemented scraper using preceding-sibling --- ca_qc_sainte_anne_de_bellevue/people.py | 49 +++++++------------------ 1 file changed, 13 insertions(+), 36 deletions(-) diff --git a/ca_qc_sainte_anne_de_bellevue/people.py b/ca_qc_sainte_anne_de_bellevue/people.py index 5b6ba033..376a0baf 100644 --- a/ca_qc_sainte_anne_de_bellevue/people.py +++ b/ca_qc_sainte_anne_de_bellevue/people.py @@ -6,51 +6,28 @@ class SainteAnneDeBellevuePersonScraper(CanadianScraper): def scrape(self): - def decode_email(e): - de = "" - k = int(e[:2], 16) - - for i in range(2, len(e) - 1, 2): - de += chr(int(e[i : i + 2], 16) ^ k) - - return de - page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//div[@class="col-md-12"]')[0] + councillors = page.xpath('//p[a[contains(@href, "@")]]') assert len(councillors), "No councillors found" - roles_and_districts = councillors.xpath(".//h2/text()") - roles = [] - districts = [] - names = [] - emails = [] + for councillor in councillors: + role = councillor.xpath("./preceding-sibling::h2[1]/text()")[0] - # Fill in roles and districts via h2 tags - for role in roles_and_districts: - role_and_district = role.split() - - roles.append(role_and_district[0]) - - if len(role_and_district) == 1: - districts.append("Sainte-Anne-de-Bellevue") + if role == "Maire": + district = "Sainte-Anne-de-Bellevue" else: - districts.append("District " + role_and_district[2]) + district = "District " + role.split()[2] + role = "Conseiller" - # Fill in contact info via p tags. - contact_info = councillors.xpath('.//p[a[contains(@href, "@")]]') - for contact in contact_info: - contact = contact.text_content().split() - name = " ".join(contact[:2]) - names.append(name) + councillor = councillor.text_content().split() - email = contact[3] + name = " ".join(councillor[:2]) + email = councillor[3] email = email.replace("Président", "") - emails.append(email) - assert len(roles) == len(districts) == len(names) == len(emails), "Lists are not of equal length" - for i in range(len(roles)): - p = Person(primary_org="legislature", name=names[i], district=districts[i], role=roles[i]) + p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) - p.add_contact("email", emails[i]) + p.add_contact("email", email) + yield p