From 0922876ebba07006f92ce44e5adfd41675964bff Mon Sep 17 00:00:00 2001 From: Seamus Lee Date: Mon, 4 Nov 2024 17:52:40 +1100 Subject: [PATCH] Modify scraper as per James to be less breaky in future --- ca_on_mississauga/people.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/ca_on_mississauga/people.py b/ca_on_mississauga/people.py index b903579b..42e2238f 100644 --- a/ca_on_mississauga/people.py +++ b/ca_on_mississauga/people.py @@ -2,7 +2,6 @@ from utils import CanadianScraper COUNCIL_PAGE = "http://www.mississauga.ca/portal/cityhall/mayorandcouncil" -MAYOR_PAGE = "http://www.mississauga.ca/portal/cityhall/mayorsoffice" CONTACT_PAGE = "http://www.mississauga.ca/portal/helpfeedback/contactus" @@ -24,17 +23,14 @@ def councillor_data(self, url): page = self.lxmlize(url) name_district = page.xpath('//*[@id="com-main"]/div/div/div/h1/text()')[0] - hyphen = name_district.find("Councillor") - if hyphen == -1: - hyphen = 9 - district = name_district[: hyphen - 3] - name = name_district[hyphen:] - mayor = name.find("Deputy") - if mayor != -1: - name = name[27:] - bracket = name.find("(") - if bracket != -1: - name = name[:bracket] + name_district_parts = name_district.split() + district = f"{name_district_parts[0]} {name_district_parts[1]}" + # Remove the first 3 elements of the name_district_parts which should include the district + '-' + name_district_parts.pop(0) + name_district_parts.pop(0) + name_district_parts.pop(0) + name = " ".join(name_district_parts) + name = name.replace("Councillor and Deputy Mayor", "").strip() email = self.get_email(page, '//section[contains(@class, "module-content")]') photo = page.xpath( '//section[contains(@class, "module-content")]/p[1]/img/@src|//section[contains(@class, "module-content")]/p[1]/b/img/@src|//section[contains(@class, "module-content")]/p[1]/strong/img/@src' @@ -52,7 +48,7 @@ def mayor_data(self, url): page = self.lxmlize(url) name = page.xpath('//*[@id="com-main"]/div/div/div/h1/text()')[0] - name = name[8:] + name = name.replace("Mayor – ", "").strip() photo = page.xpath('//*[@id="65a01af8598b7"]/p[1]/img/@src')[0] p = Person(primary_org="legislature", name=name, district="Mississauga", role="Mayor")