Skip to content

Commit

Permalink
Modify scraper as per James to be less breaky in future
Browse files Browse the repository at this point in the history
  • Loading branch information
seamuslee001 committed Nov 4, 2024
1 parent 5cd9c90 commit 0922876
Showing 1 changed file with 9 additions and 13 deletions.
22 changes: 9 additions & 13 deletions ca_on_mississauga/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from utils import CanadianScraper

COUNCIL_PAGE = "http://www.mississauga.ca/portal/cityhall/mayorandcouncil"
MAYOR_PAGE = "http://www.mississauga.ca/portal/cityhall/mayorsoffice"
CONTACT_PAGE = "http://www.mississauga.ca/portal/helpfeedback/contactus"


Expand All @@ -24,17 +23,14 @@ def councillor_data(self, url):
page = self.lxmlize(url)

name_district = page.xpath('//*[@id="com-main"]/div/div/div/h1/text()')[0]
hyphen = name_district.find("Councillor")
if hyphen == -1:
hyphen = 9
district = name_district[: hyphen - 3]
name = name_district[hyphen:]
mayor = name.find("Deputy")
if mayor != -1:
name = name[27:]
bracket = name.find("(")
if bracket != -1:
name = name[:bracket]
name_district_parts = name_district.split()
district = f"{name_district_parts[0]} {name_district_parts[1]}"
# Remove the first 3 elements of the name_district_parts which should include the district + '-'
name_district_parts.pop(0)
name_district_parts.pop(0)
name_district_parts.pop(0)
name = " ".join(name_district_parts)
name = name.replace("Councillor and Deputy Mayor", "").strip()
email = self.get_email(page, '//section[contains(@class, "module-content")]')
photo = page.xpath(
'//section[contains(@class, "module-content")]/p[1]/img/@src|//section[contains(@class, "module-content")]/p[1]/b/img/@src|//section[contains(@class, "module-content")]/p[1]/strong/img/@src'
Expand All @@ -52,7 +48,7 @@ def mayor_data(self, url):
page = self.lxmlize(url)

name = page.xpath('//*[@id="com-main"]/div/div/div/h1/text()')[0]
name = name[8:]
name = name.replace("Mayor – ", "").strip()
photo = page.xpath('//*[@id="65a01af8598b7"]/p[1]/img/@src')[0]

p = Person(primary_org="legislature", name=name, district="Mississauga", role="Mayor")
Expand Down

0 comments on commit 0922876

Please sign in to comment.