Skip to content

Commit

Permalink
Merge branch 'opencivicdata:master' into montreal_est_scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
iepmas authored Nov 6, 2024
2 parents f8d2789 + 26b8e72 commit 29c7f83
Show file tree
Hide file tree
Showing 8 changed files with 73 additions and 63 deletions.
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.9
4 changes: 3 additions & 1 deletion ca_bc_victoria/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ def scrape(self):
'//ul[@class="menu menu--level-0"]//a[contains(., "Mayor") and not(contains(., "Council"))]/@href'
)[0]
page = self.lxmlize(mayor_url)
role, name = page.xpath("//h1/span")[0].text_content().split(" ", 1)
role, name = page.xpath(
'//ul[@class="menu menu--level-0"]//a[contains(., "Mayor") and not(contains(., "Council"))]/text()'
)[0].split(" ", 1)
photo = councillor.xpath('//div[@class="field__item"]/img/@src')[0]
email = self.get_email(page)
phone = self.get_phone(page)
Expand Down
4 changes: 2 additions & 2 deletions ca_on_guelph/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@


class GuelphPersonScraper(CSVScraper):
# http://data.open.guelph.ca/dataset/city-of-guelph-contacts
csv_url = "http://data.open.guelph.ca/datafiles/guelph-mayor-and-councillors-contact-information-2018-2022.csv"
# https://explore.guelph.ca/documents/5ec8d85028c94e83be12a9f01d14eb7f/about
csv_url = "https://gismaps.guelph.ca/OpenData/guelph-city-council.csv"
many_posts_per_area = True
55 changes: 37 additions & 18 deletions ca_on_markham/people.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
import re

from utils import CanadianPerson as Person
from utils import CanadianScraper

COUNCIL_PAGE = (
"https://www.markham.ca/wps/portal/home/about/city-hall/regional-ward-councillors/02-regional-ward-councillors"
)
MAYOR_PAGE = "https://www.markham.ca/wps/portal/home/about/city-hall/mayor/00-mayors-office"
COUNCIL_PAGE = "https://www.markham.ca/about-city-markham/city-hall/regional-ward-councillors"
MAYOR_PAGE = "https://www.markham.ca/about-city-markham/city-hall/mayors-office"


class MarkhamPersonScraper(CanadianScraper):
Expand All @@ -17,10 +13,15 @@ def scrape(self):

yield self.scrape_mayor(MAYOR_PAGE)

councillors = page.xpath('//div[@class="col-sm-3 col-xs-6"]')
councillors = page.xpath(
'//div[@class="grid md:grid-cols-2 grid-cols-1 lg:grid-cols-4 gap-4 scrollablec"]/div'
)
assert len(councillors), "No councillors found"

for councillor in councillors:
name, district = councillor.xpath(".//h4/text()")[0].split(", ")
name = councillor.xpath(".//h3/text()")[0].strip()
district = councillor.xpath(".//p/text()")[0].strip()

if "Ward" in district:
district = district.replace("Councillor", "").strip()
role = "Councillor"
Expand All @@ -33,9 +34,7 @@ def scrape(self):
district = "Markham"

image = councillor.xpath(".//img/@src")[0]
url = "https://www.markham.ca/wps/portal/home/about" + re.search(
r"(?<=about).*(?='\))", councillor.xpath(".//a/@href")[0]
).group(0)
url = councillor.xpath(".//a/@href")[0]

address, phone, email, links = self.get_contact(url)

Expand All @@ -56,10 +55,27 @@ def scrape(self):
def get_contact(self, url):
page = self.lxmlize(url)

contact_node = page.xpath('//div[@class="vcard col-sm-6"]')[0]
contact_node = page.xpath(
'//div[@class="pd-x-16 pd-y-32 bg-white committee-right-info-section layout__region layout__region--second"]'
)[0]
links = []

address = contact_node.xpath(".//p/text()")[:2]
if contact_node.xpath('.//span[@class="address-line1"]/text()'):
address = " ".join(
(
contact_node.xpath('.//span[@class="address-line1"]/text()')[0],
contact_node.xpath('.//span[@class="locality"]/text()')[0],
contact_node.xpath('.//span[@class="administrative-area"]/text()')[0],
contact_node.xpath('.//span[@class="postal-code"]/text()')[0],
contact_node.xpath('.//span[@class="country"]/text()')[0],
)
)
else:
contact_node = page.xpath(
'//div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]'
)[0]
address = f'{contact_node.xpath(".//p/text()")[0]} {contact_node.xpath(".//p/text()")[1]}'

links = get_links(contact_node)
phone = self.get_phone(contact_node)
email = self.get_email(contact_node)
Expand All @@ -68,12 +84,15 @@ def get_contact(self, url):

def scrape_mayor(self, url):
page = self.lxmlize(url)
name = page.xpath('//img/@alt[contains(., "Mayor")]')[0].split(", ", 1)[1]
email = self.get_email(page)
phone = self.get_phone(page)
name = page.xpath(
'.//div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]/h1/span/span/text()'
)[0]
contact_node = page.xpath('.//div[@class="dept-contact-info--block"]')[0]
email = self.get_email(contact_node)
phone = self.get_phone(contact_node)

p = Person(primary_org="legislature", name=name, district="Markham", role="Mayor")
p.image = page.xpath('//img[contains(./@alt, "Mayor")]/@src')[0]
p.image = page.xpath('.//div[@class="align-right media--image"]/div/img/@src')[0]
p.add_contact("email", email)
p.add_contact("voice", phone, "legislature")
p.add_source(url)
Expand All @@ -86,6 +105,6 @@ def get_links(elem):
links = elem.xpath(".//a")
for link in links:
link = link.attrib["href"]
if "http://www.markham.ca" not in link and "mail" not in link:
if "http://www.markham.ca" not in link and "mail" not in link and "tel" not in link:
links_r.append(link)
return links_r
23 changes: 9 additions & 14 deletions ca_on_mississauga/people.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import re

from utils import CanadianPerson as Person
from utils import CanadianScraper

COUNCIL_PAGE = "http://www.mississauga.ca/portal/cityhall/mayorandcouncil"
MAYOR_PAGE = "http://www.mississauga.ca/portal/cityhall/mayorsoffice"
CONTACT_PAGE = "http://www.mississauga.ca/portal/helpfeedback/contactus"


Expand All @@ -16,21 +17,15 @@ def scrape(self):
if "vacant" not in councillor_url.xpath(".//div//div[1]/text()")[0].lower():
yield self.councillor_data(councillor_url.attrib["href"])

mayor_page = self.lxmlize(MAYOR_PAGE)
mayor_name = mayor_page.xpath('//*[@id="com-main"]/div/div/div/h1/text()')[0]
if "vacant" not in mayor_name.lower():
yield self.mayor_data(MAYOR_PAGE)
mayor_url = page.xpath('//li/a[contains(@href, "mayor")]')[0]
if "vacant" not in mayor_url.xpath(".//div//div[1]/text()")[0].lower():
yield self.mayor_data(mayor_url.attrib["href"])

def councillor_data(self, url):
page = self.lxmlize(url)

name_district = page.xpath('//*[@id="com-main"]/div/div/div/h1/text()')[0]
hyphen = name_district.find("Councillor")
district = name_district[: hyphen - 3]
name = name_district[hyphen:]
bracket = name.find("(")
if bracket != -1:
name = name[:bracket]
district, name = re.split(r" – (?:Councillor (?:and Deputy Mayor )?)?", name_district) # n-dash
email = self.get_email(page, '//section[contains(@class, "module-content")]')
photo = page.xpath(
'//section[contains(@class, "module-content")]/p[1]/img/@src|//section[contains(@class, "module-content")]/p[1]/b/img/@src|//section[contains(@class, "module-content")]/p[1]/strong/img/@src'
Expand All @@ -47,9 +42,9 @@ def councillor_data(self, url):
def mayor_data(self, url):
page = self.lxmlize(url)

name_text = page.xpath('//*[@id="com-main"]/div/div/div/h1/text()')[0]
name = name_text.split(",")[0]
photo = page.xpath('//img[contains(@src, "mayor")]/@src')[0]
name = page.xpath('//*[@id="com-main"]/div/div/div/h1/text()')[0]
name = name.replace("Mayor – ", "")
photo = page.xpath('//*[@id="65a01af8598b7"]/p[1]/img/@src')[0]

p = Person(primary_org="legislature", name=name, district="Mississauga", role="Mayor")
p.add_source(url)
Expand Down
2 changes: 1 addition & 1 deletion ca_on_thunder_bay/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,4 @@ def scrape(self):

def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False):
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ":HIGH:!DH:!aNULL" # site uses a weak DH key
return super().lxmlize(url, encoding, user_agent, cookies, xml)
return super().lxmlize(url, encoding, user_agent=user_agent, cookies=cookies, xml=xml)
42 changes: 17 additions & 25 deletions ca_on_wilmot/people.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,31 @@
from utils import CanadianPerson as Person
from utils import CanadianScraper

COUNCIL_PAGE = "https://www.wilmot.ca/Modules/contact/search.aspx?s=EFHOVXSi8AOIMKMStZMNvAeQuAleQuAl"
COUNCIL_PAGE = "https://www.wilmot.ca/en/township-office/council.aspx"


class WilmotPersonScraper(CanadianScraper):
def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)

councillors = page.xpath('//table[@class="contactList"]//tr')
councillors = page.xpath('//table[@class="icrtAccordion"]//tr')
assert len(councillors), "No councillors found"
for councillor in councillors:
name, role_district = councillor.xpath(".//button/text()")[0].split(" - ", 1)
if "Mayor" in role_district:
yield scrape_mayor(councillor, name)
continue
role, district = role_district.split(" - ")

for i in range(0, len(councillors), 2):
role_name, contact_info = councillors[i], councillors[i + 1]
role, name = role_name.text_content().strip().replace("\xa0", " ").split("— ")

# "Ward 1 Councillor"
if "Councillor" in role:
district = role.split(" Councillor")[0]
role = "Councillor"
# "Mayor", "Executive Officer to the Mayor and Council"
else:
district = "Wilmot"

phone = self.get_phone(contact_info)
email = self.get_email(contact_info)
p = Person(primary_org="legislature", name=name, district=district, role=role)
p.add_source(COUNCIL_PAGE)

phone = self.get_phone(councillor).replace("/", "")
p.add_contact("voice", phone, "legislature")
p.add_contact("email", email)
yield p


def scrape_mayor(div, name):
p = Person(primary_org="legislature", name=name, district="Wilmot", role="Mayor")
p.add_source(COUNCIL_PAGE)

address = div.xpath('.//div[@class="contactListAddress"]')[0].text_content()
phone = div.xpath('.//div[@class="contactListMainNumber"]/a/text()')[0]
other_phone = div.xpath('.//div[@class="contactListPhNumber"]/a/text()')[0]
p.add_contact("address", address, "legislature")
p.add_contact("voice", phone, "legislature")
p.add_contact("voice", other_phone, "office")

return p
5 changes: 3 additions & 2 deletions ca_qc_dollard_des_ormeaux/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ def scrape(self):

p = Person(primary_org="legislature", name=name, district=district, role=role)
p.add_source(COUNCIL_PAGE)
p.image = councillor.xpath(".//@data-src")[0]

image = councillor.xpath(".//@data-src")
if image:
p.image = image[0]
p.add_contact("email", email)
p.add_contact("voice", general_phone, "legislature")
p.add_contact("fax", general_fax, "legislature")
Expand Down

0 comments on commit 29c7f83

Please sign in to comment.