diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 00000000..12301490
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,6 @@
+version: 2
+updates:
+ - package-ecosystem: "github-actions"
+ directory: "/"
+ schedule:
+ interval: "daily"
diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml
new file mode 100644
index 00000000..55365732
--- /dev/null
+++ b/.github/workflows/automerge.yml
@@ -0,0 +1,35 @@
+# The pull_request_target workflow trigger is dangerous. Do not add unrelated logic to this workflow.
+# https://securitylab.github.com/research/github-actions-preventing-pwn-requests/
+# https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request_target
+name: Auto-merge
+on: pull_request_target
+permissions:
+ pull-requests: write # to approve the PR
+ contents: write # to merge the PR
+jobs:
+ dependabot:
+ if: ${{ github.event.pull_request.user.login == 'dependabot[bot]' }}
+ runs-on: ubuntu-latest
+ steps:
+ - id: dependabot-metadata
+ uses: dependabot/fetch-metadata@v2
+ with:
+ github-token: ${{ secrets.GITHUB_TOKEN }}
+ - if: ${{ steps.dependabot-metadata.outputs.update-type != 'version-update:semver-major' || steps.dependabot-metadata.outputs.package-ecosystem == 'github_actions' }}
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ run: gh pr review --approve ${{ github.event.pull_request.html_url }}
+ - if: ${{ steps.dependabot-metadata.outputs.update-type != 'version-update:semver-major' || steps.dependabot-metadata.outputs.package-ecosystem == 'github_actions' }}
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ run: gh pr merge --auto --squash ${{ github.event.pull_request.html_url }}
+ precommit:
+ if: ${{ github.event.pull_request.user.login == 'pre-commit-ci[bot]' }}
+ runs-on: ubuntu-latest
+ steps:
+ - env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ run: gh pr review --approve ${{ github.event.pull_request.html_url }}
+ - env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ run: gh pr merge --auto --squash ${{ github.event.pull_request.html_url }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bba16ab1..2aea3241 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,16 +1,17 @@
ci:
autoupdate_schedule: quarterly
+ skip: [pip-compile]
+default_language_version:
+ python: python3.10
repos:
- - repo: https://github.com/psf/black
- rev: 24.3.0
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.6.9
hooks:
- - id: black
- - repo: https://github.com/pycqa/flake8
- rev: 7.0.0
+ - id: ruff
+ - id: ruff-format
+ - repo: https://github.com/astral-sh/uv-pre-commit
+ rev: 0.4.18
hooks:
- - id: flake8
- additional_dependencies: [flake8-comprehensions]
- - repo: https://github.com/pycqa/isort
- rev: 5.13.2
- hooks:
- - id: isort
+ - id: pip-compile
+ name: pip-compile requirements.in
+ args: [requirements.in, -o, requirements.txt]
diff --git a/ca/people.py b/ca/people.py
index 6cf7b5f4..49d55060 100644
--- a/ca/people.py
+++ b/ca/people.py
@@ -59,7 +59,7 @@ def scrape_people(self, rows, gender):
photo_response = self.get(photo)
if (
photo_response.status_code == 200
- and hashlib.sha1(photo_response.content).hexdigest() not in IMAGE_PLACEHOLDER_SHA1
+ and hashlib.sha1(photo_response.content).hexdigest() not in IMAGE_PLACEHOLDER_SHA1 # noqa: S324 # non-cryptographic
):
m.image = photo
@@ -119,7 +119,7 @@ def scrape_people(self, rows, gender):
):
note = "constituency"
if i:
- note += " ({})".format(i + 1)
+ note += f" ({i + 1})"
address = constituency_office_el.xpath("./p[1]")[0]
address = address.text_content().strip().splitlines()
diff --git a/ca_ab/people.py b/ca_ab/people.py
index 601fa525..93696d59 100644
--- a/ca_ab/people.py
+++ b/ca_ab/people.py
@@ -24,7 +24,7 @@
def get_party(abbr):
- """Return full party name from abbreviation"""
+ """Return full party name from abbreviation."""
return PARTIES[abbr]
@@ -59,8 +59,8 @@ def scrape(self):
field_names = next(reader)
for name in OFFICE_FIELDS:
assert field_names.count(name) == 2
- field_names[field_names.index(name)] = "{} 1".format(name)
- field_names[field_names.index(name)] = "{} 2".format(name)
+ field_names[field_names.index(name)] = f"{name} 1"
+ field_names[field_names.index(name)] = f"{name} 2"
rows = [dict(zip_longest(field_names, row)) for row in reader]
assert len(rows), "No members found"
for mla in rows:
@@ -76,8 +76,8 @@ def scrape(self):
row_xpath = '//td[normalize-space()="{}"]/..'.format(
mla["Constituency Name"],
)
- (detail_url,) = index.xpath("{}//a/@href".format(row_xpath))
- (photo_url,) = index.xpath("{}//img/@src".format(row_xpath))
+ (detail_url,) = index.xpath(f"{row_xpath}//a/@href")
+ (photo_url,) = index.xpath(f"{row_xpath}//img/@src")
district = mla["Constituency Name"]
if district == "Calgary-Bhullar-McCall":
district = "Calgary-McCall"
@@ -108,10 +108,10 @@ def scrape(self):
for suffix, note in addresses:
for key, contact_type in (("Phone", "voice"), ("Fax", "fax")):
- value = mla["{} Number {}".format(key, suffix)]
+ value = mla[f"{key} Number {suffix}"]
if value and value != "Pending":
p.add_contact(contact_type, value, note)
- address = ", ".join(filter(bool, [mla["{} {}".format(field, suffix)] for field in ADDRESS_FIELDS]))
+ address = ", ".join(filter(bool, [mla[f"{field} {suffix}"] for field in ADDRESS_FIELDS]))
if address:
p.add_contact("address", address, note)
diff --git a/ca_ab_grande_prairie/__init__.py b/ca_ab_grande_prairie/__init__.py
index 67a6f1e5..42329fcc 100644
--- a/ca_ab_grande_prairie/__init__.py
+++ b/ca_ab_grande_prairie/__init__.py
@@ -17,7 +17,7 @@ def get_organizations(self):
for seat_number in range(1, 9):
organization.add_post(
role="Councillor",
- label="{} (seat {})".format(self.division_name, seat_number),
+ label=f"{self.division_name} (seat {seat_number})",
division_id=self.division_id,
)
diff --git a/ca_ab_grande_prairie/people.py b/ca_ab_grande_prairie/people.py
index fce739e2..9ff05187 100644
--- a/ca_ab_grande_prairie/people.py
+++ b/ca_ab_grande_prairie/people.py
@@ -1,7 +1,30 @@
-from utils import CSVScraper
+from utils import CanadianPerson as Person
+from utils import CanadianScraper
+COUNCIL_PAGE = "https://cityofgp.com/city-government/mayor-city-council/council-members"
-class GrandePrairiePersonScraper(CSVScraper):
- # https://data.cityofgp.com/Community/City-Council-Contact-Information/vcfc-gi78
- csv_url = "https://data.cityofgp.com/api/views/vcfc-gi78/rows.csv?accessType=DOWNLOAD"
- many_posts_per_area = True
+
+class GrandePrairiePersonScraper(CanadianScraper):
+ def scrape(self):
+ seat_number = 1
+ page = self.lxmlize(COUNCIL_PAGE)
+ councillors = page.xpath('//div[contains(@class, "council-bios")]//div[@class="views-row"]')
+
+ assert len(councillors), "No councillors found"
+ for councillor in councillors:
+ role, name = councillor.xpath(".//h3")[0].text_content().split(" ", 1)
+ if role == "Councillor":
+ district = f"Grande Prairie (seat {seat_number})"
+ seat_number += 1
+ else:
+ district = " Grande Prairie"
+ email = self.get_email(councillor)
+ phone = self.get_phone(councillor)
+ image = councillor.xpath(".//img/@src")[0]
+
+ p = Person(primary_org="legislature", name=name, district=district, role=role, image=image)
+ p.add_contact("email", email)
+ p.add_contact("voice", phone, "legislature")
+ p.add_source(COUNCIL_PAGE)
+
+ yield p
diff --git a/ca_ab_grande_prairie_county_no_1/__init__.py b/ca_ab_grande_prairie_county_no_1/__init__.py
index 632cdb9d..fc7fd5da 100644
--- a/ca_ab_grande_prairie_county_no_1/__init__.py
+++ b/ca_ab_grande_prairie_county_no_1/__init__.py
@@ -16,8 +16,8 @@ def get_organizations(self):
for division_number in range(1, 10):
organization.add_post(
role="Councillor",
- label="Division {}".format(division_number),
- division_id="{}/division:{}".format(self.division_id, division_number),
+ label=f"Division {division_number}",
+ division_id=f"{self.division_id}/division:{division_number}",
)
yield organization
diff --git a/ca_ab_lethbridge/__init__.py b/ca_ab_lethbridge/__init__.py
index 40d32197..d4e4c9c6 100644
--- a/ca_ab_lethbridge/__init__.py
+++ b/ca_ab_lethbridge/__init__.py
@@ -17,7 +17,7 @@ def get_organizations(self):
for seat_number in range(1, 9):
organization.add_post(
role="Councillor",
- label="{} (seat {})".format(self.division_name, seat_number),
+ label=f"{self.division_name} (seat {seat_number})",
division_id=self.division_id,
)
diff --git a/ca_ab_lethbridge/people.py b/ca_ab_lethbridge/people.py
index 74f91e0e..808c98e5 100644
--- a/ca_ab_lethbridge/people.py
+++ b/ca_ab_lethbridge/people.py
@@ -8,7 +8,7 @@
class LethbridgePersonScraper(CanadianScraper):
def scrape_mayor(self):
page = self.lxmlize(MAYOR_PAGE)
- paragraph = page.xpath("//p[1]")[0].text_content().split()
+ paragraph = page.xpath("//h4[contains(., 'Mayor')]/following-sibling::p")[0].text_content().split()
name = " ".join([paragraph[0], paragraph[1]])
p = Person(primary_org="legislature", name=name, district="Lethbridge", role="Mayor")
@@ -24,7 +24,7 @@ def scrape_person(self, url, seat_number):
p = Person(
primary_org="legislature",
name=name,
- district="Lethbridge (seat {})".format(seat_number + 1),
+ district=f"Lethbridge (seat {seat_number + 1})",
role="Councillor",
)
diff --git a/ca_ab_wood_buffalo/__init__.py b/ca_ab_wood_buffalo/__init__.py
index 40ae2c69..91eee478 100644
--- a/ca_ab_wood_buffalo/__init__.py
+++ b/ca_ab_wood_buffalo/__init__.py
@@ -17,16 +17,16 @@ def get_organizations(self):
for seat_number in range(1, 7):
organization.add_post(
role="Councillor",
- label="Ward 1 (seat {})".format(seat_number),
- division_id="{}/ward:1".format(self.division_id),
+ label=f"Ward 1 (seat {seat_number})",
+ division_id=f"{self.division_id}/ward:1",
)
for seat_number in range(1, 3):
organization.add_post(
role="Councillor",
- label="Ward 2 (seat {})".format(seat_number),
- division_id="{}/ward:2".format(self.division_id),
+ label=f"Ward 2 (seat {seat_number})",
+ division_id=f"{self.division_id}/ward:2",
)
- organization.add_post(role="Councillor", label="Ward 3", division_id="{}/ward:3".format(self.division_id))
- organization.add_post(role="Councillor", label="Ward 4", division_id="{}/ward:4".format(self.division_id))
+ organization.add_post(role="Councillor", label="Ward 3", division_id=f"{self.division_id}/ward:3")
+ organization.add_post(role="Councillor", label="Ward 4", division_id=f"{self.division_id}/ward:4")
yield organization
diff --git a/ca_ab_wood_buffalo/people.py b/ca_ab_wood_buffalo/people.py
index 2760aedd..b53bc03c 100644
--- a/ca_ab_wood_buffalo/people.py
+++ b/ca_ab_wood_buffalo/people.py
@@ -33,13 +33,13 @@ def scrape(self):
for ward in wards:
area = ward.text_content().split("–", 1)[1].strip()
councillors = ward.xpath("./following-sibling::table[1]/tbody/tr/td/h3")
- assert len(councillors), "No councillors found for {}".format(area)
+ assert len(councillors), f"No councillors found for {area}"
for councillor in councillors:
name = councillor.text_content()
if area in ("Ward 1", "Ward 2"):
seat_numbers[area] += 1
- district = "{} (seat {})".format(area, seat_numbers[area])
+ district = f"{area} (seat {seat_numbers[area]})"
else:
district = area
diff --git a/ca_bc_abbotsford/people.py b/ca_bc_abbotsford/people.py
index 5d29da2f..db72003b 100644
--- a/ca_bc_abbotsford/people.py
+++ b/ca_bc_abbotsford/people.py
@@ -19,12 +19,12 @@ def scrape(self):
]
assert len(councillors), "No councillors found"
- assert len(councillors) == len(contact_data), "Expected {}, got {}".format(len(councillors), len(contact_data))
+ assert len(councillors) == len(contact_data), f"Expected {len(councillors)}, got {len(contact_data)}"
for councillor, contact in zip(councillors, contact_data):
text = councillor.xpath(".//h3/a")[0].text_content()
if text.startswith("Councill"):
role = "Councillor"
- district = "Abbotsford (seat {})".format(councillor_seat_number)
+ district = f"Abbotsford (seat {councillor_seat_number})"
councillor_seat_number += 1
else:
role = "Mayor"
diff --git a/ca_bc_burnaby/people.py b/ca_bc_burnaby/people.py
index 981607ae..856a1b34 100644
--- a/ca_bc_burnaby/people.py
+++ b/ca_bc_burnaby/people.py
@@ -12,16 +12,6 @@ def scrape(self):
councillors = page.xpath("//a[@class='biography__link']/@href")
assert len(councillors), "No councillors found"
for person_url in councillors:
-
- def decode_email(e):
- de = ""
- k = int(e[:2], 16)
-
- for i in range(2, len(e) - 1, 2):
- de += chr(int(e[i : i + 2], 16) ^ k)
-
- return de
-
page = self.lxmlize(person_url)
role, name = page.xpath("//h1/span")[0].text_content().strip().split(" ", 1)
@@ -29,22 +19,20 @@ def decode_email(e):
contact_node = page.xpath('//div[@class="contact"]')[0]
- email = page.xpath('//div[@class = "contact__detail contact__detail--email"]/a/@href')[0]
- decoded_email = decode_email(email.split("#", 1)[1]) # cloudflare encrypts the email data
-
+ email = self.get_email(contact_node)
phone = self.get_phone(contact_node, area_codes=[604, 778])
if role == "Mayor":
district = "Burnaby"
else:
- district = "Burnaby (seat {})".format(councillor_seat_number)
+ district = f"Burnaby (seat {councillor_seat_number})"
councillor_seat_number += 1
p = Person(primary_org="legislature", name=name, district=district, role=role, image=photo_url)
p.add_source(COUNCIL_PAGE)
p.add_source(person_url)
if email:
- p.add_contact("email", decoded_email)
+ p.add_contact("email", email)
if phone:
p.add_contact("voice", phone, "legislature")
yield p
diff --git a/ca_bc_coquitlam/people.py b/ca_bc_coquitlam/people.py
index e43370f4..4f7200e2 100644
--- a/ca_bc_coquitlam/people.py
+++ b/ca_bc_coquitlam/people.py
@@ -7,18 +7,16 @@
class CoquitlamPersonScraper(CanadianScraper):
-
def scrape(self):
def build_email(script):
w = re.findall(r'w = "(.*?)"', script)[0]
x = re.findall(r'x = "(.*?)"', script)[0]
- email = w + "@" + x
- return email
+ return w + "@" + x
councillor_seat_number = 1
page = self.lxmlize(COUNCIL_PAGE, user_agent="Mozilla/5.0")
- councillors = page.xpath('//table[@id="cityDirectoryDepartmentDetails"]/tr')
+ councillors = page.xpath('//table[contains(@id, "cityDirectoryDepartmentDetails")]/tr')
assert len(councillors), "No councillors found"
for councillor in councillors:
name = " ".join(
@@ -36,7 +34,7 @@ def build_email(script):
if role == "Mayor":
district = "Coquitlam"
else:
- district = "Coquitlam (seat {})".format(councillor_seat_number)
+ district = f"Coquitlam (seat {councillor_seat_number})"
councillor_seat_number += 1
p = Person(primary_org="legislature", name=name, district=district, role=role)
diff --git a/ca_bc_langley/people.py b/ca_bc_langley/people.py
index f6863abb..b453cdfe 100644
--- a/ca_bc_langley/people.py
+++ b/ca_bc_langley/people.py
@@ -15,7 +15,7 @@ def scrape(self):
page = self.lxmlize(url)
name = page.xpath("//h1")[0].text_content().strip()
- district = "Langley (seat {})".format(seat_number)
+ district = f"Langley (seat {seat_number})"
seat_number += 1
email = self.get_email(page)
phone = self.get_phone(page)
@@ -34,7 +34,7 @@ def scrape(self):
address_block = page.xpath('//p/a[@rel="noopener noreferrer"]/parent::p')[0].text_content()
line1 = address_block[address_block.find("Facility") + 8 : address_block.find("Langley,")]
line2 = address_block[address_block.find("Langley,") : address_block.find("Phone") - 1]
- address = ", ".join([line1, line2])
+ address = f"{line1}, {line2}"
p = Person(primary_org="legislature", name=name, role="Mayor", district="Langley")
p.add_contact("email", email)
p.add_contact("voice", phone, "legislature")
diff --git a/ca_bc_langley_city/people.py b/ca_bc_langley_city/people.py
index e7d88464..0db77a03 100644
--- a/ca_bc_langley_city/people.py
+++ b/ca_bc_langley_city/people.py
@@ -1,9 +1,7 @@
-import re
-
from utils import CanadianPerson as Person
from utils import CanadianScraper
-COUNCIL_PAGE = "http://www.city.langley.bc.ca/index.php/city-hall/city-council"
+COUNCIL_PAGE = "https://city.langley.bc.ca/cityhall/city-council/council-members"
class LangleyPersonScraper(CanadianScraper):
@@ -12,60 +10,35 @@ def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)
- councillors = page.xpath('//div[@class="menuitems"]/ul//li/a[contains(text(), "Councillor")]/@href')
- mayor = page.xpath('//div[@class="menuitems"]/ul//li/a[contains(text(), "Mayor")]/@href')[0]
+ councillors = page.xpath(
+ '//div[@class="field field--name-field-ec-section-title field--type-string field--label-hidden field__item"]'
+ )[:-1]
assert len(councillors), "No councillors found"
- for url in councillors:
- district = "Langley (seat {})".format(councillor_seat_number)
- councillor_seat_number += 1
- yield self.scrape_person(url, district)
-
- yield self.scrape_mayor(mayor)
-
- def scrape_person(self, url, district):
- infos_page = self.lxmlize(url)
- infos = infos_page.xpath('//div[@class="item-page"]')[0]
-
- name = " ".join(infos.xpath("p[2]/text()")[0].split(" ")[1:3])
- lname = name.lower()
- email = lname.split(" ")[0][0] + lname.split(" ")[1] + "@langleycity.ca"
- photo_url = infos.xpath("p[1]/img/@src")[0]
-
- p = Person(primary_org="legislature", name=name, district=district, role="Councillor", image=photo_url)
- p.add_source(COUNCIL_PAGE)
- p.add_source(url)
- p.add_contact("email", email)
-
- personal_infos = infos.xpath("p[last()]/text()")
-
- if "Residence" in personal_infos[0]:
- phone = re.findall(r"(Phone|Res)(:?) (.*)", "\n".join(personal_infos))[0][2]
- address = re.findall(r"Address: (.*) (Phone|Res)", " ".join(personal_infos))[0][0]
- p.add_contact("address", address, "residence")
- p.add_contact("voice", phone, "residence")
-
- return p
-
- def scrape_mayor(self, url):
- infos_page = self.lxmlize(url)
- infos = infos_page.xpath('//div[@class="item-page"]')[0]
-
- name = " ".join(infos.xpath("p[2]/text()")[0].split(" ")[2:4])
- lname = name.lower()
- email = lname.split(" ")[0][0] + lname.split(" ")[1] + "@langleycity.ca"
- photo_url = infos.xpath("p[1]/img/@src")[0]
-
- p = Person(primary_org="legislature", name=name, district="Langley", role="Mayor", image=photo_url)
- p.add_source(COUNCIL_PAGE)
- p.add_source(url)
- p.add_contact("email", email)
-
- personal_infos = infos.xpath("p[last()]/text()")
-
- phone = re.findall(r"Phone(:?) (.*)", "\n".join(personal_infos))[0][1]
- address = re.findall(r"Address: (.*) Phone", " ".join(personal_infos))[0]
- p.add_contact("address", address, "office")
- p.add_contact("voice", phone, "office")
-
- return p
+ for councillor in councillors:
+ role, name = councillor.text_content().split(" ", 1)
+ if role == "Mayor":
+ district = "Langley"
+ phone_div = councillor.xpath('..//p[contains(., "Phone:")]')[0]
+ phone = self.get_phone(phone_div)
+ else:
+ district = f"Langley (seat {councillor_seat_number})"
+ phone = (
+ "604 514 2800" # According to their site, all councillors can be contacted at this phone number
+ )
+ councillor_seat_number += 1
+ email = (
+ councillor.xpath('..//p[contains(., "Email:")]')[0]
+ .text_content()
+ .split("Email:", 1)[1]
+ .strip()
+ .replace("(at)", "@")
+ )
+ image = councillor.xpath("..//img/@src")[0]
+
+ p = Person(primary_org="legislature", name=name, district=district, role=role, image=image)
+ p.add_contact("voice", phone, "legislature")
+ p.add_contact("email", email)
+ p.add_source(COUNCIL_PAGE)
+
+ yield p
diff --git a/ca_bc_new_westminster/people.py b/ca_bc_new_westminster/people.py
index 44e96727..a6f4a8a0 100644
--- a/ca_bc_new_westminster/people.py
+++ b/ca_bc_new_westminster/people.py
@@ -15,7 +15,7 @@ def scrape(self):
assert len(councillors), "No councillors found"
for councillor in councillors:
name = councillor.xpath(".//a[@name]")[0].text_content()
- district = "New Westminster (seat {})".format(seat_number)
+ district = f"New Westminster (seat {seat_number})"
seat_number += 1
p = Person(primary_org="legislature", name=name, role="Councillor", district=district)
photo = councillor.xpath("//img/@src")[0]
diff --git a/ca_bc_richmond/people.py b/ca_bc_richmond/people.py
index 795637b3..c90d664b 100644
--- a/ca_bc_richmond/people.py
+++ b/ca_bc_richmond/people.py
@@ -21,7 +21,7 @@ def scrape(self):
if role == "Mayor":
district = "Richmond"
else:
- district = "Richmond (seat {})".format(councillor_seat_number)
+ district = f"Richmond (seat {councillor_seat_number})"
councillor_seat_number += 1
p = Person(primary_org="legislature", name=name, district=district, role=role)
diff --git a/ca_bc_saanich/people.py b/ca_bc_saanich/people.py
index e7088e99..d3ea7da2 100644
--- a/ca_bc_saanich/people.py
+++ b/ca_bc_saanich/people.py
@@ -26,7 +26,7 @@ def scrape(self):
district = "Saanich"
else:
role = "Councillor"
- district = "Saanich (seat {})".format(councillor_seat_number)
+ district = f"Saanich (seat {councillor_seat_number})"
councillor_seat_number += 1
p = Person(primary_org="legislature", name=name, district=district, role=role)
diff --git a/ca_bc_surrey/people.py b/ca_bc_surrey/people.py
index b0240acd..bf877b91 100644
--- a/ca_bc_surrey/people.py
+++ b/ca_bc_surrey/people.py
@@ -12,9 +12,8 @@ def scrape(self):
assert len(members), "No members found"
seat_number = 1
for member in members:
-
role, name = member.xpath('.//a[@class="teaser__link"]/h4')[0].text_content().split(" ", 1)
- district = "Surrey (seat {})".format(seat_number)
+ district = f"Surrey (seat {seat_number})"
seat_number += 1
photo_url = member.xpath(".//figure//img/@src")[0]
diff --git a/ca_bc_vancouver/__init__.py b/ca_bc_vancouver/__init__.py
index f07c572d..3fa273f6 100644
--- a/ca_bc_vancouver/__init__.py
+++ b/ca_bc_vancouver/__init__.py
@@ -17,13 +17,13 @@ def get_organizations(self):
for seat_number in range(1, 11):
organization.add_post(
role="Councillor",
- label="{} (seat {})".format(self.division_name, seat_number),
+ label=f"{self.division_name} (seat {seat_number})",
division_id=self.division_id,
)
for seat_number in range(1, 8):
organization.add_post(
role="Commissioner",
- label="{} (seat {})".format(self.division_name, seat_number),
+ label=f"{self.division_name} (seat {seat_number})",
division_id=self.division_id,
)
diff --git a/ca_bc_victoria/people.py b/ca_bc_victoria/people.py
index b6c05cc8..9796b6ca 100644
--- a/ca_bc_victoria/people.py
+++ b/ca_bc_victoria/people.py
@@ -20,7 +20,7 @@ def scrape(self):
phone = self.get_phone(councillor)
url = councillor.xpath(".//h3/a/@href")[0]
- district = "Victoria (seat {})".format(seat_number)
+ district = f"Victoria (seat {seat_number})"
seat_number += 1
p = Person(primary_org="legislature", name=name, district=district, role=role)
diff --git a/ca_mb_winnipeg/people.py b/ca_mb_winnipeg/people.py
index 16e2af66..c3a2daa7 100644
--- a/ca_mb_winnipeg/people.py
+++ b/ca_mb_winnipeg/people.py
@@ -1,7 +1,5 @@
import json
-import requests
-
from utils import CanadianPerson as Person
from utils import CanadianScraper
@@ -12,7 +10,7 @@ class WinnipegPersonScraper(CanadianScraper):
def scrape(self):
# from https://data.winnipeg.ca/Council-Services/Council-Data/r4tk-7dip/about_data
api_url = "https://data.winnipeg.ca/resource/r4tk-7dip.json"
- data = json.loads(requests.get(api_url).content)
+ data = json.loads(self.get(api_url).content)
assert len(data), "No councillors found via API"
page = self.lxmlize(COUNCIL_PAGE)
diff --git a/ca_nb_fredericton/people.py b/ca_nb_fredericton/people.py
index f5977c6c..6ade8cb9 100644
--- a/ca_nb_fredericton/people.py
+++ b/ca_nb_fredericton/people.py
@@ -1,3 +1,5 @@
+import re
+
from utils import CanadianPerson as Person
from utils import CanadianScraper
@@ -17,9 +19,9 @@ def scrape(self):
text = councillor.xpath('.//div[@class="views-field views-field-field-councillor-title"]/div')[
0
].text_content()
- ward_start = text.find("Ward")
- if ward_start + 1:
- district = text[ward_start : ward_start + 7].strip()
+ ward = re.findall(r"Ward \d+", text)
+ if ward:
+ district = ward[0]
role = "Councillor"
else:
district = "Fredericton"
diff --git a/ca_nb_moncton/__init__.py b/ca_nb_moncton/__init__.py
index c4a931d0..5d2abbe6 100644
--- a/ca_nb_moncton/__init__.py
+++ b/ca_nb_moncton/__init__.py
@@ -17,15 +17,15 @@ def get_organizations(self):
for seat_number in range(1, 3):
organization.add_post(
role="Councillor at Large",
- label="{} (seat {})".format(self.division_name, seat_number),
+ label=f"{self.division_name} (seat {seat_number})",
division_id=self.division_id,
)
for ward_number in range(1, 5):
for seat_number in range(1, 3):
organization.add_post(
role="Councillor",
- label="Ward {} (seat {})".format(ward_number, seat_number),
- division_id="{}/ward:{}".format(self.division_id, ward_number),
+ label=f"Ward {ward_number} (seat {seat_number})",
+ division_id=f"{self.division_id}/ward:{ward_number}",
)
yield organization
diff --git a/ca_nb_moncton/people.py b/ca_nb_moncton/people.py
index f94e09db..aa47ed0f 100644
--- a/ca_nb_moncton/people.py
+++ b/ca_nb_moncton/people.py
@@ -1,8 +1,6 @@
import json
from collections import defaultdict
-import requests
-
from utils import CanadianPerson as Person
from utils import CanadianScraper
@@ -13,7 +11,7 @@
class MonctonPersonScraper(CanadianScraper):
def scrape(self):
seat_numbers = defaultdict(int)
- data = json.loads(requests.get(API_URL).content)["features"]
+ data = json.loads(self.get(API_URL).content)["features"]
assert len(data), "No councillors found"
for item in data:
@@ -24,7 +22,7 @@ def scrape(self):
role = councillor["Primary_role"]
if role != "Mayor":
seat_numbers[ward] += 1
- district = ward + " (seat {})".format(seat_numbers[ward])
+ district = ward + f" (seat {seat_numbers[ward]})"
else:
district = ward
name = councillor["Name"]
diff --git a/ca_nb_saint_john/__init__.py b/ca_nb_saint_john/__init__.py
index 6372f776..407b9322 100644
--- a/ca_nb_saint_john/__init__.py
+++ b/ca_nb_saint_john/__init__.py
@@ -18,15 +18,15 @@ def get_organizations(self):
for seat_number in range(1, 3):
organization.add_post(
role="Councillor",
- label="{} (seat {})".format(self.division_name, seat_number),
+ label=f"{self.division_name} (seat {seat_number})",
division_id=self.division_id,
)
for ward_number in range(1, 5):
for seat_number in range(1, 3):
organization.add_post(
role="Councillor",
- label="Ward {} (seat {})".format(ward_number, seat_number),
- division_id="{}/ward:{}".format(self.division_id, ward_number),
+ label=f"Ward {ward_number} (seat {seat_number})",
+ division_id=f"{self.division_id}/ward:{ward_number}",
)
yield organization
diff --git a/ca_nl/people.py b/ca_nl/people.py
index a75bec04..3a86fcef 100644
--- a/ca_nl/people.py
+++ b/ca_nl/people.py
@@ -1,9 +1,8 @@
import json
import re
-from utils import CUSTOM_USER_AGENT
+from utils import CUSTOM_USER_AGENT, CanadianScraper
from utils import CanadianPerson as Person
-from utils import CanadianScraper
COUNCIL_PAGE = "https://www.assembly.nl.ca/js/members-index.js"
@@ -26,9 +25,7 @@ def scrape(self):
page = self.get(COUNCIL_PAGE)
members = re.search(
r"members = (\[(.+)\]);", page.content.decode().replace("[Member-elect]", ""), re.DOTALL
- ).groups()[
- 0
- ] # extract javascript array
+ ).groups()[0] # extract javascript array
members = re.sub("", "", members) # remove comments
members = re.sub("", "", members).replace("", "") # tags
members = members.replace('"', r"\"") # escape double quotes
@@ -37,10 +34,8 @@ def scrape(self):
assert len(members), "No members found"
for member in json.loads(members):
if not member["name"].strip():
- print("Skipping blank member: {}".format(member))
continue
if member["name"] == "Vacant":
- print("Skipping vacant 'member': {}".format(member))
continue
name = " ".join(reversed(member["name"].split(","))).strip()
district = (
@@ -60,7 +55,8 @@ def scrape(self):
)
if member.get("email"):
p.add_contact(
- "email", member["email"].replace("@gov.nl.ca@gov.nl.ca", "@gov.nl.ca") # seriously guys?!
+ "email",
+ member["email"].replace("@gov.nl.ca@gov.nl.ca", "@gov.nl.ca"), # seriously guys?!
)
p.add_source(COUNCIL_PAGE)
diff --git a/ca_nl_st_john_s/__init__.py b/ca_nl_st_john_s/__init__.py
index 5b8632a8..c3fbca30 100644
--- a/ca_nl_st_john_s/__init__.py
+++ b/ca_nl_st_john_s/__init__.py
@@ -18,14 +18,14 @@ def get_organizations(self):
for seat_number in range(1, 5):
organization.add_post(
role="Councillor at Large",
- label="St. John's (seat {})".format(seat_number),
+ label=f"St. John's (seat {seat_number})",
division_id=self.division_id,
)
for ward_number in range(1, 6):
organization.add_post(
role="Councillor",
- label="Ward {}".format(ward_number),
- division_id="{}/ward:{}".format(self.division_id, ward_number),
+ label=f"Ward {ward_number}",
+ division_id=f"{self.division_id}/ward:{ward_number}",
)
yield organization
diff --git a/ca_nl_st_john_s/people.py b/ca_nl_st_john_s/people.py
index ee1f41e2..657fd8e9 100644
--- a/ca_nl_st_john_s/people.py
+++ b/ca_nl_st_john_s/people.py
@@ -23,9 +23,9 @@ def scrape(self):
district = description[index : index + 6]
else:
district = "St. John's"
- if role != "Mayor" and role != "Deputy Mayor":
+ if role not in ("Mayor", "Deputy Mayor"):
role = "Councillor at Large"
- district = "St. John's (seat {})".format(councillor_seat_number)
+ district = f"St. John's (seat {councillor_seat_number})"
councillor_seat_number += 1
email = self.get_email(page)
diff --git a/ca_ns/people.py b/ca_ns/people.py
index 4013d33f..97763d39 100644
--- a/ca_ns/people.py
+++ b/ca_ns/people.py
@@ -18,7 +18,7 @@ def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)
members = page.xpath(
'//div[contains(@class, "view-display-id-page_mlas_current_tiles")]//div[contains(@class, "views-row-")]'
- ) # noqa
+ )
assert len(members), "No members found"
for member in members:
district = member.xpath('.//div[contains(@class, "views-field-field-constituency")]/div/text()')[0]
@@ -66,16 +66,14 @@ def scrape(self):
if len(mailing_address) > 0:
address = mailing_address
- else:
- if len(civic_address) > 0 or len(civic_address_alt) > 0:
- if len(civic_address_alt) > 0:
- address = civic_address_alt
- else:
- address = civic_address
- address.remove(address[0]) # remove civic address
+ elif len(civic_address) > 0 or len(civic_address_alt) > 0:
+ if len(civic_address_alt) > 0:
+ address = civic_address_alt
else:
- if len(business_address) > 0:
- address = business_address
+ address = civic_address
+ address.remove(address[0]) # remove civic address
+ elif len(business_address) > 0:
+ address = business_address
address = list(map(str.strip, address))
p.add_contact("address", "\n".join(address), "constituency")
diff --git a/ca_ns_cape_breton/people.py b/ca_ns_cape_breton/people.py
index dad984d0..9d9272a5 100644
--- a/ca_ns_cape_breton/people.py
+++ b/ca_ns_cape_breton/people.py
@@ -1,9 +1,8 @@
import html
import re
-from utils import CUSTOM_USER_AGENT
+from utils import CUSTOM_USER_AGENT, CanadianScraper
from utils import CanadianPerson as Person
-from utils import CanadianScraper
COUNCIL_PAGE = "http://www.cbrm.ns.ca/mayor-council-2.html"
MAYOR_PAGE = "http://www.cbrm.ns.ca/mayor"
@@ -14,8 +13,7 @@ def scrape(self):
def decode_email(script):
raw_address = re.findall(r"(?<=addy).*?;\s*addy", script)
local_part = html.unescape(raw_address[0]).split("= ", 1)[1].split(";", 1)[0]
- email = re.sub(r"['\s+]", "", local_part) + "cbrm.ns.ca"
- return email
+ return re.sub(r"['\s+]", "", local_part) + "cbrm.ns.ca"
page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT)
@@ -55,7 +53,7 @@ def decode_email(script):
councillor_url = councillor.xpath(".//a/@href")[0]
p.add_source(councillor_url)
page = self.lxmlize(councillor_url, user_agent=CUSTOM_USER_AGENT)
- image = page.xpath('//img[contains(@title, "{0}")]/@src'.format(name))
+ image = page.xpath(f'//img[contains(@title, "{name}")]/@src')
if image:
p.image = image[0]
yield p
diff --git a/ca_ns_halifax/people.py b/ca_ns_halifax/people.py
index c8cf9e7b..06064166 100644
--- a/ca_ns_halifax/people.py
+++ b/ca_ns_halifax/people.py
@@ -4,52 +4,43 @@
from utils import CanadianScraper
COUNCIL_PAGE = "https://www.halifax.ca/city-hall/districts-councillors"
-MAYOR_PAGE = "https://www.halifax.ca/city-hall/mayor-mike-savage"
-MAYOR_CONTACT_URL = "http://www.halifax.ca/mayor/contact.php"
class HalifaxPersonScraper(CanadianScraper):
def scrape(self):
- page = self.lxmlize(COUNCIL_PAGE)
- councillors = page.xpath('//div[@id = "block-districtdistrictindex"]/ul/li')[1:]
+ page = self.lxmlize(COUNCIL_PAGE, user_agent="Mozilla/5.0")
+ councillors = page.xpath('//div[@id = "block-districtdistrictindex"]//ul/li')
assert len(councillors), "No councillors found"
for councillor in councillors:
photo_div = councillor.xpath("./a/div[1]")[0]
info_div = councillor.xpath("./a/div[2]")[0]
district = re.sub(r"\s*[–—-]\s*", "—", "—".join(info_div.xpath("./p/text()")))
- # FIXME: we special-case one malformed district name. If you're editing this file,
- # try removing these lines
- if district.startswith("District 16 "):
- district = district[len("District 16 ") :]
+ # District name different than in database
+ if "Westphal" in district:
+ district = "Cole Harbour—Westphal"
name = info_div.xpath("./strong/p/text()")[0].replace("Councillor ", "").replace("Deputy Mayor ", "")
+ if "Mayor" in name:
+ role = "Mayor"
+ name = name.replace("Mayor ", "")
+ district = "Halifax"
+ else:
+ role = "Councillor"
+
if name != "To be determined":
photo = photo_div.xpath(".//img/@src")[0]
url = councillor.xpath("./a/@href")[0]
- councillor_page = self.lxmlize(url)
+ councillor_page = self.lxmlize(url, user_agent="Mozilla/5.0")
- contact_node = councillor_page.xpath('//div[@id = "block-districtdistrictprofile"]')[0]
- phone = self.get_phone(contact_node, area_codes=[902])
- email = self.get_email(contact_node)
+ phone = self.get_phone(councillor_page, area_codes=[902])
+ email = self.get_email(councillor_page)
- p = Person(primary_org="legislature", name=name, district=district, role="Councillor")
+ p = Person(primary_org="legislature", name=name, district=district, role=role)
p.add_source(COUNCIL_PAGE)
p.add_source(url)
p.add_contact("voice", phone, "legislature")
p.add_contact("email", email)
p.image = photo
yield p
-
- mayor_page = self.lxmlize(MAYOR_PAGE, "iso-8859-1")
- name = " ".join(mayor_page.xpath("//h1/text()")).replace("Mayor", "").strip()
- contact_div = mayor_page.xpath('//aside[contains(@class, "layout-sidebar-second")]/section/div[1]')[0]
- phone = self.get_phone(contact_div.xpath("./p[2]")[0])
- email = self.get_email(contact_div.xpath("./p[2]")[0])
-
- p = Person(primary_org="legislature", name=name, district="Halifax", role="Mayor")
- p.add_source(MAYOR_PAGE)
- p.add_contact("email", email)
- p.add_contact("voice", phone, "legislature")
- yield p
diff --git a/ca_nt/people.py b/ca_nt/people.py
index d9460a78..d04fb9db 100644
--- a/ca_nt/people.py
+++ b/ca_nt/people.py
@@ -1,3 +1,5 @@
+import contextlib
+
from utils import CanadianPerson as Person
from utils import CanadianScraper
@@ -24,10 +26,8 @@ def scrape(self):
p = Person(primary_org="legislature", name=name, district=district, role="MLA")
p.add_source(COUNCIL_PAGE)
p.add_source(url)
- try:
+ with contextlib.suppress(IndexError):
p.image = page.xpath('//div[contains(@class, "field--name-field-media-image")]/img/@src')[0]
- except IndexError:
- pass
contact = page.xpath('//*[contains(@class, "paragraph--type--office")]')[0]
if len(contact.xpath('./div[contains(@class, "office-address-wrapper")]')) == 0:
@@ -35,7 +35,7 @@ def scrape(self):
else:
address_section = contact
- def handle_address(contact, address_type):
+ def handle_address(p, contact, address_type):
address_lines = []
po_box_line = (
"PO Box "
@@ -56,7 +56,7 @@ def handle_address(contact, address_type):
address_type,
)
- def handle_phone(lines, phone_type):
+ def handle_phone(p, lines, phone_type):
first_phone_added = False
for line in lines:
if "Assistant" in line.strip():
@@ -71,8 +71,8 @@ def handle_phone(lines, phone_type):
first_phone_added = True
contact_lines = contact.xpath(".//text()")
- handle_address(address_section, "legislature")
- handle_phone(contact_lines, "legislature")
+ handle_address(p, address_section, "legislature")
+ handle_phone(p, contact_lines, "legislature")
email_elements = page.xpath(
'//*[contains(@class, "field--paragraph--field-email")]/div[@class="field__item"]'
diff --git a/ca_nu/people.py b/ca_nu/people.py
index 363f2c98..38b1c8eb 100644
--- a/ca_nu/people.py
+++ b/ca_nu/people.py
@@ -1,3 +1,5 @@
+import contextlib
+
from utils import CanadianPerson as Person
from utils import CanadianScraper
@@ -22,17 +24,15 @@ def scrape(self):
p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party)
p.add_source(COUNCIL_PAGE)
p.add_source(url)
- try:
+ with contextlib.suppress(IndexError):
p.image = page.xpath('//div[contains(@class, "field--name-field-member-photo")]/div[2]/img/@src')[0]
- except IndexError:
- pass
contact = page.xpath('//div[contains(@class, "field--name-field-member-constituency")]/div[2]/div/p')[0]
website = contact.xpath("./div[3]/div[3]/div[2]/a")
if website:
p.add_link(website[0].text_content())
- def handle_address(lines, address_type):
+ def handle_address(p, lines, address_type):
address_lines = []
for line in lines:
if ":" in line.strip(): # Room:, Phone:, Fax:
@@ -45,15 +45,15 @@ def handle_address(lines, address_type):
address_type,
)
- def handle_phone(lines, phone_type):
+ def handle_phone(p, lines, phone_type):
for line in lines:
if "Phone:" in line:
number = line.replace("Phone: (867) ", "")
p.add_contact("voice", number, phone_type, area_code=867)
address_lines = contact.xpath("./text()")
- handle_address(address_lines, "legislature")
- handle_phone(address_lines, "legislature")
+ handle_address(p, address_lines, "legislature")
+ handle_phone(p, address_lines, "legislature")
email = self.get_email(contact, error=False)
if email:
diff --git a/ca_on/people.py b/ca_on/people.py
index 82865e66..885aa199 100644
--- a/ca_on/people.py
+++ b/ca_on/people.py
@@ -42,7 +42,7 @@ def scrape(self):
'//div[@block="block-views-block-member-current-party-block"]//div[@class="view-content"]//text()'
)
- party = [item for item in party if item.strip()][0]
+ party = next(item for item in party if item.strip())
p = Person(primary_org="legislature", name=name, district=district, role="MPP", party=party)
p.add_source(COUNCIL_PAGE)
p.add_source(url)
@@ -58,7 +58,7 @@ def scrape(self):
p.extras["constituency_email"] = emails.pop(0)
for heading, note in headings.items():
- office = node.xpath('//h3[contains(., "{}")]'.format(heading))
+ office = node.xpath(f'//h3[contains(., "{heading}")]')
if office:
try:
office_info = office[0].xpath(
diff --git a/ca_on_ajax/__init__.py b/ca_on_ajax/__init__.py
index 5f8340f8..ddbda0dc 100644
--- a/ca_on_ajax/__init__.py
+++ b/ca_on_ajax/__init__.py
@@ -15,10 +15,8 @@ def get_organizations(self):
organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id)
for ward_number in range(1, 4):
- division_id = "{}/ward:{}".format(self.division_id, ward_number)
- organization.add_post(
- role="Regional Councillor", label="Ward {}".format(ward_number), division_id=division_id
- )
- organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=division_id)
+ division_id = f"{self.division_id}/ward:{ward_number}"
+ organization.add_post(role="Regional Councillor", label=f"Ward {ward_number}", division_id=division_id)
+ organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=division_id)
yield organization
diff --git a/ca_on_belleville/__init__.py b/ca_on_belleville/__init__.py
index 6894249e..2c46ecb3 100644
--- a/ca_on_belleville/__init__.py
+++ b/ca_on_belleville/__init__.py
@@ -18,8 +18,8 @@ def get_organizations(self):
for seat_number in range(1, stop):
organization.add_post(
role="Councillor",
- label="Ward {} (seat {})".format(ward_number, seat_number),
- division_id="{}/ward:{}".format(self.division_id, ward_number),
+ label=f"Ward {ward_number} (seat {seat_number})",
+ division_id=f"{self.division_id}/ward:{ward_number}",
)
yield organization
diff --git a/ca_on_belleville/people.py b/ca_on_belleville/people.py
index fce0386d..3aebe5c6 100644
--- a/ca_on_belleville/people.py
+++ b/ca_on_belleville/people.py
@@ -36,7 +36,7 @@ def scrape(self):
councillors = ward.xpath("./following-sibling::*[img]")
for councillor in councillors:
self.seat_numbers[ward_name] += 1
- district = "{} (seat {})".format(ward_name, self.seat_numbers[ward_name])
+ district = f"{ward_name} (seat {self.seat_numbers[ward_name]})"
role = "Councillor"
name = councillor.xpath("./following-sibling::p")[0].text_content()
diff --git a/ca_on_brampton/__init__.py b/ca_on_brampton/__init__.py
index 1c6f28d4..ca0cf1aa 100644
--- a/ca_on_brampton/__init__.py
+++ b/ca_on_brampton/__init__.py
@@ -15,10 +15,8 @@ def get_organizations(self):
organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id)
for ward_number in range(1, 11):
- division_id = "{}/ward:{}".format(self.division_id, ward_number)
- organization.add_post(
- role="Regional Councillor", label="Ward {}".format(ward_number), division_id=division_id
- )
- organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=division_id)
+ division_id = f"{self.division_id}/ward:{ward_number}"
+ organization.add_post(role="Regional Councillor", label=f"Ward {ward_number}", division_id=division_id)
+ organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=division_id)
yield organization
diff --git a/ca_on_brantford/__init__.py b/ca_on_brantford/__init__.py
index 86cb306a..5bac19e3 100644
--- a/ca_on_brantford/__init__.py
+++ b/ca_on_brantford/__init__.py
@@ -18,8 +18,8 @@ def get_organizations(self):
for seat_number in range(1, 3):
organization.add_post(
role="Councillor",
- label="Ward {} (seat {})".format(ward_number, seat_number),
- division_id="{}/ward:{}".format(self.division_id, ward_number),
+ label=f"Ward {ward_number} (seat {seat_number})",
+ division_id=f"{self.division_id}/ward:{ward_number}",
)
yield organization
diff --git a/ca_on_caledon/people.py b/ca_on_caledon/people.py
index 03ec332d..837019c1 100644
--- a/ca_on_caledon/people.py
+++ b/ca_on_caledon/people.py
@@ -1,7 +1,5 @@
import re
-import requests
-
from utils import CanadianPerson as Person
from utils import CanadianScraper
@@ -33,10 +31,8 @@ def scrape(self):
# phone numbers populated by JS request
contact_num = page.xpath('//div[@class="contactBody"]/div/@id')[0].replace("contactEntry_", "")
- contact_data = requests.get(
- "https://www.caledon.ca//Modules/Contact/services/GetContactHTML.ashx?isMobile=false¶m={}&lang=en".format(
- contact_num
- )
+ contact_data = self.get(
+ f"https://www.caledon.ca//Modules/Contact/services/GetContactHTML.ashx?isMobile=false¶m={contact_num}&lang=en"
).text
voice = re.findall(r"(?<=tel://)\d+(?=\">)", contact_data)
@@ -46,7 +42,7 @@ def scrape(self):
if "&" in district: # Councillor for multiple wards
wards = re.findall(r"\d", district)
for ward_num in wards:
- p = Person(primary_org="legislature", name=name, district="Ward {}".format(ward_num), role=role)
+ p = Person(primary_org="legislature", name=name, district=f"Ward {ward_num}", role=role)
if voice:
p.add_contact("voice", voice[0], "legislature")
p.image = image
diff --git a/ca_on_cambridge/__init__.py b/ca_on_cambridge/__init__.py
index a3b13617..cbaa6a01 100644
--- a/ca_on_cambridge/__init__.py
+++ b/ca_on_cambridge/__init__.py
@@ -17,14 +17,14 @@ def get_organizations(self):
for seat_number in range(1, 3):
organization.add_post(
role="Regional Councillor",
- label="{} (seat {})".format(self.division_name, seat_number),
+ label=f"{self.division_name} (seat {seat_number})",
division_id=self.division_id,
)
for ward_number in range(1, 9):
organization.add_post(
role="Councillor",
- label="Ward {}".format(ward_number),
- division_id="{}/ward:{}".format(self.division_id, ward_number),
+ label=f"Ward {ward_number}",
+ division_id=f"{self.division_id}/ward:{ward_number}",
)
yield organization
diff --git a/ca_on_chatham_kent/__init__.py b/ca_on_chatham_kent/__init__.py
index 86be75b6..0a696434 100644
--- a/ca_on_chatham_kent/__init__.py
+++ b/ca_on_chatham_kent/__init__.py
@@ -18,8 +18,8 @@ def get_organizations(self):
for seat_number in range(1, stop):
organization.add_post(
role="Councillor",
- label="Ward {} (seat {})".format(ward_number, seat_number),
- division_id="{}/ward:{}".format(self.division_id, ward_number),
+ label=f"Ward {ward_number} (seat {seat_number})",
+ division_id=f"{self.division_id}/ward:{ward_number}",
)
yield organization
diff --git a/ca_on_chatham_kent/people.py b/ca_on_chatham_kent/people.py
index 290940d8..c5a02d6a 100644
--- a/ca_on_chatham_kent/people.py
+++ b/ca_on_chatham_kent/people.py
@@ -1,7 +1,6 @@
import re
from collections import defaultdict
-import requests
from lxml import etree
from utils import CanadianPerson as Person
@@ -19,8 +18,8 @@ def scrape(self):
headers = {"content-type": "text/xml"}
body = 'councillorsByWard50'
- response = requests.post(url=COUNCIL_DATA_URL, data=body, headers=headers)
- page = etree.fromstring(response.content)
+ response = self.post(url=COUNCIL_DATA_URL, data=body, headers=headers)
+ page = etree.fromstring(response.content) # noqa: S320
namespace = {"z": "#RowsetSchema", "rs": "urn:schemas-microsoft-com:rowset"}
councillors = page.findall(".//z:row", namespace)
@@ -30,7 +29,7 @@ def scrape(self):
ward, name = re.split(r"(?<=\d)\s", title)
name.replace("Councillor ", "")
seat_numbers[ward] += 1
- district = "{} (seat {})".format(ward, seat_numbers[ward])
+ district = f"{ward} (seat {seat_numbers[ward]})"
url = councillor.xpath("./@ows_URL")[0].split(",")[0]
page = self.lxmlize(url, user_agent="Mozilla/5.0")
diff --git a/ca_on_clarington/__init__.py b/ca_on_clarington/__init__.py
index c8732d1e..6c2b2c7f 100644
--- a/ca_on_clarington/__init__.py
+++ b/ca_on_clarington/__init__.py
@@ -17,6 +17,6 @@ def get_organizations(self):
organization.add_post(role="Regional Councillor", label="Wards 1 and 2")
organization.add_post(role="Regional Councillor", label="Wards 3 and 4")
for ward_number in range(1, 5):
- organization.add_post(role="Councillor", label="Ward {}".format(ward_number))
+ organization.add_post(role="Councillor", label=f"Ward {ward_number}")
yield organization
diff --git a/ca_on_clarington/people.py b/ca_on_clarington/people.py
index 036d40c2..01869853 100644
--- a/ca_on_clarington/people.py
+++ b/ca_on_clarington/people.py
@@ -3,26 +3,33 @@
from utils import CanadianPerson as Person
from utils import CanadianScraper
-COUNCIL_PAGE = "http://www.clarington.net/index.php?content=townhall/council"
+COUNCIL_PAGE = "https://www.clarington.net/en/town-hall/Meet-Your-Councillors.aspx"
+MAYOR_PAGE = "https://www.clarington.net/en/town-hall/mayor.aspx"
class ClaringtonPersonScraper(CanadianScraper):
def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)
- councillors = page.xpath("//h2")
+ councillors = page.xpath("//td[@data-name='accParent']")
assert len(councillors), "No councillors found"
- for person_header_elem in councillors:
- role, name_post = person_header_elem.text.split(" - ")
- try:
- name, caps_post = re.match(r"(.+) \((.+)\)", name_post).groups()
- post = caps_post.title()
- except AttributeError:
- name = name_post
- post = "Clarington"
- email = person_header_elem.xpath("./following-sibling::a[1]/@href")[0][len("mailto:") :]
- photo_url = person_header_elem.xpath("./following-sibling::img[1]/@src")[0]
- p = Person(primary_org="legislature", name=name, district=post, role=role, image=photo_url)
+ for councillor in councillors:
+ name, role_district = councillor.text_content().split(" - ")
+ role, district = re.split(r"(?<=Councillor) ", role_district, maxsplit=1)
+ content_node = councillor.xpath("../following-sibling::tr")[0]
+ email = self.get_email(content_node)
+ photo_url = content_node.xpath(".//img/@src")[0]
+ p = Person(primary_org="legislature", name=name, district=district, role=role, image=photo_url)
p.add_source(COUNCIL_PAGE)
p.add_contact("email", email)
yield p
+
+ page = self.lxmlize(MAYOR_PAGE).xpath('//div[@id="mainContent"]')[0]
+ name = page.xpath(".//img/@alt")[0].replace("Mayor", "").strip()
+ photo_url = page.xpath(".//img/@src")[0]
+ email = self.get_email(page)
+
+ p = Person(primary_org="legislature", name=name, district="Clarington", role="Mayor", image=photo_url)
+ p.add_contact("email", email)
+ p.add_source(MAYOR_PAGE)
+ yield p
diff --git a/ca_on_fort_erie/__init__.py b/ca_on_fort_erie/__init__.py
index 6bc1d6fb..8d016a6f 100644
--- a/ca_on_fort_erie/__init__.py
+++ b/ca_on_fort_erie/__init__.py
@@ -15,6 +15,6 @@ def get_organizations(self):
organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id)
for ward_number in range(1, 7):
- organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=self.division_id)
+ organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=self.division_id)
yield organization
diff --git a/ca_on_georgina/__init__.py b/ca_on_georgina/__init__.py
index 3dcef56c..ed2903dd 100644
--- a/ca_on_georgina/__init__.py
+++ b/ca_on_georgina/__init__.py
@@ -20,7 +20,7 @@ def get_organizations(self):
# organization.add_post(role='Councillor', label='Ward {}'.format(ward_number), division_id=self.division_id)
organization.add_post(
role="Councillor",
- label="{} (seat {})".format(self.division_name, ward_number),
+ label=f"{self.division_name} (seat {ward_number})",
division_id=self.division_id,
)
diff --git a/ca_on_grimsby/__init__.py b/ca_on_grimsby/__init__.py
index 3ce8ccb9..abf7b183 100644
--- a/ca_on_grimsby/__init__.py
+++ b/ca_on_grimsby/__init__.py
@@ -18,8 +18,8 @@ def get_organizations(self):
for seat_number in range(1, 3):
organization.add_post(
role="Councillor",
- label="Ward {} (seat {})".format(ward_number, seat_number),
- division_id="{}/ward:{}".format(self.division_id, ward_number),
+ label=f"Ward {ward_number} (seat {seat_number})",
+ division_id=f"{self.division_id}/ward:{ward_number}",
)
yield organization
diff --git a/ca_on_grimsby/people.py b/ca_on_grimsby/people.py
index 65f0572e..1a384ad2 100644
--- a/ca_on_grimsby/people.py
+++ b/ca_on_grimsby/people.py
@@ -11,19 +11,19 @@ class GrimsbyPersonScraper(CanadianScraper):
def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)
- wards = page.xpath("//div[@id='printAreaContent']//tbody/tr[td/h4]")
+ wards = page.xpath("//p[@class='tab ']")
assert len(wards), "No wards found"
for ward in wards:
- area = ward.xpath(".//h4")[0].text_content()
- councillors_node = ward.xpath("./following-sibling::tr/td")[0]
+ area = ward.xpath(".//a")[0].text_content().strip()
+ councillors_node = ward.xpath("./following-sibling::div")[0]
for i in range(2):
name_node = councillors_node.xpath(
'.//h5[contains(./strong, "Councillor")]|.//h5[contains(., "Councillor")]'
)[i]
- name = re.split(r"\s", name_node.text_content(), 1)[1]
- district = "{} (seat {})".format(area, i + 1)
+ name = re.split(r"\s", name_node.text_content(), maxsplit=1)[1]
+ district = f"{area} (seat {i + 1})"
phone = self.get_phone(name_node.xpath('./following-sibling::*[contains(., "Phone")]')[0])
email = self.get_email(name_node.xpath("./following-sibling::p[contains(., 'Email')]")[0])
image = councillors_node.xpath(".//@src")[i]
@@ -39,8 +39,8 @@ def scrape(self):
role, name = page.xpath("//h3")[0].text_content().split(" ", 1)
email = self.get_email(page)
- phone = self.get_phone(page.xpath("//div[@id='printAreaContent']/p[contains(., '905')]")[0])
- image = page.xpath("//h3//@src")[0]
+ phone = self.get_phone(page.xpath("//div[contains(@class, 'left')]//p[contains(., '905')]")[0])
+ image = page.xpath("//p//@src")[0]
p = Person(primary_org="legislature", name=name, district="Grimsby", role=role, image=image)
p.add_contact("email", email)
diff --git a/ca_on_guelph/__init__.py b/ca_on_guelph/__init__.py
index cfb78e38..4b265924 100644
--- a/ca_on_guelph/__init__.py
+++ b/ca_on_guelph/__init__.py
@@ -19,8 +19,8 @@ def get_organizations(self):
for seat_number in range(1, 3):
organization.add_post(
role="Councillor",
- label="Ward {} (seat {})".format(ward_number, seat_number),
- division_id="{}/ward:{}".format(self.division_id, ward_number),
+ label=f"Ward {ward_number} (seat {seat_number})",
+ division_id=f"{self.division_id}/ward:{ward_number}",
)
yield organization
diff --git a/ca_on_huron/__init__.py b/ca_on_huron/__init__.py
index 793a2a6c..389fc94d 100644
--- a/ca_on_huron/__init__.py
+++ b/ca_on_huron/__init__.py
@@ -56,7 +56,7 @@ def get_organizations(self):
for seat_number in range(1, division["count"] + 1):
organization.add_post(
role="Councillor",
- label="{} (seat {})".format(division_name, seat_number),
+ label=f"{division_name} (seat {seat_number})",
division_id=division_id,
)
diff --git a/ca_on_kawartha_lakes/people.py b/ca_on_kawartha_lakes/people.py
index 767970f1..ad2d33db 100644
--- a/ca_on_kawartha_lakes/people.py
+++ b/ca_on_kawartha_lakes/people.py
@@ -3,34 +3,34 @@
from utils import CanadianPerson as Person
from utils import CanadianScraper
-COUNCIL_PAGE = "http://www.city.kawarthalakes.on.ca/city-hall/mayor-council/members-of-council"
+COUNCIL_PAGE = "https://www.kawarthalakes.ca/en/municipal-services/contact-a-council-member.aspx"
class KawarthaLakesPersonScraper(CanadianScraper):
def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)
- councillors = page.xpath('//p[@class="WSIndent"]/a')
+ councillors = page.xpath("//tr[.//h2]")
assert len(councillors), "No councillors found"
for councillor in councillors:
- district = re.findall(r"(Ward [0-9]{1,2})", councillor.text_content())
+ district = re.findall(r"(Ward \d)", councillor.text_content())
if district:
district = district[0]
- name = councillor.text_content().replace(district, "").strip()
+ name = re.sub(r"Ward \d|Councillor|Deputy Mayor|-", "", councillor.text_content()).strip()
role = "Councillor"
else:
district = "Kawartha Lakes"
name = councillor.text_content().replace("Mayor", "").strip()
role = "Mayor"
- url = councillor.attrib["href"]
- page = self.lxmlize(url)
- email = self.get_email(page)
- image = page.xpath('//img[@class="image-right"]/@src')[0]
+ info_node = councillor.xpath("./following-sibling::*")[0]
+ email = self.get_email(info_node)
+ phone = self.get_phone(info_node)
+ image = info_node.xpath("//img/@src")[0]
p = Person(primary_org="legislature", name=name, district=district, role=role)
p.add_source(COUNCIL_PAGE)
- p.add_source(url)
+ p.add_contact("voice", phone, "legislature")
p.add_contact("email", email)
p.image = image
yield p
diff --git a/ca_on_lambton/__init__.py b/ca_on_lambton/__init__.py
index 57d42f26..eeaf9aae 100644
--- a/ca_on_lambton/__init__.py
+++ b/ca_on_lambton/__init__.py
@@ -18,7 +18,7 @@ def get_organizations(self): # @todo Fix labels along the lines of the regions
for seat_number in range(1, 16):
organization.add_post(
role="Councillor",
- label="{} (seat {})".format(self.division_name, seat_number),
+ label=f"{self.division_name} (seat {seat_number})",
division_id=self.division_id,
)
diff --git a/ca_on_lambton/people.py b/ca_on_lambton/people.py
index 8cb9dc7d..2757d6d7 100644
--- a/ca_on_lambton/people.py
+++ b/ca_on_lambton/people.py
@@ -24,7 +24,7 @@ def scrape(self):
else:
role = "Councillor"
name = text.replace("Councillor ", "")
- district = "Lambton (seat {})".format(councillor_seat_number)
+ district = f"Lambton (seat {councillor_seat_number})"
councillor_seat_number += 1
p = Person(primary_org="legislature", name=name, district=district, role=role)
diff --git a/ca_on_lasalle/__init__.py b/ca_on_lasalle/__init__.py
index fa036878..003ae973 100644
--- a/ca_on_lasalle/__init__.py
+++ b/ca_on_lasalle/__init__.py
@@ -18,7 +18,7 @@ def get_organizations(self):
for seat_number in range(1, 6):
organization.add_post(
role="Councillor",
- label="{} (seat {})".format(self.division_name, seat_number),
+ label=f"{self.division_name} (seat {seat_number})",
division_id=self.division_id,
)
diff --git a/ca_on_lasalle/people.py b/ca_on_lasalle/people.py
index 0930fc12..d0af6041 100644
--- a/ca_on_lasalle/people.py
+++ b/ca_on_lasalle/people.py
@@ -3,7 +3,7 @@
from utils import CanadianPerson as Person
from utils import CanadianScraper
-COUNCIL_PAGE = "http://www.town.lasalle.on.ca/en/town-hall/LaSalle-Council.asp"
+COUNCIL_PAGE = "https://www.lasalle.ca/en/town-hall/town-of-lasalle-council.aspx"
class LaSallePersonScraper(CanadianScraper):
@@ -12,39 +12,22 @@ def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)
- councillors = page.xpath('//table[@id="Table1table"]//td/p')
+ councillors = page.xpath('//div[@class="fbg-row lb-imageBox cm-datacontainer"]')
assert len(councillors), "No councillors found"
for councillor in councillors:
- if not councillor.text_content().strip():
- continue
- name = councillor.xpath("./font/b/text()")
- if not name:
- name = councillor.xpath("./font/text()")
- if "email" in name[0]:
- name = councillor.xpath("./b/font/text()")
- name = name[0]
- role = "Councillor"
- if "Mayor" in name:
- name = name.replace("Mayor", "")
- role = "Mayor"
- district = "LaSalle"
- else:
- district = "LaSalle (seat {})".format(councillor_seat_number)
- councillor_seat_number += 1
-
- p = Person(primary_org="legislature", name=name, district=district, role=role)
+ role, name = re.split(
+ r"(?<=Mayor)|(?<=Councillor)", councillor.xpath(".//a/div")[0].text_content(), maxsplit=1
+ )
+ district = "LaSalle" if "Mayor" in role else f"LaSalle (seat {councillor_seat_number})"
+ image = councillor.xpath(".//img/@src")[0]
+ voice = re.search(r"\d{3}-\d{3}-\d{4} ext. \d+", councillor.text_content())
+ cell = re.search(r"\d{3}-\d{3}-\d{4}(?! ext)", councillor.text_content())
+
+ p = Person(primary_org="legislature", name=name, role=role, district=district, image=image)
p.add_source(COUNCIL_PAGE)
+ if voice:
+ p.add_contact("voice", voice.group(0), "legislature")
+ if cell:
+ p.add_contact("cell", cell.group(0), "legislature")
- photo_url = councillor.xpath("./parent::td//img/@src")[0]
- p.image = photo_url
-
- email = self.get_email(councillor)
- p.add_contact("email", email)
-
- phone = re.findall(r"(?<=phone:)(.*)(?=home)", councillor.text_content(), flags=re.DOTALL)
- if phone:
- p.add_contact("voice", phone[0].strip(), "legislature")
-
- home_phone = re.findall(r"(?<=home phone:)(.*)", councillor.text_content(), flags=re.DOTALL)[0]
- p.add_contact("voice", home_phone.strip(), "residence")
yield p
diff --git a/ca_on_lincoln/__init__.py b/ca_on_lincoln/__init__.py
index 3f3bba06..7624ad1c 100644
--- a/ca_on_lincoln/__init__.py
+++ b/ca_on_lincoln/__init__.py
@@ -18,8 +18,8 @@ def get_organizations(self):
for seat_number in range(1, 3):
organization.add_post(
role="Councillor",
- label="Ward {} (seat {})".format(ward_number, seat_number),
- division_id="{}/ward:{}".format(self.division_id, ward_number),
+ label=f"Ward {ward_number} (seat {seat_number})",
+ division_id=f"{self.division_id}/ward:{ward_number}",
)
yield organization
diff --git a/ca_on_markham/__init__.py b/ca_on_markham/__init__.py
index f229f92a..26c51504 100644
--- a/ca_on_markham/__init__.py
+++ b/ca_on_markham/__init__.py
@@ -18,14 +18,14 @@ def get_organizations(self):
for seat_number in range(1, 4):
organization.add_post(
role="Regional Councillor",
- label="{} (seat {})".format(self.division_name, seat_number),
+ label=f"{self.division_name} (seat {seat_number})",
division_id=self.division_id,
)
for ward_number in range(1, 9):
organization.add_post(
role="Councillor",
- label="Ward {}".format(ward_number),
- division_id="{}/ward:{}".format(self.division_id, ward_number),
+ label=f"Ward {ward_number}",
+ division_id=f"{self.division_id}/ward:{ward_number}",
)
yield organization
diff --git a/ca_on_markham/people.py b/ca_on_markham/people.py
index d2f73fa4..2b01dfd7 100644
--- a/ca_on_markham/people.py
+++ b/ca_on_markham/people.py
@@ -26,7 +26,7 @@ def scrape(self):
role = "Councillor"
elif "Regional" in district:
role = "Regional Councillor"
- district = "Markham (seat {})".format(regional_councillor_seat_number)
+ district = f"Markham (seat {regional_councillor_seat_number})"
regional_councillor_seat_number += 1
else:
role = district
diff --git a/ca_on_milton/__init__.py b/ca_on_milton/__init__.py
index a1247215..09f6e79f 100644
--- a/ca_on_milton/__init__.py
+++ b/ca_on_milton/__init__.py
@@ -15,10 +15,8 @@ def get_organizations(self):
organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id)
for ward_number in range(1, 5):
- division_id = "{}/ward:{}".format(self.division_id, ward_number)
- organization.add_post(
- role="Regional Councillor", label="Ward {}".format(ward_number), division_id=division_id
- )
- organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=division_id)
+ division_id = f"{self.division_id}/ward:{ward_number}"
+ organization.add_post(role="Regional Councillor", label=f"Ward {ward_number}", division_id=division_id)
+ organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=division_id)
yield organization
diff --git a/ca_on_newmarket/__init__.py b/ca_on_newmarket/__init__.py
index 82f8dbe4..3e8bc0e0 100644
--- a/ca_on_newmarket/__init__.py
+++ b/ca_on_newmarket/__init__.py
@@ -18,8 +18,8 @@ def get_organizations(self):
for ward_number in range(1, 8):
organization.add_post(
role="Councillor",
- label="Ward {}".format(ward_number),
- division_id="{}/ward:{}".format(self.division_id, ward_number),
+ label=f"Ward {ward_number}",
+ division_id=f"{self.division_id}/ward:{ward_number}",
)
yield organization
diff --git a/ca_on_niagara/__init__.py b/ca_on_niagara/__init__.py
index 7c6bc4e9..bc5cc9dc 100644
--- a/ca_on_niagara/__init__.py
+++ b/ca_on_niagara/__init__.py
@@ -70,7 +70,7 @@ def get_organizations(self):
organization.add_post(role="Mayor", label=division_name, division_id=division_id)
for seat_number in range(1, division["count"] + 1):
organization.add_post(
- role="Councillor", label="{} (seat {})".format(division_name, seat_number), division_id=division_id
+ role="Councillor", label=f"{division_name} (seat {seat_number})", division_id=division_id
)
yield organization
diff --git a/ca_on_niagara_on_the_lake/__init__.py b/ca_on_niagara_on_the_lake/__init__.py
index 92dd00b7..d195985e 100644
--- a/ca_on_niagara_on_the_lake/__init__.py
+++ b/ca_on_niagara_on_the_lake/__init__.py
@@ -18,7 +18,7 @@ def get_organizations(self):
for seat_number in range(1, 9):
organization.add_post(
role="Councillor",
- label="{} (seat {})".format(self.division_name, seat_number),
+ label=f"{self.division_name} (seat {seat_number})",
division_id=self.division_id,
)
diff --git a/ca_on_north_dumfries/people.py b/ca_on_north_dumfries/people.py
index 9573eaa8..ffe19fc5 100644
--- a/ca_on_north_dumfries/people.py
+++ b/ca_on_north_dumfries/people.py
@@ -23,10 +23,7 @@ def scrape(self):
role = match.group(2)
name = match.group(3)
- if role == "Mayor":
- district = "North Dumfries"
- else:
- district = "Ward {}".format(word_to_number[match.group(1)])
+ district = "North Dumfries" if role == "Mayor" else f"Ward {word_to_number[match.group(1)]}"
p = Person(primary_org="legislature", name=name, district=district, role=role)
p.add_source(COUNCIL_PAGE)
diff --git a/ca_on_oakville/__init__.py b/ca_on_oakville/__init__.py
index 9c0373c2..113d8b9b 100644
--- a/ca_on_oakville/__init__.py
+++ b/ca_on_oakville/__init__.py
@@ -15,10 +15,8 @@ def get_organizations(self):
organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id)
for ward_number in range(1, 8):
- division_id = "{}/ward:{}".format(self.division_id, ward_number)
- organization.add_post(
- role="Regional Councillor", label="Ward {}".format(ward_number), division_id=division_id
- )
- organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=division_id)
+ division_id = f"{self.division_id}/ward:{ward_number}"
+ organization.add_post(role="Regional Councillor", label=f"Ward {ward_number}", division_id=division_id)
+ organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=division_id)
yield organization
diff --git a/ca_on_oakville/people.py b/ca_on_oakville/people.py
index e51f9ef1..c7362962 100644
--- a/ca_on_oakville/people.py
+++ b/ca_on_oakville/people.py
@@ -19,11 +19,8 @@ def scrape(self):
district = "Oakville"
role = district_role
else:
- district, role = re.split(r"(?<=\d)\s+", district_role, 1)
- if "Regional" in role:
- role = "Regional Councillor"
- else:
- role = "Councillor"
+ district, role = re.split(r"(?<=\d)\s+", district_role, maxsplit=1)
+ role = "Regional Councillor" if "Regional" in role else "Councillor"
name = councillor.xpath(".//div[@class='user-name']/text()")[0]
email = self.get_email(councillor)
diff --git a/ca_on_oshawa/__init__.py b/ca_on_oshawa/__init__.py
index eae140e7..14e62cd1 100644
--- a/ca_on_oshawa/__init__.py
+++ b/ca_on_oshawa/__init__.py
@@ -15,10 +15,8 @@ def get_organizations(self):
organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id)
for ward_number in range(1, 6):
- division_id = "{}/ward:{}".format(self.division_id, ward_number)
- organization.add_post(
- role="Regional Councillor", label="Ward {}".format(ward_number), division_id=division_id
- )
- organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=division_id)
+ division_id = f"{self.division_id}/ward:{ward_number}"
+ organization.add_post(role="Regional Councillor", label=f"Ward {ward_number}", division_id=division_id)
+ organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=division_id)
yield organization
diff --git a/ca_on_oshawa/people.py b/ca_on_oshawa/people.py
index 7105e8e2..df02029e 100644
--- a/ca_on_oshawa/people.py
+++ b/ca_on_oshawa/people.py
@@ -20,12 +20,9 @@ def scrape(self):
district = "Oshawa"
name = info.replace("Mayor ", "")
else:
- district, role_name = re.split(r"(?<=\d)\s", info, 1)
- if "Regional" in role_name:
- role = "Regional Councillor"
- else:
- role = "Councillor"
- name = re.split(r"Councillor\s", role_name, 1)[1]
+ district, role_name = re.split(r"(?<=\d)\s", info, maxsplit=1)
+ role = "Regional Councillor" if "Regional" in role_name else "Councillor"
+ name = re.split(r"Councillor\s", role_name, maxsplit=1)[1]
photo_url = councillor.xpath(".//img/@src")[0]
phone = self.get_phone(councillor)
diff --git a/ca_on_ottawa/people.py b/ca_on_ottawa/people.py
index 5d1054ee..9db7f422 100644
--- a/ca_on_ottawa/people.py
+++ b/ca_on_ottawa/people.py
@@ -2,11 +2,7 @@
class OttawaPersonScraper(CSVScraper):
- # http://data.ottawa.ca/dataset/elected-officials
- csv_url = "http://data.ottawa.ca/dataset/fd26ae83-fe1a-40d8-8951-72df40021c82/resource/3cd1b14d-cb45-4c4d-b22a-a607946e2ec2/download/elected-officials-2018-2022.csv"
- encoding = "utf-8-sig"
- corrections = {
- "district name": {
- "Orl\u0082ans": "Orléans",
- },
- }
+ # https://open.ottawa.ca/documents/ottawa::elected-officials-2022-2026/about
+ csv_url = "https://www.arcgis.com/sharing/rest/content/items/a5e9dc2425274bb796d3ded47b0d7b00/data"
+ fallbacks = {"district name": "ward name"}
+ extension = ".xls"
diff --git a/ca_on_peel/__init__.py b/ca_on_peel/__init__.py
index b22abfc0..4f2a0754 100644
--- a/ca_on_peel/__init__.py
+++ b/ca_on_peel/__init__.py
@@ -20,21 +20,21 @@ def get_organizations(self):
for ward_number in range(1, 7):
organization.add_post(
role="Councillor",
- label="Caledon Ward {} (seat 1)".format(ward_number),
- division_id="ocd-division/country:ca/csd:3521024/ward:{}".format(ward_number),
+ label=f"Caledon Ward {ward_number} (seat 1)",
+ division_id=f"ocd-division/country:ca/csd:3521024/ward:{ward_number}",
)
for ward_number in range(1, 11):
for seat_number in range(1, 3 if ward_number <= 6 else 2):
organization.add_post(
role="Councillor",
- label="Brampton Ward {} (seat {})".format(ward_number, seat_number),
- division_id="ocd-division/country:ca/csd:3521010/ward:{}".format(ward_number),
+ label=f"Brampton Ward {ward_number} (seat {seat_number})",
+ division_id=f"ocd-division/country:ca/csd:3521010/ward:{ward_number}",
)
for ward_number in range(1, 12):
organization.add_post(
role="Councillor",
- label="Mississauga Ward {} (seat 1)".format(ward_number),
- division_id="ocd-division/country:ca/csd:3521005/ward:{}".format(ward_number),
+ label=f"Mississauga Ward {ward_number} (seat 1)",
+ division_id=f"ocd-division/country:ca/csd:3521005/ward:{ward_number}",
)
yield organization
diff --git a/ca_on_pickering/__init__.py b/ca_on_pickering/__init__.py
index 60739797..0acff44d 100644
--- a/ca_on_pickering/__init__.py
+++ b/ca_on_pickering/__init__.py
@@ -15,7 +15,7 @@ def get_organizations(self):
organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id)
for ward_number in range(1, 4):
- organization.add_post(role="Regional Councillor", label="Ward {}".format(ward_number))
- organization.add_post(role="Councillor", label="Ward {}".format(ward_number))
+ organization.add_post(role="Regional Councillor", label=f"Ward {ward_number}")
+ organization.add_post(role="Councillor", label=f"Ward {ward_number}")
yield organization
diff --git a/ca_on_pickering/people.py b/ca_on_pickering/people.py
index 78bfdd3f..420b3ede 100644
--- a/ca_on_pickering/people.py
+++ b/ca_on_pickering/people.py
@@ -21,7 +21,7 @@ def scrape(self):
if "Councillor" in name:
name = name.replace("Councillor", "").strip()
role_ward = councillor.xpath(".//text()")[1]
- role, ward = re.split(r"\s(?=Ward)", role_ward, 1)
+ role, ward = re.split(r"\s(?=Ward)", role_ward, maxsplit=1)
else:
name = name.replace("Mayor", "")
role = "Mayor"
diff --git a/ca_on_richmond_hill/__init__.py b/ca_on_richmond_hill/__init__.py
index fd8c9735..32482ccc 100644
--- a/ca_on_richmond_hill/__init__.py
+++ b/ca_on_richmond_hill/__init__.py
@@ -17,14 +17,14 @@ def get_organizations(self):
for seat_number in range(1, 3):
organization.add_post(
role="Regional Councillor",
- label="{} (seat {})".format(self.division_name, seat_number),
+ label=f"{self.division_name} (seat {seat_number})",
division_id=self.division_id,
)
for ward_number in range(1, 7):
organization.add_post(
role="Councillor",
- label="Ward {}".format(ward_number),
- division_id="{}/ward:{}".format(self.division_id, ward_number),
+ label=f"Ward {ward_number}",
+ division_id=f"{self.division_id}/ward:{ward_number}",
)
yield organization
diff --git a/ca_on_richmond_hill/people.py b/ca_on_richmond_hill/people.py
index 9f92f22c..f6163c4a 100644
--- a/ca_on_richmond_hill/people.py
+++ b/ca_on_richmond_hill/people.py
@@ -14,7 +14,7 @@ def scrape(self):
urls = page.xpath('//h3[contains(text(), "Regional and Local Councillors")]/following-sibling::p[1]//@href')
assert len(urls), "No regional councillors found"
for index, url in enumerate(urls, 1):
- yield self.process(url, "Richmond Hill (seat {})".format(index), "Regional Councillor")
+ yield self.process(url, f"Richmond Hill (seat {index})", "Regional Councillor")
councillors = page.xpath('//h3[text()="Local Councillors"]/following-sibling::p')
assert len(councillors), "No councillors found"
diff --git a/ca_on_sault_ste_marie/__init__.py b/ca_on_sault_ste_marie/__init__.py
index 7bee6834..e329e1e8 100644
--- a/ca_on_sault_ste_marie/__init__.py
+++ b/ca_on_sault_ste_marie/__init__.py
@@ -1,3 +1,5 @@
+from pupa.scrape import Organization
+
from utils import CanadianJurisdiction
@@ -7,3 +9,17 @@ class SaultSteMarie(CanadianJurisdiction):
division_name = "Sault Ste. Marie"
name = "Sault Ste. Marie City Council"
url = "http://www.city.sault-ste-marie.on.ca"
+
+ def get_organizations(self):
+ organization = Organization(self.name, classification=self.classification)
+
+ organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id)
+ for ward_number in range(1, 6):
+ for seat_number in range(1, 3):
+ organization.add_post(
+ role="Councillor",
+ label=f"Ward {ward_number} (seat {seat_number})",
+ division_id=f"{self.division_id}/ward:{ward_number}",
+ )
+
+ yield organization
diff --git a/ca_on_sault_ste_marie/people.py b/ca_on_sault_ste_marie/people.py
index ffc890fb..5ff2672c 100644
--- a/ca_on_sault_ste_marie/people.py
+++ b/ca_on_sault_ste_marie/people.py
@@ -1,57 +1,49 @@
-from urllib.parse import urljoin
+import re
+from collections import defaultdict
from utils import CanadianPerson as Person
from utils import CanadianScraper
-COUNCIL_PAGE = "http://www.city.sault-ste-marie.on.ca/Open_Page.aspx?ID=174&deptid=1"
-
-
-def word_to_number(word):
- words = ("one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten")
- return words.index(word.lower()) + 1
-
-
-def district_name_using_number(name):
- district_split = name.split()
- return " ".join([district_split[0], str(word_to_number(district_split[1]))])
+COUNCIL_PAGE = "https://saultstemarie.ca/Government/City-Council.aspx"
class SaultSteMariePersonScraper(CanadianScraper):
def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)
- table_data = page.xpath('//div[@id="litcontentDiv"]//tr')
- council_data = table_data[2:-1]
-
- mayor_row = table_data[0]
-
- photo_url_rel = mayor_row.xpath("string(.//img/@src)") # can be empty
- photo_url = urljoin(COUNCIL_PAGE, photo_url_rel)
- contact_node = mayor_row.xpath("./td")[1]
- name = contact_node.xpath(".//font[1]/text()")[0]
- email = self.get_email(contact_node)
-
- p = Person(primary_org="legislature", name=name, district="Sault Ste. Marie", role="Mayor")
- p.add_source(COUNCIL_PAGE)
- p.add_contact("email", email)
- p.image = photo_url
- yield p
-
- # alternate between a row represneting a ward name and councilors
- assert len(council_data), "No councillors found"
- for ward_row, data_row in zip(*[iter(council_data)] * 2):
- district = ward_row.xpath('.//text()[contains(., "Ward")]')[0]
- district_num = district_name_using_number(district)
- for councillor_node in data_row.xpath("./td"):
- name = councillor_node.xpath(".//strong/text()|.//font[1]/text()")[0]
- email = self.get_email(councillor_node)
- photo_url_rel = councillor_node.xpath("string(.//img/@src)") # can be empty
- photo_url = urljoin(COUNCIL_PAGE, photo_url_rel)
- # address and phone are brittle, inconsistent
-
- p = Person(primary_org="legislature", name=name, district=district_num, role="Councillor")
- p.add_source(COUNCIL_PAGE)
- if email:
- p.add_contact("email", email)
- p.image = photo_url
-
- yield p
+ seat_numbers = defaultdict(int)
+
+ councillors = page.xpath('//div[@class="mb-2"]//@href')
+ assert len(councillors), "No councillors found"
+
+ for link in councillors:
+ page = self.lxmlize(link)
+ title = page.xpath("//h1")[0].text_content()
+ if "Mayor" in title:
+ role = "Mayor"
+ name = title.replace("Mayor ", "")
+ district = "Sault Ste. Marie"
+ image = None # No image on the Mayor's page at the moment
+ contact_node = page.xpath('//div[@id="mainContent_contactUs"]')[0]
+ phone_numbers = re.findall(r"\d{3}-\d{3}-\d{4}", contact_node.text_content())
+ phone = phone_numbers[0]
+ fax = phone_numbers[1]
+ else:
+ role = "Councillor"
+ area, name = title.split(" Councillor ")
+ seat_numbers[area] += 1
+ district = f"{area} (seat {seat_numbers[area]})"
+ image = page.xpath(".//h3/img/@src")[0]
+ contact_node = page.xpath('//div[@id="mainContent_left"]')[0]
+ phone = self.get_phone(contact_node)
+ email = self.get_email(contact_node)
+
+ p = Person(primary_org="legislature", name=name, district=district, role=role)
+ if image:
+ p.image = image
+ if fax:
+ p.add_contact("fax", fax, "legislature")
+ p.add_contact("email", email)
+ p.add_contact("voice", phone, "legislature")
+ p.add_source(COUNCIL_PAGE)
+ p.add_source(link)
+ yield p
diff --git a/ca_on_school_boards_english_public/__init__.py b/ca_on_school_boards_english_public/__init__.py
deleted file mode 100644
index bc76eb9f..00000000
--- a/ca_on_school_boards_english_public/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from opencivicdata.divisions import Division
-from pupa.scrape import Organization
-
-from utils import CanadianJurisdiction
-
-
-class OntarioEnglishPublicSchoolBoards(CanadianJurisdiction):
- classification = "school" # just to avoid clash
- division_id = "ocd-division/country:ca/province:on"
- division_name = 'Ontario English Public School Board boundary"'
- name = "Ontario English Public School Boards"
- url = "http://www.edu.gov.on.ca/eng/sbinfo/boardList.html"
-
- def get_organizations(self):
- organization = Organization(self.name, classification="committee")
- organization.add_source(self.url)
-
- for division in Division.get(self.division_id).children("school_district"):
- organization.add_post(role="Chair", label=division.name, division_id=division.id)
- for i in range(0, 22): # XXX made-up number
- organization.add_post(
- role="Trustee", label="{} (seat {})".format(division.name, i), division_id=division.id
- )
-
- yield organization
diff --git a/ca_on_school_boards_english_public/people.py b/ca_on_school_boards_english_public/people.py
deleted file mode 100644
index 2ee7369a..00000000
--- a/ca_on_school_boards_english_public/people.py
+++ /dev/null
@@ -1,18 +0,0 @@
-from datetime import date
-
-from utils import CSVScraper
-
-
-class OntarioEnglishPublicSchoolBoardsPersonScraper(CSVScraper):
- # CSV source: https://docs.google.com/spreadsheets/d/1smXFR3nB9lovc6bWWcLvr621wb6E5b2TZKqUtxRTUtE/edit#gid=785048945
- csv_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vTbnQN0j_2Ky56MeRQsNTYXnt9Q6f_vFgH_KyAZ3O96QhjLqMK_Fzrjz2lI8ympE1FU0lkKgbGEvjW0/pub?gid=785048945&single=true&output=csv"
- updated_at = date(2019, 9, 13)
- contact_person = "andrew@newmode.net"
- many_posts_per_area = True
- unique_roles = ["Chair"]
- encoding = "utf-8"
- corrections = {"district name": {}}
- organization_classification = "committee"
-
- def is_valid_row(self, row):
- return any(row.values()) and row["last name"] and row["first name"]
diff --git a/ca_on_st_catharines/__init__.py b/ca_on_st_catharines/__init__.py
index 40fe57d7..ce5f9aa4 100644
--- a/ca_on_st_catharines/__init__.py
+++ b/ca_on_st_catharines/__init__.py
@@ -18,8 +18,8 @@ def get_organizations(self):
for seat_number in range(1, 3):
organization.add_post(
role="Councillor",
- label="{} (seat {})".format(ward_name, seat_number),
- division_id="{}/ward:{}".format(self.division_id, clean_type_id(ward_name)),
+ label=f"{ward_name} (seat {seat_number})",
+ division_id=f"{self.division_id}/ward:{clean_type_id(ward_name)}",
)
yield organization
diff --git a/ca_on_thunder_bay/__init__.py b/ca_on_thunder_bay/__init__.py
index 0f573abf..c94012f6 100644
--- a/ca_on_thunder_bay/__init__.py
+++ b/ca_on_thunder_bay/__init__.py
@@ -17,14 +17,14 @@ def get_organizations(self):
for seat_number in range(1, 6):
organization.add_post(
role="Councillor at Large",
- label="{} (seat {})".format(self.division_name, seat_number),
+ label=f"{self.division_name} (seat {seat_number})",
division_id=self.division_id,
)
for ward_number, ward_name in enumerate(
("Current River", "Red River", "McKellar", "McIntyre", "Northwood", "Westfort", "Neebing"), 1
):
organization.add_post(
- role="Councillor", label=ward_name, division_id="{}/ward:{}".format(self.division_id, ward_number)
+ role="Councillor", label=ward_name, division_id=f"{self.division_id}/ward:{ward_number}"
)
yield organization
diff --git a/ca_on_thunder_bay/people.py b/ca_on_thunder_bay/people.py
index f0123518..0c09dcac 100644
--- a/ca_on_thunder_bay/people.py
+++ b/ca_on_thunder_bay/people.py
@@ -1,7 +1,7 @@
import requests
+from utils import DEFAULT_USER_AGENT, CanadianScraper
from utils import CanadianPerson as Person
-from utils import CanadianScraper
COUNCIL_PAGE = "https://www.thunderbay.ca/en/city-hall/mayor-and-council-profiles.aspx"
@@ -29,7 +29,7 @@ def scrape(self):
].text_content()
if "At Large" in district:
role = "Councillor at Large"
- district = "Thunder Bay (seat {})".format(seat_number)
+ district = f"Thunder Bay (seat {seat_number})"
seat_number += 1
elif "Mayor" in district:
district = "Thunder Bay"
@@ -43,6 +43,6 @@ def scrape(self):
yield p
- def lxmlize(self, url, encoding=None, user_agent=requests.utils.default_user_agent(), cookies=None, xml=False):
+ def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False):
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ":HIGH:!DH:!aNULL" # site uses a weak DH key
return super().lxmlize(url, encoding, user_agent, cookies, xml)
diff --git a/ca_on_toronto/people.py b/ca_on_toronto/people.py
index 010a9867..02661961 100644
--- a/ca_on_toronto/people.py
+++ b/ca_on_toronto/people.py
@@ -12,3 +12,6 @@ class TorontoPersonScraper(CSVScraper):
"councillor_ mckelvie@toronto.ca": "councillor_mckelvie@toronto.ca",
},
}
+
+ def is_valid_row(self, row):
+ return row["first name"] != "None" and row["last name"] != "None"
diff --git a/ca_on_uxbridge/__init__.py b/ca_on_uxbridge/__init__.py
index f429091e..f0144483 100644
--- a/ca_on_uxbridge/__init__.py
+++ b/ca_on_uxbridge/__init__.py
@@ -16,6 +16,6 @@ def get_organizations(self):
organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id)
organization.add_post(role="Regional Councillor", label=self.division_name, division_id=self.division_id)
for ward_number in range(1, 6):
- organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=self.division_id)
+ organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=self.division_id)
yield organization
diff --git a/ca_on_vaughan/__init__.py b/ca_on_vaughan/__init__.py
index fc17bf88..77cf3f69 100644
--- a/ca_on_vaughan/__init__.py
+++ b/ca_on_vaughan/__init__.py
@@ -18,14 +18,14 @@ def get_organizations(self):
for seat_number in range(1, 5):
organization.add_post(
role="Regional Councillor",
- label="{} (seat {})".format(self.division_name, seat_number),
+ label=f"{self.division_name} (seat {seat_number})",
division_id=self.division_id,
)
for ward_number in range(1, 6):
organization.add_post(
role="Councillor",
- label="Ward {}".format(ward_number),
- division_id="{}/ward:{}".format(self.division_id, ward_number),
+ label=f"Ward {ward_number}",
+ division_id=f"{self.division_id}/ward:{ward_number}",
)
yield organization
diff --git a/ca_on_vaughan/people.py b/ca_on_vaughan/people.py
index 9d25f274..75adb737 100644
--- a/ca_on_vaughan/people.py
+++ b/ca_on_vaughan/people.py
@@ -22,7 +22,7 @@ def scrape(self):
district, name = title.split("Councillor")
if "Regional" in district:
role = "Regional Councillor"
- district = "Vaughan (seat {})".format(regional_councillor_seat_number)
+ district = f"Vaughan (seat {regional_councillor_seat_number})"
regional_councillor_seat_number += 1
elif "Ward" in district:
role = "Councillor"
diff --git a/ca_on_waterloo_region/__init__.py b/ca_on_waterloo_region/__init__.py
index 0c5ad0a4..fca132eb 100644
--- a/ca_on_waterloo_region/__init__.py
+++ b/ca_on_waterloo_region/__init__.py
@@ -29,19 +29,19 @@ def get_organizations(self):
for seat_number in range(1, 4):
organization.add_post(
role="Regional Councillor",
- label="Cambridge (seat {})".format(seat_number),
+ label=f"Cambridge (seat {seat_number})",
division_id="ocd-division/country:ca/csd:3530010",
)
for seat_number in range(1, 6):
organization.add_post(
role="Regional Councillor",
- label="Kitchener (seat {})".format(seat_number),
+ label=f"Kitchener (seat {seat_number})",
division_id="ocd-division/country:ca/csd:3530013",
)
for seat_number in range(1, 4):
organization.add_post(
role="Regional Councillor",
- label="Waterloo (seat {})".format(seat_number),
+ label=f"Waterloo (seat {seat_number})",
division_id="ocd-division/country:ca/csd:3530016",
)
diff --git a/ca_on_waterloo_region/people.py b/ca_on_waterloo_region/people.py
index ca0cdeb3..639006da 100644
--- a/ca_on_waterloo_region/people.py
+++ b/ca_on_waterloo_region/people.py
@@ -20,7 +20,7 @@ def scrape(self):
area = re.sub(r"(?:City|Region|Township) of ", "", area)
councillors = municipality.xpath("./following-sibling::tr[1]//a[not(@target)]")
- assert len(councillors), "No councillors found for {}".format(area)
+ assert len(councillors), f"No councillors found for {area}"
for councillor in councillors:
name = councillor.text_content()
@@ -29,7 +29,7 @@ def scrape(self):
if re.search("Waterloo|Cambridge|Kitchener", area):
seat_numbers[area] += 1
- district = "{} (seat {})".format(area, seat_numbers[area])
+ district = f"{area} (seat {seat_numbers[area]})"
else:
district = area
if "Regional Council" in area:
diff --git a/ca_on_welland/__init__.py b/ca_on_welland/__init__.py
index 5b8df234..5d388c54 100644
--- a/ca_on_welland/__init__.py
+++ b/ca_on_welland/__init__.py
@@ -18,8 +18,8 @@ def get_organizations(self):
for seat_number in range(1, 3):
organization.add_post(
role="Councillor",
- label="Ward {} (seat {})".format(ward_number, seat_number),
- division_id="{}/ward:{}".format(self.division_id, ward_number),
+ label=f"Ward {ward_number} (seat {seat_number})",
+ division_id=f"{self.division_id}/ward:{ward_number}",
)
yield organization
diff --git a/ca_on_wellesley/people.py b/ca_on_wellesley/people.py
index 8d189e6a..1514047b 100644
--- a/ca_on_wellesley/people.py
+++ b/ca_on_wellesley/people.py
@@ -15,10 +15,10 @@ def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)
members = [
el
- for el in page.xpath('//div[@id="printAreaContent"]//td')
+ for el in page.xpath('//div//td[@data-name="accChild"]')
if el.text_content().strip().lower().split()[0] in ["mayor", "councillor"]
- ][1:]
- assert len(members) == 5
+ ]
+ assert len(members), "No councillors found"
for member in members:
position = member.text_content().split()[0]
@@ -26,12 +26,12 @@ def scrape(self):
name = srch.group(1).strip()
district = srch.group(2).strip()
phone = self.get_phone(member)
- if position == "Mayor":
- district = "Wellesley"
- else:
- district = post_number(district)
+ email = self.get_email(member, error=False)
+ district = "Wellesley" if position == "Mayor" else post_number(district)
p = Person(primary_org="legislature", name=name, district=district, role=position)
p.add_contact("voice", phone, "legislature")
+ if email:
+ p.add_contact("email", email)
p.add_source(COUNCIL_PAGE)
yield p
diff --git a/ca_on_whitby/__init__.py b/ca_on_whitby/__init__.py
index dc57319f..7a3d88b7 100644
--- a/ca_on_whitby/__init__.py
+++ b/ca_on_whitby/__init__.py
@@ -17,14 +17,14 @@ def get_organizations(self):
for seat_number in range(1, 5):
organization.add_post(
role="Regional Councillor",
- label="{} (seat {})".format(self.division_name, seat_number),
+ label=f"{self.division_name} (seat {seat_number})",
division_id=self.division_id,
)
for ward_number, ward_name in enumerate(("North", "West", "Centre", "East"), 1):
organization.add_post(
role="Councillor",
- label="{} Ward".format(ward_name),
- division_id="{}/ward:{}".format(self.division_id, ward_number),
+ label=f"{ward_name} Ward",
+ division_id=f"{self.division_id}/ward:{ward_number}",
)
yield organization
diff --git a/ca_on_whitby/people.py b/ca_on_whitby/people.py
index 24f08ab5..b0c5dd81 100644
--- a/ca_on_whitby/people.py
+++ b/ca_on_whitby/people.py
@@ -22,7 +22,7 @@ def scrape(self):
else:
name, role = name.split(", ")
if role == "Regional Councillor":
- district = "Whitby (seat {})".format(regional_councillor_seat_number)
+ district = f"Whitby (seat {regional_councillor_seat_number})"
regional_councillor_seat_number += 1
else:
district = role.split(" – ")[1]
diff --git a/ca_on_whitchurch_stouffville/__init__.py b/ca_on_whitchurch_stouffville/__init__.py
index 1e4c6e35..b48db3ee 100644
--- a/ca_on_whitchurch_stouffville/__init__.py
+++ b/ca_on_whitchurch_stouffville/__init__.py
@@ -19,7 +19,7 @@ def get_organizations(self):
# organization.add_post(role='Councillor', label='Ward {}'.format(ward_number), division_id=self.division_id)
organization.add_post(
role="Councillor",
- label="{} (seat {})".format(self.division_name, ward_number),
+ label=f"{self.division_name} (seat {ward_number})",
division_id=self.division_id,
)
diff --git a/ca_on_wilmot/people.py b/ca_on_wilmot/people.py
index 8c5898b6..d9676cfe 100644
--- a/ca_on_wilmot/people.py
+++ b/ca_on_wilmot/people.py
@@ -1,51 +1,27 @@
-import re
-
from utils import CanadianPerson as Person
from utils import CanadianScraper
-COUNCIL_PAGE = "http://www.wilmot.ca/current-council.php"
+COUNCIL_PAGE = "https://www.wilmot.ca/Modules/contact/search.aspx?s=EFHOVXSi8AOIMKMStZMNvAeQuAleQuAl"
class WilmotPersonScraper(CanadianScraper):
def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)
- councillors = page.xpath('//table[@id="Main Content"]//td[@colspan="3"]//td/p/b')
+ councillors = page.xpath('//table[@class="contactList"]//tr')
assert len(councillors), "No councillors found"
for councillor in councillors:
- district, name = councillor.xpath("./text()")[0].split(":")
- if "Mayor" in district:
+ name, role_district = councillor.xpath(".//button/text()")[0].split(" - ", 1)
+ if "Mayor" in role_district:
yield scrape_mayor(councillor, name)
continue
+ role, district = role_district.split(" - ")
- p = Person(primary_org="legislature", name=name, district=district, role="Councillor")
+ p = Person(primary_org="legislature", name=name, district=district, role=role)
p.add_source(COUNCIL_PAGE)
- base_info = councillor.xpath("./parent::p/text()")
- for info in councillor.xpath("./parent::p/following-sibling::p"):
- if info.xpath(".//b"):
- break
- base_info = base_info + info.xpath("./text()")
-
- address = ""
- complete = False
- while not complete:
- address = address + " " + base_info.pop(0)
- if re.search(r"[A-Z][0-9A-Z][A-Z] \d[A-Z]\d", address):
- complete = True
- p.add_contact("address", address, "legislature")
-
- base_info.pop(-1)
- base_info = " ".join(base_info).split()
- for i, contact in enumerate(base_info):
- if re.match(r"[0-9]", contact):
- continue
- if "fax" in contact:
- p.add_contact("fax", base_info[i + 1], "legislature")
- else:
- p.add_contact(contact, base_info[i + 1], contact)
- email = self.get_email(councillor, "./parent::p/following-sibling::p")
- p.add_contact("email", email)
+ phone = self.get_phone(councillor).replace("/", "")
+ p.add_contact("voice", phone, "legislature")
yield p
@@ -53,14 +29,11 @@ def scrape_mayor(div, name):
p = Person(primary_org="legislature", name=name, district="Wilmot", role="Mayor")
p.add_source(COUNCIL_PAGE)
- info = div.xpath("./parent::p//text()")
- info.pop(0)
- address = " ".join(info[:3])
- phone = info[3].split()[1]
- fax = info[4].split()[1]
- email = info[-1]
+ address = div.xpath('.//div[@class="contactListAddress"]')[0].text_content()
+ phone = div.xpath('.//div[@class="contactListMainNumber"]/a/text()')[0]
+ other_phone = div.xpath('.//div[@class="contactListPhNumber"]/a/text()')[0]
p.add_contact("address", address, "legislature")
p.add_contact("voice", phone, "legislature")
- p.add_contact("fax", fax, "legislature")
- p.add_contact("email", email)
+ p.add_contact("voice", other_phone, "office")
+
return p
diff --git a/ca_on_windsor/people.py b/ca_on_windsor/people.py
index 62b21688..7ee6649c 100644
--- a/ca_on_windsor/people.py
+++ b/ca_on_windsor/people.py
@@ -1,14 +1,27 @@
+import json
+
from utils import CanadianPerson as Person
from utils import CanadianScraper
-COUNCIL_PAGE = "https://www.citywindsor.ca/mayor-and-council/city-councillors"
-MAYOR_PAGE = "https://www.citywindsor.ca/mayor-and-council/mayor-drew-dilkens"
+COUNCIL_PAGE = "https://www.citywindsor.ca/mayor-and-council"
class WindsorPersonScraper(CanadianScraper):
def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)
-
+ data_url = page.xpath('//comment()[contains(., "SITE JS")]/following-sibling::script/@src')[0]
+ data = json.loads(self.get(data_url).text.split(" = ")[1])
+ nav_items = []
+ for item in data:
+ if item["RollupType"] == "SidebarNavigation":
+ nav_items = item["RollupFields"]
+ for item in nav_items:
+ if item["Title"].startswith("Mayor") and item["Parent"] == "Mayor and City Council":
+ mayor_url = "https://www.citywindsor.ca" + item["RelativeURL"]
+ if "Councillors" in item["Title"]:
+ councillors_url = "https://www.citywindsor.ca" + item["RelativeURL"]
+
+ page = self.lxmlize(councillors_url, user_agent="Mozilla/5.0")
councillors = page.xpath("//h2")
assert len(councillors), "No councillors found"
for councillor in councillors:
@@ -28,12 +41,12 @@ def scrape(self):
yield p
- page = self.lxmlize(MAYOR_PAGE)
+ page = self.lxmlize(mayor_url)
title = page.xpath("//h1")[0].text_content()
name = title.replace("Mayor ", "")
image = page.xpath('//img[contains(./@alt, "Mayor")]/@src')[0]
p = Person(primary_org="legislature", name=name, district="Windsor", role="Mayor", image=image)
- p.add_source(MAYOR_PAGE)
+ p.add_source(mayor_url)
yield p
diff --git a/ca_on_woolwich/__init__.py b/ca_on_woolwich/__init__.py
index 8a4ba24f..e072a68f 100644
--- a/ca_on_woolwich/__init__.py
+++ b/ca_on_woolwich/__init__.py
@@ -1,3 +1,5 @@
+from pupa.scrape import Organization
+
from utils import CanadianJurisdiction
@@ -7,3 +9,17 @@ class Woolwich(CanadianJurisdiction):
division_name = "Woolwich"
name = "Woolwich Township Council"
url = "http://www.woolwich.ca"
+
+ def get_organizations(self):
+ organization = Organization(self.name, classification=self.classification)
+
+ organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id)
+ for ward_number, stop in enumerate((3, 2, 3), 1):
+ for seat_number in range(1, stop):
+ organization.add_post(
+ role="Councillor",
+ label=f"Ward {ward_number} (seat {seat_number})",
+ division_id=f"{self.division_id}/ward:{ward_number}",
+ )
+
+ yield organization
diff --git a/ca_on_woolwich/people.py b/ca_on_woolwich/people.py
index 66992769..e1baf6d5 100644
--- a/ca_on_woolwich/people.py
+++ b/ca_on_woolwich/people.py
@@ -1,4 +1,5 @@
import re
+from collections import defaultdict
from utils import CanadianPerson as Person
from utils import CanadianScraper
@@ -8,29 +9,32 @@
class WoolwichPersonScraper(CanadianScraper):
def scrape(self):
+ seat_numbers = defaultdict(int)
page = self.lxmlize(COUNCIL_PAGE)
- councillors = page.xpath('//div[@id="printArea"]//strong')
+ councillors = page.xpath('//td[@data-name="accParent"]/h2')
assert len(councillors), "No councillors found"
for councillor in councillors:
- info = councillor.xpath("./parent::p/text()")
- if not info:
- info = councillor.xpath("./parent::div/text()")
- info = [x for x in info if x.strip()]
- district = re.sub(r"(?<=Ward \d).+", "", info.pop(0))
- if "Mayor" in district:
+ role, name = re.split(r"\s", councillor.text_content(), maxsplit=1)
+ area = re.search(r"Ward \d", name)
+ if not area:
district = "Woolwich"
- role = "Mayor"
else:
- district = district.replace("Councillor", "").strip()
- role = "Councillor"
+ seat_numbers[area] += 1
+ district = area.group(0) + f" (seat {seat_numbers[area]})"
+ if "(" in name:
+ name = name.split(" (")[0]
+ info = councillor.xpath("./ancestor::tr[1]/following-sibling::tr")[0].text_content()
+ office = re.search(r"(?<=Office: )\d{3}-\d{3}-\d{4}", info).group(0)
+ voice = (
+ re.search(r"(?<=Toll Free: )(1-)?\d{3}-\d{3}-\d{4}( extension \d{4})?", info)
+ .group(0)
+ .replace("extension", "x")
+ )
- p = Person(primary_org="legislature", name=councillor.text_content(), district=district, role=role)
+ p = Person(primary_org="legislature", name=name, district=district, role=role)
p.add_source(COUNCIL_PAGE)
- p.image = councillor.xpath("./img/@src")[0]
+ p.add_contact("voice", office, "office")
+ p.add_contact("voice", voice, "legislature")
- for contact in info:
- note, num = contact.split(":")
- num = num.strip().replace("(", "").replace(") ", "-").replace("extension ", "x")
- p.add_contact(note, num, note)
yield p
diff --git a/ca_pe_charlottetown/people.py b/ca_pe_charlottetown/people.py
index 4bdc4c5d..ba216571 100644
--- a/ca_pe_charlottetown/people.py
+++ b/ca_pe_charlottetown/people.py
@@ -8,15 +8,6 @@
class CharlottetownPersonScraper(CanadianScraper):
def scrape(self):
- def decode_email(e):
- de = ""
- k = int(e[:2], 16)
-
- for i in range(2, len(e) - 1, 2):
- de += chr(int(e[i : i + 2], 16) ^ k)
-
- return de
-
page = self.lxmlize(COUNCIL_PAGE, user_agent="Mozilla/5.0")
nodes = page.xpath('//div[@id="ctl00_ContentPlaceHolder1_ctl13_divContent"]/*')
@@ -52,22 +43,13 @@ def decode_email(e):
p.image = image
- for node in group:
- email_node = node.xpath("//a[span/@data-cfemail]")
- if email_node:
- email = email_node[0].xpath("./@href")[0].split("#")[1]
- break
-
- decoded_email = decode_email(email).split("?")[0]
- p.add_contact("email", decoded_email)
+ email = self.get_email(para)
+ p.add_contact("email", email)
for text in para.xpath('.//strong[contains(., "Phone")]/following-sibling::text()'):
if re.search(r"\d", text):
match = re.search(r"(.+) \((.+)\)", text)
- if match.group(2) == "Fax":
- contact_type = "fax"
- else:
- contact_type = "voice"
+ contact_type = "fax" if match.group(2) == "Fax" else "voice"
p.add_contact(contact_type, match.group(1), match.group(2))
yield p
diff --git a/ca_pe_stratford/people.py b/ca_pe_stratford/people.py
index 253b8d0a..158caf15 100644
--- a/ca_pe_stratford/people.py
+++ b/ca_pe_stratford/people.py
@@ -1,66 +1,42 @@
import re
from collections import defaultdict
-from utils import CUSTOM_USER_AGENT
from utils import CanadianPerson as Person
from utils import CanadianScraper
-COUNCIL_PAGE = "http://www.townofstratford.ca/town-hall/government/town-council/"
+COUNCIL_PAGE = "https://www.townofstratford.ca/government/about_our_government/mayor_council"
class StratfordPersonScraper(CanadianScraper):
def scrape(self):
seat_numbers = defaultdict(int)
- page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT)
+ page = self.lxmlize(COUNCIL_PAGE, user_agent="Mozilla/5.0")
- yield self.scrape_mayor(page)
-
- councillors = page.xpath(
- '//div[@id="street-container"]//strong[contains(text(), "Councillor")]/parent::p|//div[@id="street-container"]//b[contains(text(), "Councillor")]/parent::p'
- )
+ councillors = page.xpath("//tr")
assert len(councillors), "No councillors found"
for councillor in councillors:
- name = councillor.xpath("./strong/text()|./b/text()")[0].replace("Councillor", "").strip()
- post = re.findall(r"(?<=Ward \d, ).*", councillor.text_content())[0].strip()
-
- seat_numbers[post] += 1
- post = "{} (seat {})".format(post, seat_numbers[post])
-
- p = Person(primary_org="legislature", name=name, district=post, role="Councillor")
+ name = councillor.xpath(".//strong/text()")[0]
+ if re.search(r"(?).+(?=<)", child["children"]["fr"]).group(0)
+ if child["parent"] == parent_id and "Conseill" not in text:
+ name = text.replace(" ", "")
+ elif not phone:
+ phone_pattern = re.search(r"\d{3} \d{3}-\d{4}(, poste \d{4})?", text)
+ if phone_pattern:
+ phone = phone_pattern.group(0)
- index = [i for i, link in enumerate(emails) if name in link.text_content().replace("\u2019", "'")][0]
- email = emails[index + 1]
- p.add_contact("email", re.match("mailto:(.+@brossard.ca)", email.attrib["href"]).group(1))
- phone = email.xpath('./preceding-sibling::text()[contains(., "450")]')
- phone = phone[-1]
+ p = Person(primary_org="legislature", name=name, district=district, role="Conseiller", image=photo)
+ p.add_contact("email", email)
p.add_contact("voice", phone, "legislature")
+ p.add_source(COUNCIL_PAGE)
yield p
+
+ for element in elements.values():
+ if (
+ isinstance(element.get("children"), dict)
+ and re.search(r"MAIRE", element.get("children").get("fr"))
+ and not element.get("children").get("en")
+ ):
+ mayor = element
+ parent_id = mayor["parent"]
+ children = get_children(parent_id, elements)
+ phone = None
+ for id in children:
+ child = elements[id]
+ if child["tag"] == "Image":
+ photo = "https://www.brossard.ca/in/rest/public/AttachmentThumb?id=" + child["children"]["fr"]
+ elif child["tag"] == "TextBox":
+ if not isinstance(child["children"], dict) or "MAIRE" in child["children"]["fr"]:
+ continue
+ text = re.search(r"(?<=>).+(?=<)", child["children"]["fr"]).group(0)
+ if child["parent"] == parent_id:
+ name = text.replace(" ", "")
+ elif not phone:
+ phone_pattern = re.search(r"\d{3} \d{3}-\d{4}(, poste \d{4})?", text)
+ if phone_pattern:
+ phone = phone_pattern.group(0)
+ p = Person(primary_org="legislature", name=name, district="Brossard", role="Maire", image=photo)
+ p.add_contact("voice", phone, "legislature")
+ p.add_source(COUNCIL_PAGE)
+ yield p
diff --git a/ca_qc_cote_saint_luc/people.py b/ca_qc_cote_saint_luc/people.py
index 011b005e..9670ea1c 100644
--- a/ca_qc_cote_saint_luc/people.py
+++ b/ca_qc_cote_saint_luc/people.py
@@ -1,21 +1,11 @@
-from utils import CUSTOM_USER_AGENT
+from utils import CUSTOM_USER_AGENT, CanadianScraper
from utils import CanadianPerson as Person
-from utils import CanadianScraper
COUNCIL_PAGE = "https://cotesaintluc.org/fr/affaires-municipales/membres-du-conseil/"
class CoteSaintLucPersonScraper(CanadianScraper):
def scrape(self):
- def decode_email(e):
- de = ""
- k = int(e[:2], 16)
-
- for i in range(2, len(e) - 1, 2):
- de += chr(int(e[i : i + 2], 16) ^ k)
-
- return de
-
page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT)
councillors = page.xpath('//div/div[contains(@class, "gb-container gb-container-") and .//img]')
assert len(councillors), "No councillors found"
@@ -39,13 +29,11 @@ def decode_email(e):
blog = councillor.xpath(
'.//p[contains(.,"Blog")]//@href[not(contains(., "twitter") or contains(., "facebook"))]'
)
- encrypted_email = councillor.xpath('.//@href[contains(., "email")]')[0].split("#")[1]
- email = decode_email(encrypted_email)
p = Person(primary_org="legislature", name=name, district=district, role=role)
p.add_source(COUNCIL_PAGE)
- p.add_contact("email", email)
+ p.add_contact("email", self.get_email(councillor))
p.add_contact("voice", self.get_phone(councillor, area_codes=[514]), "legislature")
p.image = image
if twitter:
diff --git a/ca_qc_gatineau/people.py b/ca_qc_gatineau/people.py
index 34a9c91e..60522652 100644
--- a/ca_qc_gatineau/people.py
+++ b/ca_qc_gatineau/people.py
@@ -4,7 +4,6 @@
from utils import CanadianScraper
COUNCIL_PAGE = "http://www.gatineau.ca/portail/default.aspx?p=guichet_municipal%2fconseil_municipal"
-MAYOR_CONTACT_PAGE = "http://www.gatineau.ca/portail/default.aspx?p=la_ville/conseil_municipal/maire"
class GatineauPersonScraper(CanadianScraper):
@@ -12,32 +11,34 @@ def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)
# it's all javascript rendered on the client... wow.
- js = page.xpath('string(//div[@id="contenu-principal-centre-contenu-index"]/script[2])') # allow string()
- districts = re.findall(r'arrayDistricts\[a.+"(.+)"', js)
- names = re.findall(r'arrayMembres\[a.+"(.+)"', js)
- urls = re.findall(r'arrayLiens\[a.+"(.+)"', js)
- # first item in list is mayor
- p = Person(primary_org="legislature", name=names[0], district="Gatineau", role="Maire")
- p.add_source(COUNCIL_PAGE)
- p.add_source(MAYOR_CONTACT_PAGE)
- email = "maire@gatineau.ca" # hardcoded
- p.add_contact("email", email)
- yield p
+ js = page.xpath('string(//div[@id="contenu-principal-centre-contenu-index"]/script[1])') # allow string()
+ roles = re.findall(r'arrayMembres\[.+?"(.+?)"', js)
+ districts = re.findall(r'arrayMembres\[.+?, "(.*?)"', js)
+ names = re.findall(r'arrayMembres\[.+?,.+?, "(.*?)"', js)
+ urls = re.findall(r'arrayMembres\[.+"(.*?)",', js)
- councillors = list(zip(districts, names, urls))[1:]
+ councillors = list(zip(roles, districts, names, urls))
assert len(councillors), "No councillors found"
- for raw_district, name, url in councillors:
- if name == "Vacant":
+ for role, raw_district, name, url in councillors:
+ if name == "Vacant" or "(de " in role:
continue
-
profile_url = COUNCIL_PAGE + "/" + url.split("/")[-1]
profile_page = self.lxmlize(profile_url)
- photo_url = profile_page.xpath('//div[@class="colonnes-2"]//img/@src')[0]
- district = "District " + re.search(r"\d+", raw_district).group(0)
- email = self.get_email(profile_page)
- p = Person(primary_org="legislature", name=name, district=district, role="Conseiller")
+ photo_url = profile_page.xpath('//div[@class="colonnes-3"]//img/@src')[0]
+ if raw_district:
+ district = "District " + re.search(r"\d+", raw_district).group(0)
+ role = "Conseiller"
+ else:
+ district = "Gatineau"
+ role = "Maire"
+ email = self.get_email(profile_page, error=False)
+ phone = self.get_phone(profile_page, error=False)
+ p = Person(primary_org="legislature", name=name, district=district, role=role)
p.add_source(COUNCIL_PAGE)
p.add_source(profile_url)
p.image = photo_url
- p.add_contact("email", email)
+ if email:
+ p.add_contact("email", email)
+ if phone:
+ p.add_contact("voice", phone, "legislature")
yield p
diff --git a/ca_qc_kirkland/people.py b/ca_qc_kirkland/people.py
index 3425e57c..3f0bab4b 100644
--- a/ca_qc_kirkland/people.py
+++ b/ca_qc_kirkland/people.py
@@ -8,15 +8,6 @@
class KirklandPersonScraper(CanadianScraper):
def scrape(self):
- def decode_email(e):
- de = ""
- k = int(e[:2], 16)
-
- for i in range(2, len(e) - 1, 2):
- de += chr(int(e[i : i + 2], 16) ^ k)
-
- return de
-
page = self.lxmlize(COUNCIL_PAGE)
councillors = page.xpath('//div[@class="container_content"]//tbody/tr')
@@ -39,8 +30,7 @@ def decode_email(e):
.replace(".", ",") # correcting a typo
.replace(",-#-", " x")
)
- encrypted_email = councillor.xpath('.//@href[contains(., "email")]')[0].split("#")[1]
- email = decode_email(encrypted_email)
+ email = self.get_email(councillor)
p = Person(primary_org="legislature", name=name, district=district, role=role)
p.add_source(COUNCIL_PAGE)
diff --git a/ca_qc_longueuil/__init__.py b/ca_qc_longueuil/__init__.py
index c0c6b143..2a9425d9 100644
--- a/ca_qc_longueuil/__init__.py
+++ b/ca_qc_longueuil/__init__.py
@@ -21,7 +21,7 @@ def get_organizations(self):
for seat_number in range(1, 4):
organization.add_post(
role="Conseiller",
- label="{} (siège {})".format(division.name, seat_number),
+ label=f"{division.name} (siège {seat_number})",
division_id=division.id,
)
else:
diff --git a/ca_qc_longueuil/people.py b/ca_qc_longueuil/people.py
index 44d0c031..7227dbe1 100644
--- a/ca_qc_longueuil/people.py
+++ b/ca_qc_longueuil/people.py
@@ -20,7 +20,7 @@ def scrape(self):
district = tr.xpath('.//p[contains(./strong, "District")]/a/text()')[0]
if "Greenfield Park" in district:
- district = "Greenfield Park (siège {})".format(seat_number)
+ district = f"Greenfield Park (siège {seat_number})"
seat_number += 1
district = {
@@ -46,7 +46,7 @@ def scrape(self):
def scrape_mayor(self):
page = self.lxmlize(MAYOR_PAGE)
name = page.xpath("//h1[not(@class)]/text()")[0]
- img = page.xpath('//img[contains(./@alt, "{}")]/@src'.format(name))[0]
+ img = page.xpath(f'//img[contains(./@alt, "{name}")]/@src')[0]
p = Person(primary_org="legislature", name=name, district="Longueuil", role="Maire")
p.add_source(COUNCIL_PAGE)
p.add_source(MAYOR_PAGE)
diff --git a/ca_qc_mercier/people.py b/ca_qc_mercier/people.py
index e66dd79a..1d879f34 100644
--- a/ca_qc_mercier/people.py
+++ b/ca_qc_mercier/people.py
@@ -1,31 +1,40 @@
-import re
-
-from utils import CUSTOM_USER_AGENT
from utils import CanadianPerson as Person
from utils import CanadianScraper
-COUNCIL_PAGE = "http://www.ville.mercier.qc.ca/02_viedemocratique/default.asp"
+COUNCIL_PAGE = "https://www.ville.mercier.qc.ca/affaires-municipales/conseil-municipal/membres-du-conseil/"
class MercierPersonScraper(CanadianScraper):
def scrape(self):
- page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT, encoding="windows-1252")
+ page = self.lxmlize(COUNCIL_PAGE)
- councillors = page.xpath('//table[@width="800"]/tr')
+ councillors = page.xpath('//div[@class="wp-block-team-member"]')
assert len(councillors), "No councillors found"
for councillor in councillors:
- if councillor == councillors[0]:
- name = councillor.xpath(".//strong/text()")[0].replace("Monsieur", "").replace("Madame", "").strip()
- role = "Maire"
- district = "Mercier"
- else:
- name = councillor.xpath(".//strong/text()")[0].replace("Monsieur", "").replace("Madame", "").strip()
- role = "Conseiller"
- district = "District {}".format(re.search(r"(\d)", councillor.xpath(".//text()")[3]).group(1))
+ name = councillor.xpath(".//h4/text()")[0]
+ district = councillor.xpath(".//h5/text()")[0].split(" – ")[1]
email = self.get_email(councillor)
+ phone = self.get_phone(councillor)
+ image = councillor.xpath(".//img/@src")[0]
- p = Person(primary_org="legislature", name=name, district=district, role=role)
+ p = Person(primary_org="legislature", name=name, district=district, role="Conseiller", image=image)
p.add_source(COUNCIL_PAGE)
p.add_contact("email", email)
+ p.add_contact("voice", phone, "legislature")
+
yield p
+
+ mayor_node = page.xpath('//div[@class="wp-block-media-text alignwide is-stacked-on-mobile"]')[0]
+ name = mayor_node.xpath(".//h1")[0].text_content()
+
+ email = self.get_email(mayor_node)
+ phone = self.get_phone(mayor_node)
+ image = mayor_node.xpath(".//img/@src")[0]
+
+ p = Person(primary_org="legislature", name=name, district="Mercier", role="Maire", image=image)
+ p.add_source(COUNCIL_PAGE)
+ p.add_contact("email", email)
+ p.add_contact("voice", phone, "legislature")
+
+ yield p
diff --git a/ca_qc_montreal/__init__.py b/ca_qc_montreal/__init__.py
index d881c7dd..dfa79166 100644
--- a/ca_qc_montreal/__init__.py
+++ b/ca_qc_montreal/__init__.py
@@ -17,7 +17,7 @@ class Montreal(CanadianJurisdiction):
{"name": "Projet Montréal - Équipe Valérie Plante"},
{"name": "Vrai changement pour Montréal"},
{"name": "Équipe Anjou"},
- {"name": "Équipe Barbe Team"},
+ {"name": "Équipe LaSalle Team"},
{"name": "Équipe Dauphin Lachine"},
{"name": "Équipe Denis Coderre pour Montréal"},
]
diff --git a/ca_qc_montreal/people.py b/ca_qc_montreal/people.py
index 7bf5eeb1..046e95e0 100644
--- a/ca_qc_montreal/people.py
+++ b/ca_qc_montreal/people.py
@@ -3,15 +3,19 @@
class MontrealPersonScraper(CSVScraper):
# http://donnees.ville.montreal.qc.ca/dataset/listes-des-elus-de-la-ville-de-montreal
- csv_url = "http://donnees.ville.montreal.qc.ca/dataset/381d74ca-dadd-459f-95c9-db255b5f4480/resource/ce1315a3-50ee-48d0-a0f0-9bcc15f65643/download/listeelusmontreal.csv"
+ csv_url = "https://donnees.montreal.ca/dataset/381d74ca-dadd-459f-95c9-db255b5f4480/resource/ce1315a3-50ee-48d0-a0f0-9bcc15f65643/download/liste_elus_montreal.csv"
encoding = "utf-8"
locale = "fr"
corrections = {
"primary role": {
# Normalize to masculine role descriptor.
"Conseillère de la ville": "Conseiller de la ville",
+ "Conseiller(ère) de la ville": "Conseiller de la ville",
+ "Conseiller(ère) de la Ville": "Conseiller de la ville",
"Mairesse d'arrondissement": "Maire d'arrondissement",
+ "Maire(sse) d'arrondissement": "Maire d'arrondissement",
"Mairesse de la Ville de Montréal": "Maire de la Ville de Montréal",
+ "Maire(sse)": "Maire de la Ville de Montréal",
"Mairesse suppl\u00e9ante d'arrondissement": "Conseiller de la ville",
},
"arrondissement": {
@@ -23,18 +27,23 @@ class MontrealPersonScraper(CSVScraper):
"Rivière-des-Prairies - Pointe-aux-Trembles": "Rivière-des-Prairies—Pointe-aux-Trembles",
"Rosemont-La Petite-Patrie": "Rosemont—La Petite-Patrie",
"Villeray - Saint-Michel - Parc-Extension": "Villeray—Saint-Michel—Parc-Extension",
+ # Name.
+ "Ville de Montr\u00e9al": "Montr\u00e9al",
},
"district name": {
"Champlain—L'Île-des-Sœurs": "Champlain—L'Île-des-Soeurs",
"De Lorimier": "DeLorimier",
- "Saint-Henri-Est-Petite-Bourgogne-Pointe-Saint-Charles-Griffintown": "Saint-Henri—Petite-Bourgogne—Pointe-Saint-Charles",
- "Saint-Paul-Émard-Saint-Henri-Ouest": "Saint-Paul—Émard",
+ "Saint-Henri-Est–Petite-\nBourgogne–Pointe-Saint-\nCharles–Griffintown": "Saint-Henri—Petite-Bourgogne—Pointe-Saint-Charles",
+ "Saint-Paul–Émard– \nSaint-Henri-Ouest": "Saint-Paul—Émard",
# Hyphens.
"Maisonneuve-Longue-Pointe": "Maisonneuve—Longue-Pointe",
"Norman McLaren": "Norman-McLaren",
+ "Saint-Léonard Ouest": "Saint-Léonard-Ouest",
+ "Saint-Léonard Est": "Saint-Léonard-Est",
},
"party name": {
"Indépendante": "Indépendant",
+ "Ind\u00e9pendant(e)": "Indépendant",
},
"gender": {
"Madame": "female",
@@ -58,4 +67,8 @@ def header_converter(self, s):
}.get(s, s)
def is_valid_row(self, row):
- return row["primary role"] not in ("Conseiller d'arrondissement", "Conseillère d'arrondissement")
+ return row["primary role"] not in (
+ "Conseiller d'arrondissement",
+ "Conseillère d'arrondissement",
+ "Conseiller(\u00e8re) d'arrondissement",
+ )
diff --git a/ca_qc_montreal_est/people.py b/ca_qc_montreal_est/people.py
index 9dd0c650..8d7fb6d4 100644
--- a/ca_qc_montreal_est/people.py
+++ b/ca_qc_montreal_est/people.py
@@ -1,28 +1,26 @@
from utils import CanadianPerson as Person
from utils import CanadianScraper
-COUNCIL_PAGE = "http://ville.montreal-est.qc.ca/la-ville/conseil-municipal/conseils-municipaux/"
+COUNCIL_PAGE = "https://ville.montreal-est.qc.ca/vie-democratique/conseil-municipal/"
class MontrealEstPersonScraper(CanadianScraper):
def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)
-
- councillors = page.xpath("//table")
+ councillors = page.xpath("//div[contains(@id, 'membres-conseil-block_')]")
assert len(councillors), "No councillors found"
for councillor in councillors:
- name = councillor.xpath(".//h3")[0].text_content()
+ name, role_district = councillor.xpath(".//span[@class='h3 d-block']")[0].text_content().split(" – ", 1)
- if "maire" in name:
- name = name.split(" ", 2)[-1]
+ if "Maire" in role_district:
district = "Montréal-Est"
role = "Maire"
else:
- district = "District {}".format(councillor.xpath(".//h3")[1].text_content()[-1])
+ district = f"District {role_district[-1]}"
role = "Conseiller"
p = Person(primary_org="legislature", name=name, district=district, role=role)
- p.image = councillor.xpath(".//@src")[0]
+ p.image = councillor.xpath(".//@data-lazy-src")[0]
p.add_contact("email", self.get_email(councillor))
p.add_source(COUNCIL_PAGE)
yield p
diff --git a/ca_qc_pointe_claire/people.py b/ca_qc_pointe_claire/people.py
index c32c6d39..910d4915 100644
--- a/ca_qc_pointe_claire/people.py
+++ b/ca_qc_pointe_claire/people.py
@@ -23,10 +23,10 @@ def scrape(self):
elif district:
district = district[0].text_content().split(" – ")[0].strip()
else:
- assert False, "error parsing district"
+ raise AssertionError("error parsing district")
p = Person(primary_org="legislature", name=name, district=district, role=role)
- p.image = councillor.xpath(".//@src")[0]
+ p.image = councillor.xpath(".//@data-src")[0]
p.add_contact("email", self.get_email(councillor))
p.add_contact("voice", self.get_phone(councillor, area_codes=[514]), "legislature")
p.add_source(COUNCIL_PAGE)
diff --git a/ca_qc_quebec/people.py b/ca_qc_quebec/people.py
index 129a8f1b..653a834d 100644
--- a/ca_qc_quebec/people.py
+++ b/ca_qc_quebec/people.py
@@ -1,5 +1,7 @@
import re
+from django.template.defaultfilters import slugify
+
from utils import CanadianPerson as Person
from utils import CanadianScraper
@@ -25,18 +27,37 @@ def scrape(self):
role = "Maire"
else:
district = councillor.xpath('./p[@itemprop="jobTitle"]/a/text()')[0]
- district = re.search(r"\ADistrict (?:de(?: la)?|du|des) ([\w —–-]+)", district, flags=re.U).group(
- 1
+ district = (
+ re.search(r"\ADistrict (?:de(?: la)?|du|des) ([\w —–-]+)", district, flags=re.UNICODE)
+ .group(1)
+ .strip()
)
role = "Conseiller"
- if district == "Saules":
+ if district == "Saules–Les Méandres":
district = "Les Saules"
+ elif district == "Neufch\u00e2tel\u2013Lebourgneuf":
+ district = "Neufchâtel-Lebourgneuf"
+ elif district == "Loretteville\u2013Les Ch\u00e2tels":
+ district = "Loretteville-Les Ch\u00e2tels"
else:
district = re.sub(r"–", "—", district) # n-dash, m-dash
- p = Person(primary_org="legislature", name=name, district=district, role=role)
- p.add_source(COUNCIL_PAGE)
- p.image = councillor.xpath("./figure//@src")[0]
- p.add_contact("voice", self.get_phone(councillor, area_codes=[418]), "legislature")
- yield p
+ districts = [district]
+
+ borough = None
+ borough_strings = councillor.xpath('.//p[@itemprop = "affiliation"]/text()')
+ for string in borough_strings:
+ borough = re.findall(r"Présidente? de l’arrondissement (.*)$", string)
+ if borough:
+ borough = borough[0].replace("des", "Les").replace("de ", "")
+ districts.append(borough)
+
+ for i, district in enumerate(districts):
+ p = Person(primary_org="legislature", name=name, district=district, role=role)
+ p.add_source(COUNCIL_PAGE)
+ p.image = councillor.xpath("./figure//@src")[0]
+ p.add_contact("voice", self.get_phone(councillor, area_codes=[418]), "legislature")
+ if i:
+ p._related[0].extras["boundary_url"] = f"/boundaries/quebec-boroughs/{slugify(district)}/"
+ yield p
diff --git a/ca_qc_saguenay/people.py b/ca_qc_saguenay/people.py
index 5cf7e813..2979234c 100644
--- a/ca_qc_saguenay/people.py
+++ b/ca_qc_saguenay/people.py
@@ -1,3 +1,5 @@
+from django.template.defaultfilters import slugify
+
from utils import CanadianPerson as Person
from utils import CanadianScraper
@@ -10,14 +12,12 @@ class SaguenayPersonScraper(CanadianScraper):
def scrape(self):
mayor_page = self.lxmlize(MAYOR_PAGE)
contact_page = self.lxmlize(CONTACT_PAGE)
-
- name = mayor_page.xpath('//span/text()[contains(., "maire")]')[0].split(", ", 1)[0]
+ name = mayor_page.xpath('//a[contains(., "maire")]/span/text()')[0]
p = Person(primary_org="legislature", name=name, district="Saguenay", role="Maire")
p.add_source(MAYOR_PAGE)
p.add_source(CONTACT_PAGE)
node = contact_page.xpath('//h2[contains(., "Coordonnées du cabinet")]/following-sibling::p')[1]
p.add_contact("voice", self.get_phone(node, area_codes=[418]), "legislature")
- p.add_contact("email", self.get_email(node))
yield p
page = self.lxmlize(COUNCIL_PAGE)
@@ -26,6 +26,20 @@ def scrape(self):
for councillor in councillors:
district = councillor.xpath("./h3/text()")[0].replace("#", "")
name = councillor.xpath(".//p/text()")[0]
+ borough = None
+ borough_node = councillor.xpath(".//p/strong")
+ if borough_node:
+ text = borough_node[0].text_content()
+ if "Président" in text:
+ borough = text.replace("Président de l'arrondissement de ", "")
+
+ if borough:
+ p = Person(primary_org="legislature", name=name, district=borough, role="Conseiller")
+ p.add_source(COUNCIL_PAGE)
+ p.add_contact("voice", self.get_phone(councillor), "legislature")
+ p.add_contact("email", self.get_email(councillor))
+ p._related[0].extras["boundary_url"] = f"/boundaries/saguenay-boroughs/{slugify(borough)}/"
+ yield p
p = Person(primary_org="legislature", name=name, district=district, role="Conseiller")
p.add_source(COUNCIL_PAGE)
diff --git a/ca_qc_saint_jerome/people.py b/ca_qc_saint_jerome/people.py
index 9defa48d..23a6f19d 100644
--- a/ca_qc_saint_jerome/people.py
+++ b/ca_qc_saint_jerome/people.py
@@ -21,13 +21,14 @@ def scrape(self):
role = "Conseiller"
image = councillor.xpath('.//div[@class="portrait_single"]/img/@data-lazy-src')[0]
- contact = councillor.xpath('.//div[contains(@class,"phone")]/text()')[0]
+ phone = self.get_phone(councillor, error=False)
p = Person(primary_org="legislature", name=name, district=district, role=role)
p.add_source(COUNCIL_PAGE)
p.image = image
- p.add_contact("voice", contact, "legislature")
+ if phone:
+ p.add_contact("voice", phone, "legislature")
p.add_contact("email", self.get_email(councillor))
yield p
diff --git a/ca_qc_senneville/people.py b/ca_qc_senneville/people.py
index 6233f7da..2d0d3948 100644
--- a/ca_qc_senneville/people.py
+++ b/ca_qc_senneville/people.py
@@ -1,20 +1,23 @@
from utils import CanadianPerson as Person
from utils import CanadianScraper
-COUNCIL_PAGE = "http://www.villagesenneville.qc.ca/fr/7/conseil-municipal"
+COUNCIL_PAGE = "https://www.senneville.ca/municipalite/vie-democratique/conseil-municipal/"
class SennevillePersonScraper(CanadianScraper):
def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)
- councillors = page.xpath('//section[@class="block text"][./header/h2][position() > 1]')
+ councillors = page.xpath('//div[@class="wp-block-media-text is-stacked-on-mobile"]')
assert len(councillors), "No councillors found"
for councillor in councillors:
- role_and_district, name = councillor.xpath(".//h2/text()")[0].split("-")
- role, district = role_and_district.split(" ", 1)
- if role == "Maire":
+ role_and_district, name = councillor.xpath(".//h2")[0].text_content().split(" – ")
+ if "Maire" in role_and_district:
+ role = "Maire"
district = "Senneville"
+ else:
+ role, district = role_and_district.split(" ", 1)
+
email = self.get_email(councillor)
p = Person(primary_org="legislature", name=name, district=district, role=role)
p.add_source(COUNCIL_PAGE)
diff --git a/ca_qc_sherbrooke/people.py b/ca_qc_sherbrooke/people.py
index 9048e2d4..ecfa1bb3 100644
--- a/ca_qc_sherbrooke/people.py
+++ b/ca_qc_sherbrooke/people.py
@@ -1,44 +1,75 @@
+import json
+
+import lxml.html
+from django.template.defaultfilters import slugify
+
from utils import CanadianPerson as Person
from utils import CanadianScraper, clean_french_prepositions
-COUNCIL_PAGE = "http://www.ville.sherbrooke.qc.ca/mairie-et-vie-democratique/conseil-municipal/elus-municipaux/"
+COUNCIL_PAGE = "https://www.sherbrooke.ca/fr/vie-municipale/elues-et-elus-municipaux"
class SherbrookePersonScraper(CanadianScraper):
def scrape(self):
- page = self.lxmlize(COUNCIL_PAGE)
+ districts = []
+
+ # The whole site is rendered with Javascript, but has part of the html documents in the scripts
+ def get_content(url):
+ page = self.lxmlize(url)
+ script = page.xpath(".//script[not(@type)]")[0].text_content()
+ data = script.split(" = ", 1)[1]
+ data = json.loads(data)
+ content = data["value"]["selected"]["content"]["fr"]
+ return lxml.html.fromstring(content)
- councillors = page.xpath('//div[@id="c2087"]//a')
+ page = get_content(COUNCIL_PAGE)
+ councillors = page.xpath("//a[.//h3]")
assert len(councillors), "No councillors found"
for councillor in councillors:
- name = councillor.text_content()
- url = councillor.attrib["href"]
- page = self.lxmlize(url)
+ districts = []
+ name = councillor.xpath(".//h3")[0].text_content()
+ role = councillor.xpath('.//div[@class="poste"]')[0].text_content()
- if "Maire" in page.xpath("//h2/text()")[0]:
- district = "Sherbrooke"
+ if "Maire" in role:
role = "Maire"
+ district = "Sherbrooke"
else:
- district = page.xpath('//div[@class="csc-default"]//a[contains(@href, "fileadmin")]/text()')[0]
- district = clean_french_prepositions(district).replace("district", "").strip()
role = "Conseiller"
+ district = councillor.xpath('.//div[@class="district"]')[0].text_content()
+ district = clean_french_prepositions(district).replace("District", "").strip()
+ if district == "Lac-Magog":
+ district = "Lac Magog"
+
+ districts.append(district)
+
+ if "président" in role:
+ borough = councillor.xpath('.//div[@class="bloc_bas"]/p')[0].text_content()
+ borough = clean_french_prepositions(borough).replace("Arrondissement", "").strip()
+
+ if borough == "Brompton-Rock Forest-Saint-\u00c9lie-Deauville":
+ borough = "Brompton–Rock Forest–Saint-Élie–Deauville" # N-dashes
+ if borough != district: # Lennoxville
+ districts.append(borough)
+
+ url = "https://www.sherbrooke.ca" + councillor.xpath("./@href")[0]
+ page = get_content(url)
+
+ phone = self.get_phone(page, error=False)
+ email = self.get_email(page, error=False)
+ image = councillor.xpath(".//@src")[0]
+ if "https://" not in image:
+ image = "https://contenu.maruche.ca" + image
+
+ for i, district in enumerate(districts):
+ p = Person(primary_org="legislature", name=name, district=district, role=role)
+ p.add_source(COUNCIL_PAGE)
+ p.add_source(url)
+ p.image = image
- if district == "Lennoxville":
- district = "Arrondissement 3"
-
- p = Person(primary_org="legislature", name=name, district=district, role=role)
- p.add_source(COUNCIL_PAGE)
- p.add_source(url)
- p.image = page.xpath('//div[@class="csc-textpic-image csc-textpic-last"]//img/@src')[0]
- parts = page.xpath('//li[contains(text(), "phone")]/text()')[0].split(":")
- note = parts[0]
- phone = parts[1]
- p.add_contact(note, phone, note)
- email = self.get_email(page)
- if email:
- p.add_contact("email", email)
- if district == "Brompton":
- p._related[0].extras["boundary_url"] = "/boundaries/sherbrooke-boroughs/brompton/"
- elif district == "Lennoxville":
- p._related[0].extras["boundary_url"] = "/boundaries/sherbrooke-boroughs/lennoxville/"
- yield p
+ if email:
+ p.add_contact("email", email)
+ if phone:
+ p.add_contact("voice", phone, "legislature")
+ if i:
+ p._related[0].extras["boundary_url"] = f"/boundaries/sherbrooke-boroughs/{slugify(district)}/"
+ yield p
diff --git a/ca_qc_terrebonne/people.py b/ca_qc_terrebonne/people.py
index 023b0066..40b87c9e 100644
--- a/ca_qc_terrebonne/people.py
+++ b/ca_qc_terrebonne/people.py
@@ -1,34 +1,33 @@
from utils import CanadianPerson as Person
from utils import CanadianScraper
-COUNCIL_PAGE = "http://www.ville.terrebonne.qc.ca/fr/10/Conseil_municipal"
+COUNCIL_PAGE = "https://terrebonne.ca/membres-du-conseil-municipal/"
class TerrebonnePersonScraper(CanadianScraper):
def scrape(self):
page = self.lxmlize(COUNCIL_PAGE, "utf-8")
- councillors = page.xpath('//div[contains(@class, "member-box member-box--")]')
+ councillors = page.xpath('//div[contains(@class, "member-card jsBlockLink")]')
assert len(councillors), "No councillors found"
for councillor in councillors:
- name = councillor.xpath('.//div[@class="fiche__name"]/text()')[0]
- phone = councillor.xpath('.//div[@class="fiche__social"]/span/text()')[0].split("T")[1]
- email_mailto = councillor.xpath('.//div[@class="fiche__social"]/a[contains(@href, "mailto")]/@href')
- photo_url = councillor.xpath(".//img")[0].attrib["src"]
-
- page = self.lxmlize(councillor.xpath('.//a[@class="member-box__calltoaction"]/@href')[0])
- district = page.xpath('.//div[@class="fiche__category"]/text()')[0]
-
- if district == "Maire":
- district = "Terrebonne"
+ name = councillor.xpath('.//a[@class="name"]/text()')[0]
+ district = councillor.xpath('.//p[@class="district"]/text()')[0]
+ if "Maire" in district:
role = "Maire"
+ district = "Terrebonne"
else:
- district = "District {}".format(district)
role = "Conseiller"
+ district = district.split(" - ")[0]
+
+ photo_url = councillor.xpath(".//noscript/img/@src")[0]
+ url = councillor.xpath(".//@href")[0]
+
+ page = self.lxmlize(url)
+ email = self.get_email(page)
+ phone = self.get_phone(page)
p = Person(primary_org="legislature", name=name, district=district, role=role, image=photo_url)
p.add_source(COUNCIL_PAGE)
p.add_contact("voice", phone, "legislature")
- if email_mailto:
- email = email_mailto[0].split("mailto:")[1]
- p.add_contact("email", email)
+ p.add_contact("email", email)
yield p
diff --git a/ca_qc_trois_rivieres/people.py b/ca_qc_trois_rivieres/people.py
index f0c2ea31..b80f295a 100644
--- a/ca_qc_trois_rivieres/people.py
+++ b/ca_qc_trois_rivieres/people.py
@@ -11,17 +11,16 @@
class TroisRivieresPersonScraper(CanadianScraper):
def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)
- members = page.xpath('//div[@class="photos_conseillers"]//figure')
+ members = page.xpath('//div[contains(@class, "photos_conseillers")]//figure')
assert len(members), "No councillors found"
for member in members:
photo_url = member.xpath(".//a//img/@src")[0]
url = member.xpath(".//figcaption//a/@href")[0]
- email = self.lxmlize(url).xpath('//div[@class="content-page"]//a[starts-with(@href, "mailto:")]/@href')[0]
+ email = self.get_email(self.lxmlize(url))
- email = re.sub("^mailto:", "", email)
name, district = [x.strip() for x in member.xpath(".//figcaption//text()")]
- district = re.sub(r"\A(?:de|des|du) ", lambda match: match.group(0).lower(), district, flags=re.I)
+ district = re.sub(r"\A(?:de|des|du) ", lambda match: match.group(0).lower(), district, flags=re.IGNORECASE)
role = "Conseiller"
if "Maire" in district:
diff --git a/ca_qc_westmount/people.py b/ca_qc_westmount/people.py
index a047d864..9ed24012 100644
--- a/ca_qc_westmount/people.py
+++ b/ca_qc_westmount/people.py
@@ -21,11 +21,14 @@ def scrape(self):
role = "Conseiller"
district = councillor.xpath(".//li//text()")[0]
+ email = self.get_email(councillor, error=False)
+
p = Person(primary_org="legislature", name=name, district=district, role=role)
p.add_source(COUNCIL_PAGE)
p.image = councillor.xpath(".//@src")[0]
p.add_contact("voice", self.get_phone(councillor), "legislature")
- p.add_contact("email", self.get_email(councillor))
+ if email:
+ p.add_contact("email", email)
yield p
diff --git a/ca_sk/people.py b/ca_sk/people.py
index 6f46fe52..c3ecfc0f 100644
--- a/ca_sk/people.py
+++ b/ca_sk/people.py
@@ -1,3 +1,6 @@
+import contextlib
+import re
+
from utils import CanadianPerson as Person
from utils import CanadianScraper
@@ -8,71 +11,61 @@ class SaskatchewanPersonScraper(CanadianScraper):
def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)
- members = page.xpath('//table[@id="MLAs"]//tr')[1:]
+ members = page.xpath('//table[@id="mla-table"]//tr')[1:]
assert len(members), "No members found"
for member in members:
- if "Vacant" not in member.xpath("./td")[0].text_content():
- name = member.xpath("./td")[0].text_content().split(". ", 1)[1]
- district = member.xpath("./td")[2].text_content()
- url = member.xpath("./td[1]/a/@href")[0]
- page = self.lxmlize(url)
- party = page.xpath('//span[@id="ContentContainer_MainContent_ContentBottom_Property4"]' "/span")[
- 0
- ].text
-
- p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party)
- p.add_source(COUNCIL_PAGE)
- p.add_source(url)
- try:
- p.image = page.xpath('//div[contains(@class, "mla-image-cell")]/img/@src')[0]
- except IndexError:
- pass
+ if "Vacant" in member.xpath("./td")[1].text_content():
+ continue
+ name = member.xpath("./td")[0].text_content().split(". ", 1)[1].strip()
+ district = member.xpath("./td")[2].text_content().strip()
+ url = member.xpath("./td[1]/a/@href")[0]
+ page = self.lxmlize(url)
+ party = page.xpath('//div[contains(@class, "mla-header")]')[0].text.split(" - ")[1].strip()
- contact = page.xpath('//div[@id="mla-contact"]/div[2]')[0]
- website = contact.xpath("./div[3]/div[3]/div[2]/a")
- if website:
- p.add_link(website[0].text_content())
+ p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party)
+ p.add_source(COUNCIL_PAGE)
+ p.add_source(url)
+ with contextlib.suppress(IndexError):
+ p.image = page.xpath('//div[contains(@class, "mla-image-cell")]/img/@src')[0]
- def handle_address(lines, address_type):
- address_lines = []
- for line in lines:
- if line.endswith(":"): # Room:, Phone:, Fax:
- break
- address_lines.append(line)
- if address_lines:
- p.add_contact(
- "address",
- " ".join(address_lines),
- address_type,
- )
+ def handle_address(p, lines, address_type):
+ address_lines = []
+ for line in lines:
+ if re.match(r"(Room|Phone|Fax)\:", line):
+ break
+ address_lines.append(line)
+ if address_lines:
+ p.add_contact(
+ "address",
+ " ".join(address_lines),
+ address_type,
+ )
- def handle_phone(lines, phone_type):
- if "Phone:" in lines:
- next_line = lines[lines.index("Phone:") + 1]
- if next_line.endswith(":"):
- return
- number = None
- if "/" in next_line:
- for fragment in next_line.split("/"):
- if fragment.strip().startswith("306-"):
- number = fragment.strip()
- break
- else:
- number = next_line
- p.add_contact("voice", number, phone_type, area_code=306)
+ def handle_phone(p, lines, phone_type):
+ matches = re.findall(r"Phone\:\s*(306-[\d\-]+)", "\n".join(lines))
+ if len(matches) == 1:
+ p.add_contact("voice", matches[0], phone_type, area_code=306)
- legislature_lines = contact.xpath('.//div[@class="col-md-4"][1]/div//text()')
- assert legislature_lines[0] == "Legislative Building Address"
- handle_address(legislature_lines[1:], "legislature")
- handle_phone(legislature_lines[1:], "legislature")
+ for address in page.xpath('//div[@class="col-md-3"]'):
+ lines = address.xpath("./div//text()")
+ address_type = None
+ if lines[0] == "Legislative Building Address":
+ address_type = "legislature"
+ elif lines[0] == "Constituency Address":
+ address_type = "constituency"
+ else:
+ raise AssertionError(f"Unexpected address type: {lines[0]}")
+ handle_address(p, lines[1:], address_type)
+ handle_phone(p, lines[1:], address_type)
- constituency_lines = contact.xpath('.//div[@class="col-md-4"][2]/div//text()')
- assert constituency_lines[0] == "Constituency Address"
- handle_address(constituency_lines[1:], "constituency")
- handle_phone(constituency_lines[1:], "constituency")
+ email = self.get_email(page.xpath('//div[@id="content"]')[0], error=False)
+ if email:
+ p.add_contact("email", email)
- email = self.get_email(contact, error=False)
- if email:
- p.add_contact("email", email)
+ websites = re.findall(
+ r"Website:\s*(http\S+)", " ".join(page.xpath('//div[@class="col-md-4"]/div//text()'))
+ )
+ if len(websites) == 1:
+ p.add_link(websites[0])
- yield p
+ yield p
diff --git a/ca_sk_regina/people.py b/ca_sk_regina/people.py
index adfb1714..13565418 100644
--- a/ca_sk_regina/people.py
+++ b/ca_sk_regina/people.py
@@ -1,11 +1,10 @@
-import re
from urllib.parse import urljoin
from utils import CanadianPerson as Person
from utils import CanadianScraper
COUNCIL_PAGE = "https://www.regina.ca/city-government/city-council"
-MAYOR_CONTACT_URL = "https://www.regina.ca/city-government/city-council/mayors-office"
+MAYOR_CONTACT_URL = "https://www.regina.ca/city-government/city-council/mayors-office/contact-mayor/"
class ReginaPersonScraper(CanadianScraper):
@@ -26,7 +25,6 @@ def scrape(self):
def councillor_data(self, url, name, ward):
page = self.lxmlize(url)
- # sadly, email is a form on a separate page
photo_url_rel = page.xpath('//div[@class="councillor__image"]//img/@src')[0]
photo_url = urljoin(url, photo_url_rel)
@@ -34,12 +32,8 @@ def councillor_data(self, url, name, ward):
m.add_source(COUNCIL_PAGE)
m.add_source(url)
- # Scrape and add phone.
- phone_path = page.xpath('//div[@class="councillor__contact"]//ul/li/a/@href[contains(., "306")]')[0]
- phone_string = phone_path.rsplit("/", 1)[-1]
- phone = re.sub("[^0-9]", "", phone_string)
- if phone:
- m.add_contact("voice", phone, "legislature")
+ m.add_contact("voice", self.get_phone(page), "legislature")
+ m.add_contact("email", self.get_email(page))
m.image = photo_url
yield m
@@ -57,11 +51,8 @@ def mayor_data(self, url):
m.add_source(url)
m.image = photo_url
- # Scrape and add phone.
- phone_path = page.xpath('//div[@class="councillor__contact"]//ul/li/a/@href[contains(., "306")]')[0]
- phone_string = phone_path.rsplit("/", 1)[-1]
- phone = re.sub("[^0-9]", "", phone_string)
- if phone:
- m.add_contact("voice", phone, "legislature")
+ page = self.lxmlize(MAYOR_CONTACT_URL)
+ m.add_contact("voice", self.get_phone(page), "legislature")
+ m.add_contact("email", self.get_email(page))
return m
diff --git a/ca_yt/__init__.py b/ca_yt/__init__.py
index afb558fa..c80defd9 100644
--- a/ca_yt/__init__.py
+++ b/ca_yt/__init__.py
@@ -5,6 +5,6 @@ class Yukon(CanadianJurisdiction):
classification = "legislature"
division_id = "ocd-division/country:ca/territory:yt"
division_name = "Yukon"
- name = "Yukon Legislative Assembly"
+ name = "Legislative Assembly of Yukon"
url = "https://yukonassembly.ca"
parties = [{"name": "Yukon Liberal Party"}, {"name": "Yukon Party"}, {"name": "New Democratic Party"}]
diff --git a/ca_yt/people.py b/ca_yt/people.py
index d7f95eee..b6dd62f4 100644
--- a/ca_yt/people.py
+++ b/ca_yt/people.py
@@ -1,3 +1,5 @@
+import contextlib
+
from utils import CanadianPerson as Person
from utils import CanadianScraper
@@ -25,17 +27,15 @@ def scrape(self):
p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party)
p.add_source(COUNCIL_PAGE)
p.add_source(url)
- try:
+ with contextlib.suppress(IndexError):
p.image = page.xpath('//article[contains(@class, "member")]/p/img/@src')[0]
- except IndexError:
- pass
contact = page.xpath('//article[contains(@class, "members-sidebar")]')[0]
website = contact.xpath("./div[3]/div[3]/div[2]/a")
if website:
p.add_link(website[0].text_content())
- def handle_address(lines, address_type):
+ def handle_address(p, lines, address_type):
address_lines = []
for line in lines:
if line.endswith(":"): # Room:, Phone:, Fax:
@@ -48,7 +48,7 @@ def handle_address(lines, address_type):
address_type,
)
- def handle_phone(lines, phone_type):
+ def handle_phone(p, lines, phone_type):
if "Phone:" in lines:
next_line = lines[lines.index("Phone:") + 1]
if next_line.endswith(":"):
@@ -66,8 +66,8 @@ def handle_phone(lines, phone_type):
address_lines = contact.xpath("//address//text()")
contact_lines = contact.xpath("//p[2]//text()")
assert address_lines[0].strip() == "Yukon Legislative Assembly"
- handle_address(address_lines[1:], "legislature")
- handle_phone(contact_lines[1:], "legislature")
+ handle_address(p, address_lines[1:], "legislature")
+ handle_phone(p, contact_lines[1:], "legislature")
email = self.get_email(contact, error=False)
if email:
diff --git a/country-ca.csv b/country-ca.csv
index 3dbb69c6..0734bb82 100644
--- a/country-ca.csv
+++ b/country-ca.csv
@@ -2550,10 +2550,10 @@ ocd-division/country:ca/csd:2442110/district:4,District 4,,,,,,,,,,,,,,
ocd-division/country:ca/csd:2442110/district:5,District 5,,,,,,,,,,,,,,
ocd-division/country:ca/csd:2442110/district:6,District 6,,,,,,,,,,,,,,
ocd-division/country:ca/csd:2443027,Sherbrooke,,,,,V,Y,Sherbrooke,,Ville de Sherbrooke,,24,,,
-ocd-division/country:ca/csd:2443027/borough:1,Arrondissement 1,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:2443027/borough:2,Arrondissement 2,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:2443027/borough:3,Arrondissement 3,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:2443027/borough:4,Arrondissement 4,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2443027/borough:1,Brompton–Rock Forest–Saint-Élie–Deauville,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2443027/borough:2,Fleurimont,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2443027/borough:3,Lennoxville,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2443027/borough:4,Nations,,,,,,,,,,,,,,
ocd-division/country:ca/csd:2443027/district:1.1,Lac Magog,,,,,,,,,,,,,,
ocd-division/country:ca/csd:2443027/district:1.2,Rock Forest,,,,,,,,,,,,,,
ocd-division/country:ca/csd:2443027/district:1.3,Saint-Élie,,,,,,,,,,,,,,
@@ -4148,24 +4148,25 @@ ocd-division/country:ca/csd:2480135,Duhamel,,,,,MÉ,N,Duhamel,,,,7,,,
ocd-division/country:ca/csd:2480140,Val-des-Bois,,,,,MÉ,N,Val-des-Bois,,,,7,,,
ocd-division/country:ca/csd:2480145,Bowman,,,,,MÉ,N,Bowman,,,,7,,,
ocd-division/country:ca/csd:2481017,Gatineau,,,,,V,Y,Gatineau,,Ville de Gatineau,,19,,,
-ocd-division/country:ca/csd:2481017/district:1,d'Aylmer,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:2481017/district:10,de Touraine,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:2481017/district:11,de Pointe-Gatineau,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:2481017/district:12,du Carrefour-de-l'Hôpital,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:2481017/district:13,du Versant,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:2481017/district:14,de Bellevue,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:2481017/district:15,du Lac-Beauchamp,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:2481017/district:16,de la Rivière-Blanche,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:2481017/district:17,de Masson-Angers,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:2481017/district:18,de Buckingham,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:2481017/district:2,de Lucerne,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:2481017/district:3,de Deschênes,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:2481017/district:4,du Plateau—Manoir-des-Trembles,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:2481017/district:5,de Wright—Parc-de-la-Montagne,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:2481017/district:6,de l'Orée-du-Parc,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:2481017/district:7,de Saint-Raymond—Vanier,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:2481017/district:8,de Hull—Val-Tétreau,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:2481017/district:9,de Limbour,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2481017/district:1,Aylmer,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2481017/district:10,Limbour,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2481017/district:11,Touraine,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2481017/district:12,Pointe-Gatineau,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2481017/district:13,Carrefour-de-l'Hôpital,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2481017/district:14,Versant,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2481017/district:15,Bellevue,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2481017/district:16,Lac-Beauchamp,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2481017/district:17,Rivière-Blanche,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2481017/district:18,Masson-Angers,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2481017/district:19,Buckingham,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2481017/district:2,Lucerne,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2481017/district:3,Deschênes,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2481017/district:4,Plateau,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2481017/district:5,Mitigomijokan,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2481017/district:6,Manoir-des-Trembles—Val-Tétreau,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2481017/district:7,Hull–Wright,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2481017/district:8,Parc-de-la-Montagne—Saint-Raymond,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:2481017/district:9,Orée-du-Parc,,,,,,,,,,,,,,
ocd-division/country:ca/csd:2482005,L'Ange-Gardien,,,,,MÉ,Y,L'Ange-Gardien,,,,7,,,
ocd-division/country:ca/csd:2482005/district:1,du Lièvre,,,,,,,,,,,,,,
ocd-division/country:ca/csd:2482005/district:2,Lac Donaldson,,,,,,,,,,,,,,
@@ -4722,7 +4723,7 @@ ocd-division/country:ca/csd:3502036,Clarence-Rockland,,,,,C,,Clarence-Rockland,,
ocd-division/country:ca/csd:3502044,Casselman,,,,,VL,,Casselman,,Village of Casselman,ocd-division/country:ca/cd:3502,,,,
ocd-division/country:ca/csd:3502048,Russell,,,,,TP,,Russell,,Township of Russell,ocd-division/country:ca/cd:3502,,,,
ocd-division/country:ca/csd:3506008,Ottawa,,,,,CV,,Ottawa,,City of Ottawa,,,,,
-ocd-division/country:ca/csd:3506008/ward:1,Orléans,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:3506008/ward:1,Orléans East-Cumberland,,,,,,,,,,,,,,
ocd-division/country:ca/csd:3506008/ward:10,Gloucester-Southgate,,,,,,,,,,,,,,
ocd-division/country:ca/csd:3506008/ward:11,Beacon Hill-Cyrville,,,,,,,,,,,,,,
ocd-division/country:ca/csd:3506008/ward:12,Rideau-Vanier,,,,,,,,,,,,,,
@@ -4732,13 +4733,14 @@ ocd-division/country:ca/csd:3506008/ward:15,Kitchissippi,,,,,,,,,,,,,,
ocd-division/country:ca/csd:3506008/ward:16,River,,,,,,,,,,,,,,
ocd-division/country:ca/csd:3506008/ward:17,Capital,,,,,,,,,,,,,,
ocd-division/country:ca/csd:3506008/ward:18,Alta Vista,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:3506008/ward:19,Cumberland,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:3506008/ward:2,Innes,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:3506008/ward:19,Orléans South-Navan,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:3506008/ward:2,Orléans West-Innes,,,,,,,,,,,,,,
ocd-division/country:ca/csd:3506008/ward:20,Osgoode,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:3506008/ward:21,Rideau-Goulbourn,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:3506008/ward:22,Gloucester-South Nepean,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:3506008/ward:21,Rideau-Jock,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:3506008/ward:22,Riverside South-Findlay Creek,,,,,,,,,,,,,,
ocd-division/country:ca/csd:3506008/ward:23,Kanata South,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:3506008/ward:3,Barrhaven,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:3506008/ward:24,Barrhaven East,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:3506008/ward:3,Barrhaven West,,,,,,,,,,,,,,
ocd-division/country:ca/csd:3506008/ward:4,Kanata North,,,,,,,,,,,,,,
ocd-division/country:ca/csd:3506008/ward:5,West Carleton-March,,,,,,,,,,,,,,
ocd-division/country:ca/csd:3506008/ward:6,Stittsville,,,,,,,,,,,,,,
@@ -4834,13 +4836,13 @@ ocd-division/country:ca/csd:3515037,North Kawartha,,,,,TP,,North Kawartha,,Towns
ocd-division/country:ca/csd:3515044,Trent Lakes,,,,,MU,,Trent Lakes,,Municipality of Trent Lakes,ocd-division/country:ca/cd:3515,,,,
ocd-division/country:ca/csd:3516010,Kawartha Lakes,,,,,CY,,Kawartha Lakes,,City of Kawartha Lakes,,,,,
ocd-division/country:ca/csd:3516010/ward:1,Ward 1,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:3516010/ward:10,Ward 10,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:3516010/ward:11,Ward 11,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:3516010/ward:12,Ward 12,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:3516010/ward:13,Ward 13,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:3516010/ward:14,Ward 14,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:3516010/ward:15,Ward 15,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:3516010/ward:16,Ward 16,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:3516010/ward:10,Ward 10,,2018-12-01,,,,,,,,,,,,
+ocd-division/country:ca/csd:3516010/ward:11,Ward 11,,2018-12-01,,,,,,,,,,,,
+ocd-division/country:ca/csd:3516010/ward:12,Ward 12,,2018-12-01,,,,,,,,,,,,
+ocd-division/country:ca/csd:3516010/ward:13,Ward 13,,2018-12-01,,,,,,,,,,,,
+ocd-division/country:ca/csd:3516010/ward:14,Ward 14,,2018-12-01,,,,,,,,,,,,
+ocd-division/country:ca/csd:3516010/ward:15,Ward 15,,2018-12-01,,,,,,,,,,,,
+ocd-division/country:ca/csd:3516010/ward:16,Ward 16,,2018-12-01,,,,,,,,,,,,
ocd-division/country:ca/csd:3516010/ward:2,Ward 2,,,,,,,,,,,,,,
ocd-division/country:ca/csd:3516010/ward:3,Ward 3,,,,,,,,,,,,,,
ocd-division/country:ca/csd:3516010/ward:4,Ward 4,,,,,,,,,,,,,,
@@ -4848,7 +4850,7 @@ ocd-division/country:ca/csd:3516010/ward:5,Ward 5,,,,,,,,,,,,,,
ocd-division/country:ca/csd:3516010/ward:6,Ward 6,,,,,,,,,,,,,,
ocd-division/country:ca/csd:3516010/ward:7,Ward 7,,,,,,,,,,,,,,
ocd-division/country:ca/csd:3516010/ward:8,Ward 8,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:3516010/ward:9,Ward 9,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:3516010/ward:9,Ward 9,,2018-12-01,,,,,,,,,,,,
ocd-division/country:ca/csd:3518001,Pickering,,,,,CY,,Pickering,,City of Pickering,ocd-division/country:ca/cd:3518,,,,
ocd-division/country:ca/csd:3518001/ward:1,Ward 1,,,,,,,,,,,,,,
ocd-division/country:ca/csd:3518001/ward:2,Ward 2,,,,,,,,,,,,,,
@@ -5564,7 +5566,7 @@ ocd-division/country:ca/csd:3557061/ward:2,Ward 2,,,,,,,,,,,,,,
ocd-division/country:ca/csd:3557061/ward:3,Ward 3,,,,,,,,,,,,,,
ocd-division/country:ca/csd:3557061/ward:4,Ward 4,,,,,,,,,,,,,,
ocd-division/country:ca/csd:3557061/ward:5,Ward 5,,,,,,,,,,,,,,
-ocd-division/country:ca/csd:3557061/ward:6,Ward 6,,,,,,,,,,,,,,
+ocd-division/country:ca/csd:3557061/ward:6,Ward 6,,2018-10-21,,,,,,,,,,,,
ocd-division/country:ca/csd:3557066,Prince,,,,,TP,,Prince,,Township of Prince,,,,,
ocd-division/country:ca/csd:3557071,Sagamok,,,,,IRI,N,Sagamok,,,,,,,
ocd-division/country:ca/csd:3557072,Serpent River 7,,,,,IRI,N,Serpent River 7,,,,,,,
diff --git a/disabled/ca_bc_municipalities/people.py b/disabled/ca_bc_municipalities/people.py
index 6e505ebb..f4100405 100644
--- a/disabled/ca_bc_municipalities/people.py
+++ b/disabled/ca_bc_municipalities/people.py
@@ -92,7 +92,7 @@ def scrape(self):
if division_id in exclude_divisions:
continue
if division_id in processed_ids:
- raise Exception("unhandled collision: {}".format(division_id))
+ raise Exception(f"unhandled collision: {division_id}")
division = Division.get(division_id)
processed_divisions.add(division_name)
diff --git a/disabled/ca_bc_municipalities_candidates/people.py b/disabled/ca_bc_municipalities_candidates/people.py
index 109e4aa3..dab5c42b 100644
--- a/disabled/ca_bc_municipalities_candidates/people.py
+++ b/disabled/ca_bc_municipalities_candidates/people.py
@@ -116,11 +116,8 @@ def scrape(self):
role = row["primary role"]
if role not in expected_roles:
- raise Exception("unexpected role: {}".format(role))
- if row["district id"]:
- district = format(division_id)
- else:
- district = division_name
+ raise Exception(f"unexpected role: {role}")
+ district = format(division_id) if row["district id"] else division_name
organization.add_post(role=role, label=district, division_id=division_id)
diff --git a/disabled/ca_mb_municipalities/people.py b/disabled/ca_mb_municipalities/people.py
index 674b114f..87c2923a 100644
--- a/disabled/ca_mb_municipalities/people.py
+++ b/disabled/ca_mb_municipalities/people.py
@@ -15,10 +15,7 @@ def scrape(self):
districts = page.xpath('//div[@id="ctl00_PublicContent_divSearchContent"]//tr')[5::3]
for district in districts:
title = district.xpath(".//td//text()")
- if len(title[0]) > 1:
- title = title[0]
- else:
- title = "".join(title[:2])
+ title = title[0] if len(title[0]) > 1 else "".join(title[:2])
# @todo Need to distinguish between, e.g., R.M. and Town
title = title.title()
diff --git a/disabled/ca_municipalities/people.py b/disabled/ca_municipalities/people.py
index c20c3878..e7537a4e 100644
--- a/disabled/ca_municipalities/people.py
+++ b/disabled/ca_municipalities/people.py
@@ -61,7 +61,7 @@ def scrape(self):
if self.many_posts_per_area and role not in self.unique_roles:
seat_numbers[role][district] += 1
- district = "{} (seat {})".format(district, seat_numbers[role][district])
+ district = f"{district} (seat {seat_numbers[role][district]})"
p = Person(
primary_org=organization_classification,
@@ -111,6 +111,5 @@ def scrape(self):
p.validate()
yield p
- except Exception as e:
- print(repr(e))
- continue
+ except Exception:
+ pass
diff --git a/disabled/ca_nb_municipalities/people.py b/disabled/ca_nb_municipalities/people.py
index c0c95580..a4f42954 100644
--- a/disabled/ca_nb_municipalities/people.py
+++ b/disabled/ca_nb_municipalities/people.py
@@ -52,7 +52,7 @@ def scrape(self):
if division.attrs["classification"] == "P":
continue
if division.name in names_to_ids:
- raise Exception("unhandled collision: {}".format(division.name))
+ raise Exception(f"unhandled collision: {division.name}")
else:
names_to_ids[division.name] = division.id
@@ -79,11 +79,11 @@ def scrape(self):
if division_id in exclude_divisions:
continue
if division_id in seen:
- raise Exception("unhandled collision: {}".format(division_id))
+ raise Exception(f"unhandled collision: {division_id}")
seen.add(division_id)
division_name = Division.get(division_id).name
- organization_name = "{} {} Council".format(division_name, classifications[list_link.text])
+ organization_name = f"{division_name} {classifications[list_link.text]} Council"
organization = Organization(name=organization_name, classification="government")
organization.add_source(detail_url)
@@ -104,7 +104,7 @@ def scrape(self):
for p in groups:
role = p.xpath("./b/text()")[0].rstrip("s")
if role not in expected_roles:
- raise Exception("unexpected role: {}".format(role))
+ raise Exception(f"unexpected role: {role}")
councillors = p.xpath("./text()")
assert len(councillors), "No councillors found"
@@ -112,10 +112,7 @@ def scrape(self):
if "vacant" in name.lower():
continue
- if role in unique_roles:
- district = division_name
- else:
- district = "{} (seat {})".format(division_name, seat_number)
+ district = division_name if role in unique_roles else f"{division_name} (seat {seat_number})"
organization.add_post(role=role, label=district, division_id=division_id)
diff --git a/disabled/ca_nl_municipalities/people.py b/disabled/ca_nl_municipalities/people.py
index 3fbd9c6e..499b0e73 100644
--- a/disabled/ca_nl_municipalities/people.py
+++ b/disabled/ca_nl_municipalities/people.py
@@ -1,7 +1,7 @@
import os
import re
import subprocess
-from urllib.request import urlopen
+import tempfile
from pupa.scrape import Organization
@@ -16,15 +16,14 @@ def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)
url = page.xpath('//a[contains(text(),"Municipal Directory")]/@href')[0]
- response = urlopen(url).read()
- pdf = open("/tmp/nl.pdf", "w")
- pdf.write(response)
- pdf.close()
+ response = self.get(url).content
+ with tempfile.NamedTemporaryFile(delete_on_close=False) as pdf:
+ pdf.write(response)
- data = subprocess.check_output(["pdftotext", "-layout", "/tmp/nl.pdf", "-"])
+ data = subprocess.check_output(["pdftotext", "-layout", pdf.name, "-"]) # noqa: S603,S607
pages = data.split("Municipal Directory")[1:]
for page in pages:
- page = page.splitlines(True)
+ page = page.splitlines(keepends=True)
column_index = {}
for line in page:
if "Official Name" in line:
@@ -81,4 +80,5 @@ def scrape(self):
if address:
membership.add_contact_detail("address", address, "legislature")
yield p
- os.system("rm /tmp/nl.pdf")
+
+ os.unlink(pdf.name)
diff --git a/disabled/ca_ns_municipalities/people.py b/disabled/ca_ns_municipalities/people.py
index 3533a463..2d8b9fbe 100644
--- a/disabled/ca_ns_municipalities/people.py
+++ b/disabled/ca_ns_municipalities/people.py
@@ -1,7 +1,7 @@
import os
import re
import subprocess
-from urllib.request import urlopen
+import tempfile
from pupa.scrape import Organization
@@ -13,16 +13,15 @@
class NovaScotiaMunicipalitiesPersonScraper(CanadianScraper):
def scrape(self):
- response = urlopen(COUNCIL_PAGE).read()
- pdf = open("/tmp/ns.pdf", "w")
- pdf.write(response)
- pdf.close()
+ response = self.get(COUNCIL_PAGE).content
+ with tempfile.NamedTemporaryFile(delete_on_close=False) as pdf:
+ pdf.write(response)
- data = subprocess.check_output(["pdftotext", "/tmp/ns.pdf", "-"])
+ data = subprocess.check_output(["pdftotext", pdf.name, "-"]) # noqa: S603,S607
emails = re.findall(r"(?<=E-mail: ).+", data)
data = re.split(r"Mayor |Warden ", data)[1:]
for i, mayor in enumerate(data):
- lines = mayor.splitlines(True)
+ lines = mayor.splitlines(keepends=True)
name = lines.pop(0).strip()
if name == "Jim Smith":
continue
@@ -61,9 +60,9 @@ def scrape(self):
for i, email in enumerate(emails):
regex = name.split()[-1].lower() + "|" + "|".join(district.split()[-2:]).replace("of", "").lower()
regex = regex.replace("||", "|")
- matches = re.findall(r"{}".format(regex), email)
+ matches = re.findall(rf"{regex}", email)
if matches:
membership.add_contact_detail("email", emails.pop(i))
yield p
- os.system("rm /tmp/ns.pdf")
+ os.unlink(pdf.name)
diff --git a/disabled/ca_pe_municipalities/people.py b/disabled/ca_pe_municipalities/people.py
index 3eb0154f..f39591bb 100644
--- a/disabled/ca_pe_municipalities/people.py
+++ b/disabled/ca_pe_municipalities/people.py
@@ -44,7 +44,7 @@ def scrape(self):
councillors = page.xpath(
'//div[@style="WIDTH:750"]/dl/dt[contains(text(), "Elected Officials")]/parent::dl/dd/pre/text()'
- )[0].splitlines(True)
+ )[0].splitlines(keepends=True)
for councillor in councillors:
name = (
councillor.replace("(Mayor)", "")
diff --git a/disabled/ca_sk_municipalities/people.py b/disabled/ca_sk_municipalities/people.py
index c0f67240..b162995d 100644
--- a/disabled/ca_sk_municipalities/people.py
+++ b/disabled/ca_sk_municipalities/people.py
@@ -1,7 +1,7 @@
import os
import re
import subprocess
-from urllib.request import urlopen
+import tempfile
from pupa.scrape import Organization
@@ -14,14 +14,13 @@
class SaskatchewanMunicipalitiesPersonScraper(CanadianScraper):
def scrape(self):
- response = urlopen(COUNCIL_PAGE).read()
- pdf = open("/tmp/sk.pdf", "w")
- pdf.write(response)
- pdf.close()
+ response = self.get(COUNCIL_PAGE).read()
+ with tempfile.NamedTemporaryFile(delete_on_close=False) as pdf:
+ pdf.write(response)
- data = subprocess.check_output(["pdftotext", "-layout", "/tmp/sk.pdf", "-"])
+ data = subprocess.check_output(["pdftotext", "-layout", pdf.name, "-"]) # noqa: S603,S607
- data = data.splitlines(True)
+ data = data.splitlines(keepends=True)
pages = []
page = []
for line in data:
@@ -34,10 +33,7 @@ def scrape(self):
districts = []
for page in pages:
index = re.search(r"(\s{6,})", page[0])
- if index:
- index = index.end() - 1
- else:
- index = -1
+ index = index.end() - 1 if index else -1
dist1 = []
dist2 = []
for line in page:
@@ -99,4 +95,5 @@ def scrape(self):
for key, value in contacts.items():
membership.add_contact_detail(key, value, "" if key == "email" else "legislature")
yield p
- os.system("rm /tmp/sk.pdf")
+
+ os.unlink(pdf.name)
diff --git a/disabled/ca_yt_municipalities/people.py b/disabled/ca_yt_municipalities/people.py
index 130aa33c..7c7ba0af 100644
--- a/disabled/ca_yt_municipalities/people.py
+++ b/disabled/ca_yt_municipalities/people.py
@@ -1,7 +1,7 @@
import os
import re
import subprocess
-from urllib.request import urlopen
+import tempfile
from pupa.scrape import Organization
@@ -13,12 +13,11 @@
class YukonMunicipalitiesPersonScraper(CanadianScraper):
def scrape(self):
- response = urlopen(COUNCIL_PAGE).read()
- pdf = open("/tmp/yt.pdf", "w")
- pdf.write(response)
- pdf.close()
+ response = self.get(COUNCIL_PAGE).content
+ with tempfile.NamedTemporaryFile(delete_on_close=False) as pdf:
+ pdf.write(response)
- data = subprocess.check_output(["pdftotext", "-layout", "/tmp/yt.pdf", "-"])
+ data = subprocess.check_output(["pdftotext", "-layout", pdf.name, "-"]) # noqa: S603,S607
data = re.split(r"\n\s*\n", data)
for municipality in data:
if "Councillors" not in municipality:
@@ -81,4 +80,4 @@ def scrape(self):
p.add_link(website)
yield p
- os.system("rm /tmp/yt.pdf")
+ os.unlink(pdf.name)
diff --git a/patch.py b/patch.py
index 2d6482a0..6d6312e8 100644
--- a/patch.py
+++ b/patch.py
@@ -27,9 +27,9 @@
(r"\A1 \d{3} \d{3}-\d{4}(?: x\d+)?\Z", lambda x: x["type"] in ("text", "voice", "fax", "cell", "video", "pager")),
]
# Validate the format of contact_details[].note.
-_contact_details["items"]["properties"]["note"][
- "pattern"
-] = r"\A(?:constituency|legislature|office|residence|)(?: \(\d\))?\Z"
+_contact_details["items"]["properties"]["note"]["pattern"] = (
+ r"\A(?:constituency|legislature|office|residence|)(?: \(\d\))?\Z"
+)
# contact_details[] must not include unexpected properties.
_contact_details["items"]["additionalProperties"] = False
@@ -57,7 +57,7 @@
social_re = re.compile(
r"(?:facebook|fb|instagram|linkedin|twitter|youtube)\.com|conservative\.ca"
-) # XXX ca_candidates
+) # special case: ca_candidates
facebook_re = re.compile(r"facebook\.com")
instagram_re = re.compile(r"instagram\.com")
linkedin_re = re.compile(r"linkedin\.com")
@@ -70,15 +70,15 @@
(1, lambda x: x["type"] == "email", "Membership has many emails"),
]
-for type in ("address", "cell", "fax", "voice"):
- for note in ("constituency", "legislature", "office", "residence"):
- matchers.append(
- (
- 1,
- lambda x, type=type, note=note: x["type"] == type and x["note"] == note,
- "Membership has contact_details with same type and note",
- )
- )
+matchers.extend(
+ (
+ 1,
+ lambda x, type=type, note=note: x["type"] == type and x["note"] == note,
+ "Membership has contact_details with same type and note",
+ )
+ for type in ("address", "cell", "fax", "voice")
+ for note in ("constituency", "legislature", "office", "residence")
+)
# A membership should not have notes on emails, should have notes on non-emails,
# should have at most one email, and should, in most cases, have at most one of
@@ -133,7 +133,7 @@
r"\A"
r"(?!(?:Chair|Commissioner|Conseiller|Councillor|Deputy|Dr|Hon|M|Maire|Mayor|Miss|Mme|Mr|Mrs|Ms|Regional|Warden)\b)"
r"(?:" + name_fragment + r"(?:'|-| - | )"
- r")+" + name_fragment + r"\Z" # noqa: W504
+ r")+" + name_fragment + r"\Z"
)
person_schema["properties"]["gender"]["enum"] = ["male", "female", ""]
# @note https://github.com/opennorth/represent-canada-images checks whether an
@@ -147,7 +147,7 @@
organization_schema["properties"]["classification"]["enum"] += ["government"]
-def validate_conditionalPattern(self, x, fieldname, schema, path, arguments=None):
+def validate_conditionalPattern(self, x, fieldname, schema, path, arguments=None): # noqa: N802
value = x.get(fieldname)
if isinstance(value, str):
for pattern, method in arguments:
@@ -158,7 +158,7 @@ def validate_conditionalPattern(self, x, fieldname, schema, path, arguments=None
DatetimeValidator.validate_conditionalPattern = validate_conditionalPattern
-def validate_maxMatchingItems(self, x, fieldname, schema, path, arguments=None):
+def validate_maxMatchingItems(self, x, fieldname, schema, path, arguments=None): # noqa: N802
value = x.get(fieldname)
if isinstance(value, list):
for length, method, message in arguments:
diff --git a/pyproject.toml b/pyproject.toml
index 8656c702..28cf62c2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,40 @@
-[tool.black]
+[project]
+name = "scrapers_ca"
+version = "0.0.1"
+
+[tool.ruff]
line-length = 119
+target-version = "py39"
+
+[tool.ruff.lint]
+select = ["ALL"]
+ignore = [
+ "ANN", "C901", "COM812", "D203", "D212", "D415", "EM", "ISC001", "PERF203", "PLR091", "Q000",
+ "D1", "D205",
+ "DTZ",
+ "E501",
+ "ERA001", # commented-out code
+ "PLR2004", # magic
+ "PLW2901",
+ "PTH",
+ "RUF012",
+ "S101", # assert
+ "S113", # timeout
+ "TRY003", # errors
+
+ # To fix:
+ "BLE001", # except Exception
+ "S110", # except pass
+ "TRY002", # raise Exception
+]
+allowed-confusables = ["’", "–"]
+
+[tool.ruff.lint.flake8-builtins]
+builtins-ignorelist = ["id", "type"]
+
+[tool.ruff.lint.flake8-self]
+extend-ignore-names = ["_ElementUnicodeResult", "_id", "_related", "_type"]
-[tool.isort]
-profile = 'black'
-line_length = 119
+[tool.ruff.lint.per-file-ignores]
+"patch.py" = ["ARG001"]
+"tasks.py" = ["T201"]
diff --git a/requirements.in b/requirements.in
new file mode 100644
index 00000000..3cb35058
--- /dev/null
+++ b/requirements.in
@@ -0,0 +1,11 @@
+# 0.9.0 uses jsonschema instead of validictory, so we use a commit after 0.8.0 that adds Django 2.0 support.
+git+https://github.com/opencivicdata/pupa@f0791f7de07574039eff10d804e4683399a16ec5
+agate
+agate-excel
+django<5
+invoke
+lxml
+opencivicdata
+regex
+requests[security]
+unidecode
diff --git a/requirements.txt b/requirements.txt
index c14d43a1..080af981 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,15 +1,83 @@
-# 0.9.0 uses jsonschema instead of validictory, so we use a commit after 0.8.0 that adds Django 2.0 support.
--e git+https://github.com/opencivicdata/pupa.git@f0791f7de07574039eff10d804e4683399a16ec5#egg=pupa
-opencivicdata==3.3.1
-Django==2.2.28
-
-# Scrapers
-agate
-agate-excel
-lxml==4.9.1
-regex==2014.04.10
-requests[security]==2.32.0
-
-# Maintenance
+# This file was autogenerated by uv via the following command:
+# uv pip compile requirements.in -o requirements.txt
+agate==1.12.0
+ # via
+ # -r requirements.in
+ # agate-excel
+agate-excel==0.4.1
+ # via -r requirements.in
+asgiref==3.8.1
+ # via django
+babel==2.16.0
+ # via agate
+certifi==2024.8.30
+ # via requests
+charset-normalizer==3.3.2
+ # via requests
+dj-database-url==0.3.0
+ # via pupa
+django==4.2.16
+ # via
+ # -r requirements.in
+ # opencivicdata
+ # pupa
+et-xmlfile==1.1.0
+ # via openpyxl
+idna==3.10
+ # via requests
invoke==0.11.1
-Unidecode==0.04.14
+ # via -r requirements.in
+isodate==0.6.1
+ # via agate
+leather==0.4.0
+ # via agate
+lxml==4.9.1
+ # via -r requirements.in
+olefile==0.47
+ # via agate-excel
+opencivicdata==3.3.1
+ # via
+ # -r requirements.in
+ # pupa
+openpyxl==3.1.5
+ # via agate-excel
+parsedatetime==2.6
+ # via agate
+psycopg2==2.9.9
+ # via pupa
+psycopg2-binary==2.9.9
+ # via opencivicdata
+pupa @ git+https://github.com/opencivicdata/pupa@f0791f7de07574039eff10d804e4683399a16ec5
+ # via -r requirements.in
+python-slugify==8.0.4
+ # via agate
+pytimeparse==1.1.8
+ # via agate
+pytz==2024.2
+ # via pupa
+regex==2014.4.10
+ # via -r requirements.in
+requests==2.32.3
+ # via
+ # -r requirements.in
+ # scrapelib
+scrapelib==2.3.0
+ # via pupa
+six==1.16.0
+ # via isodate
+sqlparse==0.5.1
+ # via django
+text-unidecode==1.3
+ # via python-slugify
+typing-extensions==4.12.2
+ # via asgiref
+unidecode==0.4.14
+ # via -r requirements.in
+urllib3==1.26.20
+ # via
+ # requests
+ # scrapelib
+validictory==1.1.3
+ # via pupa
+xlrd==2.0.1
+ # via agate-excel
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index cfb0df10..00000000
--- a/setup.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-[flake8]
-extend-ignore = E203,E501
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 15719185..00000000
--- a/setup.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# @see https://pythonhosted.org/an_example_pypi_project/setuptools.html
-# @see https://pythonhosted.org/setuptools/setuptools.html
-import os
-
-from setuptools import find_packages, setup
-
-
-def read(fname):
- return open(os.path.join(os.path.dirname(__file__), fname)).read()
-
-
-setup(
- name="scrapers_ca",
- version="0.0.1",
- author="Open North",
- author_email="info@opennorth.ca",
- description="Canadian legislative scrapers",
- license="MIT",
- url="https://github.com/opencivicdata/scrapers-ca",
- packages=find_packages(),
- long_description=read("README.md"),
- install_requires=[
- "lxml",
- ],
-)
diff --git a/tasks.py b/tasks.py
index d0581c57..004737c1 100644
--- a/tasks.py
+++ b/tasks.py
@@ -22,37 +22,29 @@
def module_names():
- """
- Returns all module names.
- """
+ """Return all module names."""
for module_name in os.listdir("."):
if os.path.isfile(os.path.join(module_name, "__init__.py")):
yield module_name
def modules_and_module_names_and_classes():
- """
- Returns modules, module names, and person scraper classes.
- """
+ """Return modules, module names, and person scraper classes."""
for module_name in module_names():
- module = importlib.import_module("{}.people".format(module_name))
- class_name = next(key for key in module.__dict__.keys() if "PersonScraper" in key)
+ module = importlib.import_module(f"{module_name}.people")
+ class_name = next(key for key in module.__dict__ if "PersonScraper" in key)
yield (module, module_name, module.__dict__[class_name])
def csv_dict_reader(url, encoding="utf-8"):
- """
- Reads a remote CSV file.
- """
+ """Read a remote CSV file."""
response = requests.get(url)
response.encoding = encoding
return csv.DictReader(StringIO(response.text))
def slug(name):
- """
- Slugifies a division name.
- """
+ """Slugify a division name."""
return unidecode(
str(name)
.lower()
@@ -78,16 +70,12 @@ def province_or_territory_abbreviation(code):
def type_id(id):
- """
- Returns an OCD identifier's type ID.
- """
+ """Return an OCD identifier's type ID."""
return id.rsplit(":", 1)[1]
-def get_definition(division_id, aggregation=False):
- """
- Returns the expected configuration for a given division.
- """
+def get_definition(division_id, *, aggregation=False):
+ """Return the expected configuration for a given division."""
if not ocdid_to_type_name_map:
# Map census division type codes to names.
census_division_type_names = {}
@@ -104,7 +92,7 @@ def get_definition(division_id, aggregation=False):
requests.get("https://www12.statcan.gc.ca/census-recensement/2016/ref/dict/tab/t1_5-eng.cfm").content
)
for text in document.xpath("//table//th[@headers]/text()"):
- code, name = text.split(" – ", 1) # non-breaking space
+ code, name = text.split("\xa0– ", 1)
census_subdivision_type_names[code] = name.split(" / ", 1)[0]
# Map OCD identifiers to census types.
@@ -129,32 +117,28 @@ def get_definition(division_id, aggregation=False):
pattern = "ca_{}_municipalities" if aggregation else "ca_{}"
expected["module_name"] = pattern.format(ocd_type_id)
if aggregation:
- expected["name"] = "{} Municipalities".format(division.name)
+ expected["name"] = f"{division.name} Municipalities"
elif ocd_type_id in ("nl", "ns"):
- expected["name"] = "{} House of Assembly".format(division.name)
+ expected["name"] = f"{division.name} House of Assembly"
elif ocd_type_id == "qc":
expected["name"] = "Assemblée nationale du Québec"
else:
- expected["name"] = "Legislative Assembly of {}".format(division.name)
+ expected["name"] = f"Legislative Assembly of {division.name}"
elif division._type == "cd":
- expected["module_name"] = "ca_{}_{}".format(
- province_or_territory_abbreviation(division.id), slug(division.name)
- )
+ expected["module_name"] = f"ca_{province_or_territory_abbreviation(division.id)}_{slug(division.name)}"
name_infix = ocdid_to_type_name_map[division.id]
if name_infix == "Regional municipality":
name_infix = "Regional"
- expected["name"] = "{} {} Council".format(division.name, name_infix)
+ expected["name"] = f"{division.name} {name_infix} Council"
elif division._type == "csd":
- expected["module_name"] = "ca_{}_{}".format(
- province_or_territory_abbreviation(division.id), slug(division.name)
- )
+ expected["module_name"] = f"ca_{province_or_territory_abbreviation(division.id)}_{slug(division.name)}"
if ocd_type_id[:2] == "24":
if division.name[0] in vowels:
- expected["name"] = "Conseil municipal d'{}".format(division.name)
+ expected["name"] = f"Conseil municipal d'{division.name}"
else:
- expected["name"] = "Conseil municipal de {}".format(division.name)
+ expected["name"] = f"Conseil municipal de {division.name}"
else:
name_infix = ocdid_to_type_name_map[division.id]
if name_infix in ("Municipality", "Specialized municipality"):
@@ -163,21 +147,21 @@ def get_definition(division_id, aggregation=False):
name_infix = "District"
elif name_infix == "Regional municipality":
name_infix = "Regional"
- expected["name"] = "{} {} Council".format(division.name, name_infix)
+ expected["name"] = f"{division.name} {name_infix} Council"
elif division._type == "arrondissement":
- expected["module_name"] = "ca_{}_{}_{}".format(
- province_or_territory_abbreviation(division.parent.id), slug(division.parent.name), slug(division.name)
+ expected["module_name"] = (
+ f"ca_{province_or_territory_abbreviation(division.parent.id)}_{slug(division.parent.name)}_{slug(division.name)}"
)
if division.name[0] in vowels:
- expected["name"] = "Conseil d'arrondissement d'{}".format(division.name)
+ expected["name"] = f"Conseil d'arrondissement d'{division.name}"
elif division.name[:3] == "Le ":
- expected["name"] = "Conseil d'arrondissement du {}".format(division.name[3:])
+ expected["name"] = f"Conseil d'arrondissement du {division.name[3:]}"
else:
- expected["name"] = "Conseil d'arrondissement de {}".format(division.name)
+ expected["name"] = f"Conseil d'arrondissement de {division.name}"
else:
- raise Exception("{}: Unrecognized OCD type {}".format(division.id, division._type))
+ raise Exception(f"{division.id}: Unrecognized OCD type {division._type}")
# Determine the class name.
class_name_parts = re.split("[ -]", re.sub("[—–]", "-", re.sub("['.]", "", division.name)))
@@ -198,46 +182,37 @@ def get_definition(division_id, aggregation=False):
@task
def council_pages():
- """
- Prints scrapers' council page, or warns if it is missing or unneeded.
- """
+ """Print scrapers' council page, or warns if it is missing or unneeded."""
for module, module_name, klass in modules_and_module_names_and_classes():
if klass.__bases__[0].__name__ == "CSVScraper":
if hasattr(module, "COUNCIL_PAGE"):
- print("{:<60} Delete COUNCIL_PAGE".format(module_name))
+ print(f"{module_name:<60} Delete COUNCIL_PAGE")
+ elif hasattr(module, "COUNCIL_PAGE"):
+ print(f"{module_name:<60} {module.COUNCIL_PAGE}")
else:
- if hasattr(module, "COUNCIL_PAGE"):
- print("{:<60} {}".format(module_name, module.COUNCIL_PAGE))
- else:
- print("{:<60} Missing COUNCIL_PAGE".format(module_name))
+ print(f"{module_name:<60} Missing COUNCIL_PAGE")
@task
def csv_list():
- """
- Lists scrapers with CSV data.
- """
- for module, module_name, klass in modules_and_module_names_and_classes():
+ """List scrapers with CSV data."""
+ for _module, module_name, klass in modules_and_module_names_and_classes():
if hasattr(klass, "csv_url"):
- print("{}: {}".format(module_name, klass.csv_url))
+ print(f"{module_name}: {klass.csv_url}")
@task
def csv_stale():
- """
- Lists scrapers with stale manual CSV data.
- """
- for module, module_name, klass in modules_and_module_names_and_classes():
+ """List scrapers with stale manual CSV data."""
+ for _module, module_name, klass in modules_and_module_names_and_classes():
if hasattr(klass, "updated_at") and klass.updated_at < date.today() - timedelta(days=365):
- print("{}: Created on {} by {}".format(module_name, klass.updated_at, klass.contact_person))
+ print(f"{module_name}: Created on {klass.updated_at} by {klass.contact_person}")
@task
def csv_error():
- """
- Notes corrections that CSV publishers should make.
- """
- for module, module_name, klass in modules_and_module_names_and_classes():
+ """Note corrections that CSV publishers should make."""
+ for _module, module_name, klass in modules_and_module_names_and_classes():
if klass.__bases__[0].__name__ == "CSVScraper":
if "_candidates" in module_name and hasattr(klass, "updated_at"):
continue
@@ -263,23 +238,19 @@ def csv_error():
keys -= {"encoding"}
if keys:
- print("\n{}\n{}".format(module_name, klass.csv_url))
+ print(f"\n{module_name}\n{klass.csv_url}")
extra_keys = keys - {"corrections", "encoding", "header_converter"}
if extra_keys:
print("- Manually check the configuration of: {}".format(", ".join(extra_keys)))
if "encoding" in keys:
- print(
- "- The CSV file should be encoded as 'utf-8' or 'windows-1252', not '{}'".format(
- klass.encoding
- )
- )
+ print(f"- The CSV file should be encoded as 'utf-8' or 'windows-1252', not '{klass.encoding}'")
if "corrections" in keys:
for key, values in klass.corrections.items():
for actual, expected in values.items():
- print("- Change '{}' to '{}' in {}".format(actual, expected, key))
+ print(f"- Change '{actual}' to '{expected}' in {key}")
if "header_converter" in keys:
print("- Correct column headers according to:")
@@ -288,17 +259,13 @@ def csv_error():
@task
def tidy():
- """
- Checks that modules are configured correctly.
- """
+ """Check that modules are configured correctly."""
# Map OCD identifiers to styles of address.
leader_styles = {}
member_styles = {}
for gid in range(3):
reader = csv_dict_reader(
- "https://docs.google.com/spreadsheets/d/11qUKd5bHeG5KIzXYERtVgs3hKcd9yuZlt-tCTLBFRpI/pub?single=true&gid={}&output=csv".format(
- gid
- )
+ f"https://docs.google.com/spreadsheets/d/11qUKd5bHeG5KIzXYERtVgs3hKcd9yuZlt-tCTLBFRpI/pub?single=true&gid={gid}&output=csv"
)
for row in reader:
key = row["Identifier"]
@@ -308,7 +275,7 @@ def tidy():
division_ids = set()
jurisdiction_ids = set()
for module_name in module_names():
- if module_name.endswith("_candidates") or module_name.endswith("_municipalities"):
+ if module_name.endswith(("_candidates", "_municipalities")):
continue
metadata = module_name_to_metadata(module_name)
@@ -316,29 +283,29 @@ def tidy():
# Ensure division_id is unique.
division_id = metadata["division_id"]
if division_id in division_ids:
- print("{:<60} Duplicate division_id {}".format(module_name, division_id))
+ print(f"{module_name:<60} Duplicate division_id {division_id}")
else:
division_ids.add(division_id)
# Ensure jurisdiction_id is unique.
jurisdiction_id = metadata["jurisdiction_id"]
if jurisdiction_id in jurisdiction_ids:
- print("{:<60} Duplicate jurisdiction_id {}".format(module_name, jurisdiction_id))
+ print(f"{module_name:<60} Duplicate jurisdiction_id {jurisdiction_id}")
else:
jurisdiction_ids.add(jurisdiction_id)
- expected = get_definition(division_id, bool(module_name.endswith("_municipalities")))
+ expected = get_definition(division_id, aggregation=bool(module_name.endswith("_municipalities")))
# Ensure presence of url and styles of address.
if division_id not in member_styles:
- print("{:<60} Missing member style of address: {}".format(module_name, division_id))
+ print(f"{module_name:<60} Missing member style of address: {division_id}")
if division_id not in leader_styles:
- print("{:<60} Missing leader style of address: {}".format(module_name, division_id))
+ print(f"{module_name:<60} Missing leader style of address: {division_id}")
url = metadata["url"]
if url and not expected["url"]:
parsed = urlsplit(url)
if parsed.scheme not in ("http", "https") or parsed.path or parsed.query or parsed.fragment:
- print("{:<60} Check: {}".format(module_name, url))
+ print(f"{module_name:<60} Check: {url}")
# Warn if the name or classification may be incorrect.
name = metadata["name"]
@@ -346,7 +313,7 @@ def tidy():
print("{:<60} Expected {}".format(name, expected["name"]))
classification = metadata["classification"]
if classification != "legislature":
- print("{:<60} Expected legislature".format(classification))
+ print(f"{classification:<60} Expected legislature")
# Name the classes correctly.
class_name = metadata["class_name"]
@@ -379,9 +346,7 @@ def tidy():
@task
def sources_and_assertions():
- """
- Checks that sources are attributed and assertions are made.
- """
+ """Check that sources are attributed and assertions are made."""
for module_name in module_names():
path = os.path.join(module_name, "people.py")
with codecs.open(path, "r", "utf-8") as f:
@@ -390,17 +355,15 @@ def sources_and_assertions():
source_count = content.count("add_source")
request_count = content.count("lxmlize") + content.count("self.get(") + content.count("requests.get")
if source_count < request_count:
- print("Expected {} sources after {} requests {}".format(source_count, request_count, path))
+ print(f"Expected {source_count} sources after {request_count} requests {path}")
if "CSVScraper" not in content and "assert len(" not in content:
- print("Expected an assertion like: assert len(councillors), 'No councillors found' {}".format(path))
+ print(f"Expected an assertion like: assert len(councillors), 'No councillors found' {path}")
@task
def validate_spreadsheet(url, identifier_header, geographic_name_header):
- """
- Validates the identifiers, geographic names and geographic types in a spreadsheet.
- """
+ """Validate the identifiers, geographic names and geographic types in a spreadsheet."""
sgc_to_id = {}
for division in Division.all("ca", from_csv=ocd_division_csv):
@@ -413,19 +376,17 @@ def validate_spreadsheet(url, identifier_header, geographic_name_header):
if len(identifier) == 2:
identifier = sgc_to_id[identifier]
elif len(identifier) == 4:
- identifier = "ocd-division/country:ca/cd:{}".format(identifier)
+ identifier = f"ocd-division/country:ca/cd:{identifier}"
elif len(identifier) == 7:
- identifier = "ocd-division/country:ca/csd:{}".format(identifier)
+ identifier = f"ocd-division/country:ca/csd:{identifier}"
division = Division.get(identifier)
if row[geographic_name_header] != division.name:
- print("{}: name: {} not {}".format(identifier, division.name, row[geographic_name_header]))
+ print(f"{identifier}: name: {division.name} not {row[geographic_name_header]}")
def module_name_to_metadata(module_name):
- """
- Copied from `reports.utils`.
- """
+ # Copied from reports.utils
module = importlib.import_module(module_name)
for obj in module.__dict__.values():
division_id = getattr(obj, "division_id", None)
@@ -442,3 +403,4 @@ def module_name_to_metadata(module_name):
getattr(obj, "classification", "legislature"),
),
}
+ return None
diff --git a/tox.ini b/tox.ini
deleted file mode 100644
index 2c8b2524..00000000
--- a/tox.ini
+++ /dev/null
@@ -1,5 +0,0 @@
-[flake8]
-exclude=disabled
-ignore=E501,E731
-# E501 line too long (X > 79 characters)
-# E731 do not assign a lambda expression, use a def
diff --git a/utils.py b/utils.py
index 93916f03..3a5ded50 100644
--- a/utils.py
+++ b/utils.py
@@ -9,7 +9,7 @@
from zipfile import ZipFile
import agate
-import agateexcel # noqa
+import agateexcel # noqa: F401
import lxml.html
import requests
from lxml import etree
@@ -17,11 +17,12 @@
from pupa.scrape import Jurisdiction, Organization, Person, Post, Scraper
from requests.packages.urllib3.exceptions import InsecureRequestWarning
-import patch # patch patches validictory # noqa
+import patch # patch patches validictory # noqa: F401
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
CUSTOM_USER_AGENT = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"
+DEFAULT_USER_AGENT = requests.utils.default_user_agent()
CONTACT_DETAIL_TYPE_MAP = {
"Address": "address",
@@ -82,10 +83,7 @@
"Voice Mail": "legislature",
"Work": "legislature",
}
-if os.getenv("SSL_VERIFY", False):
- SSL_VERIFY = "/usr/lib/ssl/certs/ca-certificates.crt"
-else:
- SSL_VERIFY = bool(os.getenv("SSL_VERIFY", False))
+SSL_VERIFY = "/usr/lib/ssl/certs/ca-certificates.crt" if os.getenv("SSL_VERIFY", "") else True
email_re = re.compile(r"([A-Za-z0-9._-]+@(?:[A-Za-z0-9-]+\.)+[A-Za-z]{2,})")
@@ -93,9 +91,7 @@
styles_of_address = {}
for gid in range(3):
response = requests.get(
- "https://docs.google.com/spreadsheets/d/11qUKd5bHeG5KIzXYERtVgs3hKcd9yuZlt-tCTLBFRpI/pub?single=true&gid={}&output=csv".format(
- gid
- ),
+ f"https://docs.google.com/spreadsheets/d/11qUKd5bHeG5KIzXYERtVgs3hKcd9yuZlt-tCTLBFRpI/pub?single=true&gid={gid}&output=csv",
verify=SSL_VERIFY,
)
if response.status_code == 200:
@@ -115,35 +111,51 @@ def get_email(self, node, expression=".", *, error=True):
Make sure that the node/expression is narrow enough to not capture a
generic email address in the footer of the page, for example.
"""
-
- matches = []
# If the text would be split across multiple sub-tags.
- for match in node.xpath('{}//*[contains(text(), "@")]'.format(expression)):
- matches.append(match.text_content())
+ matches = [match.text_content() for match in node.xpath(f'{expression}//*[contains(text(), "@")]')]
# The text version is more likely to be correct, as it is more visible,
# e.g. ca_bc has one `href` of `mailto:first.last.mla@leg.bc.ca`.
- for match in node.xpath('{}//a[contains(@href, "mailto:")]'.format(expression)):
- matches.append(unquote(match.attrib["href"]))
+ matches.extend(
+ unquote(match.attrib["href"]) for match in node.xpath(f'{expression}//a[contains(@href, "mailto:")]')
+ )
+ # Some emails are obfuscated by Cloudflare.
+ matches.extend(
+ self._cloudflare_decode(match)
+ for match in node.xpath(f'{expression}//@href[contains(., "cdn-cgi/l/email-protection")]')
+ )
# If the node has no sub-tags.
if not matches:
- for match in node.xpath('{}//text()[contains(., "@")]'.format(expression)):
- matches.append(match)
+ matches = list(node.xpath(f'{expression}//text()[contains(., "@")]'))
if matches:
for match in matches:
match = email_re.search(match)
if match:
return match.group(1)
if error:
- raise Exception("No email pattern in {}".format(matches))
- elif error:
- raise Exception("No email node in {}".format(etree.tostring(node)))
+ raise Exception(f"No email pattern in {matches}")
+ return None
+ if error:
+ raise Exception(f"No email node in {etree.tostring(node)}")
+ return None
+
+ # Helper function for self,get_email
+ def _cloudflare_decode(self, link):
+ hex_email = link.split("#", 1)[1]
+ decoded_email = ""
+ key = int(hex_email[:2], 16)
- def get_phone(self, node, *, area_codes=[], error=True):
+ for i in range(2, len(hex_email) - 1, 2):
+ decoded_email += chr(int(hex_email[i : i + 2], 16) ^ key)
+
+ return decoded_email
+
+ def get_phone(self, node, *, area_codes=None, error=True):
"""
Don't use if multiple telephone numbers are present, e.g. voice and fax.
If writing a new scraper, check that extensions are captured.
"""
-
+ if area_codes is None:
+ area_codes = []
if isinstance(node, etree._ElementUnicodeResult):
match = re.search(
r"(?:\A|\D)(\(?\d{3}\)?\D?\d{3}\D?\d{4}(?:\s*(?:/|x|ext[.:]?|poste)[\s-]?\d+)?)(?:\D|\Z)", node
@@ -169,14 +181,16 @@ def get_phone(self, node, *, area_codes=[], error=True):
if match:
return match.group(1)
if error:
- raise Exception("No phone pattern in {}".format(node.text_content()))
+ raise Exception(f"No phone pattern in {node.text_content()}")
+ return None
def get_link(self, node, substring, *, error=True):
- match = node.xpath('.//a[contains(@href,"{}")]/@href'.format(substring))
+ match = node.xpath(f'.//a[contains(@href,"{substring}")]/@href')
if match:
return match[0]
if error:
- raise Exception("No link matching {}".format(substring))
+ raise Exception(f"No link matching {substring}")
+ return None
def get(self, *args, **kwargs):
return super().get(*args, verify=SSL_VERIFY, **kwargs)
@@ -184,7 +198,7 @@ def get(self, *args, **kwargs):
def post(self, *args, **kwargs):
return super().post(*args, verify=SSL_VERIFY, **kwargs)
- def lxmlize(self, url, encoding=None, user_agent=requests.utils.default_user_agent(), cookies=None, xml=False):
+ def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False):
self.user_agent = user_agent
response = self.get(url, cookies=cookies)
@@ -194,31 +208,30 @@ def lxmlize(self, url, encoding=None, user_agent=requests.utils.default_user_age
try:
text = response.text
if xml:
- text = text.replace('', "") # XXX ca_bc
- page = etree.fromstring(text)
+ text = text.replace('', "") # special case: ca_bc
+ page = etree.fromstring(text) # noqa: S320
else:
page = lxml.html.fromstring(text)
- except etree.ParserError:
- raise etree.ParserError("Document is empty {}".format(url))
+ except etree.ParserError as e:
+ raise etree.ParserError(f"Document is empty {url}") from e
meta = page.xpath('//meta[@http-equiv="refresh"]')
if meta:
_, url = meta[0].attrib["content"].split("=", 1)
return self.lxmlize(url, encoding)
- elif xml:
- return page
- else:
- page.make_links_absolute(url)
+ if xml:
return page
+ page.make_links_absolute(url)
+ return page
- def csv_reader(self, url, delimiter=",", header=False, encoding=None, skip_rows=0, data=None, **kwargs):
+ def csv_reader(self, url, *, delimiter=",", header=False, encoding=None, skip_rows=0, data=None, **kwargs):
if not data:
result = urlparse(url)
if result.scheme == "ftp":
data = StringIO()
- ftp = FTP(result.hostname)
+ ftp = FTP(result.hostname) # noqa: S321
ftp.login(result.username, result.password)
- ftp.retrbinary("RETR {}".format(result.path), lambda block: data.write(block.decode("utf-8")))
+ ftp.retrbinary(f"RETR {result.path}", lambda block: data.write(block.decode("utf-8")))
ftp.quit()
data.seek(0)
else:
@@ -234,15 +247,13 @@ def csv_reader(self, url, delimiter=",", header=False, encoding=None, skip_rows=
data.readline()
if header:
return csv.DictReader(data, delimiter=delimiter)
- else:
- return csv.reader(data, delimiter=delimiter)
+ return csv.reader(data, delimiter=delimiter)
class CSVScraper(CanadianScraper):
# File flags
- """
- Set the CSV file's delimiter.
- """
+ """Set the CSV file's delimiter."""
+
delimiter = ","
"""
Set the CSV file's encoding, like 'windows-1252' ('utf-8' by default).
@@ -252,6 +263,10 @@ class CSVScraper(CanadianScraper):
If `csv_url` is a ZIP file, set the compressed file to read.
"""
filename = None
+ """
+ If `csv_url` is an XLS, XLSX or ZIP file, but has no extension, set the extension (like '.xlsx').
+ """
+ extension = None
# Table flags
"""
@@ -326,22 +341,21 @@ class CSVScraper(CanadianScraper):
def header_converter(self, s):
"""
- Normalizes a column header name. By default, lowercases it and replaces
- underscores with spaces (e.g. because Esri fields can't contain spaces).
- """
+ Normalize a column header name.
+ By default, lowercase it and replace underscores with spaces (e.g. because Esri fields can't contain spaces).
+ """
header = clean_string(s.lower().replace("_", " "))
if hasattr(self, "locale"):
return self.column_headers[self.locale].get(header, header)
- else:
- return header
+ return header
def is_valid_row(self, row):
"""
- Returns whether the row should be imported. By default, skips empty rows
- and rows in which a name component is "Vacant".
- """
+ Return whether the row should be imported.
+ By default, skip empty rows and rows in which a name component is "Vacant".
+ """
empty = ("", "Vacant")
if not any(row.values()):
return False
@@ -352,7 +366,7 @@ def is_valid_row(self, row):
def scrape(self):
seat_numbers = defaultdict(lambda: defaultdict(int))
- extension = os.path.splitext(self.csv_url)[1]
+ extension = self.extension if self.extension else os.path.splitext(self.csv_url)[1]
if extension in (".xls", ".xlsx"):
data = StringIO()
binary = BytesIO(self.get(self.csv_url).content)
@@ -391,7 +405,7 @@ def scrape(self):
# ca_qc_laval: "maire et president du comite executif", "conseiller et membre du comite executif"
# ca_qc_montreal: "Conseiller de la ville; Membre…", "Maire d'arrondissement\nMembre…"
if row.get("primary role"):
- row["primary role"] = re.split(r"(?: (?:et)\b|[;\n])", row["primary role"], 1)[0].strip()
+ row["primary role"] = re.split(r"(?: (?:et)\b|[;\n])", row["primary role"], maxsplit=1)[0].strip()
if not self.is_valid_row(row):
continue
@@ -438,7 +452,7 @@ def scrape(self):
if self.many_posts_per_area and role not in self.unique_roles:
seat_numbers[role][district] += 1
- district = "{} (seat {})".format(district, seat_numbers[role][district])
+ district = f"{district} (seat {seat_numbers[role][district]})"
lines = []
if row.get("address line 1"):
@@ -467,11 +481,8 @@ def scrape(self):
# District name,District ID,…
# Toronto Centre,,…
# ,3520005,…
- if not row.get("district name") and row.get("district id"):
- if len(row["district id"]) == 7:
- p._related[0].extras["boundary_url"] = "/boundaries/census-subdivisions/{}/".format(
- row["district id"]
- )
+ if not row.get("district name") and row.get("district id") and len(row["district id"]) == 7:
+ p._related[0].extras["boundary_url"] = "/boundaries/census-subdivisions/{}/".format(row["district id"])
if row.get("district name") in self.district_name_to_boundary_url:
p._related[0].extras["boundary_url"] = self.district_name_to_boundary_url[row["district name"]]
@@ -515,9 +526,7 @@ def scrape(self):
class CanadianJurisdiction(Jurisdiction):
- """
- Whether to create posts whose labels match division names or type IDs.
- """
+ """Whether to create posts whose labels match division names or type IDs."""
use_type_id = False
"""
@@ -590,28 +599,21 @@ def get_organizations(self):
if valid_through and valid_through < datetime.now().strftime("%Y-%m-%d"):
continue
- if self.use_type_id:
- label = child.id.rsplit("/", 1)[1].capitalize().replace(":", " ")
- else:
- label = child.name
+ label = child.id.rsplit("/", 1)[1].capitalize().replace(":", " ") if self.use_type_id else child.name
# Yield posts to allow ca_on_toronto to make changes.
post = Post(role=member_role, label=label, division_id=child.id, organization_id=organization._id)
yield post
if not children and parent.attrs["posts_count"]:
for i in range(1, int(parent.attrs["posts_count"])): # exclude Mayor
- organization.add_post(
- role=member_role, label="{} (seat {})".format(parent.name, i), division_id=parent.id
- )
+ organization.add_post(role=member_role, label=f"{parent.name} (seat {i})", division_id=parent.id)
yield organization
class CanadianPerson(Person):
def __init__(self, *, name, district, role, **kwargs):
- """
- Cleans a person's name, district, role and any other attributes.
- """
+ """Clean a person's name, district, role and any other attributes."""
name = clean_name(name)
district = clean_string(district).replace("&", "and")
role = clean_string(role)
@@ -625,9 +627,7 @@ def __init__(self, *, name, district, role, **kwargs):
super().__init__(name=name, district=district, role=role, **kwargs)
def __setattr__(self, name, value):
- """
- Corrects gender values.
- """
+ """Correct gender values."""
if name == "gender":
value = value.lower()
if value == "m":
@@ -637,20 +637,16 @@ def __setattr__(self, name, value):
super().__setattr__(name, value)
def add_link(self, url, *, note=""):
- """
- Corrects links without schemes or domains.
- """
+ """Correct links without schemes or domains."""
url = url.strip()
if url.startswith("www."):
- url = "http://{}".format(url)
+ url = f"http://{url}"
if re.match(r"\A@[A-Za-z]+\Z", url):
- url = "https://twitter.com/{}".format(url[1:])
+ url = f"https://twitter.com/{url[1:]}"
self.links.append({"note": note, "url": url})
def add_contact(self, type, value, note="", area_code=None):
- """
- Cleans and adds a contact detail to the person's membership.
- """
+ """Clean and add a contact detail to the person's membership."""
if type:
type = clean_string(type)
if note:
@@ -673,9 +669,7 @@ def add_contact(self, type, value, note="", area_code=None):
self._related[0].add_contact_detail(type=type, value=value, note=note)
def clean_telephone_number(self, s, area_code=None):
- """
- @see http://www.btb.termiumplus.gc.ca/tpv2guides/guides/favart/index-eng.html?lang=eng&lettr=indx_titls&page=9N6fM9QmOwCE.html
- """
+ """@see http://www.btb.termiumplus.gc.ca/tpv2guides/guides/favart/index-eng.html?lang=eng&lettr=indx_titls&page=9N6fM9QmOwCE.html."""
splits = re.split(r"(?:\b \(|/|x|ext[.:]?|p\.|poste)[\s-]?(?=\b|\d)", s, flags=re.IGNORECASE)
digits = re.sub(r"\D", "", splits[0])
@@ -688,16 +682,11 @@ def clean_telephone_number(self, s, area_code=None):
digits = re.sub(r"\A(\d)(\d{3})(\d{3})(\d{4})\Z", r"\1 \2 \3-\4", digits)
if len(splits) == 2:
return "{} x{}".format(digits, splits[1].rstrip(")"))
- else:
- return digits
- else:
- return s
+ return digits
+ return s
def clean_address(self, s):
- """
- Corrects the postal code, abbreviates the province or territory name, and
- formats the last line of the address.
- """
+ """Correct the postal code, abbreviate the province or territory name, and format the last line of the address."""
# The letter "O" instead of the numeral "0" is a common mistake.
s = re.sub(
r"\b[A-Z][O0-9][A-Z]\s?[O0-9][A-Z][O0-9]\b", lambda x: x.group(0).replace("O", "0"), clean_string(s)
@@ -719,16 +708,16 @@ def clean_address(self, s):
)
-whitespace_re = re.compile(r"\s+", flags=re.U)
-whitespace_and_newline_re = re.compile(r"[^\S\n]+", flags=re.U)
+whitespace_re = re.compile(r"\s+", flags=re.UNICODE)
+whitespace_and_newline_re = re.compile(r"[^\S\n]+", flags=re.UNICODE)
honorific_prefix_re = re.compile(r"\A(?:Councillor|Dr|Hon|M|Mayor|Mme|Mr|Mrs|Ms|Miss)\.? ")
honorific_suffix_re = re.compile(r", (?:Ph\.D, Q\.C\.)\Z")
province_or_territory_abbreviation_memo = {}
table = {
- ord(""): " ", # zero-width space
+ ord("\u200b"): " ", # zero-width space
ord("’"): "'",
- ord("\xc2"): " ", # non-breaking space if mixing ISO-8869-1 into UTF-8
+ ord("\xc2"): "\xa0", # non-breaking space if mixing ISO-8869-1 into UTF-8
}
@@ -759,9 +748,8 @@ def clean_type_id(type_id):
# "Spaces should be converted to underscores."
type_id = re.sub(r" ", "_", type_id)
# "All invalid characters should be converted to tilde (~)."
- type_id = re.sub(r"[^\w.~-]", "~", type_id, re.UNICODE)
- return type_id
+ return re.sub(r"[^\w.~-]", "~", type_id, flags=re.UNICODE)
def clean_french_prepositions(s):
- return re.sub(r"\b(?:d'|de (?:l'|la )?|du |des |l')", "", clean_string(s), flags=re.I)
+ return re.sub(r"\b(?:d'|de (?:l'|la )?|du |des |l')", "", clean_string(s), flags=re.IGNORECASE)