diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..12301490 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml new file mode 100644 index 00000000..55365732 --- /dev/null +++ b/.github/workflows/automerge.yml @@ -0,0 +1,35 @@ +# The pull_request_target workflow trigger is dangerous. Do not add unrelated logic to this workflow. +# https://securitylab.github.com/research/github-actions-preventing-pwn-requests/ +# https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request_target +name: Auto-merge +on: pull_request_target +permissions: + pull-requests: write # to approve the PR + contents: write # to merge the PR +jobs: + dependabot: + if: ${{ github.event.pull_request.user.login == 'dependabot[bot]' }} + runs-on: ubuntu-latest + steps: + - id: dependabot-metadata + uses: dependabot/fetch-metadata@v2 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + - if: ${{ steps.dependabot-metadata.outputs.update-type != 'version-update:semver-major' || steps.dependabot-metadata.outputs.package-ecosystem == 'github_actions' }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: gh pr review --approve ${{ github.event.pull_request.html_url }} + - if: ${{ steps.dependabot-metadata.outputs.update-type != 'version-update:semver-major' || steps.dependabot-metadata.outputs.package-ecosystem == 'github_actions' }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: gh pr merge --auto --squash ${{ github.event.pull_request.html_url }} + precommit: + if: ${{ github.event.pull_request.user.login == 'pre-commit-ci[bot]' }} + runs-on: ubuntu-latest + steps: + - env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: gh pr review --approve ${{ github.event.pull_request.html_url }} + - env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: gh pr merge --auto --squash ${{ github.event.pull_request.html_url }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bba16ab1..2aea3241 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,16 +1,17 @@ ci: autoupdate_schedule: quarterly + skip: [pip-compile] +default_language_version: + python: python3.10 repos: - - repo: https://github.com/psf/black - rev: 24.3.0 + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.9 hooks: - - id: black - - repo: https://github.com/pycqa/flake8 - rev: 7.0.0 + - id: ruff + - id: ruff-format + - repo: https://github.com/astral-sh/uv-pre-commit + rev: 0.4.18 hooks: - - id: flake8 - additional_dependencies: [flake8-comprehensions] - - repo: https://github.com/pycqa/isort - rev: 5.13.2 - hooks: - - id: isort + - id: pip-compile + name: pip-compile requirements.in + args: [requirements.in, -o, requirements.txt] diff --git a/ca/people.py b/ca/people.py index 6cf7b5f4..49d55060 100644 --- a/ca/people.py +++ b/ca/people.py @@ -59,7 +59,7 @@ def scrape_people(self, rows, gender): photo_response = self.get(photo) if ( photo_response.status_code == 200 - and hashlib.sha1(photo_response.content).hexdigest() not in IMAGE_PLACEHOLDER_SHA1 + and hashlib.sha1(photo_response.content).hexdigest() not in IMAGE_PLACEHOLDER_SHA1 # noqa: S324 # non-cryptographic ): m.image = photo @@ -119,7 +119,7 @@ def scrape_people(self, rows, gender): ): note = "constituency" if i: - note += " ({})".format(i + 1) + note += f" ({i + 1})" address = constituency_office_el.xpath("./p[1]")[0] address = address.text_content().strip().splitlines() diff --git a/ca_ab/people.py b/ca_ab/people.py index 601fa525..93696d59 100644 --- a/ca_ab/people.py +++ b/ca_ab/people.py @@ -24,7 +24,7 @@ def get_party(abbr): - """Return full party name from abbreviation""" + """Return full party name from abbreviation.""" return PARTIES[abbr] @@ -59,8 +59,8 @@ def scrape(self): field_names = next(reader) for name in OFFICE_FIELDS: assert field_names.count(name) == 2 - field_names[field_names.index(name)] = "{} 1".format(name) - field_names[field_names.index(name)] = "{} 2".format(name) + field_names[field_names.index(name)] = f"{name} 1" + field_names[field_names.index(name)] = f"{name} 2" rows = [dict(zip_longest(field_names, row)) for row in reader] assert len(rows), "No members found" for mla in rows: @@ -76,8 +76,8 @@ def scrape(self): row_xpath = '//td[normalize-space()="{}"]/..'.format( mla["Constituency Name"], ) - (detail_url,) = index.xpath("{}//a/@href".format(row_xpath)) - (photo_url,) = index.xpath("{}//img/@src".format(row_xpath)) + (detail_url,) = index.xpath(f"{row_xpath}//a/@href") + (photo_url,) = index.xpath(f"{row_xpath}//img/@src") district = mla["Constituency Name"] if district == "Calgary-Bhullar-McCall": district = "Calgary-McCall" @@ -108,10 +108,10 @@ def scrape(self): for suffix, note in addresses: for key, contact_type in (("Phone", "voice"), ("Fax", "fax")): - value = mla["{} Number {}".format(key, suffix)] + value = mla[f"{key} Number {suffix}"] if value and value != "Pending": p.add_contact(contact_type, value, note) - address = ", ".join(filter(bool, [mla["{} {}".format(field, suffix)] for field in ADDRESS_FIELDS])) + address = ", ".join(filter(bool, [mla[f"{field} {suffix}"] for field in ADDRESS_FIELDS])) if address: p.add_contact("address", address, note) diff --git a/ca_ab_grande_prairie/__init__.py b/ca_ab_grande_prairie/__init__.py index 67a6f1e5..42329fcc 100644 --- a/ca_ab_grande_prairie/__init__.py +++ b/ca_ab_grande_prairie/__init__.py @@ -17,7 +17,7 @@ def get_organizations(self): for seat_number in range(1, 9): organization.add_post( role="Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) diff --git a/ca_ab_grande_prairie/people.py b/ca_ab_grande_prairie/people.py index fce739e2..9ff05187 100644 --- a/ca_ab_grande_prairie/people.py +++ b/ca_ab_grande_prairie/people.py @@ -1,7 +1,30 @@ -from utils import CSVScraper +from utils import CanadianPerson as Person +from utils import CanadianScraper +COUNCIL_PAGE = "https://cityofgp.com/city-government/mayor-city-council/council-members" -class GrandePrairiePersonScraper(CSVScraper): - # https://data.cityofgp.com/Community/City-Council-Contact-Information/vcfc-gi78 - csv_url = "https://data.cityofgp.com/api/views/vcfc-gi78/rows.csv?accessType=DOWNLOAD" - many_posts_per_area = True + +class GrandePrairiePersonScraper(CanadianScraper): + def scrape(self): + seat_number = 1 + page = self.lxmlize(COUNCIL_PAGE) + councillors = page.xpath('//div[contains(@class, "council-bios")]//div[@class="views-row"]') + + assert len(councillors), "No councillors found" + for councillor in councillors: + role, name = councillor.xpath(".//h3")[0].text_content().split(" ", 1) + if role == "Councillor": + district = f"Grande Prairie (seat {seat_number})" + seat_number += 1 + else: + district = " Grande Prairie" + email = self.get_email(councillor) + phone = self.get_phone(councillor) + image = councillor.xpath(".//img/@src")[0] + + p = Person(primary_org="legislature", name=name, district=district, role=role, image=image) + p.add_contact("email", email) + p.add_contact("voice", phone, "legislature") + p.add_source(COUNCIL_PAGE) + + yield p diff --git a/ca_ab_grande_prairie_county_no_1/__init__.py b/ca_ab_grande_prairie_county_no_1/__init__.py index 632cdb9d..fc7fd5da 100644 --- a/ca_ab_grande_prairie_county_no_1/__init__.py +++ b/ca_ab_grande_prairie_county_no_1/__init__.py @@ -16,8 +16,8 @@ def get_organizations(self): for division_number in range(1, 10): organization.add_post( role="Councillor", - label="Division {}".format(division_number), - division_id="{}/division:{}".format(self.division_id, division_number), + label=f"Division {division_number}", + division_id=f"{self.division_id}/division:{division_number}", ) yield organization diff --git a/ca_ab_lethbridge/__init__.py b/ca_ab_lethbridge/__init__.py index 40d32197..d4e4c9c6 100644 --- a/ca_ab_lethbridge/__init__.py +++ b/ca_ab_lethbridge/__init__.py @@ -17,7 +17,7 @@ def get_organizations(self): for seat_number in range(1, 9): organization.add_post( role="Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) diff --git a/ca_ab_lethbridge/people.py b/ca_ab_lethbridge/people.py index 74f91e0e..808c98e5 100644 --- a/ca_ab_lethbridge/people.py +++ b/ca_ab_lethbridge/people.py @@ -8,7 +8,7 @@ class LethbridgePersonScraper(CanadianScraper): def scrape_mayor(self): page = self.lxmlize(MAYOR_PAGE) - paragraph = page.xpath("//p[1]")[0].text_content().split() + paragraph = page.xpath("//h4[contains(., 'Mayor')]/following-sibling::p")[0].text_content().split() name = " ".join([paragraph[0], paragraph[1]]) p = Person(primary_org="legislature", name=name, district="Lethbridge", role="Mayor") @@ -24,7 +24,7 @@ def scrape_person(self, url, seat_number): p = Person( primary_org="legislature", name=name, - district="Lethbridge (seat {})".format(seat_number + 1), + district=f"Lethbridge (seat {seat_number + 1})", role="Councillor", ) diff --git a/ca_ab_wood_buffalo/__init__.py b/ca_ab_wood_buffalo/__init__.py index 40ae2c69..91eee478 100644 --- a/ca_ab_wood_buffalo/__init__.py +++ b/ca_ab_wood_buffalo/__init__.py @@ -17,16 +17,16 @@ def get_organizations(self): for seat_number in range(1, 7): organization.add_post( role="Councillor", - label="Ward 1 (seat {})".format(seat_number), - division_id="{}/ward:1".format(self.division_id), + label=f"Ward 1 (seat {seat_number})", + division_id=f"{self.division_id}/ward:1", ) for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="Ward 2 (seat {})".format(seat_number), - division_id="{}/ward:2".format(self.division_id), + label=f"Ward 2 (seat {seat_number})", + division_id=f"{self.division_id}/ward:2", ) - organization.add_post(role="Councillor", label="Ward 3", division_id="{}/ward:3".format(self.division_id)) - organization.add_post(role="Councillor", label="Ward 4", division_id="{}/ward:4".format(self.division_id)) + organization.add_post(role="Councillor", label="Ward 3", division_id=f"{self.division_id}/ward:3") + organization.add_post(role="Councillor", label="Ward 4", division_id=f"{self.division_id}/ward:4") yield organization diff --git a/ca_ab_wood_buffalo/people.py b/ca_ab_wood_buffalo/people.py index 2760aedd..b53bc03c 100644 --- a/ca_ab_wood_buffalo/people.py +++ b/ca_ab_wood_buffalo/people.py @@ -33,13 +33,13 @@ def scrape(self): for ward in wards: area = ward.text_content().split("–", 1)[1].strip() councillors = ward.xpath("./following-sibling::table[1]/tbody/tr/td/h3") - assert len(councillors), "No councillors found for {}".format(area) + assert len(councillors), f"No councillors found for {area}" for councillor in councillors: name = councillor.text_content() if area in ("Ward 1", "Ward 2"): seat_numbers[area] += 1 - district = "{} (seat {})".format(area, seat_numbers[area]) + district = f"{area} (seat {seat_numbers[area]})" else: district = area diff --git a/ca_bc_abbotsford/people.py b/ca_bc_abbotsford/people.py index 5d29da2f..db72003b 100644 --- a/ca_bc_abbotsford/people.py +++ b/ca_bc_abbotsford/people.py @@ -19,12 +19,12 @@ def scrape(self): ] assert len(councillors), "No councillors found" - assert len(councillors) == len(contact_data), "Expected {}, got {}".format(len(councillors), len(contact_data)) + assert len(councillors) == len(contact_data), f"Expected {len(councillors)}, got {len(contact_data)}" for councillor, contact in zip(councillors, contact_data): text = councillor.xpath(".//h3/a")[0].text_content() if text.startswith("Councill"): role = "Councillor" - district = "Abbotsford (seat {})".format(councillor_seat_number) + district = f"Abbotsford (seat {councillor_seat_number})" councillor_seat_number += 1 else: role = "Mayor" diff --git a/ca_bc_burnaby/people.py b/ca_bc_burnaby/people.py index 981607ae..856a1b34 100644 --- a/ca_bc_burnaby/people.py +++ b/ca_bc_burnaby/people.py @@ -12,16 +12,6 @@ def scrape(self): councillors = page.xpath("//a[@class='biography__link']/@href") assert len(councillors), "No councillors found" for person_url in councillors: - - def decode_email(e): - de = "" - k = int(e[:2], 16) - - for i in range(2, len(e) - 1, 2): - de += chr(int(e[i : i + 2], 16) ^ k) - - return de - page = self.lxmlize(person_url) role, name = page.xpath("//h1/span")[0].text_content().strip().split(" ", 1) @@ -29,22 +19,20 @@ def decode_email(e): contact_node = page.xpath('//div[@class="contact"]')[0] - email = page.xpath('//div[@class = "contact__detail contact__detail--email"]/a/@href')[0] - decoded_email = decode_email(email.split("#", 1)[1]) # cloudflare encrypts the email data - + email = self.get_email(contact_node) phone = self.get_phone(contact_node, area_codes=[604, 778]) if role == "Mayor": district = "Burnaby" else: - district = "Burnaby (seat {})".format(councillor_seat_number) + district = f"Burnaby (seat {councillor_seat_number})" councillor_seat_number += 1 p = Person(primary_org="legislature", name=name, district=district, role=role, image=photo_url) p.add_source(COUNCIL_PAGE) p.add_source(person_url) if email: - p.add_contact("email", decoded_email) + p.add_contact("email", email) if phone: p.add_contact("voice", phone, "legislature") yield p diff --git a/ca_bc_coquitlam/people.py b/ca_bc_coquitlam/people.py index e43370f4..4f7200e2 100644 --- a/ca_bc_coquitlam/people.py +++ b/ca_bc_coquitlam/people.py @@ -7,18 +7,16 @@ class CoquitlamPersonScraper(CanadianScraper): - def scrape(self): def build_email(script): w = re.findall(r'w = "(.*?)"', script)[0] x = re.findall(r'x = "(.*?)"', script)[0] - email = w + "@" + x - return email + return w + "@" + x councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE, user_agent="Mozilla/5.0") - councillors = page.xpath('//table[@id="cityDirectoryDepartmentDetails"]/tr') + councillors = page.xpath('//table[contains(@id, "cityDirectoryDepartmentDetails")]/tr') assert len(councillors), "No councillors found" for councillor in councillors: name = " ".join( @@ -36,7 +34,7 @@ def build_email(script): if role == "Mayor": district = "Coquitlam" else: - district = "Coquitlam (seat {})".format(councillor_seat_number) + district = f"Coquitlam (seat {councillor_seat_number})" councillor_seat_number += 1 p = Person(primary_org="legislature", name=name, district=district, role=role) diff --git a/ca_bc_langley/people.py b/ca_bc_langley/people.py index f6863abb..b453cdfe 100644 --- a/ca_bc_langley/people.py +++ b/ca_bc_langley/people.py @@ -15,7 +15,7 @@ def scrape(self): page = self.lxmlize(url) name = page.xpath("//h1")[0].text_content().strip() - district = "Langley (seat {})".format(seat_number) + district = f"Langley (seat {seat_number})" seat_number += 1 email = self.get_email(page) phone = self.get_phone(page) @@ -34,7 +34,7 @@ def scrape(self): address_block = page.xpath('//p/a[@rel="noopener noreferrer"]/parent::p')[0].text_content() line1 = address_block[address_block.find("Facility") + 8 : address_block.find("Langley,")] line2 = address_block[address_block.find("Langley,") : address_block.find("Phone") - 1] - address = ", ".join([line1, line2]) + address = f"{line1}, {line2}" p = Person(primary_org="legislature", name=name, role="Mayor", district="Langley") p.add_contact("email", email) p.add_contact("voice", phone, "legislature") diff --git a/ca_bc_langley_city/people.py b/ca_bc_langley_city/people.py index e7d88464..0db77a03 100644 --- a/ca_bc_langley_city/people.py +++ b/ca_bc_langley_city/people.py @@ -1,9 +1,7 @@ -import re - from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.city.langley.bc.ca/index.php/city-hall/city-council" +COUNCIL_PAGE = "https://city.langley.bc.ca/cityhall/city-council/council-members" class LangleyPersonScraper(CanadianScraper): @@ -12,60 +10,35 @@ def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//div[@class="menuitems"]/ul//li/a[contains(text(), "Councillor")]/@href') - mayor = page.xpath('//div[@class="menuitems"]/ul//li/a[contains(text(), "Mayor")]/@href')[0] + councillors = page.xpath( + '//div[@class="field field--name-field-ec-section-title field--type-string field--label-hidden field__item"]' + )[:-1] assert len(councillors), "No councillors found" - for url in councillors: - district = "Langley (seat {})".format(councillor_seat_number) - councillor_seat_number += 1 - yield self.scrape_person(url, district) - - yield self.scrape_mayor(mayor) - - def scrape_person(self, url, district): - infos_page = self.lxmlize(url) - infos = infos_page.xpath('//div[@class="item-page"]')[0] - - name = " ".join(infos.xpath("p[2]/text()")[0].split(" ")[1:3]) - lname = name.lower() - email = lname.split(" ")[0][0] + lname.split(" ")[1] + "@langleycity.ca" - photo_url = infos.xpath("p[1]/img/@src")[0] - - p = Person(primary_org="legislature", name=name, district=district, role="Councillor", image=photo_url) - p.add_source(COUNCIL_PAGE) - p.add_source(url) - p.add_contact("email", email) - - personal_infos = infos.xpath("p[last()]/text()") - - if "Residence" in personal_infos[0]: - phone = re.findall(r"(Phone|Res)(:?) (.*)", "\n".join(personal_infos))[0][2] - address = re.findall(r"Address: (.*) (Phone|Res)", " ".join(personal_infos))[0][0] - p.add_contact("address", address, "residence") - p.add_contact("voice", phone, "residence") - - return p - - def scrape_mayor(self, url): - infos_page = self.lxmlize(url) - infos = infos_page.xpath('//div[@class="item-page"]')[0] - - name = " ".join(infos.xpath("p[2]/text()")[0].split(" ")[2:4]) - lname = name.lower() - email = lname.split(" ")[0][0] + lname.split(" ")[1] + "@langleycity.ca" - photo_url = infos.xpath("p[1]/img/@src")[0] - - p = Person(primary_org="legislature", name=name, district="Langley", role="Mayor", image=photo_url) - p.add_source(COUNCIL_PAGE) - p.add_source(url) - p.add_contact("email", email) - - personal_infos = infos.xpath("p[last()]/text()") - - phone = re.findall(r"Phone(:?) (.*)", "\n".join(personal_infos))[0][1] - address = re.findall(r"Address: (.*) Phone", " ".join(personal_infos))[0] - p.add_contact("address", address, "office") - p.add_contact("voice", phone, "office") - - return p + for councillor in councillors: + role, name = councillor.text_content().split(" ", 1) + if role == "Mayor": + district = "Langley" + phone_div = councillor.xpath('..//p[contains(., "Phone:")]')[0] + phone = self.get_phone(phone_div) + else: + district = f"Langley (seat {councillor_seat_number})" + phone = ( + "604 514 2800" # According to their site, all councillors can be contacted at this phone number + ) + councillor_seat_number += 1 + email = ( + councillor.xpath('..//p[contains(., "Email:")]')[0] + .text_content() + .split("Email:", 1)[1] + .strip() + .replace("(at)", "@") + ) + image = councillor.xpath("..//img/@src")[0] + + p = Person(primary_org="legislature", name=name, district=district, role=role, image=image) + p.add_contact("voice", phone, "legislature") + p.add_contact("email", email) + p.add_source(COUNCIL_PAGE) + + yield p diff --git a/ca_bc_new_westminster/people.py b/ca_bc_new_westminster/people.py index 44e96727..a6f4a8a0 100644 --- a/ca_bc_new_westminster/people.py +++ b/ca_bc_new_westminster/people.py @@ -15,7 +15,7 @@ def scrape(self): assert len(councillors), "No councillors found" for councillor in councillors: name = councillor.xpath(".//a[@name]")[0].text_content() - district = "New Westminster (seat {})".format(seat_number) + district = f"New Westminster (seat {seat_number})" seat_number += 1 p = Person(primary_org="legislature", name=name, role="Councillor", district=district) photo = councillor.xpath("//img/@src")[0] diff --git a/ca_bc_richmond/people.py b/ca_bc_richmond/people.py index 795637b3..c90d664b 100644 --- a/ca_bc_richmond/people.py +++ b/ca_bc_richmond/people.py @@ -21,7 +21,7 @@ def scrape(self): if role == "Mayor": district = "Richmond" else: - district = "Richmond (seat {})".format(councillor_seat_number) + district = f"Richmond (seat {councillor_seat_number})" councillor_seat_number += 1 p = Person(primary_org="legislature", name=name, district=district, role=role) diff --git a/ca_bc_saanich/people.py b/ca_bc_saanich/people.py index e7088e99..d3ea7da2 100644 --- a/ca_bc_saanich/people.py +++ b/ca_bc_saanich/people.py @@ -26,7 +26,7 @@ def scrape(self): district = "Saanich" else: role = "Councillor" - district = "Saanich (seat {})".format(councillor_seat_number) + district = f"Saanich (seat {councillor_seat_number})" councillor_seat_number += 1 p = Person(primary_org="legislature", name=name, district=district, role=role) diff --git a/ca_bc_surrey/people.py b/ca_bc_surrey/people.py index b0240acd..bf877b91 100644 --- a/ca_bc_surrey/people.py +++ b/ca_bc_surrey/people.py @@ -12,9 +12,8 @@ def scrape(self): assert len(members), "No members found" seat_number = 1 for member in members: - role, name = member.xpath('.//a[@class="teaser__link"]/h4')[0].text_content().split(" ", 1) - district = "Surrey (seat {})".format(seat_number) + district = f"Surrey (seat {seat_number})" seat_number += 1 photo_url = member.xpath(".//figure//img/@src")[0] diff --git a/ca_bc_vancouver/__init__.py b/ca_bc_vancouver/__init__.py index f07c572d..3fa273f6 100644 --- a/ca_bc_vancouver/__init__.py +++ b/ca_bc_vancouver/__init__.py @@ -17,13 +17,13 @@ def get_organizations(self): for seat_number in range(1, 11): organization.add_post( role="Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) for seat_number in range(1, 8): organization.add_post( role="Commissioner", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) diff --git a/ca_bc_victoria/people.py b/ca_bc_victoria/people.py index b6c05cc8..9796b6ca 100644 --- a/ca_bc_victoria/people.py +++ b/ca_bc_victoria/people.py @@ -20,7 +20,7 @@ def scrape(self): phone = self.get_phone(councillor) url = councillor.xpath(".//h3/a/@href")[0] - district = "Victoria (seat {})".format(seat_number) + district = f"Victoria (seat {seat_number})" seat_number += 1 p = Person(primary_org="legislature", name=name, district=district, role=role) diff --git a/ca_mb_winnipeg/people.py b/ca_mb_winnipeg/people.py index 16e2af66..c3a2daa7 100644 --- a/ca_mb_winnipeg/people.py +++ b/ca_mb_winnipeg/people.py @@ -1,7 +1,5 @@ import json -import requests - from utils import CanadianPerson as Person from utils import CanadianScraper @@ -12,7 +10,7 @@ class WinnipegPersonScraper(CanadianScraper): def scrape(self): # from https://data.winnipeg.ca/Council-Services/Council-Data/r4tk-7dip/about_data api_url = "https://data.winnipeg.ca/resource/r4tk-7dip.json" - data = json.loads(requests.get(api_url).content) + data = json.loads(self.get(api_url).content) assert len(data), "No councillors found via API" page = self.lxmlize(COUNCIL_PAGE) diff --git a/ca_nb_fredericton/people.py b/ca_nb_fredericton/people.py index f5977c6c..6ade8cb9 100644 --- a/ca_nb_fredericton/people.py +++ b/ca_nb_fredericton/people.py @@ -1,3 +1,5 @@ +import re + from utils import CanadianPerson as Person from utils import CanadianScraper @@ -17,9 +19,9 @@ def scrape(self): text = councillor.xpath('.//div[@class="views-field views-field-field-councillor-title"]/div')[ 0 ].text_content() - ward_start = text.find("Ward") - if ward_start + 1: - district = text[ward_start : ward_start + 7].strip() + ward = re.findall(r"Ward \d+", text) + if ward: + district = ward[0] role = "Councillor" else: district = "Fredericton" diff --git a/ca_nb_moncton/__init__.py b/ca_nb_moncton/__init__.py index c4a931d0..5d2abbe6 100644 --- a/ca_nb_moncton/__init__.py +++ b/ca_nb_moncton/__init__.py @@ -17,15 +17,15 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Councillor at Large", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) for ward_number in range(1, 5): for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="Ward {} (seat {})".format(ward_number, seat_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_nb_moncton/people.py b/ca_nb_moncton/people.py index f94e09db..aa47ed0f 100644 --- a/ca_nb_moncton/people.py +++ b/ca_nb_moncton/people.py @@ -1,8 +1,6 @@ import json from collections import defaultdict -import requests - from utils import CanadianPerson as Person from utils import CanadianScraper @@ -13,7 +11,7 @@ class MonctonPersonScraper(CanadianScraper): def scrape(self): seat_numbers = defaultdict(int) - data = json.loads(requests.get(API_URL).content)["features"] + data = json.loads(self.get(API_URL).content)["features"] assert len(data), "No councillors found" for item in data: @@ -24,7 +22,7 @@ def scrape(self): role = councillor["Primary_role"] if role != "Mayor": seat_numbers[ward] += 1 - district = ward + " (seat {})".format(seat_numbers[ward]) + district = ward + f" (seat {seat_numbers[ward]})" else: district = ward name = councillor["Name"] diff --git a/ca_nb_saint_john/__init__.py b/ca_nb_saint_john/__init__.py index 6372f776..407b9322 100644 --- a/ca_nb_saint_john/__init__.py +++ b/ca_nb_saint_john/__init__.py @@ -18,15 +18,15 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) for ward_number in range(1, 5): for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="Ward {} (seat {})".format(ward_number, seat_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_nl/people.py b/ca_nl/people.py index a75bec04..3a86fcef 100644 --- a/ca_nl/people.py +++ b/ca_nl/people.py @@ -1,9 +1,8 @@ import json import re -from utils import CUSTOM_USER_AGENT +from utils import CUSTOM_USER_AGENT, CanadianScraper from utils import CanadianPerson as Person -from utils import CanadianScraper COUNCIL_PAGE = "https://www.assembly.nl.ca/js/members-index.js" @@ -26,9 +25,7 @@ def scrape(self): page = self.get(COUNCIL_PAGE) members = re.search( r"members = (\[(.+)\]);", page.content.decode().replace("[Member-elect]", ""), re.DOTALL - ).groups()[ - 0 - ] # extract javascript array + ).groups()[0] # extract javascript array members = re.sub("", "", members) # remove comments members = re.sub("", "", members).replace("", "") # tags members = members.replace('"', r"\"") # escape double quotes @@ -37,10 +34,8 @@ def scrape(self): assert len(members), "No members found" for member in json.loads(members): if not member["name"].strip(): - print("Skipping blank member: {}".format(member)) continue if member["name"] == "Vacant": - print("Skipping vacant 'member': {}".format(member)) continue name = " ".join(reversed(member["name"].split(","))).strip() district = ( @@ -60,7 +55,8 @@ def scrape(self): ) if member.get("email"): p.add_contact( - "email", member["email"].replace("@gov.nl.ca@gov.nl.ca", "@gov.nl.ca") # seriously guys?! + "email", + member["email"].replace("@gov.nl.ca@gov.nl.ca", "@gov.nl.ca"), # seriously guys?! ) p.add_source(COUNCIL_PAGE) diff --git a/ca_nl_st_john_s/__init__.py b/ca_nl_st_john_s/__init__.py index 5b8632a8..c3fbca30 100644 --- a/ca_nl_st_john_s/__init__.py +++ b/ca_nl_st_john_s/__init__.py @@ -18,14 +18,14 @@ def get_organizations(self): for seat_number in range(1, 5): organization.add_post( role="Councillor at Large", - label="St. John's (seat {})".format(seat_number), + label=f"St. John's (seat {seat_number})", division_id=self.division_id, ) for ward_number in range(1, 6): organization.add_post( role="Councillor", - label="Ward {}".format(ward_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number}", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_nl_st_john_s/people.py b/ca_nl_st_john_s/people.py index ee1f41e2..657fd8e9 100644 --- a/ca_nl_st_john_s/people.py +++ b/ca_nl_st_john_s/people.py @@ -23,9 +23,9 @@ def scrape(self): district = description[index : index + 6] else: district = "St. John's" - if role != "Mayor" and role != "Deputy Mayor": + if role not in ("Mayor", "Deputy Mayor"): role = "Councillor at Large" - district = "St. John's (seat {})".format(councillor_seat_number) + district = f"St. John's (seat {councillor_seat_number})" councillor_seat_number += 1 email = self.get_email(page) diff --git a/ca_ns/people.py b/ca_ns/people.py index 4013d33f..97763d39 100644 --- a/ca_ns/people.py +++ b/ca_ns/people.py @@ -18,7 +18,7 @@ def scrape(self): page = self.lxmlize(COUNCIL_PAGE) members = page.xpath( '//div[contains(@class, "view-display-id-page_mlas_current_tiles")]//div[contains(@class, "views-row-")]' - ) # noqa + ) assert len(members), "No members found" for member in members: district = member.xpath('.//div[contains(@class, "views-field-field-constituency")]/div/text()')[0] @@ -66,16 +66,14 @@ def scrape(self): if len(mailing_address) > 0: address = mailing_address - else: - if len(civic_address) > 0 or len(civic_address_alt) > 0: - if len(civic_address_alt) > 0: - address = civic_address_alt - else: - address = civic_address - address.remove(address[0]) # remove civic address + elif len(civic_address) > 0 or len(civic_address_alt) > 0: + if len(civic_address_alt) > 0: + address = civic_address_alt else: - if len(business_address) > 0: - address = business_address + address = civic_address + address.remove(address[0]) # remove civic address + elif len(business_address) > 0: + address = business_address address = list(map(str.strip, address)) p.add_contact("address", "\n".join(address), "constituency") diff --git a/ca_ns_cape_breton/people.py b/ca_ns_cape_breton/people.py index dad984d0..9d9272a5 100644 --- a/ca_ns_cape_breton/people.py +++ b/ca_ns_cape_breton/people.py @@ -1,9 +1,8 @@ import html import re -from utils import CUSTOM_USER_AGENT +from utils import CUSTOM_USER_AGENT, CanadianScraper from utils import CanadianPerson as Person -from utils import CanadianScraper COUNCIL_PAGE = "http://www.cbrm.ns.ca/mayor-council-2.html" MAYOR_PAGE = "http://www.cbrm.ns.ca/mayor" @@ -14,8 +13,7 @@ def scrape(self): def decode_email(script): raw_address = re.findall(r"(?<=addy).*?;\s*addy", script) local_part = html.unescape(raw_address[0]).split("= ", 1)[1].split(";", 1)[0] - email = re.sub(r"['\s+]", "", local_part) + "cbrm.ns.ca" - return email + return re.sub(r"['\s+]", "", local_part) + "cbrm.ns.ca" page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT) @@ -55,7 +53,7 @@ def decode_email(script): councillor_url = councillor.xpath(".//a/@href")[0] p.add_source(councillor_url) page = self.lxmlize(councillor_url, user_agent=CUSTOM_USER_AGENT) - image = page.xpath('//img[contains(@title, "{0}")]/@src'.format(name)) + image = page.xpath(f'//img[contains(@title, "{name}")]/@src') if image: p.image = image[0] yield p diff --git a/ca_ns_halifax/people.py b/ca_ns_halifax/people.py index c8cf9e7b..06064166 100644 --- a/ca_ns_halifax/people.py +++ b/ca_ns_halifax/people.py @@ -4,52 +4,43 @@ from utils import CanadianScraper COUNCIL_PAGE = "https://www.halifax.ca/city-hall/districts-councillors" -MAYOR_PAGE = "https://www.halifax.ca/city-hall/mayor-mike-savage" -MAYOR_CONTACT_URL = "http://www.halifax.ca/mayor/contact.php" class HalifaxPersonScraper(CanadianScraper): def scrape(self): - page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//div[@id = "block-districtdistrictindex"]/ul/li')[1:] + page = self.lxmlize(COUNCIL_PAGE, user_agent="Mozilla/5.0") + councillors = page.xpath('//div[@id = "block-districtdistrictindex"]//ul/li') assert len(councillors), "No councillors found" for councillor in councillors: photo_div = councillor.xpath("./a/div[1]")[0] info_div = councillor.xpath("./a/div[2]")[0] district = re.sub(r"\s*[–—-]\s*", "—", "—".join(info_div.xpath("./p/text()"))) - # FIXME: we special-case one malformed district name. If you're editing this file, - # try removing these lines - if district.startswith("District 16 "): - district = district[len("District 16 ") :] + # District name different than in database + if "Westphal" in district: + district = "Cole Harbour—Westphal" name = info_div.xpath("./strong/p/text()")[0].replace("Councillor ", "").replace("Deputy Mayor ", "") + if "Mayor" in name: + role = "Mayor" + name = name.replace("Mayor ", "") + district = "Halifax" + else: + role = "Councillor" + if name != "To be determined": photo = photo_div.xpath(".//img/@src")[0] url = councillor.xpath("./a/@href")[0] - councillor_page = self.lxmlize(url) + councillor_page = self.lxmlize(url, user_agent="Mozilla/5.0") - contact_node = councillor_page.xpath('//div[@id = "block-districtdistrictprofile"]')[0] - phone = self.get_phone(contact_node, area_codes=[902]) - email = self.get_email(contact_node) + phone = self.get_phone(councillor_page, area_codes=[902]) + email = self.get_email(councillor_page) - p = Person(primary_org="legislature", name=name, district=district, role="Councillor") + p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact("voice", phone, "legislature") p.add_contact("email", email) p.image = photo yield p - - mayor_page = self.lxmlize(MAYOR_PAGE, "iso-8859-1") - name = " ".join(mayor_page.xpath("//h1/text()")).replace("Mayor", "").strip() - contact_div = mayor_page.xpath('//aside[contains(@class, "layout-sidebar-second")]/section/div[1]')[0] - phone = self.get_phone(contact_div.xpath("./p[2]")[0]) - email = self.get_email(contact_div.xpath("./p[2]")[0]) - - p = Person(primary_org="legislature", name=name, district="Halifax", role="Mayor") - p.add_source(MAYOR_PAGE) - p.add_contact("email", email) - p.add_contact("voice", phone, "legislature") - yield p diff --git a/ca_nt/people.py b/ca_nt/people.py index d9460a78..d04fb9db 100644 --- a/ca_nt/people.py +++ b/ca_nt/people.py @@ -1,3 +1,5 @@ +import contextlib + from utils import CanadianPerson as Person from utils import CanadianScraper @@ -24,10 +26,8 @@ def scrape(self): p = Person(primary_org="legislature", name=name, district=district, role="MLA") p.add_source(COUNCIL_PAGE) p.add_source(url) - try: + with contextlib.suppress(IndexError): p.image = page.xpath('//div[contains(@class, "field--name-field-media-image")]/img/@src')[0] - except IndexError: - pass contact = page.xpath('//*[contains(@class, "paragraph--type--office")]')[0] if len(contact.xpath('./div[contains(@class, "office-address-wrapper")]')) == 0: @@ -35,7 +35,7 @@ def scrape(self): else: address_section = contact - def handle_address(contact, address_type): + def handle_address(p, contact, address_type): address_lines = [] po_box_line = ( "PO Box " @@ -56,7 +56,7 @@ def handle_address(contact, address_type): address_type, ) - def handle_phone(lines, phone_type): + def handle_phone(p, lines, phone_type): first_phone_added = False for line in lines: if "Assistant" in line.strip(): @@ -71,8 +71,8 @@ def handle_phone(lines, phone_type): first_phone_added = True contact_lines = contact.xpath(".//text()") - handle_address(address_section, "legislature") - handle_phone(contact_lines, "legislature") + handle_address(p, address_section, "legislature") + handle_phone(p, contact_lines, "legislature") email_elements = page.xpath( '//*[contains(@class, "field--paragraph--field-email")]/div[@class="field__item"]' diff --git a/ca_nu/people.py b/ca_nu/people.py index 363f2c98..38b1c8eb 100644 --- a/ca_nu/people.py +++ b/ca_nu/people.py @@ -1,3 +1,5 @@ +import contextlib + from utils import CanadianPerson as Person from utils import CanadianScraper @@ -22,17 +24,15 @@ def scrape(self): p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party) p.add_source(COUNCIL_PAGE) p.add_source(url) - try: + with contextlib.suppress(IndexError): p.image = page.xpath('//div[contains(@class, "field--name-field-member-photo")]/div[2]/img/@src')[0] - except IndexError: - pass contact = page.xpath('//div[contains(@class, "field--name-field-member-constituency")]/div[2]/div/p')[0] website = contact.xpath("./div[3]/div[3]/div[2]/a") if website: p.add_link(website[0].text_content()) - def handle_address(lines, address_type): + def handle_address(p, lines, address_type): address_lines = [] for line in lines: if ":" in line.strip(): # Room:, Phone:, Fax: @@ -45,15 +45,15 @@ def handle_address(lines, address_type): address_type, ) - def handle_phone(lines, phone_type): + def handle_phone(p, lines, phone_type): for line in lines: if "Phone:" in line: number = line.replace("Phone: (867) ", "") p.add_contact("voice", number, phone_type, area_code=867) address_lines = contact.xpath("./text()") - handle_address(address_lines, "legislature") - handle_phone(address_lines, "legislature") + handle_address(p, address_lines, "legislature") + handle_phone(p, address_lines, "legislature") email = self.get_email(contact, error=False) if email: diff --git a/ca_on/people.py b/ca_on/people.py index 82865e66..885aa199 100644 --- a/ca_on/people.py +++ b/ca_on/people.py @@ -42,7 +42,7 @@ def scrape(self): '//div[@block="block-views-block-member-current-party-block"]//div[@class="view-content"]//text()' ) - party = [item for item in party if item.strip()][0] + party = next(item for item in party if item.strip()) p = Person(primary_org="legislature", name=name, district=district, role="MPP", party=party) p.add_source(COUNCIL_PAGE) p.add_source(url) @@ -58,7 +58,7 @@ def scrape(self): p.extras["constituency_email"] = emails.pop(0) for heading, note in headings.items(): - office = node.xpath('//h3[contains(., "{}")]'.format(heading)) + office = node.xpath(f'//h3[contains(., "{heading}")]') if office: try: office_info = office[0].xpath( diff --git a/ca_on_ajax/__init__.py b/ca_on_ajax/__init__.py index 5f8340f8..ddbda0dc 100644 --- a/ca_on_ajax/__init__.py +++ b/ca_on_ajax/__init__.py @@ -15,10 +15,8 @@ def get_organizations(self): organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) for ward_number in range(1, 4): - division_id = "{}/ward:{}".format(self.division_id, ward_number) - organization.add_post( - role="Regional Councillor", label="Ward {}".format(ward_number), division_id=division_id - ) - organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=division_id) + division_id = f"{self.division_id}/ward:{ward_number}" + organization.add_post(role="Regional Councillor", label=f"Ward {ward_number}", division_id=division_id) + organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=division_id) yield organization diff --git a/ca_on_belleville/__init__.py b/ca_on_belleville/__init__.py index 6894249e..2c46ecb3 100644 --- a/ca_on_belleville/__init__.py +++ b/ca_on_belleville/__init__.py @@ -18,8 +18,8 @@ def get_organizations(self): for seat_number in range(1, stop): organization.add_post( role="Councillor", - label="Ward {} (seat {})".format(ward_number, seat_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_belleville/people.py b/ca_on_belleville/people.py index fce0386d..3aebe5c6 100644 --- a/ca_on_belleville/people.py +++ b/ca_on_belleville/people.py @@ -36,7 +36,7 @@ def scrape(self): councillors = ward.xpath("./following-sibling::*[img]") for councillor in councillors: self.seat_numbers[ward_name] += 1 - district = "{} (seat {})".format(ward_name, self.seat_numbers[ward_name]) + district = f"{ward_name} (seat {self.seat_numbers[ward_name]})" role = "Councillor" name = councillor.xpath("./following-sibling::p")[0].text_content() diff --git a/ca_on_brampton/__init__.py b/ca_on_brampton/__init__.py index 1c6f28d4..ca0cf1aa 100644 --- a/ca_on_brampton/__init__.py +++ b/ca_on_brampton/__init__.py @@ -15,10 +15,8 @@ def get_organizations(self): organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) for ward_number in range(1, 11): - division_id = "{}/ward:{}".format(self.division_id, ward_number) - organization.add_post( - role="Regional Councillor", label="Ward {}".format(ward_number), division_id=division_id - ) - organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=division_id) + division_id = f"{self.division_id}/ward:{ward_number}" + organization.add_post(role="Regional Councillor", label=f"Ward {ward_number}", division_id=division_id) + organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=division_id) yield organization diff --git a/ca_on_brantford/__init__.py b/ca_on_brantford/__init__.py index 86cb306a..5bac19e3 100644 --- a/ca_on_brantford/__init__.py +++ b/ca_on_brantford/__init__.py @@ -18,8 +18,8 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="Ward {} (seat {})".format(ward_number, seat_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_caledon/people.py b/ca_on_caledon/people.py index 03ec332d..837019c1 100644 --- a/ca_on_caledon/people.py +++ b/ca_on_caledon/people.py @@ -1,7 +1,5 @@ import re -import requests - from utils import CanadianPerson as Person from utils import CanadianScraper @@ -33,10 +31,8 @@ def scrape(self): # phone numbers populated by JS request contact_num = page.xpath('//div[@class="contactBody"]/div/@id')[0].replace("contactEntry_", "") - contact_data = requests.get( - "https://www.caledon.ca//Modules/Contact/services/GetContactHTML.ashx?isMobile=false¶m={}&lang=en".format( - contact_num - ) + contact_data = self.get( + f"https://www.caledon.ca//Modules/Contact/services/GetContactHTML.ashx?isMobile=false¶m={contact_num}&lang=en" ).text voice = re.findall(r"(?<=tel://)\d+(?=\">)", contact_data) @@ -46,7 +42,7 @@ def scrape(self): if "&" in district: # Councillor for multiple wards wards = re.findall(r"\d", district) for ward_num in wards: - p = Person(primary_org="legislature", name=name, district="Ward {}".format(ward_num), role=role) + p = Person(primary_org="legislature", name=name, district=f"Ward {ward_num}", role=role) if voice: p.add_contact("voice", voice[0], "legislature") p.image = image diff --git a/ca_on_cambridge/__init__.py b/ca_on_cambridge/__init__.py index a3b13617..cbaa6a01 100644 --- a/ca_on_cambridge/__init__.py +++ b/ca_on_cambridge/__init__.py @@ -17,14 +17,14 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Regional Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) for ward_number in range(1, 9): organization.add_post( role="Councillor", - label="Ward {}".format(ward_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number}", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_chatham_kent/__init__.py b/ca_on_chatham_kent/__init__.py index 86be75b6..0a696434 100644 --- a/ca_on_chatham_kent/__init__.py +++ b/ca_on_chatham_kent/__init__.py @@ -18,8 +18,8 @@ def get_organizations(self): for seat_number in range(1, stop): organization.add_post( role="Councillor", - label="Ward {} (seat {})".format(ward_number, seat_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_chatham_kent/people.py b/ca_on_chatham_kent/people.py index 290940d8..c5a02d6a 100644 --- a/ca_on_chatham_kent/people.py +++ b/ca_on_chatham_kent/people.py @@ -1,7 +1,6 @@ import re from collections import defaultdict -import requests from lxml import etree from utils import CanadianPerson as Person @@ -19,8 +18,8 @@ def scrape(self): headers = {"content-type": "text/xml"} body = 'councillorsByWard50' - response = requests.post(url=COUNCIL_DATA_URL, data=body, headers=headers) - page = etree.fromstring(response.content) + response = self.post(url=COUNCIL_DATA_URL, data=body, headers=headers) + page = etree.fromstring(response.content) # noqa: S320 namespace = {"z": "#RowsetSchema", "rs": "urn:schemas-microsoft-com:rowset"} councillors = page.findall(".//z:row", namespace) @@ -30,7 +29,7 @@ def scrape(self): ward, name = re.split(r"(?<=\d)\s", title) name.replace("Councillor ", "") seat_numbers[ward] += 1 - district = "{} (seat {})".format(ward, seat_numbers[ward]) + district = f"{ward} (seat {seat_numbers[ward]})" url = councillor.xpath("./@ows_URL")[0].split(",")[0] page = self.lxmlize(url, user_agent="Mozilla/5.0") diff --git a/ca_on_clarington/__init__.py b/ca_on_clarington/__init__.py index c8732d1e..6c2b2c7f 100644 --- a/ca_on_clarington/__init__.py +++ b/ca_on_clarington/__init__.py @@ -17,6 +17,6 @@ def get_organizations(self): organization.add_post(role="Regional Councillor", label="Wards 1 and 2") organization.add_post(role="Regional Councillor", label="Wards 3 and 4") for ward_number in range(1, 5): - organization.add_post(role="Councillor", label="Ward {}".format(ward_number)) + organization.add_post(role="Councillor", label=f"Ward {ward_number}") yield organization diff --git a/ca_on_clarington/people.py b/ca_on_clarington/people.py index 036d40c2..01869853 100644 --- a/ca_on_clarington/people.py +++ b/ca_on_clarington/people.py @@ -3,26 +3,33 @@ from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.clarington.net/index.php?content=townhall/council" +COUNCIL_PAGE = "https://www.clarington.net/en/town-hall/Meet-Your-Councillors.aspx" +MAYOR_PAGE = "https://www.clarington.net/en/town-hall/mayor.aspx" class ClaringtonPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath("//h2") + councillors = page.xpath("//td[@data-name='accParent']") assert len(councillors), "No councillors found" - for person_header_elem in councillors: - role, name_post = person_header_elem.text.split(" - ") - try: - name, caps_post = re.match(r"(.+) \((.+)\)", name_post).groups() - post = caps_post.title() - except AttributeError: - name = name_post - post = "Clarington" - email = person_header_elem.xpath("./following-sibling::a[1]/@href")[0][len("mailto:") :] - photo_url = person_header_elem.xpath("./following-sibling::img[1]/@src")[0] - p = Person(primary_org="legislature", name=name, district=post, role=role, image=photo_url) + for councillor in councillors: + name, role_district = councillor.text_content().split(" - ") + role, district = re.split(r"(?<=Councillor) ", role_district, maxsplit=1) + content_node = councillor.xpath("../following-sibling::tr")[0] + email = self.get_email(content_node) + photo_url = content_node.xpath(".//img/@src")[0] + p = Person(primary_org="legislature", name=name, district=district, role=role, image=photo_url) p.add_source(COUNCIL_PAGE) p.add_contact("email", email) yield p + + page = self.lxmlize(MAYOR_PAGE).xpath('//div[@id="mainContent"]')[0] + name = page.xpath(".//img/@alt")[0].replace("Mayor", "").strip() + photo_url = page.xpath(".//img/@src")[0] + email = self.get_email(page) + + p = Person(primary_org="legislature", name=name, district="Clarington", role="Mayor", image=photo_url) + p.add_contact("email", email) + p.add_source(MAYOR_PAGE) + yield p diff --git a/ca_on_fort_erie/__init__.py b/ca_on_fort_erie/__init__.py index 6bc1d6fb..8d016a6f 100644 --- a/ca_on_fort_erie/__init__.py +++ b/ca_on_fort_erie/__init__.py @@ -15,6 +15,6 @@ def get_organizations(self): organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) for ward_number in range(1, 7): - organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=self.division_id) + organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=self.division_id) yield organization diff --git a/ca_on_georgina/__init__.py b/ca_on_georgina/__init__.py index 3dcef56c..ed2903dd 100644 --- a/ca_on_georgina/__init__.py +++ b/ca_on_georgina/__init__.py @@ -20,7 +20,7 @@ def get_organizations(self): # organization.add_post(role='Councillor', label='Ward {}'.format(ward_number), division_id=self.division_id) organization.add_post( role="Councillor", - label="{} (seat {})".format(self.division_name, ward_number), + label=f"{self.division_name} (seat {ward_number})", division_id=self.division_id, ) diff --git a/ca_on_grimsby/__init__.py b/ca_on_grimsby/__init__.py index 3ce8ccb9..abf7b183 100644 --- a/ca_on_grimsby/__init__.py +++ b/ca_on_grimsby/__init__.py @@ -18,8 +18,8 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="Ward {} (seat {})".format(ward_number, seat_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_grimsby/people.py b/ca_on_grimsby/people.py index 65f0572e..1a384ad2 100644 --- a/ca_on_grimsby/people.py +++ b/ca_on_grimsby/people.py @@ -11,19 +11,19 @@ class GrimsbyPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - wards = page.xpath("//div[@id='printAreaContent']//tbody/tr[td/h4]") + wards = page.xpath("//p[@class='tab ']") assert len(wards), "No wards found" for ward in wards: - area = ward.xpath(".//h4")[0].text_content() - councillors_node = ward.xpath("./following-sibling::tr/td")[0] + area = ward.xpath(".//a")[0].text_content().strip() + councillors_node = ward.xpath("./following-sibling::div")[0] for i in range(2): name_node = councillors_node.xpath( './/h5[contains(./strong, "Councillor")]|.//h5[contains(., "Councillor")]' )[i] - name = re.split(r"\s", name_node.text_content(), 1)[1] - district = "{} (seat {})".format(area, i + 1) + name = re.split(r"\s", name_node.text_content(), maxsplit=1)[1] + district = f"{area} (seat {i + 1})" phone = self.get_phone(name_node.xpath('./following-sibling::*[contains(., "Phone")]')[0]) email = self.get_email(name_node.xpath("./following-sibling::p[contains(., 'Email')]")[0]) image = councillors_node.xpath(".//@src")[i] @@ -39,8 +39,8 @@ def scrape(self): role, name = page.xpath("//h3")[0].text_content().split(" ", 1) email = self.get_email(page) - phone = self.get_phone(page.xpath("//div[@id='printAreaContent']/p[contains(., '905')]")[0]) - image = page.xpath("//h3//@src")[0] + phone = self.get_phone(page.xpath("//div[contains(@class, 'left')]//p[contains(., '905')]")[0]) + image = page.xpath("//p//@src")[0] p = Person(primary_org="legislature", name=name, district="Grimsby", role=role, image=image) p.add_contact("email", email) diff --git a/ca_on_guelph/__init__.py b/ca_on_guelph/__init__.py index cfb78e38..4b265924 100644 --- a/ca_on_guelph/__init__.py +++ b/ca_on_guelph/__init__.py @@ -19,8 +19,8 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="Ward {} (seat {})".format(ward_number, seat_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_huron/__init__.py b/ca_on_huron/__init__.py index 793a2a6c..389fc94d 100644 --- a/ca_on_huron/__init__.py +++ b/ca_on_huron/__init__.py @@ -56,7 +56,7 @@ def get_organizations(self): for seat_number in range(1, division["count"] + 1): organization.add_post( role="Councillor", - label="{} (seat {})".format(division_name, seat_number), + label=f"{division_name} (seat {seat_number})", division_id=division_id, ) diff --git a/ca_on_kawartha_lakes/people.py b/ca_on_kawartha_lakes/people.py index 767970f1..ad2d33db 100644 --- a/ca_on_kawartha_lakes/people.py +++ b/ca_on_kawartha_lakes/people.py @@ -3,34 +3,34 @@ from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.city.kawarthalakes.on.ca/city-hall/mayor-council/members-of-council" +COUNCIL_PAGE = "https://www.kawarthalakes.ca/en/municipal-services/contact-a-council-member.aspx" class KawarthaLakesPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//p[@class="WSIndent"]/a') + councillors = page.xpath("//tr[.//h2]") assert len(councillors), "No councillors found" for councillor in councillors: - district = re.findall(r"(Ward [0-9]{1,2})", councillor.text_content()) + district = re.findall(r"(Ward \d)", councillor.text_content()) if district: district = district[0] - name = councillor.text_content().replace(district, "").strip() + name = re.sub(r"Ward \d|Councillor|Deputy Mayor|-", "", councillor.text_content()).strip() role = "Councillor" else: district = "Kawartha Lakes" name = councillor.text_content().replace("Mayor", "").strip() role = "Mayor" - url = councillor.attrib["href"] - page = self.lxmlize(url) - email = self.get_email(page) - image = page.xpath('//img[@class="image-right"]/@src')[0] + info_node = councillor.xpath("./following-sibling::*")[0] + email = self.get_email(info_node) + phone = self.get_phone(info_node) + image = info_node.xpath("//img/@src")[0] p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) - p.add_source(url) + p.add_contact("voice", phone, "legislature") p.add_contact("email", email) p.image = image yield p diff --git a/ca_on_lambton/__init__.py b/ca_on_lambton/__init__.py index 57d42f26..eeaf9aae 100644 --- a/ca_on_lambton/__init__.py +++ b/ca_on_lambton/__init__.py @@ -18,7 +18,7 @@ def get_organizations(self): # @todo Fix labels along the lines of the regions for seat_number in range(1, 16): organization.add_post( role="Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) diff --git a/ca_on_lambton/people.py b/ca_on_lambton/people.py index 8cb9dc7d..2757d6d7 100644 --- a/ca_on_lambton/people.py +++ b/ca_on_lambton/people.py @@ -24,7 +24,7 @@ def scrape(self): else: role = "Councillor" name = text.replace("Councillor ", "") - district = "Lambton (seat {})".format(councillor_seat_number) + district = f"Lambton (seat {councillor_seat_number})" councillor_seat_number += 1 p = Person(primary_org="legislature", name=name, district=district, role=role) diff --git a/ca_on_lasalle/__init__.py b/ca_on_lasalle/__init__.py index fa036878..003ae973 100644 --- a/ca_on_lasalle/__init__.py +++ b/ca_on_lasalle/__init__.py @@ -18,7 +18,7 @@ def get_organizations(self): for seat_number in range(1, 6): organization.add_post( role="Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) diff --git a/ca_on_lasalle/people.py b/ca_on_lasalle/people.py index 0930fc12..d0af6041 100644 --- a/ca_on_lasalle/people.py +++ b/ca_on_lasalle/people.py @@ -3,7 +3,7 @@ from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.town.lasalle.on.ca/en/town-hall/LaSalle-Council.asp" +COUNCIL_PAGE = "https://www.lasalle.ca/en/town-hall/town-of-lasalle-council.aspx" class LaSallePersonScraper(CanadianScraper): @@ -12,39 +12,22 @@ def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//table[@id="Table1table"]//td/p') + councillors = page.xpath('//div[@class="fbg-row lb-imageBox cm-datacontainer"]') assert len(councillors), "No councillors found" for councillor in councillors: - if not councillor.text_content().strip(): - continue - name = councillor.xpath("./font/b/text()") - if not name: - name = councillor.xpath("./font/text()") - if "email" in name[0]: - name = councillor.xpath("./b/font/text()") - name = name[0] - role = "Councillor" - if "Mayor" in name: - name = name.replace("Mayor", "") - role = "Mayor" - district = "LaSalle" - else: - district = "LaSalle (seat {})".format(councillor_seat_number) - councillor_seat_number += 1 - - p = Person(primary_org="legislature", name=name, district=district, role=role) + role, name = re.split( + r"(?<=Mayor)|(?<=Councillor)", councillor.xpath(".//a/div")[0].text_content(), maxsplit=1 + ) + district = "LaSalle" if "Mayor" in role else f"LaSalle (seat {councillor_seat_number})" + image = councillor.xpath(".//img/@src")[0] + voice = re.search(r"\d{3}-\d{3}-\d{4} ext. \d+", councillor.text_content()) + cell = re.search(r"\d{3}-\d{3}-\d{4}(?! ext)", councillor.text_content()) + + p = Person(primary_org="legislature", name=name, role=role, district=district, image=image) p.add_source(COUNCIL_PAGE) + if voice: + p.add_contact("voice", voice.group(0), "legislature") + if cell: + p.add_contact("cell", cell.group(0), "legislature") - photo_url = councillor.xpath("./parent::td//img/@src")[0] - p.image = photo_url - - email = self.get_email(councillor) - p.add_contact("email", email) - - phone = re.findall(r"(?<=phone:)(.*)(?=home)", councillor.text_content(), flags=re.DOTALL) - if phone: - p.add_contact("voice", phone[0].strip(), "legislature") - - home_phone = re.findall(r"(?<=home phone:)(.*)", councillor.text_content(), flags=re.DOTALL)[0] - p.add_contact("voice", home_phone.strip(), "residence") yield p diff --git a/ca_on_lincoln/__init__.py b/ca_on_lincoln/__init__.py index 3f3bba06..7624ad1c 100644 --- a/ca_on_lincoln/__init__.py +++ b/ca_on_lincoln/__init__.py @@ -18,8 +18,8 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="Ward {} (seat {})".format(ward_number, seat_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_markham/__init__.py b/ca_on_markham/__init__.py index f229f92a..26c51504 100644 --- a/ca_on_markham/__init__.py +++ b/ca_on_markham/__init__.py @@ -18,14 +18,14 @@ def get_organizations(self): for seat_number in range(1, 4): organization.add_post( role="Regional Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) for ward_number in range(1, 9): organization.add_post( role="Councillor", - label="Ward {}".format(ward_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number}", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_markham/people.py b/ca_on_markham/people.py index d2f73fa4..2b01dfd7 100644 --- a/ca_on_markham/people.py +++ b/ca_on_markham/people.py @@ -26,7 +26,7 @@ def scrape(self): role = "Councillor" elif "Regional" in district: role = "Regional Councillor" - district = "Markham (seat {})".format(regional_councillor_seat_number) + district = f"Markham (seat {regional_councillor_seat_number})" regional_councillor_seat_number += 1 else: role = district diff --git a/ca_on_milton/__init__.py b/ca_on_milton/__init__.py index a1247215..09f6e79f 100644 --- a/ca_on_milton/__init__.py +++ b/ca_on_milton/__init__.py @@ -15,10 +15,8 @@ def get_organizations(self): organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) for ward_number in range(1, 5): - division_id = "{}/ward:{}".format(self.division_id, ward_number) - organization.add_post( - role="Regional Councillor", label="Ward {}".format(ward_number), division_id=division_id - ) - organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=division_id) + division_id = f"{self.division_id}/ward:{ward_number}" + organization.add_post(role="Regional Councillor", label=f"Ward {ward_number}", division_id=division_id) + organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=division_id) yield organization diff --git a/ca_on_newmarket/__init__.py b/ca_on_newmarket/__init__.py index 82f8dbe4..3e8bc0e0 100644 --- a/ca_on_newmarket/__init__.py +++ b/ca_on_newmarket/__init__.py @@ -18,8 +18,8 @@ def get_organizations(self): for ward_number in range(1, 8): organization.add_post( role="Councillor", - label="Ward {}".format(ward_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number}", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_niagara/__init__.py b/ca_on_niagara/__init__.py index 7c6bc4e9..bc5cc9dc 100644 --- a/ca_on_niagara/__init__.py +++ b/ca_on_niagara/__init__.py @@ -70,7 +70,7 @@ def get_organizations(self): organization.add_post(role="Mayor", label=division_name, division_id=division_id) for seat_number in range(1, division["count"] + 1): organization.add_post( - role="Councillor", label="{} (seat {})".format(division_name, seat_number), division_id=division_id + role="Councillor", label=f"{division_name} (seat {seat_number})", division_id=division_id ) yield organization diff --git a/ca_on_niagara_on_the_lake/__init__.py b/ca_on_niagara_on_the_lake/__init__.py index 92dd00b7..d195985e 100644 --- a/ca_on_niagara_on_the_lake/__init__.py +++ b/ca_on_niagara_on_the_lake/__init__.py @@ -18,7 +18,7 @@ def get_organizations(self): for seat_number in range(1, 9): organization.add_post( role="Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) diff --git a/ca_on_north_dumfries/people.py b/ca_on_north_dumfries/people.py index 9573eaa8..ffe19fc5 100644 --- a/ca_on_north_dumfries/people.py +++ b/ca_on_north_dumfries/people.py @@ -23,10 +23,7 @@ def scrape(self): role = match.group(2) name = match.group(3) - if role == "Mayor": - district = "North Dumfries" - else: - district = "Ward {}".format(word_to_number[match.group(1)]) + district = "North Dumfries" if role == "Mayor" else f"Ward {word_to_number[match.group(1)]}" p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) diff --git a/ca_on_oakville/__init__.py b/ca_on_oakville/__init__.py index 9c0373c2..113d8b9b 100644 --- a/ca_on_oakville/__init__.py +++ b/ca_on_oakville/__init__.py @@ -15,10 +15,8 @@ def get_organizations(self): organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) for ward_number in range(1, 8): - division_id = "{}/ward:{}".format(self.division_id, ward_number) - organization.add_post( - role="Regional Councillor", label="Ward {}".format(ward_number), division_id=division_id - ) - organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=division_id) + division_id = f"{self.division_id}/ward:{ward_number}" + organization.add_post(role="Regional Councillor", label=f"Ward {ward_number}", division_id=division_id) + organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=division_id) yield organization diff --git a/ca_on_oakville/people.py b/ca_on_oakville/people.py index e51f9ef1..c7362962 100644 --- a/ca_on_oakville/people.py +++ b/ca_on_oakville/people.py @@ -19,11 +19,8 @@ def scrape(self): district = "Oakville" role = district_role else: - district, role = re.split(r"(?<=\d)\s+", district_role, 1) - if "Regional" in role: - role = "Regional Councillor" - else: - role = "Councillor" + district, role = re.split(r"(?<=\d)\s+", district_role, maxsplit=1) + role = "Regional Councillor" if "Regional" in role else "Councillor" name = councillor.xpath(".//div[@class='user-name']/text()")[0] email = self.get_email(councillor) diff --git a/ca_on_oshawa/__init__.py b/ca_on_oshawa/__init__.py index eae140e7..14e62cd1 100644 --- a/ca_on_oshawa/__init__.py +++ b/ca_on_oshawa/__init__.py @@ -15,10 +15,8 @@ def get_organizations(self): organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) for ward_number in range(1, 6): - division_id = "{}/ward:{}".format(self.division_id, ward_number) - organization.add_post( - role="Regional Councillor", label="Ward {}".format(ward_number), division_id=division_id - ) - organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=division_id) + division_id = f"{self.division_id}/ward:{ward_number}" + organization.add_post(role="Regional Councillor", label=f"Ward {ward_number}", division_id=division_id) + organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=division_id) yield organization diff --git a/ca_on_oshawa/people.py b/ca_on_oshawa/people.py index 7105e8e2..df02029e 100644 --- a/ca_on_oshawa/people.py +++ b/ca_on_oshawa/people.py @@ -20,12 +20,9 @@ def scrape(self): district = "Oshawa" name = info.replace("Mayor ", "") else: - district, role_name = re.split(r"(?<=\d)\s", info, 1) - if "Regional" in role_name: - role = "Regional Councillor" - else: - role = "Councillor" - name = re.split(r"Councillor\s", role_name, 1)[1] + district, role_name = re.split(r"(?<=\d)\s", info, maxsplit=1) + role = "Regional Councillor" if "Regional" in role_name else "Councillor" + name = re.split(r"Councillor\s", role_name, maxsplit=1)[1] photo_url = councillor.xpath(".//img/@src")[0] phone = self.get_phone(councillor) diff --git a/ca_on_ottawa/people.py b/ca_on_ottawa/people.py index 5d1054ee..9db7f422 100644 --- a/ca_on_ottawa/people.py +++ b/ca_on_ottawa/people.py @@ -2,11 +2,7 @@ class OttawaPersonScraper(CSVScraper): - # http://data.ottawa.ca/dataset/elected-officials - csv_url = "http://data.ottawa.ca/dataset/fd26ae83-fe1a-40d8-8951-72df40021c82/resource/3cd1b14d-cb45-4c4d-b22a-a607946e2ec2/download/elected-officials-2018-2022.csv" - encoding = "utf-8-sig" - corrections = { - "district name": { - "Orl\u0082ans": "Orléans", - }, - } + # https://open.ottawa.ca/documents/ottawa::elected-officials-2022-2026/about + csv_url = "https://www.arcgis.com/sharing/rest/content/items/a5e9dc2425274bb796d3ded47b0d7b00/data" + fallbacks = {"district name": "ward name"} + extension = ".xls" diff --git a/ca_on_peel/__init__.py b/ca_on_peel/__init__.py index b22abfc0..4f2a0754 100644 --- a/ca_on_peel/__init__.py +++ b/ca_on_peel/__init__.py @@ -20,21 +20,21 @@ def get_organizations(self): for ward_number in range(1, 7): organization.add_post( role="Councillor", - label="Caledon Ward {} (seat 1)".format(ward_number), - division_id="ocd-division/country:ca/csd:3521024/ward:{}".format(ward_number), + label=f"Caledon Ward {ward_number} (seat 1)", + division_id=f"ocd-division/country:ca/csd:3521024/ward:{ward_number}", ) for ward_number in range(1, 11): for seat_number in range(1, 3 if ward_number <= 6 else 2): organization.add_post( role="Councillor", - label="Brampton Ward {} (seat {})".format(ward_number, seat_number), - division_id="ocd-division/country:ca/csd:3521010/ward:{}".format(ward_number), + label=f"Brampton Ward {ward_number} (seat {seat_number})", + division_id=f"ocd-division/country:ca/csd:3521010/ward:{ward_number}", ) for ward_number in range(1, 12): organization.add_post( role="Councillor", - label="Mississauga Ward {} (seat 1)".format(ward_number), - division_id="ocd-division/country:ca/csd:3521005/ward:{}".format(ward_number), + label=f"Mississauga Ward {ward_number} (seat 1)", + division_id=f"ocd-division/country:ca/csd:3521005/ward:{ward_number}", ) yield organization diff --git a/ca_on_pickering/__init__.py b/ca_on_pickering/__init__.py index 60739797..0acff44d 100644 --- a/ca_on_pickering/__init__.py +++ b/ca_on_pickering/__init__.py @@ -15,7 +15,7 @@ def get_organizations(self): organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) for ward_number in range(1, 4): - organization.add_post(role="Regional Councillor", label="Ward {}".format(ward_number)) - organization.add_post(role="Councillor", label="Ward {}".format(ward_number)) + organization.add_post(role="Regional Councillor", label=f"Ward {ward_number}") + organization.add_post(role="Councillor", label=f"Ward {ward_number}") yield organization diff --git a/ca_on_pickering/people.py b/ca_on_pickering/people.py index 78bfdd3f..420b3ede 100644 --- a/ca_on_pickering/people.py +++ b/ca_on_pickering/people.py @@ -21,7 +21,7 @@ def scrape(self): if "Councillor" in name: name = name.replace("Councillor", "").strip() role_ward = councillor.xpath(".//text()")[1] - role, ward = re.split(r"\s(?=Ward)", role_ward, 1) + role, ward = re.split(r"\s(?=Ward)", role_ward, maxsplit=1) else: name = name.replace("Mayor", "") role = "Mayor" diff --git a/ca_on_richmond_hill/__init__.py b/ca_on_richmond_hill/__init__.py index fd8c9735..32482ccc 100644 --- a/ca_on_richmond_hill/__init__.py +++ b/ca_on_richmond_hill/__init__.py @@ -17,14 +17,14 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Regional Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) for ward_number in range(1, 7): organization.add_post( role="Councillor", - label="Ward {}".format(ward_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number}", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_richmond_hill/people.py b/ca_on_richmond_hill/people.py index 9f92f22c..f6163c4a 100644 --- a/ca_on_richmond_hill/people.py +++ b/ca_on_richmond_hill/people.py @@ -14,7 +14,7 @@ def scrape(self): urls = page.xpath('//h3[contains(text(), "Regional and Local Councillors")]/following-sibling::p[1]//@href') assert len(urls), "No regional councillors found" for index, url in enumerate(urls, 1): - yield self.process(url, "Richmond Hill (seat {})".format(index), "Regional Councillor") + yield self.process(url, f"Richmond Hill (seat {index})", "Regional Councillor") councillors = page.xpath('//h3[text()="Local Councillors"]/following-sibling::p') assert len(councillors), "No councillors found" diff --git a/ca_on_sault_ste_marie/__init__.py b/ca_on_sault_ste_marie/__init__.py index 7bee6834..e329e1e8 100644 --- a/ca_on_sault_ste_marie/__init__.py +++ b/ca_on_sault_ste_marie/__init__.py @@ -1,3 +1,5 @@ +from pupa.scrape import Organization + from utils import CanadianJurisdiction @@ -7,3 +9,17 @@ class SaultSteMarie(CanadianJurisdiction): division_name = "Sault Ste. Marie" name = "Sault Ste. Marie City Council" url = "http://www.city.sault-ste-marie.on.ca" + + def get_organizations(self): + organization = Organization(self.name, classification=self.classification) + + organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) + for ward_number in range(1, 6): + for seat_number in range(1, 3): + organization.add_post( + role="Councillor", + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", + ) + + yield organization diff --git a/ca_on_sault_ste_marie/people.py b/ca_on_sault_ste_marie/people.py index ffc890fb..5ff2672c 100644 --- a/ca_on_sault_ste_marie/people.py +++ b/ca_on_sault_ste_marie/people.py @@ -1,57 +1,49 @@ -from urllib.parse import urljoin +import re +from collections import defaultdict from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.city.sault-ste-marie.on.ca/Open_Page.aspx?ID=174&deptid=1" - - -def word_to_number(word): - words = ("one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten") - return words.index(word.lower()) + 1 - - -def district_name_using_number(name): - district_split = name.split() - return " ".join([district_split[0], str(word_to_number(district_split[1]))]) +COUNCIL_PAGE = "https://saultstemarie.ca/Government/City-Council.aspx" class SaultSteMariePersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - table_data = page.xpath('//div[@id="litcontentDiv"]//tr') - council_data = table_data[2:-1] - - mayor_row = table_data[0] - - photo_url_rel = mayor_row.xpath("string(.//img/@src)") # can be empty - photo_url = urljoin(COUNCIL_PAGE, photo_url_rel) - contact_node = mayor_row.xpath("./td")[1] - name = contact_node.xpath(".//font[1]/text()")[0] - email = self.get_email(contact_node) - - p = Person(primary_org="legislature", name=name, district="Sault Ste. Marie", role="Mayor") - p.add_source(COUNCIL_PAGE) - p.add_contact("email", email) - p.image = photo_url - yield p - - # alternate between a row represneting a ward name and councilors - assert len(council_data), "No councillors found" - for ward_row, data_row in zip(*[iter(council_data)] * 2): - district = ward_row.xpath('.//text()[contains(., "Ward")]')[0] - district_num = district_name_using_number(district) - for councillor_node in data_row.xpath("./td"): - name = councillor_node.xpath(".//strong/text()|.//font[1]/text()")[0] - email = self.get_email(councillor_node) - photo_url_rel = councillor_node.xpath("string(.//img/@src)") # can be empty - photo_url = urljoin(COUNCIL_PAGE, photo_url_rel) - # address and phone are brittle, inconsistent - - p = Person(primary_org="legislature", name=name, district=district_num, role="Councillor") - p.add_source(COUNCIL_PAGE) - if email: - p.add_contact("email", email) - p.image = photo_url - - yield p + seat_numbers = defaultdict(int) + + councillors = page.xpath('//div[@class="mb-2"]//@href') + assert len(councillors), "No councillors found" + + for link in councillors: + page = self.lxmlize(link) + title = page.xpath("//h1")[0].text_content() + if "Mayor" in title: + role = "Mayor" + name = title.replace("Mayor ", "") + district = "Sault Ste. Marie" + image = None # No image on the Mayor's page at the moment + contact_node = page.xpath('//div[@id="mainContent_contactUs"]')[0] + phone_numbers = re.findall(r"\d{3}-\d{3}-\d{4}", contact_node.text_content()) + phone = phone_numbers[0] + fax = phone_numbers[1] + else: + role = "Councillor" + area, name = title.split(" Councillor ") + seat_numbers[area] += 1 + district = f"{area} (seat {seat_numbers[area]})" + image = page.xpath(".//h3/img/@src")[0] + contact_node = page.xpath('//div[@id="mainContent_left"]')[0] + phone = self.get_phone(contact_node) + email = self.get_email(contact_node) + + p = Person(primary_org="legislature", name=name, district=district, role=role) + if image: + p.image = image + if fax: + p.add_contact("fax", fax, "legislature") + p.add_contact("email", email) + p.add_contact("voice", phone, "legislature") + p.add_source(COUNCIL_PAGE) + p.add_source(link) + yield p diff --git a/ca_on_school_boards_english_public/__init__.py b/ca_on_school_boards_english_public/__init__.py deleted file mode 100644 index bc76eb9f..00000000 --- a/ca_on_school_boards_english_public/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -from opencivicdata.divisions import Division -from pupa.scrape import Organization - -from utils import CanadianJurisdiction - - -class OntarioEnglishPublicSchoolBoards(CanadianJurisdiction): - classification = "school" # just to avoid clash - division_id = "ocd-division/country:ca/province:on" - division_name = 'Ontario English Public School Board boundary"' - name = "Ontario English Public School Boards" - url = "http://www.edu.gov.on.ca/eng/sbinfo/boardList.html" - - def get_organizations(self): - organization = Organization(self.name, classification="committee") - organization.add_source(self.url) - - for division in Division.get(self.division_id).children("school_district"): - organization.add_post(role="Chair", label=division.name, division_id=division.id) - for i in range(0, 22): # XXX made-up number - organization.add_post( - role="Trustee", label="{} (seat {})".format(division.name, i), division_id=division.id - ) - - yield organization diff --git a/ca_on_school_boards_english_public/people.py b/ca_on_school_boards_english_public/people.py deleted file mode 100644 index 2ee7369a..00000000 --- a/ca_on_school_boards_english_public/people.py +++ /dev/null @@ -1,18 +0,0 @@ -from datetime import date - -from utils import CSVScraper - - -class OntarioEnglishPublicSchoolBoardsPersonScraper(CSVScraper): - # CSV source: https://docs.google.com/spreadsheets/d/1smXFR3nB9lovc6bWWcLvr621wb6E5b2TZKqUtxRTUtE/edit#gid=785048945 - csv_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vTbnQN0j_2Ky56MeRQsNTYXnt9Q6f_vFgH_KyAZ3O96QhjLqMK_Fzrjz2lI8ympE1FU0lkKgbGEvjW0/pub?gid=785048945&single=true&output=csv" - updated_at = date(2019, 9, 13) - contact_person = "andrew@newmode.net" - many_posts_per_area = True - unique_roles = ["Chair"] - encoding = "utf-8" - corrections = {"district name": {}} - organization_classification = "committee" - - def is_valid_row(self, row): - return any(row.values()) and row["last name"] and row["first name"] diff --git a/ca_on_st_catharines/__init__.py b/ca_on_st_catharines/__init__.py index 40fe57d7..ce5f9aa4 100644 --- a/ca_on_st_catharines/__init__.py +++ b/ca_on_st_catharines/__init__.py @@ -18,8 +18,8 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="{} (seat {})".format(ward_name, seat_number), - division_id="{}/ward:{}".format(self.division_id, clean_type_id(ward_name)), + label=f"{ward_name} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{clean_type_id(ward_name)}", ) yield organization diff --git a/ca_on_thunder_bay/__init__.py b/ca_on_thunder_bay/__init__.py index 0f573abf..c94012f6 100644 --- a/ca_on_thunder_bay/__init__.py +++ b/ca_on_thunder_bay/__init__.py @@ -17,14 +17,14 @@ def get_organizations(self): for seat_number in range(1, 6): organization.add_post( role="Councillor at Large", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) for ward_number, ward_name in enumerate( ("Current River", "Red River", "McKellar", "McIntyre", "Northwood", "Westfort", "Neebing"), 1 ): organization.add_post( - role="Councillor", label=ward_name, division_id="{}/ward:{}".format(self.division_id, ward_number) + role="Councillor", label=ward_name, division_id=f"{self.division_id}/ward:{ward_number}" ) yield organization diff --git a/ca_on_thunder_bay/people.py b/ca_on_thunder_bay/people.py index f0123518..0c09dcac 100644 --- a/ca_on_thunder_bay/people.py +++ b/ca_on_thunder_bay/people.py @@ -1,7 +1,7 @@ import requests +from utils import DEFAULT_USER_AGENT, CanadianScraper from utils import CanadianPerson as Person -from utils import CanadianScraper COUNCIL_PAGE = "https://www.thunderbay.ca/en/city-hall/mayor-and-council-profiles.aspx" @@ -29,7 +29,7 @@ def scrape(self): ].text_content() if "At Large" in district: role = "Councillor at Large" - district = "Thunder Bay (seat {})".format(seat_number) + district = f"Thunder Bay (seat {seat_number})" seat_number += 1 elif "Mayor" in district: district = "Thunder Bay" @@ -43,6 +43,6 @@ def scrape(self): yield p - def lxmlize(self, url, encoding=None, user_agent=requests.utils.default_user_agent(), cookies=None, xml=False): + def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False): requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ":HIGH:!DH:!aNULL" # site uses a weak DH key return super().lxmlize(url, encoding, user_agent, cookies, xml) diff --git a/ca_on_toronto/people.py b/ca_on_toronto/people.py index 010a9867..02661961 100644 --- a/ca_on_toronto/people.py +++ b/ca_on_toronto/people.py @@ -12,3 +12,6 @@ class TorontoPersonScraper(CSVScraper): "councillor_ mckelvie@toronto.ca": "councillor_mckelvie@toronto.ca", }, } + + def is_valid_row(self, row): + return row["first name"] != "None" and row["last name"] != "None" diff --git a/ca_on_uxbridge/__init__.py b/ca_on_uxbridge/__init__.py index f429091e..f0144483 100644 --- a/ca_on_uxbridge/__init__.py +++ b/ca_on_uxbridge/__init__.py @@ -16,6 +16,6 @@ def get_organizations(self): organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) organization.add_post(role="Regional Councillor", label=self.division_name, division_id=self.division_id) for ward_number in range(1, 6): - organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=self.division_id) + organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=self.division_id) yield organization diff --git a/ca_on_vaughan/__init__.py b/ca_on_vaughan/__init__.py index fc17bf88..77cf3f69 100644 --- a/ca_on_vaughan/__init__.py +++ b/ca_on_vaughan/__init__.py @@ -18,14 +18,14 @@ def get_organizations(self): for seat_number in range(1, 5): organization.add_post( role="Regional Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) for ward_number in range(1, 6): organization.add_post( role="Councillor", - label="Ward {}".format(ward_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number}", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_vaughan/people.py b/ca_on_vaughan/people.py index 9d25f274..75adb737 100644 --- a/ca_on_vaughan/people.py +++ b/ca_on_vaughan/people.py @@ -22,7 +22,7 @@ def scrape(self): district, name = title.split("Councillor") if "Regional" in district: role = "Regional Councillor" - district = "Vaughan (seat {})".format(regional_councillor_seat_number) + district = f"Vaughan (seat {regional_councillor_seat_number})" regional_councillor_seat_number += 1 elif "Ward" in district: role = "Councillor" diff --git a/ca_on_waterloo_region/__init__.py b/ca_on_waterloo_region/__init__.py index 0c5ad0a4..fca132eb 100644 --- a/ca_on_waterloo_region/__init__.py +++ b/ca_on_waterloo_region/__init__.py @@ -29,19 +29,19 @@ def get_organizations(self): for seat_number in range(1, 4): organization.add_post( role="Regional Councillor", - label="Cambridge (seat {})".format(seat_number), + label=f"Cambridge (seat {seat_number})", division_id="ocd-division/country:ca/csd:3530010", ) for seat_number in range(1, 6): organization.add_post( role="Regional Councillor", - label="Kitchener (seat {})".format(seat_number), + label=f"Kitchener (seat {seat_number})", division_id="ocd-division/country:ca/csd:3530013", ) for seat_number in range(1, 4): organization.add_post( role="Regional Councillor", - label="Waterloo (seat {})".format(seat_number), + label=f"Waterloo (seat {seat_number})", division_id="ocd-division/country:ca/csd:3530016", ) diff --git a/ca_on_waterloo_region/people.py b/ca_on_waterloo_region/people.py index ca0cdeb3..639006da 100644 --- a/ca_on_waterloo_region/people.py +++ b/ca_on_waterloo_region/people.py @@ -20,7 +20,7 @@ def scrape(self): area = re.sub(r"(?:City|Region|Township) of ", "", area) councillors = municipality.xpath("./following-sibling::tr[1]//a[not(@target)]") - assert len(councillors), "No councillors found for {}".format(area) + assert len(councillors), f"No councillors found for {area}" for councillor in councillors: name = councillor.text_content() @@ -29,7 +29,7 @@ def scrape(self): if re.search("Waterloo|Cambridge|Kitchener", area): seat_numbers[area] += 1 - district = "{} (seat {})".format(area, seat_numbers[area]) + district = f"{area} (seat {seat_numbers[area]})" else: district = area if "Regional Council" in area: diff --git a/ca_on_welland/__init__.py b/ca_on_welland/__init__.py index 5b8df234..5d388c54 100644 --- a/ca_on_welland/__init__.py +++ b/ca_on_welland/__init__.py @@ -18,8 +18,8 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="Ward {} (seat {})".format(ward_number, seat_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_wellesley/people.py b/ca_on_wellesley/people.py index 8d189e6a..1514047b 100644 --- a/ca_on_wellesley/people.py +++ b/ca_on_wellesley/people.py @@ -15,10 +15,10 @@ def scrape(self): page = self.lxmlize(COUNCIL_PAGE) members = [ el - for el in page.xpath('//div[@id="printAreaContent"]//td') + for el in page.xpath('//div//td[@data-name="accChild"]') if el.text_content().strip().lower().split()[0] in ["mayor", "councillor"] - ][1:] - assert len(members) == 5 + ] + assert len(members), "No councillors found" for member in members: position = member.text_content().split()[0] @@ -26,12 +26,12 @@ def scrape(self): name = srch.group(1).strip() district = srch.group(2).strip() phone = self.get_phone(member) - if position == "Mayor": - district = "Wellesley" - else: - district = post_number(district) + email = self.get_email(member, error=False) + district = "Wellesley" if position == "Mayor" else post_number(district) p = Person(primary_org="legislature", name=name, district=district, role=position) p.add_contact("voice", phone, "legislature") + if email: + p.add_contact("email", email) p.add_source(COUNCIL_PAGE) yield p diff --git a/ca_on_whitby/__init__.py b/ca_on_whitby/__init__.py index dc57319f..7a3d88b7 100644 --- a/ca_on_whitby/__init__.py +++ b/ca_on_whitby/__init__.py @@ -17,14 +17,14 @@ def get_organizations(self): for seat_number in range(1, 5): organization.add_post( role="Regional Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) for ward_number, ward_name in enumerate(("North", "West", "Centre", "East"), 1): organization.add_post( role="Councillor", - label="{} Ward".format(ward_name), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"{ward_name} Ward", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_whitby/people.py b/ca_on_whitby/people.py index 24f08ab5..b0c5dd81 100644 --- a/ca_on_whitby/people.py +++ b/ca_on_whitby/people.py @@ -22,7 +22,7 @@ def scrape(self): else: name, role = name.split(", ") if role == "Regional Councillor": - district = "Whitby (seat {})".format(regional_councillor_seat_number) + district = f"Whitby (seat {regional_councillor_seat_number})" regional_councillor_seat_number += 1 else: district = role.split(" – ")[1] diff --git a/ca_on_whitchurch_stouffville/__init__.py b/ca_on_whitchurch_stouffville/__init__.py index 1e4c6e35..b48db3ee 100644 --- a/ca_on_whitchurch_stouffville/__init__.py +++ b/ca_on_whitchurch_stouffville/__init__.py @@ -19,7 +19,7 @@ def get_organizations(self): # organization.add_post(role='Councillor', label='Ward {}'.format(ward_number), division_id=self.division_id) organization.add_post( role="Councillor", - label="{} (seat {})".format(self.division_name, ward_number), + label=f"{self.division_name} (seat {ward_number})", division_id=self.division_id, ) diff --git a/ca_on_wilmot/people.py b/ca_on_wilmot/people.py index 8c5898b6..d9676cfe 100644 --- a/ca_on_wilmot/people.py +++ b/ca_on_wilmot/people.py @@ -1,51 +1,27 @@ -import re - from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.wilmot.ca/current-council.php" +COUNCIL_PAGE = "https://www.wilmot.ca/Modules/contact/search.aspx?s=EFHOVXSi8AOIMKMStZMNvAeQuAleQuAl" class WilmotPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//table[@id="Main Content"]//td[@colspan="3"]//td/p/b') + councillors = page.xpath('//table[@class="contactList"]//tr') assert len(councillors), "No councillors found" for councillor in councillors: - district, name = councillor.xpath("./text()")[0].split(":") - if "Mayor" in district: + name, role_district = councillor.xpath(".//button/text()")[0].split(" - ", 1) + if "Mayor" in role_district: yield scrape_mayor(councillor, name) continue + role, district = role_district.split(" - ") - p = Person(primary_org="legislature", name=name, district=district, role="Councillor") + p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) - base_info = councillor.xpath("./parent::p/text()") - for info in councillor.xpath("./parent::p/following-sibling::p"): - if info.xpath(".//b"): - break - base_info = base_info + info.xpath("./text()") - - address = "" - complete = False - while not complete: - address = address + " " + base_info.pop(0) - if re.search(r"[A-Z][0-9A-Z][A-Z] \d[A-Z]\d", address): - complete = True - p.add_contact("address", address, "legislature") - - base_info.pop(-1) - base_info = " ".join(base_info).split() - for i, contact in enumerate(base_info): - if re.match(r"[0-9]", contact): - continue - if "fax" in contact: - p.add_contact("fax", base_info[i + 1], "legislature") - else: - p.add_contact(contact, base_info[i + 1], contact) - email = self.get_email(councillor, "./parent::p/following-sibling::p") - p.add_contact("email", email) + phone = self.get_phone(councillor).replace("/", "") + p.add_contact("voice", phone, "legislature") yield p @@ -53,14 +29,11 @@ def scrape_mayor(div, name): p = Person(primary_org="legislature", name=name, district="Wilmot", role="Mayor") p.add_source(COUNCIL_PAGE) - info = div.xpath("./parent::p//text()") - info.pop(0) - address = " ".join(info[:3]) - phone = info[3].split()[1] - fax = info[4].split()[1] - email = info[-1] + address = div.xpath('.//div[@class="contactListAddress"]')[0].text_content() + phone = div.xpath('.//div[@class="contactListMainNumber"]/a/text()')[0] + other_phone = div.xpath('.//div[@class="contactListPhNumber"]/a/text()')[0] p.add_contact("address", address, "legislature") p.add_contact("voice", phone, "legislature") - p.add_contact("fax", fax, "legislature") - p.add_contact("email", email) + p.add_contact("voice", other_phone, "office") + return p diff --git a/ca_on_windsor/people.py b/ca_on_windsor/people.py index 62b21688..7ee6649c 100644 --- a/ca_on_windsor/people.py +++ b/ca_on_windsor/people.py @@ -1,14 +1,27 @@ +import json + from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "https://www.citywindsor.ca/mayor-and-council/city-councillors" -MAYOR_PAGE = "https://www.citywindsor.ca/mayor-and-council/mayor-drew-dilkens" +COUNCIL_PAGE = "https://www.citywindsor.ca/mayor-and-council" class WindsorPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - + data_url = page.xpath('//comment()[contains(., "SITE JS")]/following-sibling::script/@src')[0] + data = json.loads(self.get(data_url).text.split(" = ")[1]) + nav_items = [] + for item in data: + if item["RollupType"] == "SidebarNavigation": + nav_items = item["RollupFields"] + for item in nav_items: + if item["Title"].startswith("Mayor") and item["Parent"] == "Mayor and City Council": + mayor_url = "https://www.citywindsor.ca" + item["RelativeURL"] + if "Councillors" in item["Title"]: + councillors_url = "https://www.citywindsor.ca" + item["RelativeURL"] + + page = self.lxmlize(councillors_url, user_agent="Mozilla/5.0") councillors = page.xpath("//h2") assert len(councillors), "No councillors found" for councillor in councillors: @@ -28,12 +41,12 @@ def scrape(self): yield p - page = self.lxmlize(MAYOR_PAGE) + page = self.lxmlize(mayor_url) title = page.xpath("//h1")[0].text_content() name = title.replace("Mayor ", "") image = page.xpath('//img[contains(./@alt, "Mayor")]/@src')[0] p = Person(primary_org="legislature", name=name, district="Windsor", role="Mayor", image=image) - p.add_source(MAYOR_PAGE) + p.add_source(mayor_url) yield p diff --git a/ca_on_woolwich/__init__.py b/ca_on_woolwich/__init__.py index 8a4ba24f..e072a68f 100644 --- a/ca_on_woolwich/__init__.py +++ b/ca_on_woolwich/__init__.py @@ -1,3 +1,5 @@ +from pupa.scrape import Organization + from utils import CanadianJurisdiction @@ -7,3 +9,17 @@ class Woolwich(CanadianJurisdiction): division_name = "Woolwich" name = "Woolwich Township Council" url = "http://www.woolwich.ca" + + def get_organizations(self): + organization = Organization(self.name, classification=self.classification) + + organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) + for ward_number, stop in enumerate((3, 2, 3), 1): + for seat_number in range(1, stop): + organization.add_post( + role="Councillor", + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", + ) + + yield organization diff --git a/ca_on_woolwich/people.py b/ca_on_woolwich/people.py index 66992769..e1baf6d5 100644 --- a/ca_on_woolwich/people.py +++ b/ca_on_woolwich/people.py @@ -1,4 +1,5 @@ import re +from collections import defaultdict from utils import CanadianPerson as Person from utils import CanadianScraper @@ -8,29 +9,32 @@ class WoolwichPersonScraper(CanadianScraper): def scrape(self): + seat_numbers = defaultdict(int) page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//div[@id="printArea"]//strong') + councillors = page.xpath('//td[@data-name="accParent"]/h2') assert len(councillors), "No councillors found" for councillor in councillors: - info = councillor.xpath("./parent::p/text()") - if not info: - info = councillor.xpath("./parent::div/text()") - info = [x for x in info if x.strip()] - district = re.sub(r"(?<=Ward \d).+", "", info.pop(0)) - if "Mayor" in district: + role, name = re.split(r"\s", councillor.text_content(), maxsplit=1) + area = re.search(r"Ward \d", name) + if not area: district = "Woolwich" - role = "Mayor" else: - district = district.replace("Councillor", "").strip() - role = "Councillor" + seat_numbers[area] += 1 + district = area.group(0) + f" (seat {seat_numbers[area]})" + if "(" in name: + name = name.split(" (")[0] + info = councillor.xpath("./ancestor::tr[1]/following-sibling::tr")[0].text_content() + office = re.search(r"(?<=Office: )\d{3}-\d{3}-\d{4}", info).group(0) + voice = ( + re.search(r"(?<=Toll Free: )(1-)?\d{3}-\d{3}-\d{4}( extension \d{4})?", info) + .group(0) + .replace("extension", "x") + ) - p = Person(primary_org="legislature", name=councillor.text_content(), district=district, role=role) + p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) - p.image = councillor.xpath("./img/@src")[0] + p.add_contact("voice", office, "office") + p.add_contact("voice", voice, "legislature") - for contact in info: - note, num = contact.split(":") - num = num.strip().replace("(", "").replace(") ", "-").replace("extension ", "x") - p.add_contact(note, num, note) yield p diff --git a/ca_pe_charlottetown/people.py b/ca_pe_charlottetown/people.py index 4bdc4c5d..ba216571 100644 --- a/ca_pe_charlottetown/people.py +++ b/ca_pe_charlottetown/people.py @@ -8,15 +8,6 @@ class CharlottetownPersonScraper(CanadianScraper): def scrape(self): - def decode_email(e): - de = "" - k = int(e[:2], 16) - - for i in range(2, len(e) - 1, 2): - de += chr(int(e[i : i + 2], 16) ^ k) - - return de - page = self.lxmlize(COUNCIL_PAGE, user_agent="Mozilla/5.0") nodes = page.xpath('//div[@id="ctl00_ContentPlaceHolder1_ctl13_divContent"]/*') @@ -52,22 +43,13 @@ def decode_email(e): p.image = image - for node in group: - email_node = node.xpath("//a[span/@data-cfemail]") - if email_node: - email = email_node[0].xpath("./@href")[0].split("#")[1] - break - - decoded_email = decode_email(email).split("?")[0] - p.add_contact("email", decoded_email) + email = self.get_email(para) + p.add_contact("email", email) for text in para.xpath('.//strong[contains(., "Phone")]/following-sibling::text()'): if re.search(r"\d", text): match = re.search(r"(.+) \((.+)\)", text) - if match.group(2) == "Fax": - contact_type = "fax" - else: - contact_type = "voice" + contact_type = "fax" if match.group(2) == "Fax" else "voice" p.add_contact(contact_type, match.group(1), match.group(2)) yield p diff --git a/ca_pe_stratford/people.py b/ca_pe_stratford/people.py index 253b8d0a..158caf15 100644 --- a/ca_pe_stratford/people.py +++ b/ca_pe_stratford/people.py @@ -1,66 +1,42 @@ import re from collections import defaultdict -from utils import CUSTOM_USER_AGENT from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.townofstratford.ca/town-hall/government/town-council/" +COUNCIL_PAGE = "https://www.townofstratford.ca/government/about_our_government/mayor_council" class StratfordPersonScraper(CanadianScraper): def scrape(self): seat_numbers = defaultdict(int) - page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT) + page = self.lxmlize(COUNCIL_PAGE, user_agent="Mozilla/5.0") - yield self.scrape_mayor(page) - - councillors = page.xpath( - '//div[@id="street-container"]//strong[contains(text(), "Councillor")]/parent::p|//div[@id="street-container"]//b[contains(text(), "Councillor")]/parent::p' - ) + councillors = page.xpath("//tr") assert len(councillors), "No councillors found" for councillor in councillors: - name = councillor.xpath("./strong/text()|./b/text()")[0].replace("Councillor", "").strip() - post = re.findall(r"(?<=Ward \d, ).*", councillor.text_content())[0].strip() - - seat_numbers[post] += 1 - post = "{} (seat {})".format(post, seat_numbers[post]) - - p = Person(primary_org="legislature", name=name, district=post, role="Councillor") + name = councillor.xpath(".//strong/text()")[0] + if re.search(r"(?).+(?=<)", child["children"]["fr"]).group(0) + if child["parent"] == parent_id and "Conseill" not in text: + name = text.replace(" ", "") + elif not phone: + phone_pattern = re.search(r"\d{3} \d{3}-\d{4}(, poste \d{4})?", text) + if phone_pattern: + phone = phone_pattern.group(0) - index = [i for i, link in enumerate(emails) if name in link.text_content().replace("\u2019", "'")][0] - email = emails[index + 1] - p.add_contact("email", re.match("mailto:(.+@brossard.ca)", email.attrib["href"]).group(1)) - phone = email.xpath('./preceding-sibling::text()[contains(., "450")]') - phone = phone[-1] + p = Person(primary_org="legislature", name=name, district=district, role="Conseiller", image=photo) + p.add_contact("email", email) p.add_contact("voice", phone, "legislature") + p.add_source(COUNCIL_PAGE) yield p + + for element in elements.values(): + if ( + isinstance(element.get("children"), dict) + and re.search(r"MAIRE", element.get("children").get("fr")) + and not element.get("children").get("en") + ): + mayor = element + parent_id = mayor["parent"] + children = get_children(parent_id, elements) + phone = None + for id in children: + child = elements[id] + if child["tag"] == "Image": + photo = "https://www.brossard.ca/in/rest/public/AttachmentThumb?id=" + child["children"]["fr"] + elif child["tag"] == "TextBox": + if not isinstance(child["children"], dict) or "MAIRE" in child["children"]["fr"]: + continue + text = re.search(r"(?<=>).+(?=<)", child["children"]["fr"]).group(0) + if child["parent"] == parent_id: + name = text.replace(" ", "") + elif not phone: + phone_pattern = re.search(r"\d{3} \d{3}-\d{4}(, poste \d{4})?", text) + if phone_pattern: + phone = phone_pattern.group(0) + p = Person(primary_org="legislature", name=name, district="Brossard", role="Maire", image=photo) + p.add_contact("voice", phone, "legislature") + p.add_source(COUNCIL_PAGE) + yield p diff --git a/ca_qc_cote_saint_luc/people.py b/ca_qc_cote_saint_luc/people.py index 011b005e..9670ea1c 100644 --- a/ca_qc_cote_saint_luc/people.py +++ b/ca_qc_cote_saint_luc/people.py @@ -1,21 +1,11 @@ -from utils import CUSTOM_USER_AGENT +from utils import CUSTOM_USER_AGENT, CanadianScraper from utils import CanadianPerson as Person -from utils import CanadianScraper COUNCIL_PAGE = "https://cotesaintluc.org/fr/affaires-municipales/membres-du-conseil/" class CoteSaintLucPersonScraper(CanadianScraper): def scrape(self): - def decode_email(e): - de = "" - k = int(e[:2], 16) - - for i in range(2, len(e) - 1, 2): - de += chr(int(e[i : i + 2], 16) ^ k) - - return de - page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT) councillors = page.xpath('//div/div[contains(@class, "gb-container gb-container-") and .//img]') assert len(councillors), "No councillors found" @@ -39,13 +29,11 @@ def decode_email(e): blog = councillor.xpath( './/p[contains(.,"Blog")]//@href[not(contains(., "twitter") or contains(., "facebook"))]' ) - encrypted_email = councillor.xpath('.//@href[contains(., "email")]')[0].split("#")[1] - email = decode_email(encrypted_email) p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) - p.add_contact("email", email) + p.add_contact("email", self.get_email(councillor)) p.add_contact("voice", self.get_phone(councillor, area_codes=[514]), "legislature") p.image = image if twitter: diff --git a/ca_qc_gatineau/people.py b/ca_qc_gatineau/people.py index 34a9c91e..60522652 100644 --- a/ca_qc_gatineau/people.py +++ b/ca_qc_gatineau/people.py @@ -4,7 +4,6 @@ from utils import CanadianScraper COUNCIL_PAGE = "http://www.gatineau.ca/portail/default.aspx?p=guichet_municipal%2fconseil_municipal" -MAYOR_CONTACT_PAGE = "http://www.gatineau.ca/portail/default.aspx?p=la_ville/conseil_municipal/maire" class GatineauPersonScraper(CanadianScraper): @@ -12,32 +11,34 @@ def scrape(self): page = self.lxmlize(COUNCIL_PAGE) # it's all javascript rendered on the client... wow. - js = page.xpath('string(//div[@id="contenu-principal-centre-contenu-index"]/script[2])') # allow string() - districts = re.findall(r'arrayDistricts\[a.+"(.+)"', js) - names = re.findall(r'arrayMembres\[a.+"(.+)"', js) - urls = re.findall(r'arrayLiens\[a.+"(.+)"', js) - # first item in list is mayor - p = Person(primary_org="legislature", name=names[0], district="Gatineau", role="Maire") - p.add_source(COUNCIL_PAGE) - p.add_source(MAYOR_CONTACT_PAGE) - email = "maire@gatineau.ca" # hardcoded - p.add_contact("email", email) - yield p + js = page.xpath('string(//div[@id="contenu-principal-centre-contenu-index"]/script[1])') # allow string() + roles = re.findall(r'arrayMembres\[.+?"(.+?)"', js) + districts = re.findall(r'arrayMembres\[.+?, "(.*?)"', js) + names = re.findall(r'arrayMembres\[.+?,.+?, "(.*?)"', js) + urls = re.findall(r'arrayMembres\[.+"(.*?)",', js) - councillors = list(zip(districts, names, urls))[1:] + councillors = list(zip(roles, districts, names, urls)) assert len(councillors), "No councillors found" - for raw_district, name, url in councillors: - if name == "Vacant": + for role, raw_district, name, url in councillors: + if name == "Vacant" or "(de " in role: continue - profile_url = COUNCIL_PAGE + "/" + url.split("/")[-1] profile_page = self.lxmlize(profile_url) - photo_url = profile_page.xpath('//div[@class="colonnes-2"]//img/@src')[0] - district = "District " + re.search(r"\d+", raw_district).group(0) - email = self.get_email(profile_page) - p = Person(primary_org="legislature", name=name, district=district, role="Conseiller") + photo_url = profile_page.xpath('//div[@class="colonnes-3"]//img/@src')[0] + if raw_district: + district = "District " + re.search(r"\d+", raw_district).group(0) + role = "Conseiller" + else: + district = "Gatineau" + role = "Maire" + email = self.get_email(profile_page, error=False) + phone = self.get_phone(profile_page, error=False) + p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(profile_url) p.image = photo_url - p.add_contact("email", email) + if email: + p.add_contact("email", email) + if phone: + p.add_contact("voice", phone, "legislature") yield p diff --git a/ca_qc_kirkland/people.py b/ca_qc_kirkland/people.py index 3425e57c..3f0bab4b 100644 --- a/ca_qc_kirkland/people.py +++ b/ca_qc_kirkland/people.py @@ -8,15 +8,6 @@ class KirklandPersonScraper(CanadianScraper): def scrape(self): - def decode_email(e): - de = "" - k = int(e[:2], 16) - - for i in range(2, len(e) - 1, 2): - de += chr(int(e[i : i + 2], 16) ^ k) - - return de - page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="container_content"]//tbody/tr') @@ -39,8 +30,7 @@ def decode_email(e): .replace(".", ",") # correcting a typo .replace(",-#-", " x") ) - encrypted_email = councillor.xpath('.//@href[contains(., "email")]')[0].split("#")[1] - email = decode_email(encrypted_email) + email = self.get_email(councillor) p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) diff --git a/ca_qc_longueuil/__init__.py b/ca_qc_longueuil/__init__.py index c0c6b143..2a9425d9 100644 --- a/ca_qc_longueuil/__init__.py +++ b/ca_qc_longueuil/__init__.py @@ -21,7 +21,7 @@ def get_organizations(self): for seat_number in range(1, 4): organization.add_post( role="Conseiller", - label="{} (siège {})".format(division.name, seat_number), + label=f"{division.name} (siège {seat_number})", division_id=division.id, ) else: diff --git a/ca_qc_longueuil/people.py b/ca_qc_longueuil/people.py index 44d0c031..7227dbe1 100644 --- a/ca_qc_longueuil/people.py +++ b/ca_qc_longueuil/people.py @@ -20,7 +20,7 @@ def scrape(self): district = tr.xpath('.//p[contains(./strong, "District")]/a/text()')[0] if "Greenfield Park" in district: - district = "Greenfield Park (siège {})".format(seat_number) + district = f"Greenfield Park (siège {seat_number})" seat_number += 1 district = { @@ -46,7 +46,7 @@ def scrape(self): def scrape_mayor(self): page = self.lxmlize(MAYOR_PAGE) name = page.xpath("//h1[not(@class)]/text()")[0] - img = page.xpath('//img[contains(./@alt, "{}")]/@src'.format(name))[0] + img = page.xpath(f'//img[contains(./@alt, "{name}")]/@src')[0] p = Person(primary_org="legislature", name=name, district="Longueuil", role="Maire") p.add_source(COUNCIL_PAGE) p.add_source(MAYOR_PAGE) diff --git a/ca_qc_mercier/people.py b/ca_qc_mercier/people.py index e66dd79a..1d879f34 100644 --- a/ca_qc_mercier/people.py +++ b/ca_qc_mercier/people.py @@ -1,31 +1,40 @@ -import re - -from utils import CUSTOM_USER_AGENT from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.ville.mercier.qc.ca/02_viedemocratique/default.asp" +COUNCIL_PAGE = "https://www.ville.mercier.qc.ca/affaires-municipales/conseil-municipal/membres-du-conseil/" class MercierPersonScraper(CanadianScraper): def scrape(self): - page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT, encoding="windows-1252") + page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//table[@width="800"]/tr') + councillors = page.xpath('//div[@class="wp-block-team-member"]') assert len(councillors), "No councillors found" for councillor in councillors: - if councillor == councillors[0]: - name = councillor.xpath(".//strong/text()")[0].replace("Monsieur", "").replace("Madame", "").strip() - role = "Maire" - district = "Mercier" - else: - name = councillor.xpath(".//strong/text()")[0].replace("Monsieur", "").replace("Madame", "").strip() - role = "Conseiller" - district = "District {}".format(re.search(r"(\d)", councillor.xpath(".//text()")[3]).group(1)) + name = councillor.xpath(".//h4/text()")[0] + district = councillor.xpath(".//h5/text()")[0].split(" – ")[1] email = self.get_email(councillor) + phone = self.get_phone(councillor) + image = councillor.xpath(".//img/@src")[0] - p = Person(primary_org="legislature", name=name, district=district, role=role) + p = Person(primary_org="legislature", name=name, district=district, role="Conseiller", image=image) p.add_source(COUNCIL_PAGE) p.add_contact("email", email) + p.add_contact("voice", phone, "legislature") + yield p + + mayor_node = page.xpath('//div[@class="wp-block-media-text alignwide is-stacked-on-mobile"]')[0] + name = mayor_node.xpath(".//h1")[0].text_content() + + email = self.get_email(mayor_node) + phone = self.get_phone(mayor_node) + image = mayor_node.xpath(".//img/@src")[0] + + p = Person(primary_org="legislature", name=name, district="Mercier", role="Maire", image=image) + p.add_source(COUNCIL_PAGE) + p.add_contact("email", email) + p.add_contact("voice", phone, "legislature") + + yield p diff --git a/ca_qc_montreal/__init__.py b/ca_qc_montreal/__init__.py index d881c7dd..dfa79166 100644 --- a/ca_qc_montreal/__init__.py +++ b/ca_qc_montreal/__init__.py @@ -17,7 +17,7 @@ class Montreal(CanadianJurisdiction): {"name": "Projet Montréal - Équipe Valérie Plante"}, {"name": "Vrai changement pour Montréal"}, {"name": "Équipe Anjou"}, - {"name": "Équipe Barbe Team"}, + {"name": "Équipe LaSalle Team"}, {"name": "Équipe Dauphin Lachine"}, {"name": "Équipe Denis Coderre pour Montréal"}, ] diff --git a/ca_qc_montreal/people.py b/ca_qc_montreal/people.py index 7bf5eeb1..046e95e0 100644 --- a/ca_qc_montreal/people.py +++ b/ca_qc_montreal/people.py @@ -3,15 +3,19 @@ class MontrealPersonScraper(CSVScraper): # http://donnees.ville.montreal.qc.ca/dataset/listes-des-elus-de-la-ville-de-montreal - csv_url = "http://donnees.ville.montreal.qc.ca/dataset/381d74ca-dadd-459f-95c9-db255b5f4480/resource/ce1315a3-50ee-48d0-a0f0-9bcc15f65643/download/listeelusmontreal.csv" + csv_url = "https://donnees.montreal.ca/dataset/381d74ca-dadd-459f-95c9-db255b5f4480/resource/ce1315a3-50ee-48d0-a0f0-9bcc15f65643/download/liste_elus_montreal.csv" encoding = "utf-8" locale = "fr" corrections = { "primary role": { # Normalize to masculine role descriptor. "Conseillère de la ville": "Conseiller de la ville", + "Conseiller(ère) de la ville": "Conseiller de la ville", + "Conseiller(ère) de la Ville": "Conseiller de la ville", "Mairesse d'arrondissement": "Maire d'arrondissement", + "Maire(sse) d'arrondissement": "Maire d'arrondissement", "Mairesse de la Ville de Montréal": "Maire de la Ville de Montréal", + "Maire(sse)": "Maire de la Ville de Montréal", "Mairesse suppl\u00e9ante d'arrondissement": "Conseiller de la ville", }, "arrondissement": { @@ -23,18 +27,23 @@ class MontrealPersonScraper(CSVScraper): "Rivière-des-Prairies - Pointe-aux-Trembles": "Rivière-des-Prairies—Pointe-aux-Trembles", "Rosemont-La Petite-Patrie": "Rosemont—La Petite-Patrie", "Villeray - Saint-Michel - Parc-Extension": "Villeray—Saint-Michel—Parc-Extension", + # Name. + "Ville de Montr\u00e9al": "Montr\u00e9al", }, "district name": { "Champlain—L'Île-des-Sœurs": "Champlain—L'Île-des-Soeurs", "De Lorimier": "DeLorimier", - "Saint-Henri-Est-Petite-Bourgogne-Pointe-Saint-Charles-Griffintown": "Saint-Henri—Petite-Bourgogne—Pointe-Saint-Charles", - "Saint-Paul-Émard-Saint-Henri-Ouest": "Saint-Paul—Émard", + "Saint-Henri-Est–Petite-\nBourgogne–Pointe-Saint-\nCharles–Griffintown": "Saint-Henri—Petite-Bourgogne—Pointe-Saint-Charles", + "Saint-Paul–Émard– \nSaint-Henri-Ouest": "Saint-Paul—Émard", # Hyphens. "Maisonneuve-Longue-Pointe": "Maisonneuve—Longue-Pointe", "Norman McLaren": "Norman-McLaren", + "Saint-Léonard Ouest": "Saint-Léonard-Ouest", + "Saint-Léonard Est": "Saint-Léonard-Est", }, "party name": { "Indépendante": "Indépendant", + "Ind\u00e9pendant(e)": "Indépendant", }, "gender": { "Madame": "female", @@ -58,4 +67,8 @@ def header_converter(self, s): }.get(s, s) def is_valid_row(self, row): - return row["primary role"] not in ("Conseiller d'arrondissement", "Conseillère d'arrondissement") + return row["primary role"] not in ( + "Conseiller d'arrondissement", + "Conseillère d'arrondissement", + "Conseiller(\u00e8re) d'arrondissement", + ) diff --git a/ca_qc_montreal_est/people.py b/ca_qc_montreal_est/people.py index 9dd0c650..8d7fb6d4 100644 --- a/ca_qc_montreal_est/people.py +++ b/ca_qc_montreal_est/people.py @@ -1,28 +1,26 @@ from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://ville.montreal-est.qc.ca/la-ville/conseil-municipal/conseils-municipaux/" +COUNCIL_PAGE = "https://ville.montreal-est.qc.ca/vie-democratique/conseil-municipal/" class MontrealEstPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - - councillors = page.xpath("//table") + councillors = page.xpath("//div[contains(@id, 'membres-conseil-block_')]") assert len(councillors), "No councillors found" for councillor in councillors: - name = councillor.xpath(".//h3")[0].text_content() + name, role_district = councillor.xpath(".//span[@class='h3 d-block']")[0].text_content().split(" – ", 1) - if "maire" in name: - name = name.split(" ", 2)[-1] + if "Maire" in role_district: district = "Montréal-Est" role = "Maire" else: - district = "District {}".format(councillor.xpath(".//h3")[1].text_content()[-1]) + district = f"District {role_district[-1]}" role = "Conseiller" p = Person(primary_org="legislature", name=name, district=district, role=role) - p.image = councillor.xpath(".//@src")[0] + p.image = councillor.xpath(".//@data-lazy-src")[0] p.add_contact("email", self.get_email(councillor)) p.add_source(COUNCIL_PAGE) yield p diff --git a/ca_qc_pointe_claire/people.py b/ca_qc_pointe_claire/people.py index c32c6d39..910d4915 100644 --- a/ca_qc_pointe_claire/people.py +++ b/ca_qc_pointe_claire/people.py @@ -23,10 +23,10 @@ def scrape(self): elif district: district = district[0].text_content().split(" – ")[0].strip() else: - assert False, "error parsing district" + raise AssertionError("error parsing district") p = Person(primary_org="legislature", name=name, district=district, role=role) - p.image = councillor.xpath(".//@src")[0] + p.image = councillor.xpath(".//@data-src")[0] p.add_contact("email", self.get_email(councillor)) p.add_contact("voice", self.get_phone(councillor, area_codes=[514]), "legislature") p.add_source(COUNCIL_PAGE) diff --git a/ca_qc_quebec/people.py b/ca_qc_quebec/people.py index 129a8f1b..653a834d 100644 --- a/ca_qc_quebec/people.py +++ b/ca_qc_quebec/people.py @@ -1,5 +1,7 @@ import re +from django.template.defaultfilters import slugify + from utils import CanadianPerson as Person from utils import CanadianScraper @@ -25,18 +27,37 @@ def scrape(self): role = "Maire" else: district = councillor.xpath('./p[@itemprop="jobTitle"]/a/text()')[0] - district = re.search(r"\ADistrict (?:de(?: la)?|du|des) ([\w —–-]+)", district, flags=re.U).group( - 1 + district = ( + re.search(r"\ADistrict (?:de(?: la)?|du|des) ([\w —–-]+)", district, flags=re.UNICODE) + .group(1) + .strip() ) role = "Conseiller" - if district == "Saules": + if district == "Saules–Les Méandres": district = "Les Saules" + elif district == "Neufch\u00e2tel\u2013Lebourgneuf": + district = "Neufchâtel-Lebourgneuf" + elif district == "Loretteville\u2013Les Ch\u00e2tels": + district = "Loretteville-Les Ch\u00e2tels" else: district = re.sub(r"–", "—", district) # n-dash, m-dash - p = Person(primary_org="legislature", name=name, district=district, role=role) - p.add_source(COUNCIL_PAGE) - p.image = councillor.xpath("./figure//@src")[0] - p.add_contact("voice", self.get_phone(councillor, area_codes=[418]), "legislature") - yield p + districts = [district] + + borough = None + borough_strings = councillor.xpath('.//p[@itemprop = "affiliation"]/text()') + for string in borough_strings: + borough = re.findall(r"Présidente? de l’arrondissement (.*)$", string) + if borough: + borough = borough[0].replace("des", "Les").replace("de ", "") + districts.append(borough) + + for i, district in enumerate(districts): + p = Person(primary_org="legislature", name=name, district=district, role=role) + p.add_source(COUNCIL_PAGE) + p.image = councillor.xpath("./figure//@src")[0] + p.add_contact("voice", self.get_phone(councillor, area_codes=[418]), "legislature") + if i: + p._related[0].extras["boundary_url"] = f"/boundaries/quebec-boroughs/{slugify(district)}/" + yield p diff --git a/ca_qc_saguenay/people.py b/ca_qc_saguenay/people.py index 5cf7e813..2979234c 100644 --- a/ca_qc_saguenay/people.py +++ b/ca_qc_saguenay/people.py @@ -1,3 +1,5 @@ +from django.template.defaultfilters import slugify + from utils import CanadianPerson as Person from utils import CanadianScraper @@ -10,14 +12,12 @@ class SaguenayPersonScraper(CanadianScraper): def scrape(self): mayor_page = self.lxmlize(MAYOR_PAGE) contact_page = self.lxmlize(CONTACT_PAGE) - - name = mayor_page.xpath('//span/text()[contains(., "maire")]')[0].split(", ", 1)[0] + name = mayor_page.xpath('//a[contains(., "maire")]/span/text()')[0] p = Person(primary_org="legislature", name=name, district="Saguenay", role="Maire") p.add_source(MAYOR_PAGE) p.add_source(CONTACT_PAGE) node = contact_page.xpath('//h2[contains(., "Coordonnées du cabinet")]/following-sibling::p')[1] p.add_contact("voice", self.get_phone(node, area_codes=[418]), "legislature") - p.add_contact("email", self.get_email(node)) yield p page = self.lxmlize(COUNCIL_PAGE) @@ -26,6 +26,20 @@ def scrape(self): for councillor in councillors: district = councillor.xpath("./h3/text()")[0].replace("#", "") name = councillor.xpath(".//p/text()")[0] + borough = None + borough_node = councillor.xpath(".//p/strong") + if borough_node: + text = borough_node[0].text_content() + if "Président" in text: + borough = text.replace("Président de l'arrondissement de ", "") + + if borough: + p = Person(primary_org="legislature", name=name, district=borough, role="Conseiller") + p.add_source(COUNCIL_PAGE) + p.add_contact("voice", self.get_phone(councillor), "legislature") + p.add_contact("email", self.get_email(councillor)) + p._related[0].extras["boundary_url"] = f"/boundaries/saguenay-boroughs/{slugify(borough)}/" + yield p p = Person(primary_org="legislature", name=name, district=district, role="Conseiller") p.add_source(COUNCIL_PAGE) diff --git a/ca_qc_saint_jerome/people.py b/ca_qc_saint_jerome/people.py index 9defa48d..23a6f19d 100644 --- a/ca_qc_saint_jerome/people.py +++ b/ca_qc_saint_jerome/people.py @@ -21,13 +21,14 @@ def scrape(self): role = "Conseiller" image = councillor.xpath('.//div[@class="portrait_single"]/img/@data-lazy-src')[0] - contact = councillor.xpath('.//div[contains(@class,"phone")]/text()')[0] + phone = self.get_phone(councillor, error=False) p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = image - p.add_contact("voice", contact, "legislature") + if phone: + p.add_contact("voice", phone, "legislature") p.add_contact("email", self.get_email(councillor)) yield p diff --git a/ca_qc_senneville/people.py b/ca_qc_senneville/people.py index 6233f7da..2d0d3948 100644 --- a/ca_qc_senneville/people.py +++ b/ca_qc_senneville/people.py @@ -1,20 +1,23 @@ from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.villagesenneville.qc.ca/fr/7/conseil-municipal" +COUNCIL_PAGE = "https://www.senneville.ca/municipalite/vie-democratique/conseil-municipal/" class SennevillePersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//section[@class="block text"][./header/h2][position() > 1]') + councillors = page.xpath('//div[@class="wp-block-media-text is-stacked-on-mobile"]') assert len(councillors), "No councillors found" for councillor in councillors: - role_and_district, name = councillor.xpath(".//h2/text()")[0].split("-") - role, district = role_and_district.split(" ", 1) - if role == "Maire": + role_and_district, name = councillor.xpath(".//h2")[0].text_content().split(" – ") + if "Maire" in role_and_district: + role = "Maire" district = "Senneville" + else: + role, district = role_and_district.split(" ", 1) + email = self.get_email(councillor) p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) diff --git a/ca_qc_sherbrooke/people.py b/ca_qc_sherbrooke/people.py index 9048e2d4..ecfa1bb3 100644 --- a/ca_qc_sherbrooke/people.py +++ b/ca_qc_sherbrooke/people.py @@ -1,44 +1,75 @@ +import json + +import lxml.html +from django.template.defaultfilters import slugify + from utils import CanadianPerson as Person from utils import CanadianScraper, clean_french_prepositions -COUNCIL_PAGE = "http://www.ville.sherbrooke.qc.ca/mairie-et-vie-democratique/conseil-municipal/elus-municipaux/" +COUNCIL_PAGE = "https://www.sherbrooke.ca/fr/vie-municipale/elues-et-elus-municipaux" class SherbrookePersonScraper(CanadianScraper): def scrape(self): - page = self.lxmlize(COUNCIL_PAGE) + districts = [] + + # The whole site is rendered with Javascript, but has part of the html documents in the scripts + def get_content(url): + page = self.lxmlize(url) + script = page.xpath(".//script[not(@type)]")[0].text_content() + data = script.split(" = ", 1)[1] + data = json.loads(data) + content = data["value"]["selected"]["content"]["fr"] + return lxml.html.fromstring(content) - councillors = page.xpath('//div[@id="c2087"]//a') + page = get_content(COUNCIL_PAGE) + councillors = page.xpath("//a[.//h3]") assert len(councillors), "No councillors found" for councillor in councillors: - name = councillor.text_content() - url = councillor.attrib["href"] - page = self.lxmlize(url) + districts = [] + name = councillor.xpath(".//h3")[0].text_content() + role = councillor.xpath('.//div[@class="poste"]')[0].text_content() - if "Maire" in page.xpath("//h2/text()")[0]: - district = "Sherbrooke" + if "Maire" in role: role = "Maire" + district = "Sherbrooke" else: - district = page.xpath('//div[@class="csc-default"]//a[contains(@href, "fileadmin")]/text()')[0] - district = clean_french_prepositions(district).replace("district", "").strip() role = "Conseiller" + district = councillor.xpath('.//div[@class="district"]')[0].text_content() + district = clean_french_prepositions(district).replace("District", "").strip() + if district == "Lac-Magog": + district = "Lac Magog" + + districts.append(district) + + if "président" in role: + borough = councillor.xpath('.//div[@class="bloc_bas"]/p')[0].text_content() + borough = clean_french_prepositions(borough).replace("Arrondissement", "").strip() + + if borough == "Brompton-Rock Forest-Saint-\u00c9lie-Deauville": + borough = "Brompton–Rock Forest–Saint-Élie–Deauville" # N-dashes + if borough != district: # Lennoxville + districts.append(borough) + + url = "https://www.sherbrooke.ca" + councillor.xpath("./@href")[0] + page = get_content(url) + + phone = self.get_phone(page, error=False) + email = self.get_email(page, error=False) + image = councillor.xpath(".//@src")[0] + if "https://" not in image: + image = "https://contenu.maruche.ca" + image + + for i, district in enumerate(districts): + p = Person(primary_org="legislature", name=name, district=district, role=role) + p.add_source(COUNCIL_PAGE) + p.add_source(url) + p.image = image - if district == "Lennoxville": - district = "Arrondissement 3" - - p = Person(primary_org="legislature", name=name, district=district, role=role) - p.add_source(COUNCIL_PAGE) - p.add_source(url) - p.image = page.xpath('//div[@class="csc-textpic-image csc-textpic-last"]//img/@src')[0] - parts = page.xpath('//li[contains(text(), "phone")]/text()')[0].split(":") - note = parts[0] - phone = parts[1] - p.add_contact(note, phone, note) - email = self.get_email(page) - if email: - p.add_contact("email", email) - if district == "Brompton": - p._related[0].extras["boundary_url"] = "/boundaries/sherbrooke-boroughs/brompton/" - elif district == "Lennoxville": - p._related[0].extras["boundary_url"] = "/boundaries/sherbrooke-boroughs/lennoxville/" - yield p + if email: + p.add_contact("email", email) + if phone: + p.add_contact("voice", phone, "legislature") + if i: + p._related[0].extras["boundary_url"] = f"/boundaries/sherbrooke-boroughs/{slugify(district)}/" + yield p diff --git a/ca_qc_terrebonne/people.py b/ca_qc_terrebonne/people.py index 023b0066..40b87c9e 100644 --- a/ca_qc_terrebonne/people.py +++ b/ca_qc_terrebonne/people.py @@ -1,34 +1,33 @@ from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.ville.terrebonne.qc.ca/fr/10/Conseil_municipal" +COUNCIL_PAGE = "https://terrebonne.ca/membres-du-conseil-municipal/" class TerrebonnePersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE, "utf-8") - councillors = page.xpath('//div[contains(@class, "member-box member-box--")]') + councillors = page.xpath('//div[contains(@class, "member-card jsBlockLink")]') assert len(councillors), "No councillors found" for councillor in councillors: - name = councillor.xpath('.//div[@class="fiche__name"]/text()')[0] - phone = councillor.xpath('.//div[@class="fiche__social"]/span/text()')[0].split("T")[1] - email_mailto = councillor.xpath('.//div[@class="fiche__social"]/a[contains(@href, "mailto")]/@href') - photo_url = councillor.xpath(".//img")[0].attrib["src"] - - page = self.lxmlize(councillor.xpath('.//a[@class="member-box__calltoaction"]/@href')[0]) - district = page.xpath('.//div[@class="fiche__category"]/text()')[0] - - if district == "Maire": - district = "Terrebonne" + name = councillor.xpath('.//a[@class="name"]/text()')[0] + district = councillor.xpath('.//p[@class="district"]/text()')[0] + if "Maire" in district: role = "Maire" + district = "Terrebonne" else: - district = "District {}".format(district) role = "Conseiller" + district = district.split(" - ")[0] + + photo_url = councillor.xpath(".//noscript/img/@src")[0] + url = councillor.xpath(".//@href")[0] + + page = self.lxmlize(url) + email = self.get_email(page) + phone = self.get_phone(page) p = Person(primary_org="legislature", name=name, district=district, role=role, image=photo_url) p.add_source(COUNCIL_PAGE) p.add_contact("voice", phone, "legislature") - if email_mailto: - email = email_mailto[0].split("mailto:")[1] - p.add_contact("email", email) + p.add_contact("email", email) yield p diff --git a/ca_qc_trois_rivieres/people.py b/ca_qc_trois_rivieres/people.py index f0c2ea31..b80f295a 100644 --- a/ca_qc_trois_rivieres/people.py +++ b/ca_qc_trois_rivieres/people.py @@ -11,17 +11,16 @@ class TroisRivieresPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - members = page.xpath('//div[@class="photos_conseillers"]//figure') + members = page.xpath('//div[contains(@class, "photos_conseillers")]//figure') assert len(members), "No councillors found" for member in members: photo_url = member.xpath(".//a//img/@src")[0] url = member.xpath(".//figcaption//a/@href")[0] - email = self.lxmlize(url).xpath('//div[@class="content-page"]//a[starts-with(@href, "mailto:")]/@href')[0] + email = self.get_email(self.lxmlize(url)) - email = re.sub("^mailto:", "", email) name, district = [x.strip() for x in member.xpath(".//figcaption//text()")] - district = re.sub(r"\A(?:de|des|du) ", lambda match: match.group(0).lower(), district, flags=re.I) + district = re.sub(r"\A(?:de|des|du) ", lambda match: match.group(0).lower(), district, flags=re.IGNORECASE) role = "Conseiller" if "Maire" in district: diff --git a/ca_qc_westmount/people.py b/ca_qc_westmount/people.py index a047d864..9ed24012 100644 --- a/ca_qc_westmount/people.py +++ b/ca_qc_westmount/people.py @@ -21,11 +21,14 @@ def scrape(self): role = "Conseiller" district = councillor.xpath(".//li//text()")[0] + email = self.get_email(councillor, error=False) + p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath(".//@src")[0] p.add_contact("voice", self.get_phone(councillor), "legislature") - p.add_contact("email", self.get_email(councillor)) + if email: + p.add_contact("email", email) yield p diff --git a/ca_sk/people.py b/ca_sk/people.py index 6f46fe52..c3ecfc0f 100644 --- a/ca_sk/people.py +++ b/ca_sk/people.py @@ -1,3 +1,6 @@ +import contextlib +import re + from utils import CanadianPerson as Person from utils import CanadianScraper @@ -8,71 +11,61 @@ class SaskatchewanPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - members = page.xpath('//table[@id="MLAs"]//tr')[1:] + members = page.xpath('//table[@id="mla-table"]//tr')[1:] assert len(members), "No members found" for member in members: - if "Vacant" not in member.xpath("./td")[0].text_content(): - name = member.xpath("./td")[0].text_content().split(". ", 1)[1] - district = member.xpath("./td")[2].text_content() - url = member.xpath("./td[1]/a/@href")[0] - page = self.lxmlize(url) - party = page.xpath('//span[@id="ContentContainer_MainContent_ContentBottom_Property4"]' "/span")[ - 0 - ].text - - p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party) - p.add_source(COUNCIL_PAGE) - p.add_source(url) - try: - p.image = page.xpath('//div[contains(@class, "mla-image-cell")]/img/@src')[0] - except IndexError: - pass + if "Vacant" in member.xpath("./td")[1].text_content(): + continue + name = member.xpath("./td")[0].text_content().split(". ", 1)[1].strip() + district = member.xpath("./td")[2].text_content().strip() + url = member.xpath("./td[1]/a/@href")[0] + page = self.lxmlize(url) + party = page.xpath('//div[contains(@class, "mla-header")]')[0].text.split(" - ")[1].strip() - contact = page.xpath('//div[@id="mla-contact"]/div[2]')[0] - website = contact.xpath("./div[3]/div[3]/div[2]/a") - if website: - p.add_link(website[0].text_content()) + p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party) + p.add_source(COUNCIL_PAGE) + p.add_source(url) + with contextlib.suppress(IndexError): + p.image = page.xpath('//div[contains(@class, "mla-image-cell")]/img/@src')[0] - def handle_address(lines, address_type): - address_lines = [] - for line in lines: - if line.endswith(":"): # Room:, Phone:, Fax: - break - address_lines.append(line) - if address_lines: - p.add_contact( - "address", - " ".join(address_lines), - address_type, - ) + def handle_address(p, lines, address_type): + address_lines = [] + for line in lines: + if re.match(r"(Room|Phone|Fax)\:", line): + break + address_lines.append(line) + if address_lines: + p.add_contact( + "address", + " ".join(address_lines), + address_type, + ) - def handle_phone(lines, phone_type): - if "Phone:" in lines: - next_line = lines[lines.index("Phone:") + 1] - if next_line.endswith(":"): - return - number = None - if "/" in next_line: - for fragment in next_line.split("/"): - if fragment.strip().startswith("306-"): - number = fragment.strip() - break - else: - number = next_line - p.add_contact("voice", number, phone_type, area_code=306) + def handle_phone(p, lines, phone_type): + matches = re.findall(r"Phone\:\s*(306-[\d\-]+)", "\n".join(lines)) + if len(matches) == 1: + p.add_contact("voice", matches[0], phone_type, area_code=306) - legislature_lines = contact.xpath('.//div[@class="col-md-4"][1]/div//text()') - assert legislature_lines[0] == "Legislative Building Address" - handle_address(legislature_lines[1:], "legislature") - handle_phone(legislature_lines[1:], "legislature") + for address in page.xpath('//div[@class="col-md-3"]'): + lines = address.xpath("./div//text()") + address_type = None + if lines[0] == "Legislative Building Address": + address_type = "legislature" + elif lines[0] == "Constituency Address": + address_type = "constituency" + else: + raise AssertionError(f"Unexpected address type: {lines[0]}") + handle_address(p, lines[1:], address_type) + handle_phone(p, lines[1:], address_type) - constituency_lines = contact.xpath('.//div[@class="col-md-4"][2]/div//text()') - assert constituency_lines[0] == "Constituency Address" - handle_address(constituency_lines[1:], "constituency") - handle_phone(constituency_lines[1:], "constituency") + email = self.get_email(page.xpath('//div[@id="content"]')[0], error=False) + if email: + p.add_contact("email", email) - email = self.get_email(contact, error=False) - if email: - p.add_contact("email", email) + websites = re.findall( + r"Website:\s*(http\S+)", " ".join(page.xpath('//div[@class="col-md-4"]/div//text()')) + ) + if len(websites) == 1: + p.add_link(websites[0]) - yield p + yield p diff --git a/ca_sk_regina/people.py b/ca_sk_regina/people.py index adfb1714..13565418 100644 --- a/ca_sk_regina/people.py +++ b/ca_sk_regina/people.py @@ -1,11 +1,10 @@ -import re from urllib.parse import urljoin from utils import CanadianPerson as Person from utils import CanadianScraper COUNCIL_PAGE = "https://www.regina.ca/city-government/city-council" -MAYOR_CONTACT_URL = "https://www.regina.ca/city-government/city-council/mayors-office" +MAYOR_CONTACT_URL = "https://www.regina.ca/city-government/city-council/mayors-office/contact-mayor/" class ReginaPersonScraper(CanadianScraper): @@ -26,7 +25,6 @@ def scrape(self): def councillor_data(self, url, name, ward): page = self.lxmlize(url) - # sadly, email is a form on a separate page photo_url_rel = page.xpath('//div[@class="councillor__image"]//img/@src')[0] photo_url = urljoin(url, photo_url_rel) @@ -34,12 +32,8 @@ def councillor_data(self, url, name, ward): m.add_source(COUNCIL_PAGE) m.add_source(url) - # Scrape and add phone. - phone_path = page.xpath('//div[@class="councillor__contact"]//ul/li/a/@href[contains(., "306")]')[0] - phone_string = phone_path.rsplit("/", 1)[-1] - phone = re.sub("[^0-9]", "", phone_string) - if phone: - m.add_contact("voice", phone, "legislature") + m.add_contact("voice", self.get_phone(page), "legislature") + m.add_contact("email", self.get_email(page)) m.image = photo_url yield m @@ -57,11 +51,8 @@ def mayor_data(self, url): m.add_source(url) m.image = photo_url - # Scrape and add phone. - phone_path = page.xpath('//div[@class="councillor__contact"]//ul/li/a/@href[contains(., "306")]')[0] - phone_string = phone_path.rsplit("/", 1)[-1] - phone = re.sub("[^0-9]", "", phone_string) - if phone: - m.add_contact("voice", phone, "legislature") + page = self.lxmlize(MAYOR_CONTACT_URL) + m.add_contact("voice", self.get_phone(page), "legislature") + m.add_contact("email", self.get_email(page)) return m diff --git a/ca_yt/__init__.py b/ca_yt/__init__.py index afb558fa..c80defd9 100644 --- a/ca_yt/__init__.py +++ b/ca_yt/__init__.py @@ -5,6 +5,6 @@ class Yukon(CanadianJurisdiction): classification = "legislature" division_id = "ocd-division/country:ca/territory:yt" division_name = "Yukon" - name = "Yukon Legislative Assembly" + name = "Legislative Assembly of Yukon" url = "https://yukonassembly.ca" parties = [{"name": "Yukon Liberal Party"}, {"name": "Yukon Party"}, {"name": "New Democratic Party"}] diff --git a/ca_yt/people.py b/ca_yt/people.py index d7f95eee..b6dd62f4 100644 --- a/ca_yt/people.py +++ b/ca_yt/people.py @@ -1,3 +1,5 @@ +import contextlib + from utils import CanadianPerson as Person from utils import CanadianScraper @@ -25,17 +27,15 @@ def scrape(self): p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party) p.add_source(COUNCIL_PAGE) p.add_source(url) - try: + with contextlib.suppress(IndexError): p.image = page.xpath('//article[contains(@class, "member")]/p/img/@src')[0] - except IndexError: - pass contact = page.xpath('//article[contains(@class, "members-sidebar")]')[0] website = contact.xpath("./div[3]/div[3]/div[2]/a") if website: p.add_link(website[0].text_content()) - def handle_address(lines, address_type): + def handle_address(p, lines, address_type): address_lines = [] for line in lines: if line.endswith(":"): # Room:, Phone:, Fax: @@ -48,7 +48,7 @@ def handle_address(lines, address_type): address_type, ) - def handle_phone(lines, phone_type): + def handle_phone(p, lines, phone_type): if "Phone:" in lines: next_line = lines[lines.index("Phone:") + 1] if next_line.endswith(":"): @@ -66,8 +66,8 @@ def handle_phone(lines, phone_type): address_lines = contact.xpath("//address//text()") contact_lines = contact.xpath("//p[2]//text()") assert address_lines[0].strip() == "Yukon Legislative Assembly" - handle_address(address_lines[1:], "legislature") - handle_phone(contact_lines[1:], "legislature") + handle_address(p, address_lines[1:], "legislature") + handle_phone(p, contact_lines[1:], "legislature") email = self.get_email(contact, error=False) if email: diff --git a/country-ca.csv b/country-ca.csv index 3dbb69c6..0734bb82 100644 --- a/country-ca.csv +++ b/country-ca.csv @@ -2550,10 +2550,10 @@ ocd-division/country:ca/csd:2442110/district:4,District 4,,,,,,,,,,,,,, ocd-division/country:ca/csd:2442110/district:5,District 5,,,,,,,,,,,,,, ocd-division/country:ca/csd:2442110/district:6,District 6,,,,,,,,,,,,,, ocd-division/country:ca/csd:2443027,Sherbrooke,,,,,V,Y,Sherbrooke,,Ville de Sherbrooke,,24,,, -ocd-division/country:ca/csd:2443027/borough:1,Arrondissement 1,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2443027/borough:2,Arrondissement 2,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2443027/borough:3,Arrondissement 3,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2443027/borough:4,Arrondissement 4,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2443027/borough:1,Brompton–Rock Forest–Saint-Élie–Deauville,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2443027/borough:2,Fleurimont,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2443027/borough:3,Lennoxville,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2443027/borough:4,Nations,,,,,,,,,,,,,, ocd-division/country:ca/csd:2443027/district:1.1,Lac Magog,,,,,,,,,,,,,, ocd-division/country:ca/csd:2443027/district:1.2,Rock Forest,,,,,,,,,,,,,, ocd-division/country:ca/csd:2443027/district:1.3,Saint-Élie,,,,,,,,,,,,,, @@ -4148,24 +4148,25 @@ ocd-division/country:ca/csd:2480135,Duhamel,,,,,MÉ,N,Duhamel,,,,7,,, ocd-division/country:ca/csd:2480140,Val-des-Bois,,,,,MÉ,N,Val-des-Bois,,,,7,,, ocd-division/country:ca/csd:2480145,Bowman,,,,,MÉ,N,Bowman,,,,7,,, ocd-division/country:ca/csd:2481017,Gatineau,,,,,V,Y,Gatineau,,Ville de Gatineau,,19,,, -ocd-division/country:ca/csd:2481017/district:1,d'Aylmer,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:10,de Touraine,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:11,de Pointe-Gatineau,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:12,du Carrefour-de-l'Hôpital,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:13,du Versant,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:14,de Bellevue,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:15,du Lac-Beauchamp,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:16,de la Rivière-Blanche,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:17,de Masson-Angers,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:18,de Buckingham,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:2,de Lucerne,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:3,de Deschênes,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:4,du Plateau—Manoir-des-Trembles,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:5,de Wright—Parc-de-la-Montagne,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:6,de l'Orée-du-Parc,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:7,de Saint-Raymond—Vanier,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:8,de Hull—Val-Tétreau,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:9,de Limbour,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:1,Aylmer,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:10,Limbour,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:11,Touraine,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:12,Pointe-Gatineau,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:13,Carrefour-de-l'Hôpital,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:14,Versant,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:15,Bellevue,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:16,Lac-Beauchamp,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:17,Rivière-Blanche,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:18,Masson-Angers,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:19,Buckingham,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:2,Lucerne,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:3,Deschênes,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:4,Plateau,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:5,Mitigomijokan,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:6,Manoir-des-Trembles—Val-Tétreau,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:7,Hull–Wright,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:8,Parc-de-la-Montagne—Saint-Raymond,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:9,Orée-du-Parc,,,,,,,,,,,,,, ocd-division/country:ca/csd:2482005,L'Ange-Gardien,,,,,MÉ,Y,L'Ange-Gardien,,,,7,,, ocd-division/country:ca/csd:2482005/district:1,du Lièvre,,,,,,,,,,,,,, ocd-division/country:ca/csd:2482005/district:2,Lac Donaldson,,,,,,,,,,,,,, @@ -4722,7 +4723,7 @@ ocd-division/country:ca/csd:3502036,Clarence-Rockland,,,,,C,,Clarence-Rockland,, ocd-division/country:ca/csd:3502044,Casselman,,,,,VL,,Casselman,,Village of Casselman,ocd-division/country:ca/cd:3502,,,, ocd-division/country:ca/csd:3502048,Russell,,,,,TP,,Russell,,Township of Russell,ocd-division/country:ca/cd:3502,,,, ocd-division/country:ca/csd:3506008,Ottawa,,,,,CV,,Ottawa,,City of Ottawa,,,,, -ocd-division/country:ca/csd:3506008/ward:1,Orléans,,,,,,,,,,,,,, +ocd-division/country:ca/csd:3506008/ward:1,Orléans East-Cumberland,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:10,Gloucester-Southgate,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:11,Beacon Hill-Cyrville,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:12,Rideau-Vanier,,,,,,,,,,,,,, @@ -4732,13 +4733,14 @@ ocd-division/country:ca/csd:3506008/ward:15,Kitchissippi,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:16,River,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:17,Capital,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:18,Alta Vista,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3506008/ward:19,Cumberland,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3506008/ward:2,Innes,,,,,,,,,,,,,, +ocd-division/country:ca/csd:3506008/ward:19,Orléans South-Navan,,,,,,,,,,,,,, +ocd-division/country:ca/csd:3506008/ward:2,Orléans West-Innes,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:20,Osgoode,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3506008/ward:21,Rideau-Goulbourn,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3506008/ward:22,Gloucester-South Nepean,,,,,,,,,,,,,, +ocd-division/country:ca/csd:3506008/ward:21,Rideau-Jock,,,,,,,,,,,,,, +ocd-division/country:ca/csd:3506008/ward:22,Riverside South-Findlay Creek,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:23,Kanata South,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3506008/ward:3,Barrhaven,,,,,,,,,,,,,, +ocd-division/country:ca/csd:3506008/ward:24,Barrhaven East,,,,,,,,,,,,,, +ocd-division/country:ca/csd:3506008/ward:3,Barrhaven West,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:4,Kanata North,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:5,West Carleton-March,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:6,Stittsville,,,,,,,,,,,,,, @@ -4834,13 +4836,13 @@ ocd-division/country:ca/csd:3515037,North Kawartha,,,,,TP,,North Kawartha,,Towns ocd-division/country:ca/csd:3515044,Trent Lakes,,,,,MU,,Trent Lakes,,Municipality of Trent Lakes,ocd-division/country:ca/cd:3515,,,, ocd-division/country:ca/csd:3516010,Kawartha Lakes,,,,,CY,,Kawartha Lakes,,City of Kawartha Lakes,,,,, ocd-division/country:ca/csd:3516010/ward:1,Ward 1,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3516010/ward:10,Ward 10,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3516010/ward:11,Ward 11,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3516010/ward:12,Ward 12,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3516010/ward:13,Ward 13,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3516010/ward:14,Ward 14,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3516010/ward:15,Ward 15,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3516010/ward:16,Ward 16,,,,,,,,,,,,,, +ocd-division/country:ca/csd:3516010/ward:10,Ward 10,,2018-12-01,,,,,,,,,,,, +ocd-division/country:ca/csd:3516010/ward:11,Ward 11,,2018-12-01,,,,,,,,,,,, +ocd-division/country:ca/csd:3516010/ward:12,Ward 12,,2018-12-01,,,,,,,,,,,, +ocd-division/country:ca/csd:3516010/ward:13,Ward 13,,2018-12-01,,,,,,,,,,,, +ocd-division/country:ca/csd:3516010/ward:14,Ward 14,,2018-12-01,,,,,,,,,,,, +ocd-division/country:ca/csd:3516010/ward:15,Ward 15,,2018-12-01,,,,,,,,,,,, +ocd-division/country:ca/csd:3516010/ward:16,Ward 16,,2018-12-01,,,,,,,,,,,, ocd-division/country:ca/csd:3516010/ward:2,Ward 2,,,,,,,,,,,,,, ocd-division/country:ca/csd:3516010/ward:3,Ward 3,,,,,,,,,,,,,, ocd-division/country:ca/csd:3516010/ward:4,Ward 4,,,,,,,,,,,,,, @@ -4848,7 +4850,7 @@ ocd-division/country:ca/csd:3516010/ward:5,Ward 5,,,,,,,,,,,,,, ocd-division/country:ca/csd:3516010/ward:6,Ward 6,,,,,,,,,,,,,, ocd-division/country:ca/csd:3516010/ward:7,Ward 7,,,,,,,,,,,,,, ocd-division/country:ca/csd:3516010/ward:8,Ward 8,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3516010/ward:9,Ward 9,,,,,,,,,,,,,, +ocd-division/country:ca/csd:3516010/ward:9,Ward 9,,2018-12-01,,,,,,,,,,,, ocd-division/country:ca/csd:3518001,Pickering,,,,,CY,,Pickering,,City of Pickering,ocd-division/country:ca/cd:3518,,,, ocd-division/country:ca/csd:3518001/ward:1,Ward 1,,,,,,,,,,,,,, ocd-division/country:ca/csd:3518001/ward:2,Ward 2,,,,,,,,,,,,,, @@ -5564,7 +5566,7 @@ ocd-division/country:ca/csd:3557061/ward:2,Ward 2,,,,,,,,,,,,,, ocd-division/country:ca/csd:3557061/ward:3,Ward 3,,,,,,,,,,,,,, ocd-division/country:ca/csd:3557061/ward:4,Ward 4,,,,,,,,,,,,,, ocd-division/country:ca/csd:3557061/ward:5,Ward 5,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3557061/ward:6,Ward 6,,,,,,,,,,,,,, +ocd-division/country:ca/csd:3557061/ward:6,Ward 6,,2018-10-21,,,,,,,,,,,, ocd-division/country:ca/csd:3557066,Prince,,,,,TP,,Prince,,Township of Prince,,,,, ocd-division/country:ca/csd:3557071,Sagamok,,,,,IRI,N,Sagamok,,,,,,, ocd-division/country:ca/csd:3557072,Serpent River 7,,,,,IRI,N,Serpent River 7,,,,,,, diff --git a/disabled/ca_bc_municipalities/people.py b/disabled/ca_bc_municipalities/people.py index 6e505ebb..f4100405 100644 --- a/disabled/ca_bc_municipalities/people.py +++ b/disabled/ca_bc_municipalities/people.py @@ -92,7 +92,7 @@ def scrape(self): if division_id in exclude_divisions: continue if division_id in processed_ids: - raise Exception("unhandled collision: {}".format(division_id)) + raise Exception(f"unhandled collision: {division_id}") division = Division.get(division_id) processed_divisions.add(division_name) diff --git a/disabled/ca_bc_municipalities_candidates/people.py b/disabled/ca_bc_municipalities_candidates/people.py index 109e4aa3..dab5c42b 100644 --- a/disabled/ca_bc_municipalities_candidates/people.py +++ b/disabled/ca_bc_municipalities_candidates/people.py @@ -116,11 +116,8 @@ def scrape(self): role = row["primary role"] if role not in expected_roles: - raise Exception("unexpected role: {}".format(role)) - if row["district id"]: - district = format(division_id) - else: - district = division_name + raise Exception(f"unexpected role: {role}") + district = format(division_id) if row["district id"] else division_name organization.add_post(role=role, label=district, division_id=division_id) diff --git a/disabled/ca_mb_municipalities/people.py b/disabled/ca_mb_municipalities/people.py index 674b114f..87c2923a 100644 --- a/disabled/ca_mb_municipalities/people.py +++ b/disabled/ca_mb_municipalities/people.py @@ -15,10 +15,7 @@ def scrape(self): districts = page.xpath('//div[@id="ctl00_PublicContent_divSearchContent"]//tr')[5::3] for district in districts: title = district.xpath(".//td//text()") - if len(title[0]) > 1: - title = title[0] - else: - title = "".join(title[:2]) + title = title[0] if len(title[0]) > 1 else "".join(title[:2]) # @todo Need to distinguish between, e.g., R.M. and Town title = title.title() diff --git a/disabled/ca_municipalities/people.py b/disabled/ca_municipalities/people.py index c20c3878..e7537a4e 100644 --- a/disabled/ca_municipalities/people.py +++ b/disabled/ca_municipalities/people.py @@ -61,7 +61,7 @@ def scrape(self): if self.many_posts_per_area and role not in self.unique_roles: seat_numbers[role][district] += 1 - district = "{} (seat {})".format(district, seat_numbers[role][district]) + district = f"{district} (seat {seat_numbers[role][district]})" p = Person( primary_org=organization_classification, @@ -111,6 +111,5 @@ def scrape(self): p.validate() yield p - except Exception as e: - print(repr(e)) - continue + except Exception: + pass diff --git a/disabled/ca_nb_municipalities/people.py b/disabled/ca_nb_municipalities/people.py index c0c95580..a4f42954 100644 --- a/disabled/ca_nb_municipalities/people.py +++ b/disabled/ca_nb_municipalities/people.py @@ -52,7 +52,7 @@ def scrape(self): if division.attrs["classification"] == "P": continue if division.name in names_to_ids: - raise Exception("unhandled collision: {}".format(division.name)) + raise Exception(f"unhandled collision: {division.name}") else: names_to_ids[division.name] = division.id @@ -79,11 +79,11 @@ def scrape(self): if division_id in exclude_divisions: continue if division_id in seen: - raise Exception("unhandled collision: {}".format(division_id)) + raise Exception(f"unhandled collision: {division_id}") seen.add(division_id) division_name = Division.get(division_id).name - organization_name = "{} {} Council".format(division_name, classifications[list_link.text]) + organization_name = f"{division_name} {classifications[list_link.text]} Council" organization = Organization(name=organization_name, classification="government") organization.add_source(detail_url) @@ -104,7 +104,7 @@ def scrape(self): for p in groups: role = p.xpath("./b/text()")[0].rstrip("s") if role not in expected_roles: - raise Exception("unexpected role: {}".format(role)) + raise Exception(f"unexpected role: {role}") councillors = p.xpath("./text()") assert len(councillors), "No councillors found" @@ -112,10 +112,7 @@ def scrape(self): if "vacant" in name.lower(): continue - if role in unique_roles: - district = division_name - else: - district = "{} (seat {})".format(division_name, seat_number) + district = division_name if role in unique_roles else f"{division_name} (seat {seat_number})" organization.add_post(role=role, label=district, division_id=division_id) diff --git a/disabled/ca_nl_municipalities/people.py b/disabled/ca_nl_municipalities/people.py index 3fbd9c6e..499b0e73 100644 --- a/disabled/ca_nl_municipalities/people.py +++ b/disabled/ca_nl_municipalities/people.py @@ -1,7 +1,7 @@ import os import re import subprocess -from urllib.request import urlopen +import tempfile from pupa.scrape import Organization @@ -16,15 +16,14 @@ def scrape(self): page = self.lxmlize(COUNCIL_PAGE) url = page.xpath('//a[contains(text(),"Municipal Directory")]/@href')[0] - response = urlopen(url).read() - pdf = open("/tmp/nl.pdf", "w") - pdf.write(response) - pdf.close() + response = self.get(url).content + with tempfile.NamedTemporaryFile(delete_on_close=False) as pdf: + pdf.write(response) - data = subprocess.check_output(["pdftotext", "-layout", "/tmp/nl.pdf", "-"]) + data = subprocess.check_output(["pdftotext", "-layout", pdf.name, "-"]) # noqa: S603,S607 pages = data.split("Municipal Directory")[1:] for page in pages: - page = page.splitlines(True) + page = page.splitlines(keepends=True) column_index = {} for line in page: if "Official Name" in line: @@ -81,4 +80,5 @@ def scrape(self): if address: membership.add_contact_detail("address", address, "legislature") yield p - os.system("rm /tmp/nl.pdf") + + os.unlink(pdf.name) diff --git a/disabled/ca_ns_municipalities/people.py b/disabled/ca_ns_municipalities/people.py index 3533a463..2d8b9fbe 100644 --- a/disabled/ca_ns_municipalities/people.py +++ b/disabled/ca_ns_municipalities/people.py @@ -1,7 +1,7 @@ import os import re import subprocess -from urllib.request import urlopen +import tempfile from pupa.scrape import Organization @@ -13,16 +13,15 @@ class NovaScotiaMunicipalitiesPersonScraper(CanadianScraper): def scrape(self): - response = urlopen(COUNCIL_PAGE).read() - pdf = open("/tmp/ns.pdf", "w") - pdf.write(response) - pdf.close() + response = self.get(COUNCIL_PAGE).content + with tempfile.NamedTemporaryFile(delete_on_close=False) as pdf: + pdf.write(response) - data = subprocess.check_output(["pdftotext", "/tmp/ns.pdf", "-"]) + data = subprocess.check_output(["pdftotext", pdf.name, "-"]) # noqa: S603,S607 emails = re.findall(r"(?<=E-mail: ).+", data) data = re.split(r"Mayor |Warden ", data)[1:] for i, mayor in enumerate(data): - lines = mayor.splitlines(True) + lines = mayor.splitlines(keepends=True) name = lines.pop(0).strip() if name == "Jim Smith": continue @@ -61,9 +60,9 @@ def scrape(self): for i, email in enumerate(emails): regex = name.split()[-1].lower() + "|" + "|".join(district.split()[-2:]).replace("of", "").lower() regex = regex.replace("||", "|") - matches = re.findall(r"{}".format(regex), email) + matches = re.findall(rf"{regex}", email) if matches: membership.add_contact_detail("email", emails.pop(i)) yield p - os.system("rm /tmp/ns.pdf") + os.unlink(pdf.name) diff --git a/disabled/ca_pe_municipalities/people.py b/disabled/ca_pe_municipalities/people.py index 3eb0154f..f39591bb 100644 --- a/disabled/ca_pe_municipalities/people.py +++ b/disabled/ca_pe_municipalities/people.py @@ -44,7 +44,7 @@ def scrape(self): councillors = page.xpath( '//div[@style="WIDTH:750"]/dl/dt[contains(text(), "Elected Officials")]/parent::dl/dd/pre/text()' - )[0].splitlines(True) + )[0].splitlines(keepends=True) for councillor in councillors: name = ( councillor.replace("(Mayor)", "") diff --git a/disabled/ca_sk_municipalities/people.py b/disabled/ca_sk_municipalities/people.py index c0f67240..b162995d 100644 --- a/disabled/ca_sk_municipalities/people.py +++ b/disabled/ca_sk_municipalities/people.py @@ -1,7 +1,7 @@ import os import re import subprocess -from urllib.request import urlopen +import tempfile from pupa.scrape import Organization @@ -14,14 +14,13 @@ class SaskatchewanMunicipalitiesPersonScraper(CanadianScraper): def scrape(self): - response = urlopen(COUNCIL_PAGE).read() - pdf = open("/tmp/sk.pdf", "w") - pdf.write(response) - pdf.close() + response = self.get(COUNCIL_PAGE).read() + with tempfile.NamedTemporaryFile(delete_on_close=False) as pdf: + pdf.write(response) - data = subprocess.check_output(["pdftotext", "-layout", "/tmp/sk.pdf", "-"]) + data = subprocess.check_output(["pdftotext", "-layout", pdf.name, "-"]) # noqa: S603,S607 - data = data.splitlines(True) + data = data.splitlines(keepends=True) pages = [] page = [] for line in data: @@ -34,10 +33,7 @@ def scrape(self): districts = [] for page in pages: index = re.search(r"(\s{6,})", page[0]) - if index: - index = index.end() - 1 - else: - index = -1 + index = index.end() - 1 if index else -1 dist1 = [] dist2 = [] for line in page: @@ -99,4 +95,5 @@ def scrape(self): for key, value in contacts.items(): membership.add_contact_detail(key, value, "" if key == "email" else "legislature") yield p - os.system("rm /tmp/sk.pdf") + + os.unlink(pdf.name) diff --git a/disabled/ca_yt_municipalities/people.py b/disabled/ca_yt_municipalities/people.py index 130aa33c..7c7ba0af 100644 --- a/disabled/ca_yt_municipalities/people.py +++ b/disabled/ca_yt_municipalities/people.py @@ -1,7 +1,7 @@ import os import re import subprocess -from urllib.request import urlopen +import tempfile from pupa.scrape import Organization @@ -13,12 +13,11 @@ class YukonMunicipalitiesPersonScraper(CanadianScraper): def scrape(self): - response = urlopen(COUNCIL_PAGE).read() - pdf = open("/tmp/yt.pdf", "w") - pdf.write(response) - pdf.close() + response = self.get(COUNCIL_PAGE).content + with tempfile.NamedTemporaryFile(delete_on_close=False) as pdf: + pdf.write(response) - data = subprocess.check_output(["pdftotext", "-layout", "/tmp/yt.pdf", "-"]) + data = subprocess.check_output(["pdftotext", "-layout", pdf.name, "-"]) # noqa: S603,S607 data = re.split(r"\n\s*\n", data) for municipality in data: if "Councillors" not in municipality: @@ -81,4 +80,4 @@ def scrape(self): p.add_link(website) yield p - os.system("rm /tmp/yt.pdf") + os.unlink(pdf.name) diff --git a/patch.py b/patch.py index 2d6482a0..6d6312e8 100644 --- a/patch.py +++ b/patch.py @@ -27,9 +27,9 @@ (r"\A1 \d{3} \d{3}-\d{4}(?: x\d+)?\Z", lambda x: x["type"] in ("text", "voice", "fax", "cell", "video", "pager")), ] # Validate the format of contact_details[].note. -_contact_details["items"]["properties"]["note"][ - "pattern" -] = r"\A(?:constituency|legislature|office|residence|)(?: \(\d\))?\Z" +_contact_details["items"]["properties"]["note"]["pattern"] = ( + r"\A(?:constituency|legislature|office|residence|)(?: \(\d\))?\Z" +) # contact_details[] must not include unexpected properties. _contact_details["items"]["additionalProperties"] = False @@ -57,7 +57,7 @@ social_re = re.compile( r"(?:facebook|fb|instagram|linkedin|twitter|youtube)\.com|conservative\.ca" -) # XXX ca_candidates +) # special case: ca_candidates facebook_re = re.compile(r"facebook\.com") instagram_re = re.compile(r"instagram\.com") linkedin_re = re.compile(r"linkedin\.com") @@ -70,15 +70,15 @@ (1, lambda x: x["type"] == "email", "Membership has many emails"), ] -for type in ("address", "cell", "fax", "voice"): - for note in ("constituency", "legislature", "office", "residence"): - matchers.append( - ( - 1, - lambda x, type=type, note=note: x["type"] == type and x["note"] == note, - "Membership has contact_details with same type and note", - ) - ) +matchers.extend( + ( + 1, + lambda x, type=type, note=note: x["type"] == type and x["note"] == note, + "Membership has contact_details with same type and note", + ) + for type in ("address", "cell", "fax", "voice") + for note in ("constituency", "legislature", "office", "residence") +) # A membership should not have notes on emails, should have notes on non-emails, # should have at most one email, and should, in most cases, have at most one of @@ -133,7 +133,7 @@ r"\A" r"(?!(?:Chair|Commissioner|Conseiller|Councillor|Deputy|Dr|Hon|M|Maire|Mayor|Miss|Mme|Mr|Mrs|Ms|Regional|Warden)\b)" r"(?:" + name_fragment + r"(?:'|-| - | )" - r")+" + name_fragment + r"\Z" # noqa: W504 + r")+" + name_fragment + r"\Z" ) person_schema["properties"]["gender"]["enum"] = ["male", "female", ""] # @note https://github.com/opennorth/represent-canada-images checks whether an @@ -147,7 +147,7 @@ organization_schema["properties"]["classification"]["enum"] += ["government"] -def validate_conditionalPattern(self, x, fieldname, schema, path, arguments=None): +def validate_conditionalPattern(self, x, fieldname, schema, path, arguments=None): # noqa: N802 value = x.get(fieldname) if isinstance(value, str): for pattern, method in arguments: @@ -158,7 +158,7 @@ def validate_conditionalPattern(self, x, fieldname, schema, path, arguments=None DatetimeValidator.validate_conditionalPattern = validate_conditionalPattern -def validate_maxMatchingItems(self, x, fieldname, schema, path, arguments=None): +def validate_maxMatchingItems(self, x, fieldname, schema, path, arguments=None): # noqa: N802 value = x.get(fieldname) if isinstance(value, list): for length, method, message in arguments: diff --git a/pyproject.toml b/pyproject.toml index 8656c702..28cf62c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,40 @@ -[tool.black] +[project] +name = "scrapers_ca" +version = "0.0.1" + +[tool.ruff] line-length = 119 +target-version = "py39" + +[tool.ruff.lint] +select = ["ALL"] +ignore = [ + "ANN", "C901", "COM812", "D203", "D212", "D415", "EM", "ISC001", "PERF203", "PLR091", "Q000", + "D1", "D205", + "DTZ", + "E501", + "ERA001", # commented-out code + "PLR2004", # magic + "PLW2901", + "PTH", + "RUF012", + "S101", # assert + "S113", # timeout + "TRY003", # errors + + # To fix: + "BLE001", # except Exception + "S110", # except pass + "TRY002", # raise Exception +] +allowed-confusables = ["’", "–"] + +[tool.ruff.lint.flake8-builtins] +builtins-ignorelist = ["id", "type"] + +[tool.ruff.lint.flake8-self] +extend-ignore-names = ["_ElementUnicodeResult", "_id", "_related", "_type"] -[tool.isort] -profile = 'black' -line_length = 119 +[tool.ruff.lint.per-file-ignores] +"patch.py" = ["ARG001"] +"tasks.py" = ["T201"] diff --git a/requirements.in b/requirements.in new file mode 100644 index 00000000..3cb35058 --- /dev/null +++ b/requirements.in @@ -0,0 +1,11 @@ +# 0.9.0 uses jsonschema instead of validictory, so we use a commit after 0.8.0 that adds Django 2.0 support. +git+https://github.com/opencivicdata/pupa@f0791f7de07574039eff10d804e4683399a16ec5 +agate +agate-excel +django<5 +invoke +lxml +opencivicdata +regex +requests[security] +unidecode diff --git a/requirements.txt b/requirements.txt index c14d43a1..080af981 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,83 @@ -# 0.9.0 uses jsonschema instead of validictory, so we use a commit after 0.8.0 that adds Django 2.0 support. --e git+https://github.com/opencivicdata/pupa.git@f0791f7de07574039eff10d804e4683399a16ec5#egg=pupa -opencivicdata==3.3.1 -Django==2.2.28 - -# Scrapers -agate -agate-excel -lxml==4.9.1 -regex==2014.04.10 -requests[security]==2.32.0 - -# Maintenance +# This file was autogenerated by uv via the following command: +# uv pip compile requirements.in -o requirements.txt +agate==1.12.0 + # via + # -r requirements.in + # agate-excel +agate-excel==0.4.1 + # via -r requirements.in +asgiref==3.8.1 + # via django +babel==2.16.0 + # via agate +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +dj-database-url==0.3.0 + # via pupa +django==4.2.16 + # via + # -r requirements.in + # opencivicdata + # pupa +et-xmlfile==1.1.0 + # via openpyxl +idna==3.10 + # via requests invoke==0.11.1 -Unidecode==0.04.14 + # via -r requirements.in +isodate==0.6.1 + # via agate +leather==0.4.0 + # via agate +lxml==4.9.1 + # via -r requirements.in +olefile==0.47 + # via agate-excel +opencivicdata==3.3.1 + # via + # -r requirements.in + # pupa +openpyxl==3.1.5 + # via agate-excel +parsedatetime==2.6 + # via agate +psycopg2==2.9.9 + # via pupa +psycopg2-binary==2.9.9 + # via opencivicdata +pupa @ git+https://github.com/opencivicdata/pupa@f0791f7de07574039eff10d804e4683399a16ec5 + # via -r requirements.in +python-slugify==8.0.4 + # via agate +pytimeparse==1.1.8 + # via agate +pytz==2024.2 + # via pupa +regex==2014.4.10 + # via -r requirements.in +requests==2.32.3 + # via + # -r requirements.in + # scrapelib +scrapelib==2.3.0 + # via pupa +six==1.16.0 + # via isodate +sqlparse==0.5.1 + # via django +text-unidecode==1.3 + # via python-slugify +typing-extensions==4.12.2 + # via asgiref +unidecode==0.4.14 + # via -r requirements.in +urllib3==1.26.20 + # via + # requests + # scrapelib +validictory==1.1.3 + # via pupa +xlrd==2.0.1 + # via agate-excel diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index cfb0df10..00000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[flake8] -extend-ignore = E203,E501 diff --git a/setup.py b/setup.py deleted file mode 100644 index 15719185..00000000 --- a/setup.py +++ /dev/null @@ -1,25 +0,0 @@ -# @see https://pythonhosted.org/an_example_pypi_project/setuptools.html -# @see https://pythonhosted.org/setuptools/setuptools.html -import os - -from setuptools import find_packages, setup - - -def read(fname): - return open(os.path.join(os.path.dirname(__file__), fname)).read() - - -setup( - name="scrapers_ca", - version="0.0.1", - author="Open North", - author_email="info@opennorth.ca", - description="Canadian legislative scrapers", - license="MIT", - url="https://github.com/opencivicdata/scrapers-ca", - packages=find_packages(), - long_description=read("README.md"), - install_requires=[ - "lxml", - ], -) diff --git a/tasks.py b/tasks.py index d0581c57..004737c1 100644 --- a/tasks.py +++ b/tasks.py @@ -22,37 +22,29 @@ def module_names(): - """ - Returns all module names. - """ + """Return all module names.""" for module_name in os.listdir("."): if os.path.isfile(os.path.join(module_name, "__init__.py")): yield module_name def modules_and_module_names_and_classes(): - """ - Returns modules, module names, and person scraper classes. - """ + """Return modules, module names, and person scraper classes.""" for module_name in module_names(): - module = importlib.import_module("{}.people".format(module_name)) - class_name = next(key for key in module.__dict__.keys() if "PersonScraper" in key) + module = importlib.import_module(f"{module_name}.people") + class_name = next(key for key in module.__dict__ if "PersonScraper" in key) yield (module, module_name, module.__dict__[class_name]) def csv_dict_reader(url, encoding="utf-8"): - """ - Reads a remote CSV file. - """ + """Read a remote CSV file.""" response = requests.get(url) response.encoding = encoding return csv.DictReader(StringIO(response.text)) def slug(name): - """ - Slugifies a division name. - """ + """Slugify a division name.""" return unidecode( str(name) .lower() @@ -78,16 +70,12 @@ def province_or_territory_abbreviation(code): def type_id(id): - """ - Returns an OCD identifier's type ID. - """ + """Return an OCD identifier's type ID.""" return id.rsplit(":", 1)[1] -def get_definition(division_id, aggregation=False): - """ - Returns the expected configuration for a given division. - """ +def get_definition(division_id, *, aggregation=False): + """Return the expected configuration for a given division.""" if not ocdid_to_type_name_map: # Map census division type codes to names. census_division_type_names = {} @@ -104,7 +92,7 @@ def get_definition(division_id, aggregation=False): requests.get("https://www12.statcan.gc.ca/census-recensement/2016/ref/dict/tab/t1_5-eng.cfm").content ) for text in document.xpath("//table//th[@headers]/text()"): - code, name = text.split(" – ", 1) # non-breaking space + code, name = text.split("\xa0– ", 1) census_subdivision_type_names[code] = name.split(" / ", 1)[0] # Map OCD identifiers to census types. @@ -129,32 +117,28 @@ def get_definition(division_id, aggregation=False): pattern = "ca_{}_municipalities" if aggregation else "ca_{}" expected["module_name"] = pattern.format(ocd_type_id) if aggregation: - expected["name"] = "{} Municipalities".format(division.name) + expected["name"] = f"{division.name} Municipalities" elif ocd_type_id in ("nl", "ns"): - expected["name"] = "{} House of Assembly".format(division.name) + expected["name"] = f"{division.name} House of Assembly" elif ocd_type_id == "qc": expected["name"] = "Assemblée nationale du Québec" else: - expected["name"] = "Legislative Assembly of {}".format(division.name) + expected["name"] = f"Legislative Assembly of {division.name}" elif division._type == "cd": - expected["module_name"] = "ca_{}_{}".format( - province_or_territory_abbreviation(division.id), slug(division.name) - ) + expected["module_name"] = f"ca_{province_or_territory_abbreviation(division.id)}_{slug(division.name)}" name_infix = ocdid_to_type_name_map[division.id] if name_infix == "Regional municipality": name_infix = "Regional" - expected["name"] = "{} {} Council".format(division.name, name_infix) + expected["name"] = f"{division.name} {name_infix} Council" elif division._type == "csd": - expected["module_name"] = "ca_{}_{}".format( - province_or_territory_abbreviation(division.id), slug(division.name) - ) + expected["module_name"] = f"ca_{province_or_territory_abbreviation(division.id)}_{slug(division.name)}" if ocd_type_id[:2] == "24": if division.name[0] in vowels: - expected["name"] = "Conseil municipal d'{}".format(division.name) + expected["name"] = f"Conseil municipal d'{division.name}" else: - expected["name"] = "Conseil municipal de {}".format(division.name) + expected["name"] = f"Conseil municipal de {division.name}" else: name_infix = ocdid_to_type_name_map[division.id] if name_infix in ("Municipality", "Specialized municipality"): @@ -163,21 +147,21 @@ def get_definition(division_id, aggregation=False): name_infix = "District" elif name_infix == "Regional municipality": name_infix = "Regional" - expected["name"] = "{} {} Council".format(division.name, name_infix) + expected["name"] = f"{division.name} {name_infix} Council" elif division._type == "arrondissement": - expected["module_name"] = "ca_{}_{}_{}".format( - province_or_territory_abbreviation(division.parent.id), slug(division.parent.name), slug(division.name) + expected["module_name"] = ( + f"ca_{province_or_territory_abbreviation(division.parent.id)}_{slug(division.parent.name)}_{slug(division.name)}" ) if division.name[0] in vowels: - expected["name"] = "Conseil d'arrondissement d'{}".format(division.name) + expected["name"] = f"Conseil d'arrondissement d'{division.name}" elif division.name[:3] == "Le ": - expected["name"] = "Conseil d'arrondissement du {}".format(division.name[3:]) + expected["name"] = f"Conseil d'arrondissement du {division.name[3:]}" else: - expected["name"] = "Conseil d'arrondissement de {}".format(division.name) + expected["name"] = f"Conseil d'arrondissement de {division.name}" else: - raise Exception("{}: Unrecognized OCD type {}".format(division.id, division._type)) + raise Exception(f"{division.id}: Unrecognized OCD type {division._type}") # Determine the class name. class_name_parts = re.split("[ -]", re.sub("[—–]", "-", re.sub("['.]", "", division.name))) @@ -198,46 +182,37 @@ def get_definition(division_id, aggregation=False): @task def council_pages(): - """ - Prints scrapers' council page, or warns if it is missing or unneeded. - """ + """Print scrapers' council page, or warns if it is missing or unneeded.""" for module, module_name, klass in modules_and_module_names_and_classes(): if klass.__bases__[0].__name__ == "CSVScraper": if hasattr(module, "COUNCIL_PAGE"): - print("{:<60} Delete COUNCIL_PAGE".format(module_name)) + print(f"{module_name:<60} Delete COUNCIL_PAGE") + elif hasattr(module, "COUNCIL_PAGE"): + print(f"{module_name:<60} {module.COUNCIL_PAGE}") else: - if hasattr(module, "COUNCIL_PAGE"): - print("{:<60} {}".format(module_name, module.COUNCIL_PAGE)) - else: - print("{:<60} Missing COUNCIL_PAGE".format(module_name)) + print(f"{module_name:<60} Missing COUNCIL_PAGE") @task def csv_list(): - """ - Lists scrapers with CSV data. - """ - for module, module_name, klass in modules_and_module_names_and_classes(): + """List scrapers with CSV data.""" + for _module, module_name, klass in modules_and_module_names_and_classes(): if hasattr(klass, "csv_url"): - print("{}: {}".format(module_name, klass.csv_url)) + print(f"{module_name}: {klass.csv_url}") @task def csv_stale(): - """ - Lists scrapers with stale manual CSV data. - """ - for module, module_name, klass in modules_and_module_names_and_classes(): + """List scrapers with stale manual CSV data.""" + for _module, module_name, klass in modules_and_module_names_and_classes(): if hasattr(klass, "updated_at") and klass.updated_at < date.today() - timedelta(days=365): - print("{}: Created on {} by {}".format(module_name, klass.updated_at, klass.contact_person)) + print(f"{module_name}: Created on {klass.updated_at} by {klass.contact_person}") @task def csv_error(): - """ - Notes corrections that CSV publishers should make. - """ - for module, module_name, klass in modules_and_module_names_and_classes(): + """Note corrections that CSV publishers should make.""" + for _module, module_name, klass in modules_and_module_names_and_classes(): if klass.__bases__[0].__name__ == "CSVScraper": if "_candidates" in module_name and hasattr(klass, "updated_at"): continue @@ -263,23 +238,19 @@ def csv_error(): keys -= {"encoding"} if keys: - print("\n{}\n{}".format(module_name, klass.csv_url)) + print(f"\n{module_name}\n{klass.csv_url}") extra_keys = keys - {"corrections", "encoding", "header_converter"} if extra_keys: print("- Manually check the configuration of: {}".format(", ".join(extra_keys))) if "encoding" in keys: - print( - "- The CSV file should be encoded as 'utf-8' or 'windows-1252', not '{}'".format( - klass.encoding - ) - ) + print(f"- The CSV file should be encoded as 'utf-8' or 'windows-1252', not '{klass.encoding}'") if "corrections" in keys: for key, values in klass.corrections.items(): for actual, expected in values.items(): - print("- Change '{}' to '{}' in {}".format(actual, expected, key)) + print(f"- Change '{actual}' to '{expected}' in {key}") if "header_converter" in keys: print("- Correct column headers according to:") @@ -288,17 +259,13 @@ def csv_error(): @task def tidy(): - """ - Checks that modules are configured correctly. - """ + """Check that modules are configured correctly.""" # Map OCD identifiers to styles of address. leader_styles = {} member_styles = {} for gid in range(3): reader = csv_dict_reader( - "https://docs.google.com/spreadsheets/d/11qUKd5bHeG5KIzXYERtVgs3hKcd9yuZlt-tCTLBFRpI/pub?single=true&gid={}&output=csv".format( - gid - ) + f"https://docs.google.com/spreadsheets/d/11qUKd5bHeG5KIzXYERtVgs3hKcd9yuZlt-tCTLBFRpI/pub?single=true&gid={gid}&output=csv" ) for row in reader: key = row["Identifier"] @@ -308,7 +275,7 @@ def tidy(): division_ids = set() jurisdiction_ids = set() for module_name in module_names(): - if module_name.endswith("_candidates") or module_name.endswith("_municipalities"): + if module_name.endswith(("_candidates", "_municipalities")): continue metadata = module_name_to_metadata(module_name) @@ -316,29 +283,29 @@ def tidy(): # Ensure division_id is unique. division_id = metadata["division_id"] if division_id in division_ids: - print("{:<60} Duplicate division_id {}".format(module_name, division_id)) + print(f"{module_name:<60} Duplicate division_id {division_id}") else: division_ids.add(division_id) # Ensure jurisdiction_id is unique. jurisdiction_id = metadata["jurisdiction_id"] if jurisdiction_id in jurisdiction_ids: - print("{:<60} Duplicate jurisdiction_id {}".format(module_name, jurisdiction_id)) + print(f"{module_name:<60} Duplicate jurisdiction_id {jurisdiction_id}") else: jurisdiction_ids.add(jurisdiction_id) - expected = get_definition(division_id, bool(module_name.endswith("_municipalities"))) + expected = get_definition(division_id, aggregation=bool(module_name.endswith("_municipalities"))) # Ensure presence of url and styles of address. if division_id not in member_styles: - print("{:<60} Missing member style of address: {}".format(module_name, division_id)) + print(f"{module_name:<60} Missing member style of address: {division_id}") if division_id not in leader_styles: - print("{:<60} Missing leader style of address: {}".format(module_name, division_id)) + print(f"{module_name:<60} Missing leader style of address: {division_id}") url = metadata["url"] if url and not expected["url"]: parsed = urlsplit(url) if parsed.scheme not in ("http", "https") or parsed.path or parsed.query or parsed.fragment: - print("{:<60} Check: {}".format(module_name, url)) + print(f"{module_name:<60} Check: {url}") # Warn if the name or classification may be incorrect. name = metadata["name"] @@ -346,7 +313,7 @@ def tidy(): print("{:<60} Expected {}".format(name, expected["name"])) classification = metadata["classification"] if classification != "legislature": - print("{:<60} Expected legislature".format(classification)) + print(f"{classification:<60} Expected legislature") # Name the classes correctly. class_name = metadata["class_name"] @@ -379,9 +346,7 @@ def tidy(): @task def sources_and_assertions(): - """ - Checks that sources are attributed and assertions are made. - """ + """Check that sources are attributed and assertions are made.""" for module_name in module_names(): path = os.path.join(module_name, "people.py") with codecs.open(path, "r", "utf-8") as f: @@ -390,17 +355,15 @@ def sources_and_assertions(): source_count = content.count("add_source") request_count = content.count("lxmlize") + content.count("self.get(") + content.count("requests.get") if source_count < request_count: - print("Expected {} sources after {} requests {}".format(source_count, request_count, path)) + print(f"Expected {source_count} sources after {request_count} requests {path}") if "CSVScraper" not in content and "assert len(" not in content: - print("Expected an assertion like: assert len(councillors), 'No councillors found' {}".format(path)) + print(f"Expected an assertion like: assert len(councillors), 'No councillors found' {path}") @task def validate_spreadsheet(url, identifier_header, geographic_name_header): - """ - Validates the identifiers, geographic names and geographic types in a spreadsheet. - """ + """Validate the identifiers, geographic names and geographic types in a spreadsheet.""" sgc_to_id = {} for division in Division.all("ca", from_csv=ocd_division_csv): @@ -413,19 +376,17 @@ def validate_spreadsheet(url, identifier_header, geographic_name_header): if len(identifier) == 2: identifier = sgc_to_id[identifier] elif len(identifier) == 4: - identifier = "ocd-division/country:ca/cd:{}".format(identifier) + identifier = f"ocd-division/country:ca/cd:{identifier}" elif len(identifier) == 7: - identifier = "ocd-division/country:ca/csd:{}".format(identifier) + identifier = f"ocd-division/country:ca/csd:{identifier}" division = Division.get(identifier) if row[geographic_name_header] != division.name: - print("{}: name: {} not {}".format(identifier, division.name, row[geographic_name_header])) + print(f"{identifier}: name: {division.name} not {row[geographic_name_header]}") def module_name_to_metadata(module_name): - """ - Copied from `reports.utils`. - """ + # Copied from reports.utils module = importlib.import_module(module_name) for obj in module.__dict__.values(): division_id = getattr(obj, "division_id", None) @@ -442,3 +403,4 @@ def module_name_to_metadata(module_name): getattr(obj, "classification", "legislature"), ), } + return None diff --git a/tox.ini b/tox.ini deleted file mode 100644 index 2c8b2524..00000000 --- a/tox.ini +++ /dev/null @@ -1,5 +0,0 @@ -[flake8] -exclude=disabled -ignore=E501,E731 -# E501 line too long (X > 79 characters) -# E731 do not assign a lambda expression, use a def diff --git a/utils.py b/utils.py index 93916f03..3a5ded50 100644 --- a/utils.py +++ b/utils.py @@ -9,7 +9,7 @@ from zipfile import ZipFile import agate -import agateexcel # noqa +import agateexcel # noqa: F401 import lxml.html import requests from lxml import etree @@ -17,11 +17,12 @@ from pupa.scrape import Jurisdiction, Organization, Person, Post, Scraper from requests.packages.urllib3.exceptions import InsecureRequestWarning -import patch # patch patches validictory # noqa +import patch # patch patches validictory # noqa: F401 requests.packages.urllib3.disable_warnings(InsecureRequestWarning) CUSTOM_USER_AGENT = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)" +DEFAULT_USER_AGENT = requests.utils.default_user_agent() CONTACT_DETAIL_TYPE_MAP = { "Address": "address", @@ -82,10 +83,7 @@ "Voice Mail": "legislature", "Work": "legislature", } -if os.getenv("SSL_VERIFY", False): - SSL_VERIFY = "/usr/lib/ssl/certs/ca-certificates.crt" -else: - SSL_VERIFY = bool(os.getenv("SSL_VERIFY", False)) +SSL_VERIFY = "/usr/lib/ssl/certs/ca-certificates.crt" if os.getenv("SSL_VERIFY", "") else True email_re = re.compile(r"([A-Za-z0-9._-]+@(?:[A-Za-z0-9-]+\.)+[A-Za-z]{2,})") @@ -93,9 +91,7 @@ styles_of_address = {} for gid in range(3): response = requests.get( - "https://docs.google.com/spreadsheets/d/11qUKd5bHeG5KIzXYERtVgs3hKcd9yuZlt-tCTLBFRpI/pub?single=true&gid={}&output=csv".format( - gid - ), + f"https://docs.google.com/spreadsheets/d/11qUKd5bHeG5KIzXYERtVgs3hKcd9yuZlt-tCTLBFRpI/pub?single=true&gid={gid}&output=csv", verify=SSL_VERIFY, ) if response.status_code == 200: @@ -115,35 +111,51 @@ def get_email(self, node, expression=".", *, error=True): Make sure that the node/expression is narrow enough to not capture a generic email address in the footer of the page, for example. """ - - matches = [] # If the text would be split across multiple sub-tags. - for match in node.xpath('{}//*[contains(text(), "@")]'.format(expression)): - matches.append(match.text_content()) + matches = [match.text_content() for match in node.xpath(f'{expression}//*[contains(text(), "@")]')] # The text version is more likely to be correct, as it is more visible, # e.g. ca_bc has one `href` of `mailto:first.last.mla@leg.bc.ca`. - for match in node.xpath('{}//a[contains(@href, "mailto:")]'.format(expression)): - matches.append(unquote(match.attrib["href"])) + matches.extend( + unquote(match.attrib["href"]) for match in node.xpath(f'{expression}//a[contains(@href, "mailto:")]') + ) + # Some emails are obfuscated by Cloudflare. + matches.extend( + self._cloudflare_decode(match) + for match in node.xpath(f'{expression}//@href[contains(., "cdn-cgi/l/email-protection")]') + ) # If the node has no sub-tags. if not matches: - for match in node.xpath('{}//text()[contains(., "@")]'.format(expression)): - matches.append(match) + matches = list(node.xpath(f'{expression}//text()[contains(., "@")]')) if matches: for match in matches: match = email_re.search(match) if match: return match.group(1) if error: - raise Exception("No email pattern in {}".format(matches)) - elif error: - raise Exception("No email node in {}".format(etree.tostring(node))) + raise Exception(f"No email pattern in {matches}") + return None + if error: + raise Exception(f"No email node in {etree.tostring(node)}") + return None + + # Helper function for self,get_email + def _cloudflare_decode(self, link): + hex_email = link.split("#", 1)[1] + decoded_email = "" + key = int(hex_email[:2], 16) - def get_phone(self, node, *, area_codes=[], error=True): + for i in range(2, len(hex_email) - 1, 2): + decoded_email += chr(int(hex_email[i : i + 2], 16) ^ key) + + return decoded_email + + def get_phone(self, node, *, area_codes=None, error=True): """ Don't use if multiple telephone numbers are present, e.g. voice and fax. If writing a new scraper, check that extensions are captured. """ - + if area_codes is None: + area_codes = [] if isinstance(node, etree._ElementUnicodeResult): match = re.search( r"(?:\A|\D)(\(?\d{3}\)?\D?\d{3}\D?\d{4}(?:\s*(?:/|x|ext[.:]?|poste)[\s-]?\d+)?)(?:\D|\Z)", node @@ -169,14 +181,16 @@ def get_phone(self, node, *, area_codes=[], error=True): if match: return match.group(1) if error: - raise Exception("No phone pattern in {}".format(node.text_content())) + raise Exception(f"No phone pattern in {node.text_content()}") + return None def get_link(self, node, substring, *, error=True): - match = node.xpath('.//a[contains(@href,"{}")]/@href'.format(substring)) + match = node.xpath(f'.//a[contains(@href,"{substring}")]/@href') if match: return match[0] if error: - raise Exception("No link matching {}".format(substring)) + raise Exception(f"No link matching {substring}") + return None def get(self, *args, **kwargs): return super().get(*args, verify=SSL_VERIFY, **kwargs) @@ -184,7 +198,7 @@ def get(self, *args, **kwargs): def post(self, *args, **kwargs): return super().post(*args, verify=SSL_VERIFY, **kwargs) - def lxmlize(self, url, encoding=None, user_agent=requests.utils.default_user_agent(), cookies=None, xml=False): + def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False): self.user_agent = user_agent response = self.get(url, cookies=cookies) @@ -194,31 +208,30 @@ def lxmlize(self, url, encoding=None, user_agent=requests.utils.default_user_age try: text = response.text if xml: - text = text.replace('', "") # XXX ca_bc - page = etree.fromstring(text) + text = text.replace('', "") # special case: ca_bc + page = etree.fromstring(text) # noqa: S320 else: page = lxml.html.fromstring(text) - except etree.ParserError: - raise etree.ParserError("Document is empty {}".format(url)) + except etree.ParserError as e: + raise etree.ParserError(f"Document is empty {url}") from e meta = page.xpath('//meta[@http-equiv="refresh"]') if meta: _, url = meta[0].attrib["content"].split("=", 1) return self.lxmlize(url, encoding) - elif xml: - return page - else: - page.make_links_absolute(url) + if xml: return page + page.make_links_absolute(url) + return page - def csv_reader(self, url, delimiter=",", header=False, encoding=None, skip_rows=0, data=None, **kwargs): + def csv_reader(self, url, *, delimiter=",", header=False, encoding=None, skip_rows=0, data=None, **kwargs): if not data: result = urlparse(url) if result.scheme == "ftp": data = StringIO() - ftp = FTP(result.hostname) + ftp = FTP(result.hostname) # noqa: S321 ftp.login(result.username, result.password) - ftp.retrbinary("RETR {}".format(result.path), lambda block: data.write(block.decode("utf-8"))) + ftp.retrbinary(f"RETR {result.path}", lambda block: data.write(block.decode("utf-8"))) ftp.quit() data.seek(0) else: @@ -234,15 +247,13 @@ def csv_reader(self, url, delimiter=",", header=False, encoding=None, skip_rows= data.readline() if header: return csv.DictReader(data, delimiter=delimiter) - else: - return csv.reader(data, delimiter=delimiter) + return csv.reader(data, delimiter=delimiter) class CSVScraper(CanadianScraper): # File flags - """ - Set the CSV file's delimiter. - """ + """Set the CSV file's delimiter.""" + delimiter = "," """ Set the CSV file's encoding, like 'windows-1252' ('utf-8' by default). @@ -252,6 +263,10 @@ class CSVScraper(CanadianScraper): If `csv_url` is a ZIP file, set the compressed file to read. """ filename = None + """ + If `csv_url` is an XLS, XLSX or ZIP file, but has no extension, set the extension (like '.xlsx'). + """ + extension = None # Table flags """ @@ -326,22 +341,21 @@ class CSVScraper(CanadianScraper): def header_converter(self, s): """ - Normalizes a column header name. By default, lowercases it and replaces - underscores with spaces (e.g. because Esri fields can't contain spaces). - """ + Normalize a column header name. + By default, lowercase it and replace underscores with spaces (e.g. because Esri fields can't contain spaces). + """ header = clean_string(s.lower().replace("_", " ")) if hasattr(self, "locale"): return self.column_headers[self.locale].get(header, header) - else: - return header + return header def is_valid_row(self, row): """ - Returns whether the row should be imported. By default, skips empty rows - and rows in which a name component is "Vacant". - """ + Return whether the row should be imported. + By default, skip empty rows and rows in which a name component is "Vacant". + """ empty = ("", "Vacant") if not any(row.values()): return False @@ -352,7 +366,7 @@ def is_valid_row(self, row): def scrape(self): seat_numbers = defaultdict(lambda: defaultdict(int)) - extension = os.path.splitext(self.csv_url)[1] + extension = self.extension if self.extension else os.path.splitext(self.csv_url)[1] if extension in (".xls", ".xlsx"): data = StringIO() binary = BytesIO(self.get(self.csv_url).content) @@ -391,7 +405,7 @@ def scrape(self): # ca_qc_laval: "maire et president du comite executif", "conseiller et membre du comite executif" # ca_qc_montreal: "Conseiller de la ville; Membre…", "Maire d'arrondissement\nMembre…" if row.get("primary role"): - row["primary role"] = re.split(r"(?: (?:et)\b|[;\n])", row["primary role"], 1)[0].strip() + row["primary role"] = re.split(r"(?: (?:et)\b|[;\n])", row["primary role"], maxsplit=1)[0].strip() if not self.is_valid_row(row): continue @@ -438,7 +452,7 @@ def scrape(self): if self.many_posts_per_area and role not in self.unique_roles: seat_numbers[role][district] += 1 - district = "{} (seat {})".format(district, seat_numbers[role][district]) + district = f"{district} (seat {seat_numbers[role][district]})" lines = [] if row.get("address line 1"): @@ -467,11 +481,8 @@ def scrape(self): # District name,District ID,… # Toronto Centre,,… # ,3520005,… - if not row.get("district name") and row.get("district id"): - if len(row["district id"]) == 7: - p._related[0].extras["boundary_url"] = "/boundaries/census-subdivisions/{}/".format( - row["district id"] - ) + if not row.get("district name") and row.get("district id") and len(row["district id"]) == 7: + p._related[0].extras["boundary_url"] = "/boundaries/census-subdivisions/{}/".format(row["district id"]) if row.get("district name") in self.district_name_to_boundary_url: p._related[0].extras["boundary_url"] = self.district_name_to_boundary_url[row["district name"]] @@ -515,9 +526,7 @@ def scrape(self): class CanadianJurisdiction(Jurisdiction): - """ - Whether to create posts whose labels match division names or type IDs. - """ + """Whether to create posts whose labels match division names or type IDs.""" use_type_id = False """ @@ -590,28 +599,21 @@ def get_organizations(self): if valid_through and valid_through < datetime.now().strftime("%Y-%m-%d"): continue - if self.use_type_id: - label = child.id.rsplit("/", 1)[1].capitalize().replace(":", " ") - else: - label = child.name + label = child.id.rsplit("/", 1)[1].capitalize().replace(":", " ") if self.use_type_id else child.name # Yield posts to allow ca_on_toronto to make changes. post = Post(role=member_role, label=label, division_id=child.id, organization_id=organization._id) yield post if not children and parent.attrs["posts_count"]: for i in range(1, int(parent.attrs["posts_count"])): # exclude Mayor - organization.add_post( - role=member_role, label="{} (seat {})".format(parent.name, i), division_id=parent.id - ) + organization.add_post(role=member_role, label=f"{parent.name} (seat {i})", division_id=parent.id) yield organization class CanadianPerson(Person): def __init__(self, *, name, district, role, **kwargs): - """ - Cleans a person's name, district, role and any other attributes. - """ + """Clean a person's name, district, role and any other attributes.""" name = clean_name(name) district = clean_string(district).replace("&", "and") role = clean_string(role) @@ -625,9 +627,7 @@ def __init__(self, *, name, district, role, **kwargs): super().__init__(name=name, district=district, role=role, **kwargs) def __setattr__(self, name, value): - """ - Corrects gender values. - """ + """Correct gender values.""" if name == "gender": value = value.lower() if value == "m": @@ -637,20 +637,16 @@ def __setattr__(self, name, value): super().__setattr__(name, value) def add_link(self, url, *, note=""): - """ - Corrects links without schemes or domains. - """ + """Correct links without schemes or domains.""" url = url.strip() if url.startswith("www."): - url = "http://{}".format(url) + url = f"http://{url}" if re.match(r"\A@[A-Za-z]+\Z", url): - url = "https://twitter.com/{}".format(url[1:]) + url = f"https://twitter.com/{url[1:]}" self.links.append({"note": note, "url": url}) def add_contact(self, type, value, note="", area_code=None): - """ - Cleans and adds a contact detail to the person's membership. - """ + """Clean and add a contact detail to the person's membership.""" if type: type = clean_string(type) if note: @@ -673,9 +669,7 @@ def add_contact(self, type, value, note="", area_code=None): self._related[0].add_contact_detail(type=type, value=value, note=note) def clean_telephone_number(self, s, area_code=None): - """ - @see http://www.btb.termiumplus.gc.ca/tpv2guides/guides/favart/index-eng.html?lang=eng&lettr=indx_titls&page=9N6fM9QmOwCE.html - """ + """@see http://www.btb.termiumplus.gc.ca/tpv2guides/guides/favart/index-eng.html?lang=eng&lettr=indx_titls&page=9N6fM9QmOwCE.html.""" splits = re.split(r"(?:\b \(|/|x|ext[.:]?|p\.|poste)[\s-]?(?=\b|\d)", s, flags=re.IGNORECASE) digits = re.sub(r"\D", "", splits[0]) @@ -688,16 +682,11 @@ def clean_telephone_number(self, s, area_code=None): digits = re.sub(r"\A(\d)(\d{3})(\d{3})(\d{4})\Z", r"\1 \2 \3-\4", digits) if len(splits) == 2: return "{} x{}".format(digits, splits[1].rstrip(")")) - else: - return digits - else: - return s + return digits + return s def clean_address(self, s): - """ - Corrects the postal code, abbreviates the province or territory name, and - formats the last line of the address. - """ + """Correct the postal code, abbreviate the province or territory name, and format the last line of the address.""" # The letter "O" instead of the numeral "0" is a common mistake. s = re.sub( r"\b[A-Z][O0-9][A-Z]\s?[O0-9][A-Z][O0-9]\b", lambda x: x.group(0).replace("O", "0"), clean_string(s) @@ -719,16 +708,16 @@ def clean_address(self, s): ) -whitespace_re = re.compile(r"\s+", flags=re.U) -whitespace_and_newline_re = re.compile(r"[^\S\n]+", flags=re.U) +whitespace_re = re.compile(r"\s+", flags=re.UNICODE) +whitespace_and_newline_re = re.compile(r"[^\S\n]+", flags=re.UNICODE) honorific_prefix_re = re.compile(r"\A(?:Councillor|Dr|Hon|M|Mayor|Mme|Mr|Mrs|Ms|Miss)\.? ") honorific_suffix_re = re.compile(r", (?:Ph\.D, Q\.C\.)\Z") province_or_territory_abbreviation_memo = {} table = { - ord("​"): " ", # zero-width space + ord("\u200b"): " ", # zero-width space ord("’"): "'", - ord("\xc2"): " ", # non-breaking space if mixing ISO-8869-1 into UTF-8 + ord("\xc2"): "\xa0", # non-breaking space if mixing ISO-8869-1 into UTF-8 } @@ -759,9 +748,8 @@ def clean_type_id(type_id): # "Spaces should be converted to underscores." type_id = re.sub(r" ", "_", type_id) # "All invalid characters should be converted to tilde (~)." - type_id = re.sub(r"[^\w.~-]", "~", type_id, re.UNICODE) - return type_id + return re.sub(r"[^\w.~-]", "~", type_id, flags=re.UNICODE) def clean_french_prepositions(s): - return re.sub(r"\b(?:d'|de (?:l'|la )?|du |des |l')", "", clean_string(s), flags=re.I) + return re.sub(r"\b(?:d'|de (?:l'|la )?|du |des |l')", "", clean_string(s), flags=re.IGNORECASE)