diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 47434be2..5e135c6f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,16 +1,17 @@ ci: autoupdate_schedule: quarterly + skip: [pip-compile] +default_language_version: + python: python3.10 repos: - - repo: https://github.com/psf/black - rev: 24.4.2 + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.3 hooks: - - id: black - - repo: https://github.com/pycqa/flake8 - rev: 7.1.0 + - id: ruff + - id: ruff-format + - repo: https://github.com/astral-sh/uv-pre-commit + rev: 0.4.4 hooks: - - id: flake8 - additional_dependencies: [flake8-comprehensions] - - repo: https://github.com/pycqa/isort - rev: 5.13.2 - hooks: - - id: isort + - id: pip-compile + name: pip-compile requirements.in + args: [requirements.in, -o, requirements.txt] diff --git a/ca_bc_coquitlam/people.py b/ca_bc_coquitlam/people.py index 51e98652..e8147c56 100644 --- a/ca_bc_coquitlam/people.py +++ b/ca_bc_coquitlam/people.py @@ -7,7 +7,6 @@ class CoquitlamPersonScraper(CanadianScraper): - def scrape(self): def build_email(script): w = re.findall(r'w = "(.*?)"', script)[0] diff --git a/ca_bc_surrey/people.py b/ca_bc_surrey/people.py index b0240acd..f9654b73 100644 --- a/ca_bc_surrey/people.py +++ b/ca_bc_surrey/people.py @@ -12,7 +12,6 @@ def scrape(self): assert len(members), "No members found" seat_number = 1 for member in members: - role, name = member.xpath('.//a[@class="teaser__link"]/h4')[0].text_content().split(" ", 1) district = "Surrey (seat {})".format(seat_number) seat_number += 1 diff --git a/ca_nl/people.py b/ca_nl/people.py index a75bec04..b9f38932 100644 --- a/ca_nl/people.py +++ b/ca_nl/people.py @@ -1,9 +1,8 @@ import json import re -from utils import CUSTOM_USER_AGENT +from utils import CUSTOM_USER_AGENT, CanadianScraper from utils import CanadianPerson as Person -from utils import CanadianScraper COUNCIL_PAGE = "https://www.assembly.nl.ca/js/members-index.js" @@ -26,9 +25,7 @@ def scrape(self): page = self.get(COUNCIL_PAGE) members = re.search( r"members = (\[(.+)\]);", page.content.decode().replace("[Member-elect]", ""), re.DOTALL - ).groups()[ - 0 - ] # extract javascript array + ).groups()[0] # extract javascript array members = re.sub("", "", members) # remove comments members = re.sub("", "", members).replace("", "") # tags members = members.replace('"', r"\"") # escape double quotes @@ -60,7 +57,8 @@ def scrape(self): ) if member.get("email"): p.add_contact( - "email", member["email"].replace("@gov.nl.ca@gov.nl.ca", "@gov.nl.ca") # seriously guys?! + "email", + member["email"].replace("@gov.nl.ca@gov.nl.ca", "@gov.nl.ca"), # seriously guys?! ) p.add_source(COUNCIL_PAGE) diff --git a/ca_ns_cape_breton/people.py b/ca_ns_cape_breton/people.py index dad984d0..6774bd7e 100644 --- a/ca_ns_cape_breton/people.py +++ b/ca_ns_cape_breton/people.py @@ -1,9 +1,8 @@ import html import re -from utils import CUSTOM_USER_AGENT +from utils import CUSTOM_USER_AGENT, CanadianScraper from utils import CanadianPerson as Person -from utils import CanadianScraper COUNCIL_PAGE = "http://www.cbrm.ns.ca/mayor-council-2.html" MAYOR_PAGE = "http://www.cbrm.ns.ca/mayor" diff --git a/ca_qc_cote_saint_luc/people.py b/ca_qc_cote_saint_luc/people.py index 56a1f225..9670ea1c 100644 --- a/ca_qc_cote_saint_luc/people.py +++ b/ca_qc_cote_saint_luc/people.py @@ -1,6 +1,5 @@ -from utils import CUSTOM_USER_AGENT +from utils import CUSTOM_USER_AGENT, CanadianScraper from utils import CanadianPerson as Person -from utils import CanadianScraper COUNCIL_PAGE = "https://cotesaintluc.org/fr/affaires-municipales/membres-du-conseil/" diff --git a/patch.py b/patch.py index 2d6482a0..8acf2c0c 100644 --- a/patch.py +++ b/patch.py @@ -27,9 +27,9 @@ (r"\A1 \d{3} \d{3}-\d{4}(?: x\d+)?\Z", lambda x: x["type"] in ("text", "voice", "fax", "cell", "video", "pager")), ] # Validate the format of contact_details[].note. -_contact_details["items"]["properties"]["note"][ - "pattern" -] = r"\A(?:constituency|legislature|office|residence|)(?: \(\d\))?\Z" +_contact_details["items"]["properties"]["note"]["pattern"] = ( + r"\A(?:constituency|legislature|office|residence|)(?: \(\d\))?\Z" +) # contact_details[] must not include unexpected properties. _contact_details["items"]["additionalProperties"] = False diff --git a/pyproject.toml b/pyproject.toml index 8656c702..059d331a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,11 @@ -[tool.black] +[project] +name = "scrapers_ca" +version = "0.0.1" + +[tool.ruff] line-length = 119 +target-version = "py310" -[tool.isort] -profile = 'black' -line_length = 119 +[tool.ruff.lint] +select = ["C4", "E", "F", "I", "W"] +ignore = ["E501"] diff --git a/requirements.in b/requirements.in new file mode 100644 index 00000000..3cb35058 --- /dev/null +++ b/requirements.in @@ -0,0 +1,11 @@ +# 0.9.0 uses jsonschema instead of validictory, so we use a commit after 0.8.0 that adds Django 2.0 support. +git+https://github.com/opencivicdata/pupa@f0791f7de07574039eff10d804e4683399a16ec5 +agate +agate-excel +django<5 +invoke +lxml +opencivicdata +regex +requests[security] +unidecode diff --git a/requirements.txt b/requirements.txt index c14d43a1..080af981 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,83 @@ -# 0.9.0 uses jsonschema instead of validictory, so we use a commit after 0.8.0 that adds Django 2.0 support. --e git+https://github.com/opencivicdata/pupa.git@f0791f7de07574039eff10d804e4683399a16ec5#egg=pupa -opencivicdata==3.3.1 -Django==2.2.28 - -# Scrapers -agate -agate-excel -lxml==4.9.1 -regex==2014.04.10 -requests[security]==2.32.0 - -# Maintenance +# This file was autogenerated by uv via the following command: +# uv pip compile requirements.in -o requirements.txt +agate==1.12.0 + # via + # -r requirements.in + # agate-excel +agate-excel==0.4.1 + # via -r requirements.in +asgiref==3.8.1 + # via django +babel==2.16.0 + # via agate +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +dj-database-url==0.3.0 + # via pupa +django==4.2.16 + # via + # -r requirements.in + # opencivicdata + # pupa +et-xmlfile==1.1.0 + # via openpyxl +idna==3.10 + # via requests invoke==0.11.1 -Unidecode==0.04.14 + # via -r requirements.in +isodate==0.6.1 + # via agate +leather==0.4.0 + # via agate +lxml==4.9.1 + # via -r requirements.in +olefile==0.47 + # via agate-excel +opencivicdata==3.3.1 + # via + # -r requirements.in + # pupa +openpyxl==3.1.5 + # via agate-excel +parsedatetime==2.6 + # via agate +psycopg2==2.9.9 + # via pupa +psycopg2-binary==2.9.9 + # via opencivicdata +pupa @ git+https://github.com/opencivicdata/pupa@f0791f7de07574039eff10d804e4683399a16ec5 + # via -r requirements.in +python-slugify==8.0.4 + # via agate +pytimeparse==1.1.8 + # via agate +pytz==2024.2 + # via pupa +regex==2014.4.10 + # via -r requirements.in +requests==2.32.3 + # via + # -r requirements.in + # scrapelib +scrapelib==2.3.0 + # via pupa +six==1.16.0 + # via isodate +sqlparse==0.5.1 + # via django +text-unidecode==1.3 + # via python-slugify +typing-extensions==4.12.2 + # via asgiref +unidecode==0.4.14 + # via -r requirements.in +urllib3==1.26.20 + # via + # requests + # scrapelib +validictory==1.1.3 + # via pupa +xlrd==2.0.1 + # via agate-excel diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index cfb0df10..00000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[flake8] -extend-ignore = E203,E501 diff --git a/setup.py b/setup.py deleted file mode 100644 index 15719185..00000000 --- a/setup.py +++ /dev/null @@ -1,25 +0,0 @@ -# @see https://pythonhosted.org/an_example_pypi_project/setuptools.html -# @see https://pythonhosted.org/setuptools/setuptools.html -import os - -from setuptools import find_packages, setup - - -def read(fname): - return open(os.path.join(os.path.dirname(__file__), fname)).read() - - -setup( - name="scrapers_ca", - version="0.0.1", - author="Open North", - author_email="info@opennorth.ca", - description="Canadian legislative scrapers", - license="MIT", - url="https://github.com/opencivicdata/scrapers-ca", - packages=find_packages(), - long_description=read("README.md"), - install_requires=[ - "lxml", - ], -) diff --git a/tox.ini b/tox.ini deleted file mode 100644 index 2c8b2524..00000000 --- a/tox.ini +++ /dev/null @@ -1,5 +0,0 @@ -[flake8] -exclude=disabled -ignore=E501,E731 -# E501 line too long (X > 79 characters) -# E731 do not assign a lambda expression, use a def diff --git a/utils.py b/utils.py index 99fef627..030728bb 100644 --- a/utils.py +++ b/utils.py @@ -256,6 +256,7 @@ class CSVScraper(CanadianScraper): """ Set the CSV file's delimiter. """ + delimiter = "," """ Set the CSV file's encoding, like 'windows-1252' ('utf-8' by default).