diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index edc295e..5d47485 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -15,4 +15,4 @@ jobs: steps: - uses: actions/checkout@v1 - name: Build containers and run tests - run: docker-compose -f docker-compose.yml -f tests/docker-compose.yml run --rm scrapers + run: docker compose -f docker-compose.yml -f tests/docker-compose.yml run --rm scrapers diff --git a/.github/workflows/publish_docs.yml b/.github/workflows/publish_docs.yml new file mode 100644 index 0000000..475228f --- /dev/null +++ b/.github/workflows/publish_docs.yml @@ -0,0 +1,34 @@ +on: + workflow_dispatch: + push: + branches: + - main + +jobs: + build-and-deploy: + name: Build and publish documentation + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - uses: tj-actions/changed-files@v45 + id: docs-changed + with: + files: | + docs/* + + - name: Set up Quarto + if: steps.docs-changed.outputs.any_changed == 'true' + uses: quarto-dev/quarto-actions/setup@v2 + + - name: Render and Publish + if: steps.docs-changed.outputs.any_changed == 'true' + uses: quarto-dev/quarto-actions/publish@v2 + with: + path: docs + target: gh-pages + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/README.md b/README.md index 1197c6f..3d4d858 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,22 @@ -scrapers-lametro -===================== +# scrapers-lametro DataMade's source for municipal scrapers feeding [boardagendas.metro.net](https://boardagendas.metro.net). -## Development +For more on development, debugging, deployment, and more, [consult the documentation](https://metro-records.github.io/scrapers-lametro/)! -### Making changes to this repository +## Updating the documentation -Make your changes to the scraper code here. - -Merge your PR to push to `main` and publish a `main` tag of the scraper image. - -To publish a `production` tag of the scraper image, sync the `main` branch with the -`deploy` branch: +To make changes to the documentation, [install Quarto](https://quarto.org/docs/get-started/). +Then, run the following in your terminal: ```bash -git push origin main:deploy +quarto preview docs ``` -### Scheduling +Make your changes to the `.qmd` files in the `docs/` directory. They will be automatically +reflected in your local version of the docs. + +For more on authoring docs with Quarto, see [their Getting Started guide](https://quarto.org/docs/get-started/authoring/text-editor.html) and [documentation](https://quarto.org/docs/guide/). -The LA Metro scrapers are scheduled via Airflow. The production Airflow instance -is located at [la-metro-dashboard.datamade.us](https://la-metro-dashboard.datamade.us/). -DataMade staff can find login credentials under the Metro support email in -LastPass. The underlying code is in the [`datamade/la-metro-dashboard` repository](https://github.com/datamade/la-metro-dashboard). +The GitHub Pages site will rebuild automatically when your documentation changes are +merged into `main`. \ No newline at end of file diff --git a/docker-compose.councilmatic.yml b/docker-compose.councilmatic.yml new file mode 100644 index 0000000..ab5a9b7 --- /dev/null +++ b/docker-compose.councilmatic.yml @@ -0,0 +1,26 @@ +version: '2.4' + +services: + scrapers: + image: scrapers-lametro + container_name: scrapers-lametro + build: . + stdin_open: true + tty: true + volumes: + - .:/app + environment: + # Populate the local Councilmatic database + DATABASE_URL: postgresql://postgres:postgres@postgres:5432/lametro + DJANGO_SETTINGS_MODULE: pupa.settings + OCD_DIVISION_CSV: "/app/lametro/divisions.csv" + command: pupa update lametro + # Connect the scraper container to the app network + networks: + - app_net + +networks: + # Define connection to the app's Docker network + app_net: + name: la-metro-councilmatic_default + external: true diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..67f9f12 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,2 @@ +/.quarto/ +/_site/ \ No newline at end of file diff --git a/docs/_quarto.yml b/docs/_quarto.yml new file mode 100644 index 0000000..bc6bc73 --- /dev/null +++ b/docs/_quarto.yml @@ -0,0 +1,27 @@ +project: + type: website + +website: + title: "LA Metro Scrapers Documentation" + sidebar: + style: "docked" + search: true + contents: auto + tools: + - icon: github + menu: + - text: Source Code + href: https://github.com/Metro-Records/scrapers-lametro + - text: Issue Tracker + href: https://github.com/Metro-Records/scrapers-lametro/issues + - text: Legacy Issue Tracker + href: https://github.com/opencivicdata/scrapers-us-municipal/issues?q=is%3Aissue+metro + +format: + html: + theme: litera + css: styles.css + toc: true + + + diff --git a/docs/debugging.qmd b/docs/debugging.qmd new file mode 100644 index 0000000..b7b87e1 --- /dev/null +++ b/docs/debugging.qmd @@ -0,0 +1,224 @@ +--- +title: "Debugging" +order: 1 +--- + +# Debugging data issues + +### DON'T PANIC! + +Many issues can arise in the Metro galaxy, from the shallowest part of the frontend to the deepest depths of the backend. However, these issues generally fall into [two broad categories](#categories-of-failure): + +- [Data is missing](#data-is-missing) +- [Data is incorrect](#data-is-incorrect) +- Appendices + - [More on Airflow](#more-on-airflow) + - [View an entity in the Councilmatic database](#view-an-entity-in-the-councilmatic-database) + - [Inspect the scraper logs](#inspect-the-scraper-logs) + +## Categories of failure + +This section expounds on common culprits in our three categories of failure: missing data, incorrect data, and issues with metadata. The culprits are ordered from most to least likely, therefore we suggest moving through the category relevant to your problem in order until you identify the issue. + +### Data is missing + +#### Scrape failures + +Our scrapers are brittle by design, i.e., they will generally fail if they try to ingest data that is formatted incorrectly. If there has been a scrape failure, there should be a corresponding exception in [the `scrapers-lametro` project in Sentry](https://datamade.sentry.io/projects/scrapers-lametro/?project=4504447849201664). + +Common exceptions include `OperationalError`, which indicates that something went awry with the server (e.g., not enough disk space) and `ScraperError`, which indicates a particular issue with the one of the scrapers. + +If you don't see an issue in Sentry, but you have reason to believe there was an error (or that our integration with Sentry is faulty), you can [follow our documentation for finding errors in the scraper logs](#inspect-the-scraper-logs). + +If there is not an error in Sentry or in the scraper logs, then the scrape ran. + +#### Inaccurate timestamps in the Legistar API + +If scrapes are running, by far the most common source of issues is the scraper failing to capture changes to a bill or event. The root issue is that we rely on timestamps that should indicate an update to determine which bills and events to scrape. In reality, these timestamps do not always update when a change is made to an event or bill in Legistar. We have a couple of strategies to get around this: + +1. Generally, Metro staff post agendas the Friday before meetings occur. Thus, [on Fridays](https://github.com/datamade/la-metro-dashboard/blob/ac416e5e03f6a97fb9b0c6112093c679cefb0d1c/dags/constants.py#L41-L59), we scrape all events and bills at the top of every hour. +2. When we run windowed scrapes, we scrape all events and bills with timestamps within _or after_ the given window. This is because upcoming events and bills are the ones that are most likely to change. For example, a scrape with a one-hour window will scrape any events that have changed in the past hour, plus any events with a future start date. Here is [a complete list of the timestamps we consider](https://docs.google.com/document/d/1LjZ61g4s-eiP-aWo4GVoOv27SF7iZ8npLV6Gg9b_BsI/edit?usp=sharing) when scraping events and bills. + +Taken together, these strategies _should_ ensure that any change appears on the Metro site within an hour of being made, however edge cases can happen! + +To determine whether timestamps are causing the problem, follow [our documentation for viewing an entity](#view-an-entity-in-the-councilmatic-database) to ascertain when an entity was last updated in the Councilmatic database and compare it to [the timestamps we consider](https://docs.google.com/document/d/1LjZ61g4s-eiP-aWo4GVoOv27SF7iZ8npLV6Gg9b_BsI/edit?usp=sharing) in windowed scrapes. If the entity has not been updated in Councilmatic since the latest timestamp in the Legistar API, trigger a broader scrape. + +#### Deeper problems + +If the scrapes and ETL pipeline are running as expected, but data is missing, then there is a deeper issue. A good first question: What are the most recent changes in the scraper codebase? Could this have caused unusual behavior? + +The scrapers work through the cooperation of several repos, and the bug fix may require investigating one or more of these repos. + +- [`Metro-Records/scrapers-lametro`](https://github.com/Metro-Records/scrapers-lametro) contains Metro-specific code for the `Bill`, `Event`, and `Person` scrapers. If you need to patch the scraper code, create a PR against this repo. +- [`Metro-Records/la-metro-dashboard`](https://github.com/Metro-Records/la-metro-dashboard) is the Airflow app that schedules scrapes and the scripts that define them. If you need to change the scheduling of scrapes, create a PR against this repo. +- [`opencivicdata/python-legistar-scraper/tree/master/legistar`](https://github.com/opencivicdata/python-legistar-scraper/tree/master/legistar) contains the `LegistarScraper` and `LegistarAPIScraper` variants, from which the Metro scrapers inherit. If you need to patch the Legistar scraping code, create a PR against this repo. +- All scrapers depend on [the `pupa` framework](https://github.com/opencivicdata/pupa) for scraping and importing data using the OCD standard. In the unlikely event that you need to patch `pupa`, create a fork, then submit a PR against this repo. + +### Data is incorrect + +#### Legistar contains the wrong data + +Data issues can occur when the Legistar API or web interface displays the wrong information. This generally happens when Metro staff enters information that is incorrect or is organized differently than our scraper expects. + +If Metro reports that data is incorrect, follow [our documentation for viewing an entity](#view-an-entity-in-the-councilmatic-database) to inspect the problematic object and view its sources in the Legistar API. If the erroneous data matches the API sources, report the error the Metro and wait for them to resolve the issue or clarify how to interpret the data. + +#### Metro has deleted data from Legistar + +`pupa`, our scraping framework, [doesn't know how to identify information that has been deleted](https://github.com/opencivicdata/pupa/issues/295). Sometimes, we scrape information that Metro later removes. + +If this happens for a bill, event, or membership, shell into the server and remove the erroneous entity through +the ORM. N.b., it's easiest to get at this through the relevant person, e.g., + +```python +from lametro.models import * +LAMetroPerson.objects.get(family_name='Mitchell') + +LAMetroPerson.objects.get(family_name='Mitchell').memberships.filter(organization__name__endswith='Committee') +, ]> +LAMetroPerson.objects.get(family_name='Mitchell').memberships.filter(organization__name__endswith='Committee').count() +2 +LAMetroPerson.objects.get(family_name='Mitchell').memberships.filter(organization__name__endswith='Committee').delete() +(2, {'lametro.Membership': 2}) +``` + +#### Deeper issues + +Sometimes, there is not a problem with the data, but rather there is an error in how it is displayed in the Metro Councilmatic instance. This class of issues is generally very rare. As with deeper issues with the scrapers, a helpful first question is: What are the most recent changes, and could they be the source of the problem you're seeing? + +Base models and view logic are defined in [django-councilmatic](https://github.com/datamade/django-councilmatic/tree/2.5). Metro generally uses the most recent release of the 2.5.x series, so be sure you're consulting the `2.5` branch of `django-councilmatic` when you start spelunking. + +Metro makes a number of customizations to the models and view logic in this repository. Consult [the most recent release](https://github.com/datamade/la-metro-councilmatic/releases) to view the code that's deployed to production. + +## Inspecting data and logs + +### More on Airflow + +The Metro data pipeline, both scrapes and management commands to perform subsequent ETL, are scheduled and run by our Metro Airflow instance, located at [https://la-metro-dashboard-heroku-prod.datamade.us](https://la-metro-dashboard-heroku-prod.datamade.us). Consult the DataMade BitWarden for login credentials. (Search `metro-support@datamade.us` in our shared folder!) + +The dashboard lives on GitHub, [here](https://github.com/Metro-Records/la-metro-dashboard). + +Apache maintains thorough documentation of [core concepts](http://airflow.apache.org/docs/stable/concepts.html), as well as [navigating the UI](http://airflow.apache.org/docs/stable/ui.html). If you've never used Airflow before, these are great resources to start with! + +### Get links to source data from Legistar + +If there is an issue with a particular entity, view that entity's detail page on the board +agendas site ([https://boardagendas.metro.net](https://boardagendas.metro.net)). Links to source data from Legistar will be logged to your browser's developer console. + +### View an entity in the Councilmatic database + +#### Retrieve the entity in the Django shell + +Shell into a running instance of LA Metro Councilmatic using either the Heroku CLI: + +```bash +heroku login +heroku ps:exec --app=lametro-councilmatic-production +``` + +or Heroku's web-based console: + +Screenshot 2024-08-06 at 1 34 59 PM + +with `python manage.py shell`. + +Retrieve the problematic entity using its slug, which you can retrieve from the URL of +its detail page. + +```python +# In the Django shell +>>> from lametro.models import * +>>> entity = LAMetroEvent.objects.get(slug='regular-board-meeting-036b08c9a3f3') +``` + +You can use the same ORM query to retrieve any entity. Simply swap out `LAMetroEvent` for the correct model and, of course, update the OCD ID. + +| Entity | Model | +| -- | -- | +| Person | LAMetroPerson | +| Committee | LAMetroOrganization | +| Bill | LAMetroBill | +| Event | LAMetroEvent | + +#### View useful information + +Assuming you have retrieved the entity as illustrated in the previous step, you can view its last updated date like this: + +```python +>>> entity.updated_at +datetime.datetime(2020, 3, 25, 0, 47, 3, 471572, tzinfo=) +``` + +You can also view its sources like this: + +```python +>>> import pprint +>>> pprint.pprint([(source.note, source.url) for source in entity.sources.all()]) +[('api', 'http://webapi.legistar.com/v1/metro/events/1384'), + ('api (sap)', 'http://webapi.legistar.com/v1/metro/events/1493'), + ('web', + 'https://metro.legistar.com/MeetingDetail.aspx?LEGID=1384&GID=557&G=A5FAA737-A54D-4A6C-B1E8-FF70F765FA94'), + ('web (sap)', + 'https://metro.legistar.com/MeetingDetail.aspx?LEGID=1493&GID=557&G=A5FAA737-A54D-4A6C-B1E8-FF70F765FA94')] +``` + +Events have Spanish language sources (e.g., "api (sap)"), as well. In initial debugging, focus on the "api" and "web" sources – by visiting these links and checking that the data in Legistar appears as expected. + +### Inspect the scraper logs + +::: {.callout-warning} +The timestamps in the scraper logs are in Chicago local time! +::: + +If you have reason to believe the scrape has failed, but you don't see an exception in Sentry, you can make double sure by consulting the logs for the scraping DAGs [in the dashboard](https://la-metro-dashboard-heroku-prod.datamade.us). + +#### Confirming the scrape ran + +To confirm the scrape ran, click the name of a DAG to pull up the tree view, which will display the status of the last 25 DAG runs. + +![windowed_bill_scraping DAG tree view](https://i.imgur.com/9NDdEpy.png) + +Note that the scraping DAGs employ [branch operators](https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/dags.html#concepts-branching) to determine what kind of scraping task to run. Be sure to look closely to verify that the task you're expecting is green (succeeded), not pink (skipped). + +#### Verifying whether an error occurred + +The dashboard has DAGs corresponding to the full overnight scrape, windowed scrapes, fast full scrapes, and hourly processing. To view the logs associated with tasks in a particular DAG, click the name of the DAG to go to the Tree View, which shows you the last 24 runs of the DAG. Then, click the box associated with the task for which you'd like the view the logs. This will pop open a window containing, among other things, a link to the logs. Click the link to view the logs! + +![View the logs for the most recent `convert_attachment_text` run](https://i.imgur.com/MSTc4O0.gif) + +If there has been an error, the logs for the implicated task should contain something like this: + +``` +import memberships... +Traceback (most recent call last): + File "/home/datamade/.virtualenvs/opencivicdata/bin/pupa", line 8, in + sys.exit(main()) + File "/home/datamade/.virtualenvs/opencivicdata/lib/python3.5/site-packages/pupa/cli/__main__.py", line 68, in main + subcommands[args.subcommand].handle(args, other) + File "/home/datamade/.virtualenvs/opencivicdata/lib/python3.5/site-packages/pupa/cli/commands/update.py", line 278, in handle + return self.do_handle(args, other, juris) + File "/home/datamade/.virtualenvs/opencivicdata/lib/python3.5/site-packages/pupa/cli/commands/update.py", line 329, in do_handle + report['import'] = self.do_import(juris, args) + File "/home/datamade/.virtualenvs/opencivicdata/lib/python3.5/site-packages/pupa/cli/commands/update.py", line 216, in do_import + report.update(membership_importer.import_directory(datadir)) + File "/home/datamade/.virtualenvs/opencivicdata/lib/python3.5/site-packages/pupa/importers/base.py", line 197, in import_directory + return self.import_data(json_stream()) + File "/home/datamade/.virtualenvs/opencivicdata/lib/python3.5/site-packages/pupa/importers/base.py", line 234, in import_data + obj_id, what = self.import_item(data) + File "/home/datamade/.virtualenvs/opencivicdata/lib/python3.5/site-packages/pupa/importers/base.py", line 258, in import_item + obj = self.get_object(data) + File "/home/datamade/.virtualenvs/opencivicdata/lib/python3.5/site-packages/pupa/importers/memberships.py", line 36, in get_object + return self.model_class.objects.get(**spec) + File "/home/datamade/.virtualenvs/opencivicdata/lib/python3.5/site-packages/django/db/models/manager.py", line 82, in manager_method + return getattr(self.get_queryset(), name)(*args, **kwargs) + File "/home/datamade/.virtualenvs/opencivicdata/lib/python3.5/site-packages/django/db/models/query.py", line 412, in get + (self.model._meta.object_name, num) +opencivicdata.core.models.people_orgs.MultipleObjectsReturned: get() returned more than one Membership -- it returned 2! +Sentry is attempting to send 1 pending error messages +Waiting up to 10 seconds +Press Ctrl-C to quit +05/02/2020 00:40:09 INFO pupa: save jurisdiction New York City as jurisdiction_ocd-jurisdiction-country:us-state:ny-place:new_york-government.json +05/02/2020 00:40:09 INFO pupa: save organization New York City Council as organization_62bd3c8a-8c37-11ea-b678-122a3d729da3.json +05/02/2020 00:40:09 INFO pupa: save post District 1 as post_62bdb9c6-8c37-11ea-b678-122a3d729da3.json +``` + +The timestamps at the bottom correspond to the scrape _after_ the error occurred, so you can use them to determine when the broken scrape occurred and whether it's relevant to the missing data. diff --git a/docs/deployment.qmd b/docs/deployment.qmd new file mode 100644 index 0000000..0c8dc24 --- /dev/null +++ b/docs/deployment.qmd @@ -0,0 +1,51 @@ +--- +title: "Deployment" +order: 3 +--- + +We deploy the scrapers as tagged Docker images. Tagged images are built +automatically and [published to the GitHub Container Registry](https://github.com/Metro-Records/scrapers-lametro/pkgs/container/scrapers-lametro) +by [the `publish_image.yml` workflow](https://github.com/Metro-Records/scrapers-lametro/blob/main/.github/workflows/publish_image.yml). + +Deployment can be summarized as follows: + +| On push to... | Build tag... | In use by... | +| - | - | - | +| `main` | `main` | [https://la-metro-dashboard-heroku.datamade.us/home](https://la-metro-dashboard-heroku.datamade.us/home) | +| `deploy` | `deploy` | [https://la-metro-dashboard-heroku-prod.datamade.us/home](https://la-metro-dashboard-heroku-prod.datamade.us/home) | + +## When should I deploy the scrapers? + +A `main` tag will be built automatically when a pull request is merged. + +::: {.callout-caution} +If you have made a significant change to a scraper, it is recommended that you run that +scrape on the staging Airflow instance before deploying the change to the production +Airflow instance. +::: + +A `deploy` tag will be built on pushes to the `deploy` branch. Once you are ready to +deploy your change to production, run: + +```bash +git push origin main:deploy +``` + +### A note on dependencies + +We install our `pupa` and `legistar` dependencies from the `master` branch of their +respective repositories, i.e., `pip` will not automatically recognize changes to +these libraries. + +::: {.callout-warning} +If you have made a change to `pupa` or `legistar`, you must rebuild the scraper +images to deploy it. +::: + +To rebuild the scraper images without making a change to the scraper code, check out +the main branch, then run: + +```bash +git commit --allow-empty -m "Rebuild scrapers" +git push origin main && git push origin main:deploy +``` diff --git a/docs/index.qmd b/docs/index.qmd new file mode 100644 index 0000000..7007750 --- /dev/null +++ b/docs/index.qmd @@ -0,0 +1,41 @@ +--- +title: "LA Metro Scrapers Documentation" +--- + +Welcome to the documentation for the LA Metro Scrapers! Here, you'll find +information about local development, deployment, and an overview of each +scraper (and decisions that we've made about them). + +## How do they work? + +At a high level, the scrapers retrieve information from Metro instances of +[the Legistar interface, also known as InSite](https://metro.legistar.com/Legislation.aspx), +and [the Legistar API](https://webapi.legistar.com/) (endpoints at `https://webapi.legistar.com/metro/*`). + +See the relevant scraper documentation for more information about where information +comes from, and how it is parsed. + +## How are they run? + +The scrapers are run by Airflow and populate LA Metro Councilmatic instances, +outlined below. + +| Scraper image tag | Airflow instance | Metro instance | +| - | - | - | +| `main` | [https://la-metro-dashboard-heroku.datamade.us/home](https://la-metro-dashboard-heroku.datamade.us/home) | [https://la-metro-councilmatic-staging.herokuapp.com/](https://la-metro-councilmatic-staging.herokuapp.com/) | +| `deploy` | [https://la-metro-dashboard-heroku-prod.datamade.us/home](https://la-metro-dashboard-heroku-prod.datamade.us/home) | [https://boardagendas.metro.net](https://boardagendas.metro.net) | + +See [Deployment](deployment.qmd) for more on how scraper image tags are built. + +## What do they depend on? + +The scrapers have a couple of key dependencies. + +- [`pupa`](https://github.com/opencivicdata/pupa) is the framework for scraping and +organizing data according to the Open Civic Data standard. Our scrapers are subclasses of +`pupa.Scraper`, and we use the `pupa` CLI to run scrapes. + - See [Useful pupa commands](http://localhost:3821/local-development.html#useful-pupa-commands) + for more on the CLI. +- [`python-legistar-scraper`](https://github.com/opencivicdata/python-legistar-scraper/) is +a Python wrapper for InSite and the Legistar API that we use to retrieve data. Our scrapers +are also subclasses of the relevant `LegistarScraper` subclasses from this library. \ No newline at end of file diff --git a/docs/local-development.qmd b/docs/local-development.qmd new file mode 100644 index 0000000..b98c5b2 --- /dev/null +++ b/docs/local-development.qmd @@ -0,0 +1,134 @@ +--- +title: "Local development" +order: 2 +--- + +## Running the scrapers + +The scrapers are bundled with a `docker-compose.yml` file that will allow +you to run them on your machine. + +To scrape all recently updated data, run: + +```bash +docker compose run --rm scrapers +``` + +To run a particular scrape or pass arguments to `pupa`, append your command +to the end of the previous command, like: + +```bash +# Scrape board reports from the last week +docker compose run --rm scrapers pupa update lametro bills window=7 +``` + +### Populate a local Councilmatic instance + +If you'd like to scrape data into a Councilmatic instance for easy viewing, first +run your local instance of LA Metro Councilmatic as normal. + +Then, in your local scraper repository, run your scrapes using the `docker-compose.councilmatic.yml` file: + +```bash +docker compose -f docker-compose.councilmatic.yml run --rm scrapers +``` + +### Useful pupa commands + +#### pupa update + +```bash +usage: pupa update [-h] [--scrape] [--import] [--nonstrict] [--fastmode] [--datadir SCRAPED_DATA_DIR] [--cachedir CACHE_DIR] + [-r SCRAPELIB_RPM] [--timeout SCRAPELIB_TIMEOUT] [--no-verify] [--retries SCRAPELIB_RETRIES] + [--retry_wait SCRAPELIB_RETRY_WAIT_SECONDS] + module + +update pupa data + +positional arguments: + module path to scraper module + +options: + -h, --help show this help message and exit + --scrape only run scrape post-scrape step + --import only run import post-scrape step + --nonstrict skip validation on save + --fastmode use cache and turn off throttling + --datadir SCRAPED_DATA_DIR + data directory + --cachedir CACHE_DIR cache directory + -r SCRAPELIB_RPM, --rpm SCRAPELIB_RPM + scraper rpm + --timeout SCRAPELIB_TIMEOUT + scraper timeout + --no-verify skip tls verification + --retries SCRAPELIB_RETRIES + scraper retries + --retry_wait SCRAPELIB_RETRY_WAIT_SECONDS + scraper retry wait +``` + +::: {.callout-tip} +Running a scrape with `--fastmode` will disable request throttling, resulting +in a faster scrape. Great for local development, especially for narrow scrapes, +e.g., + +```bash +pupa update --fastmode lametro events window=1 +``` +::: + +##### Additional arguments + +- `bills` + - `window` (default: 28) - How far back to scrape, in days. Scrapes all matters, + if 0. + - `matter_ids` (default: None) - Comma-separated list of MatterIds from the + Legistar API. Scrapes all matters updated within window, if None. +- `events` + - `window` (default: None) - How far back to scrape, in days. + +##### Examples + +```bash +# Scrape board reports from the past week +pupa update lametro bills window=7 + +# Scrape specific board reports +pupa update lametro bills matter_ids=10340,10084 + +# Scrape events from past 30 days +pupa update lametro events window=30 +``` + +#### pupa clean + +```bash +usage: pupa clean [-h] [--window WINDOW] [--max MAX] [--report] [--yes] + +Removes database objects that haven't been seen in recent scrapes + +options: + -h, --help show this help message and exit + --window WINDOW objects not seen in this many days will be deleted from the database + --max MAX max number of objects to delete without triggering failsafe + --report generate a report of what objects this command would delete without making any changes to the database + --yes assumes an answer of 'yes' to all interactive prompts +``` + +##### Examples + +```bash +# Log which objects will be deleted without making changes to the database +pupa clean --report + +# Remove objects that haven't been seen for 30 days +pupa clean --window 30 + +# Remove a maximum of 100 objects +pupa clean --max 100 +``` + +## Writing tests + +tktktk diff --git a/docs/scrapers/bills.qmd b/docs/scrapers/bills.qmd new file mode 100644 index 0000000..d936104 --- /dev/null +++ b/docs/scrapers/bills.qmd @@ -0,0 +1,28 @@ +--- +title: "Bill Scraper" +order: 3 +--- + +- Located in `lametro/bills.py` +- Creates board reports and their associated attachments, sponsorships, votes, +relations, and topics +- Further reading: [https://open-civic-data.readthedocs.io/en/latest/proposals/0006.html](https://open-civic-data.readthedocs.io/en/latest/proposals/0006.html) + +::: {.callout-tip} +The following terms all refer to the same thing: + +- Matter (Legistar API) +- Bill (Open Civic Data universe) +- Board report (Legistar UI, Councilmatic universe) +::: + +## Key issues + +- Board reports are added to the Legistar API before they are published to InSite. We +need to determine which are "private" in order to suppress them in Councilmatic. + - [Why do we need to scrape private bills?](https://github.com/Metro-Records/la-metro-councilmatic/pull/394#issuecomment-461922234) + - [Limit private scrape](https://github.com/opencivicdata/scrapers-us-municipal/pull/262) +- Our windowed scrapes rely on timestamps from the Legistar API, however the timestamps +are not always updated when changes are made. + - [Metro scraper did not scrape EventAgendaStatusName](https://github.com/opencivicdata/scrapers-us-municipal/issues/239) + - [Metro: Scrape matters updated within a window, as well as matters related to events updated within a window](https://github.com/opencivicdata/scrapers-us-municipal/pull/344) \ No newline at end of file diff --git a/docs/scrapers/events.qmd b/docs/scrapers/events.qmd new file mode 100644 index 0000000..3b37c05 --- /dev/null +++ b/docs/scrapers/events.qmd @@ -0,0 +1,29 @@ +--- +title: "Event Scraper" +order: 4 +--- + +- Located in `lametro/events.py` +- Creates events + +::: {.callout-tip} +The following terms all refer to the same thing: + +- Event (Legistar API, Open Civic Data universe, Councilmatic models) +- Meeting (Legistar UI, Councilmatic UI) +::: + +## Key issues + +- Metro streams audio in both English and Spanish. They cannot associate multiple +broadcast link with one event in Legistar, so they create two nearly identical events that +we merge during scrapes. + - [Import and display the Spanish language audio](https://github.com/Metro-Records/la-metro-councilmatic/issues/263) + - ["Ver en español" link not visible](https://github.com/Metro-Records/la-metro-councilmatic/issues/393) +- The board approves minutes for their previous meeting each time they meet. Sometimes, +these minutes are explicitly associated with the event. When they aren't, we try to guess +the approved minutes file. + - [Find approved minutes](https://github.com/opencivicdata/scrapers-us-municipal/pull/291) + - [Upgrades to the LA Metro scraper](https://github.com/opencivicdata/scrapers-us-municipal/pull/299) + - [Event scrape: If there is more than one minutes file, which one should we use?](https://github.com/Metro-Records/la-metro-councilmatic/issues/742) + - [Account for multiple meeting minutes](https://github.com/Metro-Records/scrapers-lametro/issues/16) \ No newline at end of file diff --git a/docs/scrapers/jurisdiction.qmd b/docs/scrapers/jurisdiction.qmd new file mode 100644 index 0000000..a5b4011 --- /dev/null +++ b/docs/scrapers/jurisdiction.qmd @@ -0,0 +1,52 @@ +--- +title: "Jurisdiction Scraper" +order: 1 +--- + +- Located in `lametro/__init__.py` +- Creates jurisdiction, organization, and posts + - Specifically, creates organizations that are not coded as the board itself or a committee + in the Legistar API, as bodies meeting these parameters are created in the people scraper +- Creates legislative sessions + +## Core concepts + +### Division + +- Political geography +- "Jurisdictions exist within a division, while Posts can represent a division" +- Divisions relevant to LA Metro: + - City of Los Angeles + - [Los Angeles County supervisorial districts](https://experience.arcgis.com/experience/159e4b53494e47fe82bacd8016065843) + - [Statutorially defined transit sectors](https://boardagendas.metro.net/static/pdf/CitySelectionCommitte-MTA-Rules-and-Regulations.pdf) + - [Caltrans (Calfornia Department of Transportation) District](https://dot.ca.gov/caltrans-near-me/district-7) +- Further reading: [https://open-civic-data.readthedocs.io/en/latest/proposals/0002.html](https://open-civic-data.readthedocs.io/en/latest/proposals/0002.html) + +### Jurisdiction + +- Logical unit of governance +- Example: LA Metro +- Further reading: [https://open-civic-data.readthedocs.io/en/latest/proposals/0003.html](https://open-civic-data.readthedocs.io/en/latest/proposals/0003.html) + +### Post + +- Position in organization +- Posts define the core positions within an organization, and can optionally be associated with a Division, i.e., the political geography they represent +- Examples: + - Post associated with a division: Appointee of the Mayor of the City of Los Angeles on the Board of Directors representing the City of Los Angeles + - Post not associated with a division: Chair of the Board of Directors +- Further reading: [https://open-civic-data.readthedocs.io/en/latest/proposals/0005.html](https://open-civic-data.readthedocs.io/en/latest/proposals/0005.html) + +### Membership + +- "A relationship between a Person and an Organization, possibly including a Post" + - Committee Memberships are not associated with a Post because committees do not have a defined membership structure +- Memberships are created by the [people scraper](/scrapers/people.html) +- Further reading: [https://open-civic-data.readthedocs.io/en/latest/proposals/0005.html](https://open-civic-data.readthedocs.io/en/latest/proposals/0005.html) + +## Key issues + +- Legislative sessions must be added manually every June. The scraper breaks if they are not +added before legislation is created in the new session. + - [Write a self-sufficient method for detecting LA Metro legislative session](https://github.com/opencivicdata/scrapers-us-municipal/issues/161) + - [Update legislative session automatically](https://github.com/Metro-Records/scrapers-lametro/issues/22) diff --git a/docs/scrapers/people.qmd b/docs/scrapers/people.qmd new file mode 100644 index 0000000..2b8cc69 --- /dev/null +++ b/docs/scrapers/people.qmd @@ -0,0 +1,43 @@ +--- +title: "Board Member and Legislative Body Scrapers" +order: 2 +--- + +- Located in `lametro/people.py` +- Creates the board and its committees (or, bodies of type "Committee" or +"Independent Taxpayer Oversight Committee") +- Creates members of the above bodies and membership objects for each of +their terms +- Further reading: [https://open-civic-data.readthedocs.io/en/latest/proposals/0005.html](https://open-civic-data.readthedocs.io/en/latest/proposals/0005.html) + +::: {.callout-tip} +The following terms all refer to the same thing: + +**Board of Directors, Committee** + +- Body (Legistar API) +- Meeting body (Legistar UI) +- Organization (Open Civic Data universe, Councilmatic models) +- Committee (Legistar UI, Councilmatic UI) + +**Board member** + +- Member (Legistar UI) +- Person/s (Legistar API, Open Civic Data universe, Councilmatic models) +- Board member (Councilmatic UI) + +**Membership** + +- Office record (Legistar API) +- Membership (Open Civic Data models, Councilmatic models) +- Term (pupa) +::: + +## Key issues + +- Memberships are tricky! Sometimes, they are updated in ways that don't resolve correctly, +introducing duplicates. Other times, they are deleted from Legistar, which is a challenge for +us to detect. + - [Consider start date in addition to, or instead of, end date when importing memberships](https://github.com/opencivicdata/pupa/issues/303) + - [Committees: Old Committee members showing](https://github.com/Metro-Records/la-metro-councilmatic/issues/746) + - [Find a mechanism to handle data deleted from the source system](https://github.com/opencivicdata/pupa/issues/295) diff --git a/docs/styles.css b/docs/styles.css new file mode 100644 index 0000000..2ddf50c --- /dev/null +++ b/docs/styles.css @@ -0,0 +1 @@ +/* css styles */