From 39f0b636c2c53e3ab7bec01a4bc8579ccc32d9e0 Mon Sep 17 00:00:00 2001 From: Serdar Tumgoren Date: Wed, 10 Apr 2024 08:34:10 -0700 Subject: [PATCH] Rework to use Site class per agency. #3 #4 --- clean/ca/san_diego_pd.py | 134 +++++++++++++++++++++------------------ clean/cli.py | 94 +++++++++++++++++++++++++-- clean/runner.py | 33 ++++++++-- clean/utils.py | 16 +++-- 4 files changed, 197 insertions(+), 80 deletions(-) diff --git a/clean/ca/san_diego_pd.py b/clean/ca/san_diego_pd.py index 99a9fe5f..77696bac 100644 --- a/clean/ca/san_diego_pd.py +++ b/clean/ca/san_diego_pd.py @@ -1,72 +1,87 @@ import time from pathlib import Path -import requests from bs4 import BeautifulSoup from .. import utils from ..cache import Cache -def scrape(data_dir=utils.CLEAN_DATA_DIR, cache_dir=utils.CLEAN_CACHE_DIR, throttle=0): - """Scrape San Diego Police Department for SB16/SB1421/AB748 data.""" - cache = Cache(cache_dir) - # This module - mod = Path(__file__) - state_postal = mod.parent.stem - # Use module to construct agency slug, which we'll use downstream - # to create a subdir inside the main cache directory to stash files for this agency - cache_suffix = f"{state_postal}_{mod.stem}" # ca_san_diego_pd - # Page with links to all the SB16/SB1421/AB748 "child" pages containing videos and documents - base_url = "https://www.sandiego.gov/police/data-transparency/mandated-disclosures/sb16-sb1421-ab748" - current_page = 0 - page_count = None # which we don't know until we get the first page - # This will be a list of paths to HTML pages that we cache locally - index_pages = download_index_pages( - base_url, cache, cache_suffix, throttle, page_count, current_page - ) - # TODO: Get the child pages and, you know, actually scrape them - return index_pages - - -def download_index_pages( - base_url, cache, cache_suffix, throttle, page_count, current_page, index_pages=[] -): - """Download index pages for SB16/SB1421/AB748. - - Index pages link to child pages containing videos and - other files related to use-of-force and disciplinary incidents. - - Returns: - List of path to cached index pages +class Site: + """Scrape file metadata and download files for the San Diego Police Department for SB16/SB1421/AB748 data. + + Attributes: + name (str): The official name of the agency """ - # Pause between requests - time.sleep(throttle) - file_stem = base_url.split("/")[-1] - base_file = f"{cache_suffix}/{file_stem}_page{current_page}.html" - # Download the page (if it's not already cached) - cache_path = cache.download(base_file, base_url, "utf-8") - # Add the path to the list of index pages - index_pages.append(cache_path) - # If there's no page_count, we're on first page, so... - if not page_count: - # Extract page count from the initial page - html = cache.read(base_file) - soup = BeautifulSoup(html, "html.parser") - page_count = int( - soup.find_all("li", class_="pager__item")[-1] # last
  • in the pager - .a.attrs["href"] # the tag inside the
  • # will be ?page=X - .split("=")[-1] # get the X - ) - if current_page != page_count: - # Recursively call this function to get the next page - next_page = current_page + 1 - download_index_pages( - base_url, cache, cache_suffix, throttle, page_count, next_page - ) - return index_pages + name = "San Diego Police Department" + + def __init__(self, data_dir=utils.CLEAN_DATA_DIR, cache_dir=utils.CLEAN_CACHE_DIR): + """Initialize a new instance. + + Args: + data_dir (Path): The directory where downstream processed files/data will be saved + cache_dir (Path): The directory where files will be cached + """ + # Start page contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files + # along with additional index pages + self.base_url = "https://www.sandiego.gov/police/data-transparency/mandated-disclosures/sb16-sb1421-ab748" + self.data_dir = data_dir + self.cache_dir = cache_dir + self.cache = Cache(cache_dir) + # Use module path to construct agency slug, which we'll use downstream + mod = Path(__file__) + state_postal = mod.parent.stem + # to create a subdir inside the main cache directory to stash files for this agency + self.cache_suffix = f"{state_postal}_{mod.stem}" # ca_san_diego_pd + + def scrape_meta(self, throttle=0): + """Gather metadata on downloadable files (videos, etc.).""" + current_page = 0 + page_count = None # which we don't know until we get the first page + # This will be a list of paths to HTML pages that we cache locally + index_pages = self._download_index_pages(throttle, page_count, current_page) + # TODO: Get the child pages and, you know, actually scrape file metadata + return index_pages + + # Helper functions + def _download_index_pages(self, throttle, page_count, current_page, index_pages=[]): + """Download index pages for SB16/SB1421/AB748. + + Index pages link to child pages containing videos and + other files related to use-of-force and disciplinary incidents. + Returns: + List of path to cached index pages + """ + # Pause between requests + time.sleep(throttle) + file_stem = self.base_url.split("/")[-1] + base_file = f"{self.cache_suffix}/{file_stem}_index_page{current_page}.html" + # Construct URL: pages, including start page, have a page GET parameter + target_url = f"{self.base_url}?page={current_page}" + # Download the page (if it's not already cached) + cache_path = self.cache.download(base_file, target_url, "utf-8") + # Add the path to the list of index pages + index_pages.append(cache_path) + # If there's no page_count, we're on first page, so... + if not page_count: + # Extract page count from the initial page + html = self.cache.read(base_file) + soup = BeautifulSoup(html, "html.parser") + page_count = int( + soup.find_all("li", class_="pager__item")[-1] # last
  • in the pager + .a.attrs["href"] # the tag inside the
  • # will be ?page=X + .split("=")[-1] # get the X + ) + if current_page != page_count: + # Recursively call this function to get the next page + next_page = current_page + 1 + self._download_index_pages(throttle, page_count, next_page, index_pages) + return index_pages + + +""" # LEGACY CODE BELOW # def _scrape_list_page(cache, top_level_urls, base_url, throttle): second_level_urls = {} @@ -105,7 +120,4 @@ def _download_case_files(base_url, second_level_urls): all_case_content_links.append(text) print("_______________________") return - - -if __name__ == "__main__": - scrape() +""" diff --git a/clean/cli.py b/clean/cli.py index 8341ae22..fc5c330b 100644 --- a/clean/cli.py +++ b/clean/cli.py @@ -16,19 +16,98 @@ def cli(): def list_agencies(): """List all available agencies and their slugs. - Agency slugs can then used to with the scrape subcommand + Agency slugs can then used with the scrape-meta and scrape subcommands """ - for state, agency_slugs in utils.get_all_scrapers().items(): - click.echo(f"\n{state.upper()}:") - for slug in sorted(agency_slugs): - click.echo(f" - {state}_{slug}") + for state, agencies in utils.get_all_scrapers().items(): + click.echo(f"{state.upper()}:") + for record in sorted(agencies, key=lambda x: x["slug"]): + click.echo(f" - {record['slug']} ({record['agency']})") message = ( - "\nTo scrape an agency, pass an agency slug (e.g. ca_san_diego_pd) as the " - "argument to the scrape command:\n\n\tclean-scraper scrape ca_san_diego_pd\n\n" + "\nTo scrape an agency's file metadata or download files, pass an " + "agency slug (e.g. ca_san_diego_pd) as the argument to the scrape-meta or scrape subcommands: \n\n" + "\tclean-scraper scrape-meta ca_san_diego_pd\n" + "\tclean-scraper scrape ca_san_diego_pd\n" ) click.echo(message) +@click.command() +@click.argument("agency") +@click.option( + "--data-dir", + default=utils.CLEAN_DATA_DIR, + type=click.Path(), + help="The Path were the results will be saved", +) +@click.option( + "--cache-dir", + default=utils.CLEAN_CACHE_DIR, + type=click.Path(), + help="The Path where results can be cached", +) +@click.option( + "--delete/--no-delete", + default=False, + help="Delete generated files from the cache", +) +@click.option( + "--log-level", + "-l", + default="INFO", + type=click.Choice( + ("DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"), case_sensitive=False + ), + help="Set the logging level", +) +@click.option( + "--throttle", + "-t", + default=0, + help="Set throttle on scraping in seconds. Default is no delay on file downloads.", +) +def scrape_meta( + agency: str, + data_dir: Path, + cache_dir: Path, + delete: bool, + log_level: str, + throttle: int, +): + """ + Command-line interface for generating metadata CSV about CLEAN files. + + The metadata CSV includes the file's name, URL, size, etc. + This file is required for downstream uage by the 'scrape' command, which + relies on it to download the files (in particular the URL for videos and other files). + + AGENCY -- An agency slug (e.g. ca_san_diego_pd) + + Use the 'list' command to see available agencies and their slugs. + + clean-scraper list + """ + # Set higher log-level on third-party libs that use DEBUG logging, + # In order to limit debug logging to our library + logging.getLogger("urllib3").setLevel(logging.ERROR) + + # Local logging config + logging.basicConfig(level=log_level, format="%(asctime)s - %(name)s - %(message)s") + logger = logging.getLogger(__name__) + + # Runner config + data_dir = Path(data_dir) + cache_dir = Path(cache_dir) + runner = Runner(data_dir, cache_dir, throttle) + + # Delete files, if asked + if delete: + logger.info("Deleting files generated from previous scraper run.") + runner.delete() + + # Try running the scraper + runner.scrape_meta(agency) + + @click.command() @click.argument("agency") @click.option( @@ -103,6 +182,7 @@ def scrape( cli.add_command(list_agencies) +cli.add_command(scrape_meta) cli.add_command(scrape) if __name__ == "__main__": diff --git a/clean/runner.py b/clean/runner.py index bccc32dc..0fed57a3 100644 --- a/clean/runner.py +++ b/clean/runner.py @@ -18,8 +18,8 @@ class Runner: The data_dir and cache_dir arguments can specify any location, but it's not a bad idea to have them as sibling directories: - /tmp/CLEAN/working # ETL files - /tmp/CLEAN/exports # Final, polished data e.g CSVs for analysis + /tmp/CLEAN/cache # source files (HTML, videos, CSV of metadata for downloaded files, etc.) + /tmp/CLEAN/exports # transformed files Args: data_dir (str): Path where final output files are saved. @@ -39,13 +39,35 @@ def __init__( self.cache_dir = cache_dir self.throttle = throttle + def scrape_meta(self, agency_slug: str) -> Path: + """Scrape metadata for the provided agency. + + Args: + agency_slug (str): Unique scraper slug composed of two-letter state postal code and agency slug: e.g. ca_san_diego_pd + + Returns: a Path object leading to a CSV file. + """ + # Get the module + state = agency_slug[:2].strip().lower() + slug = agency_slug[3:].strip().lower() + state_mod = import_module(f"clean.{state}.{slug}") + # Run the scrape method + logger.info(f"Scraping {agency_slug}") + site = state_mod.Site(self.data_dir, self.cache_dir) + data_path = site.scrape_meta(throttle=self.throttle) + # Run the path to the data file + logger.info(f"Generated {data_path}") + return data_path + def scrape(self, agency_slug: str) -> Path: """Run the scraper for the provided agency. + This method will operate on the metadata generated by the scrape_meta method. + Args: agency_slug (str): Unique scraper slug composed of two-letter state postal code and agency slug: e.g. ca_san_diego_pd - Returns: a Path object leading to the CSV file. + Returns: a Path object leading to directory where downloaded files are stored. """ # Get the module state = agency_slug[:2].strip().lower() @@ -53,9 +75,8 @@ def scrape(self, agency_slug: str) -> Path: state_mod = import_module(f"clean.{state}.{slug}") # Run the scrape method logger.info(f"Scraping {agency_slug}") - data_path = state_mod.scrape( - self.data_dir, self.cache_dir, throttle=self.throttle - ) + site = state_mod.Site(self.data_dir, self.cache_dir) + data_path = site.scrape(throttle=self.throttle) # Run the path to the data file logger.info(f"Generated {data_path}") return data_path diff --git a/clean/utils.py b/clean/utils.py index 72bba260..ff94e721 100644 --- a/clean/utils.py +++ b/clean/utils.py @@ -1,4 +1,5 @@ import csv +import importlib import logging import os from pathlib import Path @@ -135,19 +136,22 @@ def write_dict_rows_to_csv(output_path, headers, rows, mode="w", extrasaction="r def get_all_scrapers(): """Get all the agencies that have scrapers. - Returns: Dictionary of agency slugs grouped by state postal. + Returns: List of dicts containing agency slug and name """ - # Filter out anything not in a state folder - abbrevs = [state.abbr.lower() for state in us.states.STATES] # Get all folders in dir folders = [p for p in Path(__file__).parent.iterdir() if p.is_dir()] + # Filter out anything not in a state folder + abbrevs = [state.abbr.lower() for state in us.states.STATES] state_folders = [p for p in folders if p.stem in abbrevs] scrapers = {} for state_folder in state_folders: state = state_folder.stem - for mod in state_folder.iterdir(): - if not mod.stem.startswith("__init"): - scrapers.setdefault(state, []).append(mod.stem) + for mod_path in state_folder.iterdir(): + if not mod_path.stem.startswith("__init"): + agency_mod = importlib.import_module(f"clean.{state}.{mod_path.stem}") + scrapers.setdefault(state, []).append( + {"slug": f"{state}_{mod_path.stem}", "agency": agency_mod.Site.name} + ) return scrapers