Rework to use Site class per agency. #3 #4

biglocalnews · Apr 10, 2024 · 39f0b63 · 39f0b63
1 parent d208169
commit 39f0b63
Show file tree

Hide file tree

Showing 4 changed files with 197 additions and 80 deletions.
diff --git a/clean/ca/san_diego_pd.py b/clean/ca/san_diego_pd.py
@@ -1,72 +1,87 @@
 import time
 from pathlib import Path
 
-import requests
 from bs4 import BeautifulSoup
 
 from .. import utils
 from ..cache import Cache
 
 
-def scrape(data_dir=utils.CLEAN_DATA_DIR, cache_dir=utils.CLEAN_CACHE_DIR, throttle=0):
-    """Scrape San Diego Police Department for SB16/SB1421/AB748 data."""
-    cache = Cache(cache_dir)
-    # This module
-    mod = Path(__file__)
-    state_postal = mod.parent.stem
-    # Use module to construct agency slug, which we'll use downstream
-    # to create a subdir inside the main cache directory to stash files for this agency
-    cache_suffix = f"{state_postal}_{mod.stem}"  # ca_san_diego_pd
-    # Page with links to all the SB16/SB1421/AB748 "child" pages containing videos and documents
-    base_url = "https://www.sandiego.gov/police/data-transparency/mandated-disclosures/sb16-sb1421-ab748"
-    current_page = 0
-    page_count = None  # which we don't know until we get the first page
-    # This will be a list of paths to HTML pages that we cache locally
-    index_pages = download_index_pages(
-        base_url, cache, cache_suffix, throttle, page_count, current_page
-    )
-    # TODO: Get the child pages and, you know, actually scrape them
-    return index_pages
-
-
-def download_index_pages(
-    base_url, cache, cache_suffix, throttle, page_count, current_page, index_pages=[]
-):
-    """Download index pages for SB16/SB1421/AB748.
-
-    Index pages link to child pages containing videos and
-    other files related to use-of-force and disciplinary incidents.
-
-    Returns:
-        List of path to cached index pages
+class Site:
+    """Scrape file metadata and download files for the San Diego Police Department for SB16/SB1421/AB748 data.
+
+    Attributes:
+        name (str): The official name of the agency
     """
-    # Pause between requests
-    time.sleep(throttle)
-    file_stem = base_url.split("/")[-1]
-    base_file = f"{cache_suffix}/{file_stem}_page{current_page}.html"
-    # Download the page (if it's not already cached)
-    cache_path = cache.download(base_file, base_url, "utf-8")
-    # Add the path to the list of index pages
-    index_pages.append(cache_path)
-    # If there's no page_count, we're on first page, so...
-    if not page_count:
-        # Extract page count from the initial page
-        html = cache.read(base_file)
-        soup = BeautifulSoup(html, "html.parser")
-        page_count = int(
-            soup.find_all("li", class_="pager__item")[-1]  # last <li> in the pager
-            .a.attrs["href"]  # the <a> tag inside the <li>  # will be ?page=X
-            .split("=")[-1]  # get the X
-        )
-    if current_page != page_count:
-        # Recursively call this function to get the next page
-        next_page = current_page + 1
-        download_index_pages(
-            base_url, cache, cache_suffix, throttle, page_count, next_page
-        )
-    return index_pages
 
+    name = "San Diego Police Department"
+
+    def __init__(self, data_dir=utils.CLEAN_DATA_DIR, cache_dir=utils.CLEAN_CACHE_DIR):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        # Start page contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
+        # along with additional index pages
+        self.base_url = "https://www.sandiego.gov/police/data-transparency/mandated-disclosures/sb16-sb1421-ab748"
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.cache = Cache(cache_dir)
+        # Use module path to construct agency slug, which we'll use downstream
+        mod = Path(__file__)
+        state_postal = mod.parent.stem
+        # to create a subdir inside the main cache directory to stash files for this agency
+        self.cache_suffix = f"{state_postal}_{mod.stem}"  # ca_san_diego_pd
+
+    def scrape_meta(self, throttle=0):
+        """Gather metadata on downloadable files (videos, etc.)."""
+        current_page = 0
+        page_count = None  # which we don't know until we get the first page
+        # This will be a list of paths to HTML pages that we cache locally
+        index_pages = self._download_index_pages(throttle, page_count, current_page)
+        # TODO: Get the child pages and, you know, actually scrape file metadata
+        return index_pages
+
+    # Helper functions
+    def _download_index_pages(self, throttle, page_count, current_page, index_pages=[]):
+        """Download index pages for SB16/SB1421/AB748.
+
+        Index pages link to child pages containing videos and
+        other files related to use-of-force and disciplinary incidents.
 
+        Returns:
+            List of path to cached index pages
+        """
+        # Pause between requests
+        time.sleep(throttle)
+        file_stem = self.base_url.split("/")[-1]
+        base_file = f"{self.cache_suffix}/{file_stem}_index_page{current_page}.html"
+        # Construct URL: pages, including start page, have a page GET parameter
+        target_url = f"{self.base_url}?page={current_page}"
+        # Download the page (if it's not already cached)
+        cache_path = self.cache.download(base_file, target_url, "utf-8")
+        # Add the path to the list of index pages
+        index_pages.append(cache_path)
+        # If there's no page_count, we're on first page, so...
+        if not page_count:
+            # Extract page count from the initial page
+            html = self.cache.read(base_file)
+            soup = BeautifulSoup(html, "html.parser")
+            page_count = int(
+                soup.find_all("li", class_="pager__item")[-1]  # last <li> in the pager
+                .a.attrs["href"]  # the <a> tag inside the <li>  # will be ?page=X
+                .split("=")[-1]  # get the X
+            )
+        if current_page != page_count:
+            # Recursively call this function to get the next page
+            next_page = current_page + 1
+            self._download_index_pages(throttle, page_count, next_page, index_pages)
+        return index_pages
+
+
+"""
 # LEGACY CODE BELOW #
 def _scrape_list_page(cache, top_level_urls, base_url, throttle):
     second_level_urls = {}
@@ -105,7 +120,4 @@ def _download_case_files(base_url, second_level_urls):
             all_case_content_links.append(text)
             print("_______________________")
     return
-
-
-if __name__ == "__main__":
-    scrape()
+"""
diff --git a/clean/cli.py b/clean/cli.py
@@ -16,19 +16,98 @@ def cli():
 def list_agencies():
     """List all available agencies and their slugs.
 
-    Agency slugs can then used to with the scrape subcommand
+    Agency slugs can then used with the scrape-meta and scrape subcommands
     """
-    for state, agency_slugs in utils.get_all_scrapers().items():
-        click.echo(f"\n{state.upper()}:")
-        for slug in sorted(agency_slugs):
-            click.echo(f" - {state}_{slug}")
+    for state, agencies in utils.get_all_scrapers().items():
+        click.echo(f"{state.upper()}:")
+        for record in sorted(agencies, key=lambda x: x["slug"]):
+            click.echo(f" - {record['slug']} ({record['agency']})")
     message = (
-        "\nTo scrape an agency, pass an agency slug (e.g. ca_san_diego_pd) as the "
-        "argument to the scrape command:\n\n\tclean-scraper scrape ca_san_diego_pd\n\n"
+        "\nTo scrape an agency's file metadata or download files, pass an "
+        "agency slug (e.g. ca_san_diego_pd) as the argument to the scrape-meta or scrape subcommands: \n\n"
+        "\tclean-scraper scrape-meta ca_san_diego_pd\n"
+        "\tclean-scraper scrape ca_san_diego_pd\n"
     )
     click.echo(message)
 
 
+@click.command()
+@click.argument("agency")
+@click.option(
+    "--data-dir",
+    default=utils.CLEAN_DATA_DIR,
+    type=click.Path(),
+    help="The Path were the results will be saved",
+)
+@click.option(
+    "--cache-dir",
+    default=utils.CLEAN_CACHE_DIR,
+    type=click.Path(),
+    help="The Path where results can be cached",
+)
+@click.option(
+    "--delete/--no-delete",
+    default=False,
+    help="Delete generated files from the cache",
+)
+@click.option(
+    "--log-level",
+    "-l",
+    default="INFO",
+    type=click.Choice(
+        ("DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"), case_sensitive=False
+    ),
+    help="Set the logging level",
+)
+@click.option(
+    "--throttle",
+    "-t",
+    default=0,
+    help="Set throttle on scraping in seconds. Default is no delay on file downloads.",
+)
+def scrape_meta(
+    agency: str,
+    data_dir: Path,
+    cache_dir: Path,
+    delete: bool,
+    log_level: str,
+    throttle: int,
+):
+    """
+    Command-line interface for generating metadata CSV about CLEAN files.
+
+    The metadata CSV includes the file's name, URL, size, etc.
+    This file is required for downstream uage by the 'scrape' command, which
+    relies on it to download the files (in particular the URL for videos and other files).
+
+    AGENCY -- An agency slug (e.g. ca_san_diego_pd)
+
+    Use the 'list' command to see available agencies and their slugs.
+
+      clean-scraper list
+    """
+    # Set higher log-level on third-party libs that use DEBUG logging,
+    # In order to limit debug logging to our library
+    logging.getLogger("urllib3").setLevel(logging.ERROR)
+
+    # Local logging config
+    logging.basicConfig(level=log_level, format="%(asctime)s - %(name)s - %(message)s")
+    logger = logging.getLogger(__name__)
+
+    # Runner config
+    data_dir = Path(data_dir)
+    cache_dir = Path(cache_dir)
+    runner = Runner(data_dir, cache_dir, throttle)
+
+    # Delete files, if asked
+    if delete:
+        logger.info("Deleting files generated from previous scraper run.")
+        runner.delete()
+
+    # Try running the scraper
+    runner.scrape_meta(agency)
+
+
 @click.command()
 @click.argument("agency")
 @click.option(
@@ -103,6 +182,7 @@ def scrape(
 
 
 cli.add_command(list_agencies)
+cli.add_command(scrape_meta)
 cli.add_command(scrape)
 
 if __name__ == "__main__":

diff --git a/clean/runner.py b/clean/runner.py
@@ -18,8 +18,8 @@ class Runner:
     The data_dir and cache_dir arguments can specify any
     location, but it's not a bad idea to have them as sibling directories:
 
-        /tmp/CLEAN/working # ETL files
-        /tmp/CLEAN/exports # Final, polished data e.g CSVs for analysis
+        /tmp/CLEAN/cache   # source files (HTML, videos, CSV of metadata for downloaded files, etc.)
+        /tmp/CLEAN/exports # transformed files
 
     Args:
         data_dir (str): Path where final output files are saved.
@@ -39,23 +39,44 @@ def __init__(
         self.cache_dir = cache_dir
         self.throttle = throttle
 
+    def scrape_meta(self, agency_slug: str) -> Path:
+        """Scrape metadata  for the provided agency.
+
+        Args:
+            agency_slug (str): Unique scraper slug composed of two-letter state postal code and agency slug: e.g. ca_san_diego_pd
+
+        Returns: a Path object leading to a CSV file.
+        """
+        # Get the module
+        state = agency_slug[:2].strip().lower()
+        slug = agency_slug[3:].strip().lower()
+        state_mod = import_module(f"clean.{state}.{slug}")
+        # Run the scrape method
+        logger.info(f"Scraping {agency_slug}")
+        site = state_mod.Site(self.data_dir, self.cache_dir)
+        data_path = site.scrape_meta(throttle=self.throttle)
+        # Run the path to the data file
+        logger.info(f"Generated {data_path}")
+        return data_path
+
     def scrape(self, agency_slug: str) -> Path:
         """Run the scraper for the provided agency.
 
+        This method will operate on the metadata generated by the scrape_meta method.
+
         Args:
             agency_slug (str): Unique scraper slug composed of two-letter state postal code and agency slug: e.g. ca_san_diego_pd
 
-        Returns: a Path object leading to the CSV file.
+        Returns: a Path object leading to directory where downloaded files are stored.
         """
         # Get the module
         state = agency_slug[:2].strip().lower()
         slug = agency_slug[3:].strip().lower()
         state_mod = import_module(f"clean.{state}.{slug}")
         # Run the scrape method
         logger.info(f"Scraping {agency_slug}")
-        data_path = state_mod.scrape(
-            self.data_dir, self.cache_dir, throttle=self.throttle
-        )
+        site = state_mod.Site(self.data_dir, self.cache_dir)
+        data_path = site.scrape(throttle=self.throttle)
         # Run the path to the data file
         logger.info(f"Generated {data_path}")
         return data_path

diff --git a/clean/utils.py b/clean/utils.py
@@ -1,4 +1,5 @@
 import csv
+import importlib
 import logging
 import os
 from pathlib import Path
@@ -135,19 +136,22 @@ def write_dict_rows_to_csv(output_path, headers, rows, mode="w", extrasaction="r
 def get_all_scrapers():
     """Get all the agencies that have scrapers.
 
-    Returns: Dictionary of agency slugs grouped by state postal.
+    Returns: List of dicts containing agency slug and name
     """
-    # Filter out anything not in a state folder
-    abbrevs = [state.abbr.lower() for state in us.states.STATES]
     # Get all folders in dir
     folders = [p for p in Path(__file__).parent.iterdir() if p.is_dir()]
+    # Filter out anything not in a state folder
+    abbrevs = [state.abbr.lower() for state in us.states.STATES]
     state_folders = [p for p in folders if p.stem in abbrevs]
     scrapers = {}
     for state_folder in state_folders:
         state = state_folder.stem
-        for mod in state_folder.iterdir():
-            if not mod.stem.startswith("__init"):
-                scrapers.setdefault(state, []).append(mod.stem)
+        for mod_path in state_folder.iterdir():
+            if not mod_path.stem.startswith("__init"):
+                agency_mod = importlib.import_module(f"clean.{state}.{mod_path.stem}")
+                scrapers.setdefault(state, []).append(
+                    {"slug": f"{state}_{mod_path.stem}", "agency": agency_mod.Site.name}
+                )
     return scrapers