Skip to content

Commit

Permalink
Rework to use Site class per agency. #3 #4
Browse files Browse the repository at this point in the history
  • Loading branch information
zstumgoren committed Apr 10, 2024
1 parent d208169 commit 39f0b63
Show file tree
Hide file tree
Showing 4 changed files with 197 additions and 80 deletions.
134 changes: 73 additions & 61 deletions clean/ca/san_diego_pd.py
Original file line number Diff line number Diff line change
@@ -1,72 +1,87 @@
import time
from pathlib import Path

import requests
from bs4 import BeautifulSoup

from .. import utils
from ..cache import Cache


def scrape(data_dir=utils.CLEAN_DATA_DIR, cache_dir=utils.CLEAN_CACHE_DIR, throttle=0):
"""Scrape San Diego Police Department for SB16/SB1421/AB748 data."""
cache = Cache(cache_dir)
# This module
mod = Path(__file__)
state_postal = mod.parent.stem
# Use module to construct agency slug, which we'll use downstream
# to create a subdir inside the main cache directory to stash files for this agency
cache_suffix = f"{state_postal}_{mod.stem}" # ca_san_diego_pd
# Page with links to all the SB16/SB1421/AB748 "child" pages containing videos and documents
base_url = "https://www.sandiego.gov/police/data-transparency/mandated-disclosures/sb16-sb1421-ab748"
current_page = 0
page_count = None # which we don't know until we get the first page
# This will be a list of paths to HTML pages that we cache locally
index_pages = download_index_pages(
base_url, cache, cache_suffix, throttle, page_count, current_page
)
# TODO: Get the child pages and, you know, actually scrape them
return index_pages


def download_index_pages(
base_url, cache, cache_suffix, throttle, page_count, current_page, index_pages=[]
):
"""Download index pages for SB16/SB1421/AB748.
Index pages link to child pages containing videos and
other files related to use-of-force and disciplinary incidents.
Returns:
List of path to cached index pages
class Site:
"""Scrape file metadata and download files for the San Diego Police Department for SB16/SB1421/AB748 data.
Attributes:
name (str): The official name of the agency
"""
# Pause between requests
time.sleep(throttle)
file_stem = base_url.split("/")[-1]
base_file = f"{cache_suffix}/{file_stem}_page{current_page}.html"
# Download the page (if it's not already cached)
cache_path = cache.download(base_file, base_url, "utf-8")
# Add the path to the list of index pages
index_pages.append(cache_path)
# If there's no page_count, we're on first page, so...
if not page_count:
# Extract page count from the initial page
html = cache.read(base_file)
soup = BeautifulSoup(html, "html.parser")
page_count = int(
soup.find_all("li", class_="pager__item")[-1] # last <li> in the pager
.a.attrs["href"] # the <a> tag inside the <li> # will be ?page=X
.split("=")[-1] # get the X
)
if current_page != page_count:
# Recursively call this function to get the next page
next_page = current_page + 1
download_index_pages(
base_url, cache, cache_suffix, throttle, page_count, next_page
)
return index_pages

name = "San Diego Police Department"

def __init__(self, data_dir=utils.CLEAN_DATA_DIR, cache_dir=utils.CLEAN_CACHE_DIR):
"""Initialize a new instance.
Args:
data_dir (Path): The directory where downstream processed files/data will be saved
cache_dir (Path): The directory where files will be cached
"""
# Start page contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
# along with additional index pages
self.base_url = "https://www.sandiego.gov/police/data-transparency/mandated-disclosures/sb16-sb1421-ab748"
self.data_dir = data_dir
self.cache_dir = cache_dir
self.cache = Cache(cache_dir)
# Use module path to construct agency slug, which we'll use downstream
mod = Path(__file__)
state_postal = mod.parent.stem
# to create a subdir inside the main cache directory to stash files for this agency
self.cache_suffix = f"{state_postal}_{mod.stem}" # ca_san_diego_pd

def scrape_meta(self, throttle=0):
"""Gather metadata on downloadable files (videos, etc.)."""
current_page = 0
page_count = None # which we don't know until we get the first page
# This will be a list of paths to HTML pages that we cache locally
index_pages = self._download_index_pages(throttle, page_count, current_page)
# TODO: Get the child pages and, you know, actually scrape file metadata
return index_pages

# Helper functions
def _download_index_pages(self, throttle, page_count, current_page, index_pages=[]):
"""Download index pages for SB16/SB1421/AB748.
Index pages link to child pages containing videos and
other files related to use-of-force and disciplinary incidents.
Returns:
List of path to cached index pages
"""
# Pause between requests
time.sleep(throttle)
file_stem = self.base_url.split("/")[-1]
base_file = f"{self.cache_suffix}/{file_stem}_index_page{current_page}.html"
# Construct URL: pages, including start page, have a page GET parameter
target_url = f"{self.base_url}?page={current_page}"
# Download the page (if it's not already cached)
cache_path = self.cache.download(base_file, target_url, "utf-8")
# Add the path to the list of index pages
index_pages.append(cache_path)
# If there's no page_count, we're on first page, so...
if not page_count:
# Extract page count from the initial page
html = self.cache.read(base_file)
soup = BeautifulSoup(html, "html.parser")
page_count = int(
soup.find_all("li", class_="pager__item")[-1] # last <li> in the pager
.a.attrs["href"] # the <a> tag inside the <li> # will be ?page=X
.split("=")[-1] # get the X
)
if current_page != page_count:
# Recursively call this function to get the next page
next_page = current_page + 1
self._download_index_pages(throttle, page_count, next_page, index_pages)
return index_pages


"""
# LEGACY CODE BELOW #
def _scrape_list_page(cache, top_level_urls, base_url, throttle):
second_level_urls = {}
Expand Down Expand Up @@ -105,7 +120,4 @@ def _download_case_files(base_url, second_level_urls):
all_case_content_links.append(text)
print("_______________________")
return


if __name__ == "__main__":
scrape()
"""
94 changes: 87 additions & 7 deletions clean/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,98 @@ def cli():
def list_agencies():
"""List all available agencies and their slugs.
Agency slugs can then used to with the scrape subcommand
Agency slugs can then used with the scrape-meta and scrape subcommands
"""
for state, agency_slugs in utils.get_all_scrapers().items():
click.echo(f"\n{state.upper()}:")
for slug in sorted(agency_slugs):
click.echo(f" - {state}_{slug}")
for state, agencies in utils.get_all_scrapers().items():
click.echo(f"{state.upper()}:")
for record in sorted(agencies, key=lambda x: x["slug"]):
click.echo(f" - {record['slug']} ({record['agency']})")
message = (
"\nTo scrape an agency, pass an agency slug (e.g. ca_san_diego_pd) as the "
"argument to the scrape command:\n\n\tclean-scraper scrape ca_san_diego_pd\n\n"
"\nTo scrape an agency's file metadata or download files, pass an "
"agency slug (e.g. ca_san_diego_pd) as the argument to the scrape-meta or scrape subcommands: \n\n"
"\tclean-scraper scrape-meta ca_san_diego_pd\n"
"\tclean-scraper scrape ca_san_diego_pd\n"
)
click.echo(message)


@click.command()
@click.argument("agency")
@click.option(
"--data-dir",
default=utils.CLEAN_DATA_DIR,
type=click.Path(),
help="The Path were the results will be saved",
)
@click.option(
"--cache-dir",
default=utils.CLEAN_CACHE_DIR,
type=click.Path(),
help="The Path where results can be cached",
)
@click.option(
"--delete/--no-delete",
default=False,
help="Delete generated files from the cache",
)
@click.option(
"--log-level",
"-l",
default="INFO",
type=click.Choice(
("DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"), case_sensitive=False
),
help="Set the logging level",
)
@click.option(
"--throttle",
"-t",
default=0,
help="Set throttle on scraping in seconds. Default is no delay on file downloads.",
)
def scrape_meta(
agency: str,
data_dir: Path,
cache_dir: Path,
delete: bool,
log_level: str,
throttle: int,
):
"""
Command-line interface for generating metadata CSV about CLEAN files.
The metadata CSV includes the file's name, URL, size, etc.
This file is required for downstream uage by the 'scrape' command, which
relies on it to download the files (in particular the URL for videos and other files).
AGENCY -- An agency slug (e.g. ca_san_diego_pd)
Use the 'list' command to see available agencies and their slugs.
clean-scraper list
"""
# Set higher log-level on third-party libs that use DEBUG logging,
# In order to limit debug logging to our library
logging.getLogger("urllib3").setLevel(logging.ERROR)

# Local logging config
logging.basicConfig(level=log_level, format="%(asctime)s - %(name)s - %(message)s")
logger = logging.getLogger(__name__)

# Runner config
data_dir = Path(data_dir)
cache_dir = Path(cache_dir)
runner = Runner(data_dir, cache_dir, throttle)

# Delete files, if asked
if delete:
logger.info("Deleting files generated from previous scraper run.")
runner.delete()

# Try running the scraper
runner.scrape_meta(agency)


@click.command()
@click.argument("agency")
@click.option(
Expand Down Expand Up @@ -103,6 +182,7 @@ def scrape(


cli.add_command(list_agencies)
cli.add_command(scrape_meta)
cli.add_command(scrape)

if __name__ == "__main__":
Expand Down
33 changes: 27 additions & 6 deletions clean/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ class Runner:
The data_dir and cache_dir arguments can specify any
location, but it's not a bad idea to have them as sibling directories:
/tmp/CLEAN/working # ETL files
/tmp/CLEAN/exports # Final, polished data e.g CSVs for analysis
/tmp/CLEAN/cache # source files (HTML, videos, CSV of metadata for downloaded files, etc.)
/tmp/CLEAN/exports # transformed files
Args:
data_dir (str): Path where final output files are saved.
Expand All @@ -39,23 +39,44 @@ def __init__(
self.cache_dir = cache_dir
self.throttle = throttle

def scrape_meta(self, agency_slug: str) -> Path:
"""Scrape metadata for the provided agency.
Args:
agency_slug (str): Unique scraper slug composed of two-letter state postal code and agency slug: e.g. ca_san_diego_pd
Returns: a Path object leading to a CSV file.
"""
# Get the module
state = agency_slug[:2].strip().lower()
slug = agency_slug[3:].strip().lower()
state_mod = import_module(f"clean.{state}.{slug}")
# Run the scrape method
logger.info(f"Scraping {agency_slug}")
site = state_mod.Site(self.data_dir, self.cache_dir)
data_path = site.scrape_meta(throttle=self.throttle)
# Run the path to the data file
logger.info(f"Generated {data_path}")
return data_path

def scrape(self, agency_slug: str) -> Path:
"""Run the scraper for the provided agency.
This method will operate on the metadata generated by the scrape_meta method.
Args:
agency_slug (str): Unique scraper slug composed of two-letter state postal code and agency slug: e.g. ca_san_diego_pd
Returns: a Path object leading to the CSV file.
Returns: a Path object leading to directory where downloaded files are stored.
"""
# Get the module
state = agency_slug[:2].strip().lower()
slug = agency_slug[3:].strip().lower()
state_mod = import_module(f"clean.{state}.{slug}")
# Run the scrape method
logger.info(f"Scraping {agency_slug}")
data_path = state_mod.scrape(
self.data_dir, self.cache_dir, throttle=self.throttle
)
site = state_mod.Site(self.data_dir, self.cache_dir)
data_path = site.scrape(throttle=self.throttle)
# Run the path to the data file
logger.info(f"Generated {data_path}")
return data_path
Expand Down
16 changes: 10 additions & 6 deletions clean/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import csv
import importlib
import logging
import os
from pathlib import Path
Expand Down Expand Up @@ -135,19 +136,22 @@ def write_dict_rows_to_csv(output_path, headers, rows, mode="w", extrasaction="r
def get_all_scrapers():
"""Get all the agencies that have scrapers.
Returns: Dictionary of agency slugs grouped by state postal.
Returns: List of dicts containing agency slug and name
"""
# Filter out anything not in a state folder
abbrevs = [state.abbr.lower() for state in us.states.STATES]
# Get all folders in dir
folders = [p for p in Path(__file__).parent.iterdir() if p.is_dir()]
# Filter out anything not in a state folder
abbrevs = [state.abbr.lower() for state in us.states.STATES]
state_folders = [p for p in folders if p.stem in abbrevs]
scrapers = {}
for state_folder in state_folders:
state = state_folder.stem
for mod in state_folder.iterdir():
if not mod.stem.startswith("__init"):
scrapers.setdefault(state, []).append(mod.stem)
for mod_path in state_folder.iterdir():
if not mod_path.stem.startswith("__init"):
agency_mod = importlib.import_module(f"clean.{state}.{mod_path.stem}")
scrapers.setdefault(state, []).append(
{"slug": f"{state}_{mod_path.stem}", "agency": agency_mod.Site.name}
)
return scrapers


Expand Down

0 comments on commit 39f0b63

Please sign in to comment.