From bcfba73686cbb9bb6ef4139b71d2f54d6889e211 Mon Sep 17 00:00:00 2001 From: Nauman <157045300+naumansharifwork@users.noreply.github.com> Date: Thu, 31 Oct 2024 14:01:36 +0500 Subject: [PATCH] added scrape-meta for sacramento sheriff --- clean/ca/sacramento_sheriffs.py | 91 +++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 clean/ca/sacramento_sheriffs.py diff --git a/clean/ca/sacramento_sheriffs.py b/clean/ca/sacramento_sheriffs.py new file mode 100644 index 0000000..77e2044 --- /dev/null +++ b/clean/ca/sacramento_sheriffs.py @@ -0,0 +1,91 @@ +import time +from pathlib import Path + +from bs4 import BeautifulSoup + +from .. import utils +from ..cache import Cache + + +class Site: + """Scrape file metadata files for the Sacramento County Sheriff Office. + + Attributes: + name (str): The official name of the agency + """ + + name = "Sacramento County Sheriff Office" + + def __init__( + self, + data_dir: Path = utils.CLEAN_DATA_DIR, + cache_dir: Path = utils.CLEAN_CACHE_DIR, + ): + """Initialize a new instance. + + Args: + data_dir (Path): The directory where downstream processed files/data will be saved + cache_dir (Path): The directory where files will be cached + """ + self.base_url = "https://www.sacsheriff.com/pages/released_cases.php" + self.data_dir = data_dir + self.cache_dir = cache_dir + self.cache = Cache(cache_dir) + + @property + def agency_slug(self) -> str: + """Construct the agency slug.""" + # Use module path to construct agency slug, which we'll use downstream + mod = Path(__file__) + state_postal = mod.parent.stem + return f"{state_postal}_{mod.stem}" # ca_sacramento_sheriffs + + def scrape_meta(self, throttle=0): + # construct a local filename relative to the cache directory - agency slug + page url (ca_sacramento_sheriffs/released_cases.html) + # download the page (if not already cached) + # save the index page url to cache (sensible name) + base_name = f"{self.base_url.split('/')[-1].split('.')[0]}.html" + filename = f"{self.agency_slug}/{base_name}" + self.cache.download(filename, self.base_url) + metadata = [] + html = self.cache.read(filename) + soup = BeautifulSoup(html, "html.parser") + body = soup.find("div", class_="interior_section") + rows = body.select("tbody tr") + for row in rows: + # Get the relevant data from each