added scrape meta for chula_vista_pd #94 (#95)

* added scrape meta for chula_vista_pd #94 * removed user-agent * changes done * Rework URL handling; clean up a little more text * Linted. Oops. --------- Co-authored-by: Mike Stucka <[email protected]>
biglocalnews · Sep 3, 2024 · b139e7c · b139e7c
1 parent f12b238
commit b139e7c
Show file tree

Hide file tree

Showing 2 changed files with 127 additions and 0 deletions.
diff --git a/clean/ca/chula_vista_pd.py b/clean/ca/chula_vista_pd.py
@@ -0,0 +1,113 @@
+import logging
+import time
+import urllib.parse
+from pathlib import Path
+
+from bs4 import BeautifulSoup
+
+from .. import utils
+from ..cache import Cache
+from .config.chula_vista_pd import index_request_headers
+
+logger = logging.getLogger(__name__)
+
+
+class Site:
+    """Scrape file metadata and download files for the City of Chula Vista Police Department.
+
+    Attributes:
+        name (str): The official name of the agency
+    """
+
+    name = "Chula Vista Police Department"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        self.base_url = "https://www.chulavistaca.gov/departments/police-department/senate-bill-1421"
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.cache = Cache(cache_dir)
+
+    @property
+    def agency_slug(self) -> str:
+        """Construct the agency slug."""
+        # Use module path to construct agency slug, which we'll use downstream
+        mod = Path(__file__)
+        state_postal = mod.parent.stem
+        return f"{state_postal}_{mod.stem}"  # ca_chula_vista_pd
+
+    def scrape_meta(self, throttle=0):
+        # construct a local filename relative to the cache directory - agency slug + page url (ca_chula_vista_pd/senate-bill-1421.html)
+        # download the page (if not already cached)
+        # save the index page url to cache (sensible name)
+        base_name = f"{self.base_url.split('/')[-1]}.html"
+        filename = f"{self.agency_slug}/{base_name}"
+        self.cache.download(
+            filename, self.base_url, force=True, headers=index_request_headers
+        )
+        metadata = []
+        html = self.cache.read(filename)
+        soup = BeautifulSoup(html, "html.parser")
+        content_areas = soup.find_all("div", class_="content_area clearfix")
+        desired_element = None
+        for content_area in content_areas:
+            previous_h2 = content_area.find_previous("h2")
+            if previous_h2 and previous_h2.text == "Documents":
+                desired_element = content_area
+                break
+
+        if desired_element:
+            sections = desired_element.find_all("div", class_="accordion-item")
+            for section in sections:
+                case_type = section.find("div", class_="title").get_text(strip=True)
+                links = section.find_all("a")
+                for link in links:
+                    link_href = link.get("href", None)
+                    case_id = link.find_previous("p").text
+                    case_id = case_id.replace("\u00a0", " ").replace("\u2014", "--")
+                    if link_href:
+                        title = link.string
+                        title = title.replace("\u00a0", " ").replace("\u2014", "--")
+                        redirect_start = "/?splash="
+                        redirect_end = "&____isexternal=true"
+
+                        # Clean up links. Check to see if it's a redirect:
+                        if redirect_start in link_href:
+                            link_href = link_href.replace(redirect_start, "").replace(
+                                redirect_end, ""
+                            )
+                            link_href = urllib.parse.unquote(link_href)
+                            name = title
+                        else:
+                            name = link_href.split("/")[-1]
+
+                        # See if it's a relative link
+                        if urllib.parse.urlparse(link_href).netloc == "":
+                            link_href = f"https://www.chulavistaca.gov{link_href}"
+
+                        payload = {
+                            "asset_url": link_href,
+                            "case_id": case_id,
+                            "name": name,
+                            "title": title,
+                            "parent_page": str(filename),
+                            "details": {"case_type": case_type},
+                        }
+                        metadata.append(payload)
+
+                    time.sleep(throttle)
+        else:
+            logger.error("HTML for the desired Elelemt")
+
+        outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
+        self.cache.write_json(outfile, metadata)
+        return outfile
diff --git a/clean/ca/config/chula_vista_pd.py b/clean/ca/config/chula_vista_pd.py
@@ -0,0 +1,14 @@
+index_request_headers = {
+    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+    "accept-language": "en-US,en;q=0.9,nl;q=0.8,ur;q=0.7,ru;q=0.6",
+    "cache-control": "max-age=0",
+    "priority": "u=0, i",
+    "sec-ch-ua": '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
+    "sec-ch-ua-mobile": "?0",
+    "sec-ch-ua-platform": '"Windows"',
+    "sec-fetch-dest": "document",
+    "sec-fetch-mode": "navigate",
+    "sec-fetch-site": "none",
+    "sec-fetch-user": "?1",
+    "upgrade-insecure-requests": "1",
+}