biglocalnews · stucka · Jan 15, 2025 · Jan 17, 2025 · Jan 21, 2025 · Jan 21, 2025
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -1,3 +1,10 @@
 * Ocheze Amuzie ([@ochezems](https://github.com/ochezems))
+* Irene Casado Sánchez ([@irenecasdo](https://github.com/irenecasado))
+* Dilcia Mercedes ([@dilcia19](https://github.com/dilcia19))
+* Lisa Pickoff-White ([@pickoffwhite](https://github.com/pickoffwhite))
+* Gerald Rich ([@newsroomdev](https://github.com/newsroomdev))
 * Jordan Rynning ([@jrynning](https://github.com/jrynning))
+* Tarak Shah ([@tarakc02](https://github.com/tarakc02 ))
+* Nauman Sharif ([@naumansharifwork](https://github.com/naumansharifwork))
+* Mike Stucka ([@stucka](https://github.com/stucka))
 * Serdar Tumgoren ([@zstumgoren](https://github.com/zstumgoren))
diff --git a/clean/ca/mendocino_county_sheriff.py b/clean/ca/mendocino_county_sheriff.py
@@ -0,0 +1,71 @@
+import logging
+from pathlib import Path
+from typing import Dict, List
+
+from .. import utils
+from ..cache import Cache
+from ..platforms.nextrequest import auth_nextrequest, process_nextrequest
+
+# from ..utils import MetadataDict
+
+logger = logging.getLogger(__name__)
+
+
+class Site:
+    """Scrape file metadata for the Mendocino County Sheriff's Office.
+
+    Attributes:
+        name (str): The official name of the agency
+    """
+
+    name = "Mendocino County Sheriff"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        self.site_slug = "ca_mendocino_county_sheriff"
+        self.base_url = "https://mendocinocounty.nextrequest.com"
+        # Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
+        # along with additional index pages
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
+        self.cache = Cache(cache_dir)
+        for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
+            utils.create_directory(localdir)
+
+    def scrape_meta(self, throttle: int = 2) -> Path:
+        """Gather metadata on downloadable files (videos, etc.).
+
+        Args:
+            throttle (int): Number of seconds to wait between requests. Defaults to 0.
+
+        Returns:
+            Path: Local path of JSON file containing metadata on downloadable files
+        """
+        metadata: List = []
+
+        for folder in ["22-18", "23-27", "20-30"]:
+            username = utils.get_credentials(f"MENDOSO{folder}_USER")
+            password = utils.get_credentials(f"MENDOSO{folder}_PASS")
+            start_url = f"https://mendocinocounty.nextrequest.com/requests/{folder}"
+            auth: Dict = auth_nextrequest(self.base_url, username, password)
+            local_metadata = process_nextrequest(
+                self.subpages_dir, start_url, force=True, throttle=throttle, auth=auth
+            )
+            for i, _entry in enumerate(local_metadata):
+                local_metadata[i]["auth"] = auth
+            metadata.extend(local_metadata)
+
+        json_filename = self.data_dir / (self.site_slug + ".json")
+        self.cache.write_json(json_filename, metadata)
+
+        return json_filename
diff --git a/clean/platforms/nextrequest.py b/clean/platforms/nextrequest.py
@@ -5,6 +5,9 @@
 from typing import Dict, List
 from urllib.parse import parse_qs, urlparse
 
+import requests
+from bs4 import BeautifulSoup
+
 from .. import utils
 from ..cache import Cache
 
@@ -17,8 +20,53 @@
 """
 
 
+def auth_nextrequest(base_url: str, username: str, password: str, throttle: int = 2):
+    """Try to retrieve and return necessary authentication.
+
+    Args:
+        base_url (str): The base URL of the NextRequest portal.
+            Example: https://mendocinocounty.nextrequest.com
+        username (str): The username for the NextRequest portal
+        password (str): The password for the NextRequest portal
+    Returns:
+        auth (dict): Dictionary of 'headers' and 'cookies' dictionaries
+
+    Notes:
+        Basic approach from https://github.com/danem/foiatool/blob/main/foiatool/apis/nextrequest.py
+    """
+    session = None
+    session = requests.Session()
+    session.headers["User-Agent"] = (
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36)"
+    )
+    login_url = f"{base_url}/users/sign_in"
+    page = session.get(login_url)
+    sleep(throttle)
+    soup = BeautifulSoup(page.content, "html5lib")
+    token = soup.find(attrs={"name": "csrf-token"})["content"]  # type: ignore
+    payload = {
+        "authenticity_token": token,
+        "user[email]": username,
+        "user[password]": password,
+        "user[remember_me]": "0",
+        "button": "",
+    }
+    session.headers.update({"x-csrf-token": token})  # type: ignore
+    session.post(login_url, params=payload)
+    auth: dict = {}
+    auth["headers"] = dict(session.headers)
+    auth["cookies"] = dict(session.cookies.get_dict())
+    sleep(throttle)
+    session = None
+    return auth  # Force conversion from case-insensitive dict
+
+
 def process_nextrequest(
-    base_directory: Path, start_url: str, force: bool = False, throttle: int = 2
+    base_directory: Path,
+    start_url: str,
+    force: bool = False,
+    throttle: int = 2,
+    auth=None,  # type: ignore
 ):
     """Turn a base filepath and NextRequest folder URL into saved data and parsed Metadata.
 
@@ -29,12 +77,13 @@ def process_nextrequest(
         start_url (str): The web page for the folder of NextRequest docs you want
         force (bool, default False): Overwrite file, if it exists? Otherwise, use cached version.
         throttle (int, default 2): Time to wait between calls
+        auth (dict, optional, default None): Dictionary of 'headers' and 'cookies' dictionaries
     Returns:
         List(Metadata)
     """
     # Download data, if necessary
     filename, returned_json, file_needs_write = fetch_nextrequest(
-        base_directory, start_url, force, throttle=throttle
+        base_directory, start_url, force, throttle=throttle, auth=auth
     )
 
     # Write data, if necessary
@@ -49,7 +98,11 @@ def process_nextrequest(
 
 # Type base_directory to Path
 def fetch_nextrequest(
-    base_directory: Path, start_url: str, force: bool = False, throttle: int = 2
+    base_directory: Path,
+    start_url: str,
+    force: bool = False,
+    throttle: int = 2,
+    auth=None,
 ):
     """
     Given a link to a NextRequest documents folder, return a proposed filename and the JSON contents.
@@ -58,6 +111,7 @@ def fetch_nextrequest(
         base_direcory (Path): The directory to save data in, e.g., cache/site-name/subpages
         start_url (str): The web page for the folder of NextRequest docs you want
         force (bool, default False): Overwrite file, if it exists? Otherwise, use cached version.
+        auth (dict, optional, default None): Dictionary of headers
     Returns:
         filename (str): Proposed filename; file NOT saved
         returned_json (None | dict): None if no rescrape needed; dict if JSON had to be downloaded
@@ -83,21 +137,40 @@ def fetch_nextrequest(
         # Remember pagination here!
         page_number = 1
         page_url = f"{json_url}{page_number}"
-        r = utils.get_url(page_url)
+        if auth:
+            if auth["headers"]:
+                headers = auth["headers"]
+            else:
+                headers = {}
+            if auth["cookies"]:
+                cookies = auth["cookies"]
+            else:
+                cookies = {}
+            r = utils.get_url(page_url, headers=headers, cookies=cookies)
+        else:
+            r = utils.get_url(page_url)
         if not r.ok:
             logger.error(f"Problem downloading {page_url}: {r.status_code}")
             returned_json: Dict = {}  # type: ignore
             file_needs_write = False
         else:
             returned_json = r.json()
-            # local_cache.write_json(filename,
             file_needs_write = True
             total_documents = returned_json[profile["tally_field"]]
-            for i, _entry in enumerate(returned_json["documents"]):
-                returned_json["documents"][i]["bln_page_url"] = page_url
-                returned_json["documents"][i]["bln_total_documents"] = total_documents
-            page_size = profile["page_size"]
-            max_pages = find_max_pages(total_documents, page_size)
+            if total_documents == 0:
+                logger.debug(f"No documents found for processing! {returned_json}")
+                max_pages = 0
+            else:
+                for i, _entry in enumerate(returned_json["documents"]):
+                    returned_json["documents"][i]["bln_page_url"] = page_url
+                    returned_json["documents"][i][
+                        "bln_total_documents"
+                    ] = total_documents
+                page_size = profile["page_size"]
+                max_pages = find_max_pages(total_documents, page_size)
+                logger.debug(
+                    f"Total documents: {total_documents}. Page size: {page_size}. Max pages: {max_pages}."
+                )
             sleep(throttle)
             if total_documents > profile["doc_limit"]:
                 message = f"Request found with {total_documents:,} documents, exceeding limits. "
@@ -116,7 +189,20 @@ def fetch_nextrequest(
                         message += f"199 pages. Not trying to scrape {page_url}."
                         logger.warning(message)
                     else:
-                        r = utils.get_url(page_url)
+                        if auth:
+                            if auth["headers"]:
+                                headers = auth["headers"]
+                            else:
+                                headers = {}
+                            if auth["cookies"]:
+                                cookies = auth["cookies"]
+                            else:
+                                cookies = {}
+                            r = utils.get_url(
+                                page_url, headers=headers, cookies=cookies
+                            )
+                        else:
+                            r = utils.get_url(page_url)
                         if not r.ok:
                             logger.error(
                                 f"Problem downloading {page_url}: {r.status_code}"

diff --git a/docs/contributing.md b/docs/contributing.md
@@ -191,6 +191,11 @@ The metadata file should contain an array of one or more objects with the below
     - Complete date: YYYY-MM-DD (eg 1997-07-16)
     - Complete date plus hours and minutes: YYYY-MM-DDThh:mmTZD (eg 1997-07-16T19:20+01:00)
     - Complete date plus hours, minutes and seconds: YYYY-MM-DDThh:mm:ssTZD (eg 1997-07-16T19:20:30+01:00)
+- `auth`: (optional) Details needed to handle authentication to download assets, on a per-asset basis. Per-project authentication methods can be handled through agency-level files in state-level `config` folders.
+  - `headers`: (optional) Dictionary of request headers needed to download asset.
+  - `cookies`: (optional) Dictionary of cookies to be sent with request
+  - `method`: (optional) Text of either `POST` or `GET`
+  - `payload`: (optional) Payload to be sent with POST requests
 
 Below is an example from `ca_san_diego_pd.json` metadata JSON.
 
@@ -213,6 +218,8 @@ Below is an example from `ca_san_diego_pd.json` metadata JSON.
 ]
 ```
 
+For an example of authentication handling, see `ca_mendocino_county_sheriff.json`.
+
 #### Assets
 
 The `clean.cache.Cache.download` method is available to help simplify the process of downloading file "assets" -- e.g. police videos and the HTML of pages where those video links are found -- to a local cache directory.