diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index d541b40e..a1b65ea0 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -1,3 +1,10 @@ * Ocheze Amuzie ([@ochezems](https://github.com/ochezems)) +* Irene Casado Sánchez ([@irenecasdo](https://github.com/irenecasado)) +* Dilcia Mercedes ([@dilcia19](https://github.com/dilcia19)) +* Lisa Pickoff-White ([@pickoffwhite](https://github.com/pickoffwhite)) +* Gerald Rich ([@newsroomdev](https://github.com/newsroomdev)) * Jordan Rynning ([@jrynning](https://github.com/jrynning)) +* Tarak Shah ([@tarakc02](https://github.com/tarakc02 )) +* Nauman Sharif ([@naumansharifwork](https://github.com/naumansharifwork)) +* Mike Stucka ([@stucka](https://github.com/stucka)) * Serdar Tumgoren ([@zstumgoren](https://github.com/zstumgoren)) diff --git a/clean/ca/mendocino_county_sheriff.py b/clean/ca/mendocino_county_sheriff.py new file mode 100644 index 00000000..0d4dfd10 --- /dev/null +++ b/clean/ca/mendocino_county_sheriff.py @@ -0,0 +1,71 @@ +import logging +from pathlib import Path +from typing import Dict, List + +from .. import utils +from ..cache import Cache +from ..platforms.nextrequest import auth_nextrequest, process_nextrequest + +# from ..utils import MetadataDict + +logger = logging.getLogger(__name__) + + +class Site: + """Scrape file metadata for the Mendocino County Sheriff's Office. + + Attributes: + name (str): The official name of the agency + """ + + name = "Mendocino County Sheriff" + + def __init__( + self, + data_dir: Path = utils.CLEAN_DATA_DIR, + cache_dir: Path = utils.CLEAN_CACHE_DIR, + ): + """Initialize a new instance. + + Args: + data_dir (Path): The directory where downstream processed files/data will be saved + cache_dir (Path): The directory where files will be cached + """ + self.site_slug = "ca_mendocino_county_sheriff" + self.base_url = "https://mendocinocounty.nextrequest.com" + # Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files + # along with additional index pages + self.data_dir = data_dir + self.cache_dir = cache_dir + self.subpages_dir = cache_dir / (self.site_slug + "/subpages") + self.cache = Cache(cache_dir) + for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]: + utils.create_directory(localdir) + + def scrape_meta(self, throttle: int = 2) -> Path: + """Gather metadata on downloadable files (videos, etc.). + + Args: + throttle (int): Number of seconds to wait between requests. Defaults to 0. + + Returns: + Path: Local path of JSON file containing metadata on downloadable files + """ + metadata: List = [] + + for folder in ["22-18", "23-27", "20-30"]: + username = utils.get_credentials(f"MENDOSO{folder}_USER") + password = utils.get_credentials(f"MENDOSO{folder}_PASS") + start_url = f"https://mendocinocounty.nextrequest.com/requests/{folder}" + auth: Dict = auth_nextrequest(self.base_url, username, password) + local_metadata = process_nextrequest( + self.subpages_dir, start_url, force=True, throttle=throttle, auth=auth + ) + for i, _entry in enumerate(local_metadata): + local_metadata[i]["auth"] = auth + metadata.extend(local_metadata) + + json_filename = self.data_dir / (self.site_slug + ".json") + self.cache.write_json(json_filename, metadata) + + return json_filename diff --git a/clean/platforms/nextrequest.py b/clean/platforms/nextrequest.py index 9f20d761..04be46ae 100644 --- a/clean/platforms/nextrequest.py +++ b/clean/platforms/nextrequest.py @@ -5,6 +5,9 @@ from typing import Dict, List from urllib.parse import parse_qs, urlparse +import requests +from bs4 import BeautifulSoup + from .. import utils from ..cache import Cache @@ -17,8 +20,53 @@ """ +def auth_nextrequest(base_url: str, username: str, password: str, throttle: int = 2): + """Try to retrieve and return necessary authentication. + + Args: + base_url (str): The base URL of the NextRequest portal. + Example: https://mendocinocounty.nextrequest.com + username (str): The username for the NextRequest portal + password (str): The password for the NextRequest portal + Returns: + auth (dict): Dictionary of 'headers' and 'cookies' dictionaries + + Notes: + Basic approach from https://github.com/danem/foiatool/blob/main/foiatool/apis/nextrequest.py + """ + session = None + session = requests.Session() + session.headers["User-Agent"] = ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36)" + ) + login_url = f"{base_url}/users/sign_in" + page = session.get(login_url) + sleep(throttle) + soup = BeautifulSoup(page.content, "html5lib") + token = soup.find(attrs={"name": "csrf-token"})["content"] # type: ignore + payload = { + "authenticity_token": token, + "user[email]": username, + "user[password]": password, + "user[remember_me]": "0", + "button": "", + } + session.headers.update({"x-csrf-token": token}) # type: ignore + session.post(login_url, params=payload) + auth: dict = {} + auth["headers"] = dict(session.headers) + auth["cookies"] = dict(session.cookies.get_dict()) + sleep(throttle) + session = None + return auth # Force conversion from case-insensitive dict + + def process_nextrequest( - base_directory: Path, start_url: str, force: bool = False, throttle: int = 2 + base_directory: Path, + start_url: str, + force: bool = False, + throttle: int = 2, + auth=None, # type: ignore ): """Turn a base filepath and NextRequest folder URL into saved data and parsed Metadata. @@ -29,12 +77,13 @@ def process_nextrequest( start_url (str): The web page for the folder of NextRequest docs you want force (bool, default False): Overwrite file, if it exists? Otherwise, use cached version. throttle (int, default 2): Time to wait between calls + auth (dict, optional, default None): Dictionary of 'headers' and 'cookies' dictionaries Returns: List(Metadata) """ # Download data, if necessary filename, returned_json, file_needs_write = fetch_nextrequest( - base_directory, start_url, force, throttle=throttle + base_directory, start_url, force, throttle=throttle, auth=auth ) # Write data, if necessary @@ -49,7 +98,11 @@ def process_nextrequest( # Type base_directory to Path def fetch_nextrequest( - base_directory: Path, start_url: str, force: bool = False, throttle: int = 2 + base_directory: Path, + start_url: str, + force: bool = False, + throttle: int = 2, + auth=None, ): """ Given a link to a NextRequest documents folder, return a proposed filename and the JSON contents. @@ -58,6 +111,7 @@ def fetch_nextrequest( base_direcory (Path): The directory to save data in, e.g., cache/site-name/subpages start_url (str): The web page for the folder of NextRequest docs you want force (bool, default False): Overwrite file, if it exists? Otherwise, use cached version. + auth (dict, optional, default None): Dictionary of headers Returns: filename (str): Proposed filename; file NOT saved returned_json (None | dict): None if no rescrape needed; dict if JSON had to be downloaded @@ -83,21 +137,40 @@ def fetch_nextrequest( # Remember pagination here! page_number = 1 page_url = f"{json_url}{page_number}" - r = utils.get_url(page_url) + if auth: + if auth["headers"]: + headers = auth["headers"] + else: + headers = {} + if auth["cookies"]: + cookies = auth["cookies"] + else: + cookies = {} + r = utils.get_url(page_url, headers=headers, cookies=cookies) + else: + r = utils.get_url(page_url) if not r.ok: logger.error(f"Problem downloading {page_url}: {r.status_code}") returned_json: Dict = {} # type: ignore file_needs_write = False else: returned_json = r.json() - # local_cache.write_json(filename, file_needs_write = True total_documents = returned_json[profile["tally_field"]] - for i, _entry in enumerate(returned_json["documents"]): - returned_json["documents"][i]["bln_page_url"] = page_url - returned_json["documents"][i]["bln_total_documents"] = total_documents - page_size = profile["page_size"] - max_pages = find_max_pages(total_documents, page_size) + if total_documents == 0: + logger.debug(f"No documents found for processing! {returned_json}") + max_pages = 0 + else: + for i, _entry in enumerate(returned_json["documents"]): + returned_json["documents"][i]["bln_page_url"] = page_url + returned_json["documents"][i][ + "bln_total_documents" + ] = total_documents + page_size = profile["page_size"] + max_pages = find_max_pages(total_documents, page_size) + logger.debug( + f"Total documents: {total_documents}. Page size: {page_size}. Max pages: {max_pages}." + ) sleep(throttle) if total_documents > profile["doc_limit"]: message = f"Request found with {total_documents:,} documents, exceeding limits. " @@ -116,7 +189,20 @@ def fetch_nextrequest( message += f"199 pages. Not trying to scrape {page_url}." logger.warning(message) else: - r = utils.get_url(page_url) + if auth: + if auth["headers"]: + headers = auth["headers"] + else: + headers = {} + if auth["cookies"]: + cookies = auth["cookies"] + else: + cookies = {} + r = utils.get_url( + page_url, headers=headers, cookies=cookies + ) + else: + r = utils.get_url(page_url) if not r.ok: logger.error( f"Problem downloading {page_url}: {r.status_code}" diff --git a/docs/contributing.md b/docs/contributing.md index 6324a2ad..52a3c916 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -191,6 +191,11 @@ The metadata file should contain an array of one or more objects with the below - Complete date: YYYY-MM-DD (eg 1997-07-16) - Complete date plus hours and minutes: YYYY-MM-DDThh:mmTZD (eg 1997-07-16T19:20+01:00) - Complete date plus hours, minutes and seconds: YYYY-MM-DDThh:mm:ssTZD (eg 1997-07-16T19:20:30+01:00) +- `auth`: (optional) Details needed to handle authentication to download assets, on a per-asset basis. Per-project authentication methods can be handled through agency-level files in state-level `config` folders. + - `headers`: (optional) Dictionary of request headers needed to download asset. + - `cookies`: (optional) Dictionary of cookies to be sent with request + - `method`: (optional) Text of either `POST` or `GET` + - `payload`: (optional) Payload to be sent with POST requests Below is an example from `ca_san_diego_pd.json` metadata JSON. @@ -213,6 +218,8 @@ Below is an example from `ca_san_diego_pd.json` metadata JSON. ] ``` +For an example of authentication handling, see `ca_mendocino_county_sheriff.json`. + #### Assets The `clean.cache.Cache.download` method is available to help simplify the process of downloading file "assets" -- e.g. police videos and the HTML of pages where those video links are found -- to a local cache directory.