Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NextRequest auth #182 Mendo SO #169 #188

Open
wants to merge 7 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
* Ocheze Amuzie ([@ochezems](https://github.com/ochezems))
* Irene Casado Sánchez ([@irenecasdo](https://github.com/irenecasado))
* Dilcia Mercedes ([@dilcia19](https://github.com/dilcia19))
* Lisa Pickoff-White ([@pickoffwhite](https://github.com/pickoffwhite))
* Gerald Rich ([@newsroomdev](https://github.com/newsroomdev))
* Jordan Rynning ([@jrynning](https://github.com/jrynning))
* Tarak Shah ([@tarakc02](https://github.com/tarakc02 ))
* Nauman Sharif ([@naumansharifwork](https://github.com/naumansharifwork))
* Mike Stucka ([@stucka](https://github.com/stucka))
* Serdar Tumgoren ([@zstumgoren](https://github.com/zstumgoren))
71 changes: 71 additions & 0 deletions clean/ca/mendocino_county_sheriff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import logging
from pathlib import Path
from typing import Dict, List

from .. import utils
from ..cache import Cache
from ..platforms.nextrequest import auth_nextrequest, process_nextrequest

# from ..utils import MetadataDict

logger = logging.getLogger(__name__)


class Site:
"""Scrape file metadata for the Mendocino County Sheriff's Office.

Attributes:
name (str): The official name of the agency
"""

name = "Mendocino County Sheriff"

def __init__(
self,
data_dir: Path = utils.CLEAN_DATA_DIR,
cache_dir: Path = utils.CLEAN_CACHE_DIR,
):
"""Initialize a new instance.

Args:
data_dir (Path): The directory where downstream processed files/data will be saved
cache_dir (Path): The directory where files will be cached
"""
self.site_slug = "ca_mendocino_county_sheriff"
self.base_url = "https://mendocinocounty.nextrequest.com"
# Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
# along with additional index pages
self.data_dir = data_dir
self.cache_dir = cache_dir
self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
self.cache = Cache(cache_dir)
for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
utils.create_directory(localdir)

def scrape_meta(self, throttle: int = 2) -> Path:
"""Gather metadata on downloadable files (videos, etc.).

Args:
throttle (int): Number of seconds to wait between requests. Defaults to 0.

Returns:
Path: Local path of JSON file containing metadata on downloadable files
"""
metadata: List = []

for folder in ["22-18", "23-27", "20-30"]:
username = utils.get_credentials(f"MENDOSO{folder}_USER")
password = utils.get_credentials(f"MENDOSO{folder}_PASS")
start_url = f"https://mendocinocounty.nextrequest.com/requests/{folder}"
auth: Dict = auth_nextrequest(self.base_url, username, password)
local_metadata = process_nextrequest(
self.subpages_dir, start_url, force=True, throttle=throttle, auth=auth
)
for i, _entry in enumerate(local_metadata):
local_metadata[i]["auth"] = auth
metadata.extend(local_metadata)

json_filename = self.data_dir / (self.site_slug + ".json")
self.cache.write_json(json_filename, metadata)

return json_filename
108 changes: 97 additions & 11 deletions clean/platforms/nextrequest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from typing import Dict, List
from urllib.parse import parse_qs, urlparse

import requests
from bs4 import BeautifulSoup

from .. import utils
from ..cache import Cache

Expand All @@ -17,8 +20,53 @@
"""


def auth_nextrequest(base_url: str, username: str, password: str, throttle: int = 2):
"""Try to retrieve and return necessary authentication.

Args:
base_url (str): The base URL of the NextRequest portal.
Example: https://mendocinocounty.nextrequest.com
username (str): The username for the NextRequest portal
password (str): The password for the NextRequest portal
Returns:
auth (dict): Dictionary of 'headers' and 'cookies' dictionaries

Notes:
Basic approach from https://github.com/danem/foiatool/blob/main/foiatool/apis/nextrequest.py
"""
session = None
session = requests.Session()
session.headers["User-Agent"] = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36)"
)
login_url = f"{base_url}/users/sign_in"
page = session.get(login_url)
sleep(throttle)
soup = BeautifulSoup(page.content, "html5lib")
token = soup.find(attrs={"name": "csrf-token"})["content"] # type: ignore
payload = {
"authenticity_token": token,
"user[email]": username,
"user[password]": password,
"user[remember_me]": "0",
"button": "",
}
session.headers.update({"x-csrf-token": token}) # type: ignore
session.post(login_url, params=payload)
auth: dict = {}
auth["headers"] = dict(session.headers)
auth["cookies"] = dict(session.cookies.get_dict())
sleep(throttle)
session = None
return auth # Force conversion from case-insensitive dict


def process_nextrequest(
base_directory: Path, start_url: str, force: bool = False, throttle: int = 2
base_directory: Path,
start_url: str,
force: bool = False,
throttle: int = 2,
auth=None, # type: ignore
):
"""Turn a base filepath and NextRequest folder URL into saved data and parsed Metadata.

Expand All @@ -29,12 +77,13 @@ def process_nextrequest(
start_url (str): The web page for the folder of NextRequest docs you want
force (bool, default False): Overwrite file, if it exists? Otherwise, use cached version.
throttle (int, default 2): Time to wait between calls
auth (dict, optional, default None): Dictionary of 'headers' and 'cookies' dictionaries
Returns:
List(Metadata)
"""
# Download data, if necessary
filename, returned_json, file_needs_write = fetch_nextrequest(
base_directory, start_url, force, throttle=throttle
base_directory, start_url, force, throttle=throttle, auth=auth
)

# Write data, if necessary
Expand All @@ -49,7 +98,11 @@ def process_nextrequest(

# Type base_directory to Path
def fetch_nextrequest(
base_directory: Path, start_url: str, force: bool = False, throttle: int = 2
base_directory: Path,
start_url: str,
force: bool = False,
throttle: int = 2,
auth=None,
):
"""
Given a link to a NextRequest documents folder, return a proposed filename and the JSON contents.
Expand All @@ -58,6 +111,7 @@ def fetch_nextrequest(
base_direcory (Path): The directory to save data in, e.g., cache/site-name/subpages
start_url (str): The web page for the folder of NextRequest docs you want
force (bool, default False): Overwrite file, if it exists? Otherwise, use cached version.
auth (dict, optional, default None): Dictionary of headers
Returns:
filename (str): Proposed filename; file NOT saved
returned_json (None | dict): None if no rescrape needed; dict if JSON had to be downloaded
Expand All @@ -83,21 +137,40 @@ def fetch_nextrequest(
# Remember pagination here!
page_number = 1
page_url = f"{json_url}{page_number}"
r = utils.get_url(page_url)
if auth:
if auth["headers"]:
headers = auth["headers"]
else:
headers = {}
if auth["cookies"]:
cookies = auth["cookies"]
else:
cookies = {}
r = utils.get_url(page_url, headers=headers, cookies=cookies)
else:
r = utils.get_url(page_url)
if not r.ok:
logger.error(f"Problem downloading {page_url}: {r.status_code}")
returned_json: Dict = {} # type: ignore
file_needs_write = False
else:
returned_json = r.json()
# local_cache.write_json(filename,
file_needs_write = True
total_documents = returned_json[profile["tally_field"]]
for i, _entry in enumerate(returned_json["documents"]):
returned_json["documents"][i]["bln_page_url"] = page_url
returned_json["documents"][i]["bln_total_documents"] = total_documents
page_size = profile["page_size"]
max_pages = find_max_pages(total_documents, page_size)
if total_documents == 0:
logger.debug(f"No documents found for processing! {returned_json}")
max_pages = 0
else:
for i, _entry in enumerate(returned_json["documents"]):
returned_json["documents"][i]["bln_page_url"] = page_url
returned_json["documents"][i][
"bln_total_documents"
] = total_documents
page_size = profile["page_size"]
max_pages = find_max_pages(total_documents, page_size)
logger.debug(
f"Total documents: {total_documents}. Page size: {page_size}. Max pages: {max_pages}."
)
sleep(throttle)
if total_documents > profile["doc_limit"]:
message = f"Request found with {total_documents:,} documents, exceeding limits. "
Expand All @@ -116,7 +189,20 @@ def fetch_nextrequest(
message += f"199 pages. Not trying to scrape {page_url}."
logger.warning(message)
else:
r = utils.get_url(page_url)
if auth:
if auth["headers"]:
headers = auth["headers"]
else:
headers = {}
if auth["cookies"]:
cookies = auth["cookies"]
else:
cookies = {}
r = utils.get_url(
page_url, headers=headers, cookies=cookies
)
else:
r = utils.get_url(page_url)
if not r.ok:
logger.error(
f"Problem downloading {page_url}: {r.status_code}"
Expand Down
7 changes: 7 additions & 0 deletions docs/contributing.md
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,11 @@ The metadata file should contain an array of one or more objects with the below
- Complete date: YYYY-MM-DD (eg 1997-07-16)
- Complete date plus hours and minutes: YYYY-MM-DDThh:mmTZD (eg 1997-07-16T19:20+01:00)
- Complete date plus hours, minutes and seconds: YYYY-MM-DDThh:mm:ssTZD (eg 1997-07-16T19:20:30+01:00)
- `auth`: (optional) Details needed to handle authentication to download assets, on a per-asset basis. Per-project authentication methods can be handled through agency-level files in state-level `config` folders.
- `headers`: (optional) Dictionary of request headers needed to download asset.
- `cookies`: (optional) Dictionary of cookies to be sent with request
- `method`: (optional) Text of either `POST` or `GET`
- `payload`: (optional) Payload to be sent with POST requests

Below is an example from `ca_san_diego_pd.json` metadata JSON.

Expand All @@ -213,6 +218,8 @@ Below is an example from `ca_san_diego_pd.json` metadata JSON.
]
```

For an example of authentication handling, see `ca_mendocino_county_sheriff.json`.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add some more details? Should contribs generate that?


#### Assets

The `clean.cache.Cache.download` method is available to help simplify the process of downloading file "assets" -- e.g. police videos and the HTML of pages where those video links are found -- to a local cache directory.
Expand Down
Loading