Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added scrape-meta for lake_county_sheriff #132 #157

Open
wants to merge 4 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 162 additions & 0 deletions clean/ca/lake_county_sheriff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import os
import time
from pathlib import Path

from bs4 import BeautifulSoup
from dotenv import load_dotenv

from .. import utils
from ..cache import Cache


class Site:
"""Scrape file metadata and download files for the City of Lake County Sheriff.

Attributes:
name (str): The official name of the agency
"""

name = "Lake County Sheriff"

def __init__(
self,
data_dir: Path = utils.CLEAN_DATA_DIR,
cache_dir: Path = utils.CLEAN_CACHE_DIR,
):
"""Initialize a new instance.

Args:
data_dir (Path): The directory where downstream processed files/data will be saved
cache_dir (Path): The directory where files will be cached
"""
self.base_url = "https://www.lakesheriff.com/969/Use-of-Force"
self.zenrows_api_url = "https://api.zenrows.com/v1/"
self.data_dir = data_dir
self.cache_dir = cache_dir
self.cache = Cache(cache_dir)
dotenv_path = "env/.env"
load_dotenv(dotenv_path=dotenv_path)
self.params = {
"apikey": os.getenv("ZENROWS_KEY"),
"url": "", # Target website URL
# Add any other ZenRows parameters here (optional)
}

@property
def agency_slug(self) -> str:
"""Construct the agency slug."""
# Use module path to construct agency slug, which we'll use downstream
mod = Path(__file__)
state_postal = mod.parent.stem
return f"{state_postal}_{mod.stem}" # ca_lake_county_sheriff

def scrape_meta(self, throttle=0):
# construct a local filename relative to the cache directory - agency slug + page url (ca_lake_county_sheriff/Use-of-Force.html)
# download the page (if not already cached)
# save the index page url to cache (sensible name)
base_name = f"{self.base_url.split('/')[-1]}.html"
filename = f"{self.agency_slug}/{base_name}"
self.params["url"] = self.base_url
self.cache.download(filename, self.zenrows_api_url, params=self.params)
metadata = []
child_pages = []
html = self.cache.read(filename)
soup = BeautifulSoup(html, "html.parser")
body = soup.find("table", class_="fr-alternate-rows")
child_links = body.find_all("a")
for link in child_links:
tr_tag = link.find_parent("tr")
td_tag = tr_tag.find_all("td")
child_page_data = dict()
child_page_data["date"] = td_tag[0].text
child_page_data["location"] = td_tag[1].get_text(separator=", ")
child_page_data["name"] = td_tag[2].text
child_page_data["incident_type"] = td_tag[3].abbr.text
child_page_data["case_number"] = link.text
child_file_name = (
f'{self.agency_slug}/{child_page_data["case_number"]}.html'
)
if link["href"]:
link_url = f"https://www.lakesheriff.com{link['href']}"
self.params["url"] = link_url
self.cache.download(
child_file_name, self.zenrows_api_url, params=self.params
)
child_page_data["page_filename"] = child_file_name
child_pages.append(child_page_data)
time.sleep(throttle)
for child_page in child_pages:
html = self.cache.read(child_page["page_filename"])
soup = BeautifulSoup(html, "html.parser")
body = soup.find(attrs={"data-cprole": "mainContentContainer"})
links = body.find_all("a")
for link in links:
link_href = link.get("href", None)
if link_href:
if "youtu" in link_href:
payload = {
"asset_url": link_href,
"case_id": child_page["case_number"],
"name": link.text,
"title": link.text,
"parent_page": str(child_page["page_filename"]),
"details": {
"date": child_page["date"],
"location": child_page["location"],
"name": child_page["name"],
"incident_type": child_page["incident_type"],
},
}
metadata.append(payload)
elif "DocumentCenter" in link_href:
payload = {
"asset_url": f"https://www.lakesheriff.com{link_href}",
"case_id": child_page["case_number"],
"name": link.text,
"title": link.text,
"parent_page": str(child_page["page_filename"]),
"details": {
"date": child_page["date"],
"location": child_page["location"],
"name": child_page["name"],
"incident_type": child_page["incident_type"],
},
}
metadata.append(payload)
elif "gallery" in link_href:
gallery_id = link_href.split("=")[-1]
galley_link = f"https://www.lakesheriff.com/SlideShow.aspx?AID={gallery_id}&AN=Sheriff%20-%20Use%20of%20Force%20-%20Case%2014110123"
self.params["url"] = galley_link
images_file_name = (
f"{self.agency_slug}/images_{gallery_id}.html"
)
self.cache.download(
images_file_name, self.zenrows_api_url, params=self.params
)
html = self.cache.read(images_file_name)
soup = BeautifulSoup(html, "html.parser")
body = soup.find("div", class_="slides")
a_tags = body.find_all("a")
for a_tag in a_tags:
img_tag = a_tag.find("img")
# Get the 'src' and 'alt' attributes
image_src = img_tag.get("src")
image_alt = img_tag.get("alt")
payload = {
"asset_url": f"https://www.lakesheriff.com{image_src}",
"case_id": child_page["case_number"],
"name": image_alt,
"title": link.text,
"parent_page": str(child_page["page_filename"]),
"details": {
"date": child_page["date"],
"location": child_page["location"],
"name": child_page["name"],
"incident_type": child_page["incident_type"],
},
}
metadata.append(payload)

outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
self.cache.write_json(outfile, metadata)
return outfile
5 changes: 5 additions & 0 deletions docs/contributing.md
Original file line number Diff line number Diff line change
Expand Up @@ -358,3 +358,8 @@ git push origin your-branch-name
The final step is to submit a [pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) to the main respository, asking the maintainers to consider integrating your patch.

GitHub has [a short guide](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) that can walk you through the process. You should tag your issue number in the request so that it gets linked in GitHub's system.


## Zen Rows Use

Some sites uses zenrows API, click [here](https://app.zenrows.com/login) to sign up for an account and get a ZENROWS_KEY which you can add to your .env file
1 change: 1 addition & 0 deletions env.sample/.env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ MUCKROCK_SAMPLE=TKTKTK
GOVQA_CA_SAMPLECITY_PASSWORD=TKTKTK
GOVQA_CA_SAMPLECITY_USERNAME=TKTKTK
MUCKROCK_CRP=TKTKTK
ZENROWS_KEY=TKTKTK