From 65e06586d8b56aa7f2cc408f4fee59f924e0647b Mon Sep 17 00:00:00 2001 From: Nauman <157045300+naumansharifwork@users.noreply.github.com> Date: Thu, 24 Oct 2024 13:31:21 +0500 Subject: [PATCH 1/2] added scrape-meta for lake_county_sheriff --- clean/ca/lake_county_sheriff.py | 162 ++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 clean/ca/lake_county_sheriff.py diff --git a/clean/ca/lake_county_sheriff.py b/clean/ca/lake_county_sheriff.py new file mode 100644 index 0000000..5bb0886 --- /dev/null +++ b/clean/ca/lake_county_sheriff.py @@ -0,0 +1,162 @@ +import os +import time +from pathlib import Path + +from bs4 import BeautifulSoup +from dotenv import load_dotenv + +from .. import utils +from ..cache import Cache + + +class Site: + """Scrape file metadata and download files for the City of Lake County Sheriff. + + Attributes: + name (str): The official name of the agency + """ + + name = "Lake County Sheriff" + + def __init__( + self, + data_dir: Path = utils.CLEAN_DATA_DIR, + cache_dir: Path = utils.CLEAN_CACHE_DIR, + ): + """Initialize a new instance. + + Args: + data_dir (Path): The directory where downstream processed files/data will be saved + cache_dir (Path): The directory where files will be cached + """ + self.base_url = "https://www.lakesheriff.com/969/Use-of-Force" + self.zenrows_api_url = "https://api.zenrows.com/v1/" + self.data_dir = data_dir + self.cache_dir = cache_dir + self.cache = Cache(cache_dir) + dotenv_path = "env/.env" + load_dotenv(dotenv_path=dotenv_path) + self.params = { + "apikey": os.getenv("zenrows_key"), + "url": "", # Target website URL + # Add any other ZenRows parameters here (optional) + } + + @property + def agency_slug(self) -> str: + """Construct the agency slug.""" + # Use module path to construct agency slug, which we'll use downstream + mod = Path(__file__) + state_postal = mod.parent.stem + return f"{state_postal}_{mod.stem}" # ca_lake_county_sheriff + + def scrape_meta(self, throttle=0): + # construct a local filename relative to the cache directory - agency slug + page url (ca_lake_county_sheriff/Use-of-Force.html) + # download the page (if not already cached) + # save the index page url to cache (sensible name) + base_name = f"{self.base_url.split('/')[-1]}.html" + filename = f"{self.agency_slug}/{base_name}" + self.params["url"] = self.base_url + self.cache.download(filename, self.zenrows_api_url, params=self.params) + metadata = [] + child_pages = [] + html = self.cache.read(filename) + soup = BeautifulSoup(html, "html.parser") + body = soup.find("table", class_="fr-alternate-rows") + child_links = body.find_all("a") + for link in child_links: + tr_tag = link.find_parent("tr") + td_tag = tr_tag.find_all("td") + child_page_data = dict() + child_page_data["date"] = td_tag[0].text + child_page_data["location"] = td_tag[1].get_text(separator=", ") + child_page_data["name"] = td_tag[2].text + child_page_data["incident_type"] = td_tag[3].abbr.text + child_page_data["case_number"] = link.text + child_file_name = ( + f'{self.agency_slug}/{child_page_data["case_number"]}.html' + ) + if link["href"]: + link_url = f"https://www.lakesheriff.com{link['href']}" + self.params["url"] = link_url + self.cache.download( + child_file_name, self.zenrows_api_url, params=self.params + ) + child_page_data["page_filename"] = child_file_name + child_pages.append(child_page_data) + time.sleep(throttle) + for child_page in child_pages: + html = self.cache.read(child_page["page_filename"]) + soup = BeautifulSoup(html, "html.parser") + body = soup.find(attrs={"data-cprole": "mainContentContainer"}) + links = body.find_all("a") + for link in links: + link_href = link.get("href", None) + if link_href: + if "youtu" in link_href: + payload = { + "asset_url": link_href, + "case_id": child_page["case_number"], + "name": link.text, + "title": link.text, + "parent_page": str(child_page["page_filename"]), + "details": { + "date": child_page["date"], + "location": child_page["location"], + "name": child_page["name"], + "incident_type": child_page["incident_type"], + }, + } + metadata.append(payload) + elif "DocumentCenter" in link_href: + payload = { + "asset_url": f"https://www.lakesheriff.com{link_href}", + "case_id": child_page["case_number"], + "name": link.text, + "title": link.text, + "parent_page": str(child_page["page_filename"]), + "details": { + "date": child_page["date"], + "location": child_page["location"], + "name": child_page["name"], + "incident_type": child_page["incident_type"], + }, + } + metadata.append(payload) + elif "gallery" in link_href: + gallery_id = link_href.split("=")[-1] + galley_link = f"https://www.lakesheriff.com/SlideShow.aspx?AID={gallery_id}&AN=Sheriff%20-%20Use%20of%20Force%20-%20Case%2014110123" + self.params["url"] = galley_link + images_file_name = ( + f"{self.agency_slug}/images_{gallery_id}.html" + ) + self.cache.download( + images_file_name, self.zenrows_api_url, params=self.params + ) + html = self.cache.read(images_file_name) + soup = BeautifulSoup(html, "html.parser") + body = soup.find("div", class_="slides") + a_tags = body.find_all("a") + for a_tag in a_tags: + img_tag = a_tag.find("img") + # Get the 'src' and 'alt' attributes + image_src = img_tag.get("src") + image_alt = img_tag.get("alt") + payload = { + "asset_url": f"https://www.lakesheriff.com{image_src}", + "case_id": child_page["case_number"], + "name": image_alt, + "title": link.text, + "parent_page": str(child_page["page_filename"]), + "details": { + "date": child_page["date"], + "location": child_page["location"], + "name": child_page["name"], + "incident_type": child_page["incident_type"], + }, + } + metadata.append(payload) + + outfile = self.data_dir.joinpath(f"{self.agency_slug}.json") + self.cache.write_json(outfile, metadata) + return outfile From 1e5e4d2aaa130ceafb42aafdca351b785ab8a962 Mon Sep 17 00:00:00 2001 From: Nauman <157045300+naumansharifwork@users.noreply.github.com> Date: Tue, 29 Oct 2024 14:55:22 +0500 Subject: [PATCH 2/2] added zen-rows doc --- clean/ca/lake_county_sheriff.py | 2 +- docs/contributing.md | 5 +++++ env.sample/.env.sample | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/clean/ca/lake_county_sheriff.py b/clean/ca/lake_county_sheriff.py index 5bb0886..79e9c26 100644 --- a/clean/ca/lake_county_sheriff.py +++ b/clean/ca/lake_county_sheriff.py @@ -37,7 +37,7 @@ def __init__( dotenv_path = "env/.env" load_dotenv(dotenv_path=dotenv_path) self.params = { - "apikey": os.getenv("zenrows_key"), + "apikey": os.getenv("ZENROWS_KEY"), "url": "", # Target website URL # Add any other ZenRows parameters here (optional) } diff --git a/docs/contributing.md b/docs/contributing.md index 6324a2a..ee1f83c 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -358,3 +358,8 @@ git push origin your-branch-name The final step is to submit a [pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) to the main respository, asking the maintainers to consider integrating your patch. GitHub has [a short guide](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) that can walk you through the process. You should tag your issue number in the request so that it gets linked in GitHub's system. + + +## Zen Rows Use + +Some sites uses zenrows API, click [here](https://app.zenrows.com/login) to sign up for an account and get a ZENROWS_KEY which you can add to your .env file diff --git a/env.sample/.env.sample b/env.sample/.env.sample index 7b161b2..6f19092 100644 --- a/env.sample/.env.sample +++ b/env.sample/.env.sample @@ -2,3 +2,4 @@ MUCKROCK_SAMPLE=TKTKTK GOVQA_CA_SAMPLECITY_PASSWORD=TKTKTK GOVQA_CA_SAMPLECITY_USERNAME=TKTKTK MUCKROCK_CRP=TKTKTK +ZENROWS_KEY=TKTKTK