From 65e06586d8b56aa7f2cc408f4fee59f924e0647b Mon Sep 17 00:00:00 2001
From: Nauman <157045300+naumansharifwork@users.noreply.github.com>
Date: Thu, 24 Oct 2024 13:31:21 +0500
Subject: [PATCH 1/2] added scrape-meta for lake_county_sheriff

---
 clean/ca/lake_county_sheriff.py | 162 ++++++++++++++++++++++++++++++++
 1 file changed, 162 insertions(+)
 create mode 100644 clean/ca/lake_county_sheriff.py

diff --git a/clean/ca/lake_county_sheriff.py b/clean/ca/lake_county_sheriff.py
new file mode 100644
index 0000000..5bb0886
--- /dev/null
+++ b/clean/ca/lake_county_sheriff.py
@@ -0,0 +1,162 @@
+import os
+import time
+from pathlib import Path
+
+from bs4 import BeautifulSoup
+from dotenv import load_dotenv
+
+from .. import utils
+from ..cache import Cache
+
+
+class Site:
+    """Scrape file metadata and download files for the City of Lake County Sheriff.
+
+    Attributes:
+        name (str): The official name of the agency
+    """
+
+    name = "Lake County Sheriff"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        self.base_url = "https://www.lakesheriff.com/969/Use-of-Force"
+        self.zenrows_api_url = "https://api.zenrows.com/v1/"
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.cache = Cache(cache_dir)
+        dotenv_path = "env/.env"
+        load_dotenv(dotenv_path=dotenv_path)
+        self.params = {
+            "apikey": os.getenv("zenrows_key"),
+            "url": "",  # Target website URL
+            # Add any other ZenRows parameters here (optional)
+        }
+
+    @property
+    def agency_slug(self) -> str:
+        """Construct the agency slug."""
+        # Use module path to construct agency slug, which we'll use downstream
+        mod = Path(__file__)
+        state_postal = mod.parent.stem
+        return f"{state_postal}_{mod.stem}"  # ca_lake_county_sheriff
+
+    def scrape_meta(self, throttle=0):
+        # construct a local filename relative to the cache directory - agency slug + page url (ca_lake_county_sheriff/Use-of-Force.html)
+        # download the page (if not already cached)
+        # save the index page url to cache (sensible name)
+        base_name = f"{self.base_url.split('/')[-1]}.html"
+        filename = f"{self.agency_slug}/{base_name}"
+        self.params["url"] = self.base_url
+        self.cache.download(filename, self.zenrows_api_url, params=self.params)
+        metadata = []
+        child_pages = []
+        html = self.cache.read(filename)
+        soup = BeautifulSoup(html, "html.parser")
+        body = soup.find("table", class_="fr-alternate-rows")
+        child_links = body.find_all("a")
+        for link in child_links:
+            tr_tag = link.find_parent("tr")
+            td_tag = tr_tag.find_all("td")
+            child_page_data = dict()
+            child_page_data["date"] = td_tag[0].text
+            child_page_data["location"] = td_tag[1].get_text(separator=", ")
+            child_page_data["name"] = td_tag[2].text
+            child_page_data["incident_type"] = td_tag[3].abbr.text
+            child_page_data["case_number"] = link.text
+            child_file_name = (
+                f'{self.agency_slug}/{child_page_data["case_number"]}.html'
+            )
+            if link["href"]:
+                link_url = f"https://www.lakesheriff.com{link['href']}"
+                self.params["url"] = link_url
+                self.cache.download(
+                    child_file_name, self.zenrows_api_url, params=self.params
+                )
+                child_page_data["page_filename"] = child_file_name
+                child_pages.append(child_page_data)
+            time.sleep(throttle)
+        for child_page in child_pages:
+            html = self.cache.read(child_page["page_filename"])
+            soup = BeautifulSoup(html, "html.parser")
+            body = soup.find(attrs={"data-cprole": "mainContentContainer"})
+            links = body.find_all("a")
+            for link in links:
+                link_href = link.get("href", None)
+                if link_href:
+                    if "youtu" in link_href:
+                        payload = {
+                            "asset_url": link_href,
+                            "case_id": child_page["case_number"],
+                            "name": link.text,
+                            "title": link.text,
+                            "parent_page": str(child_page["page_filename"]),
+                            "details": {
+                                "date": child_page["date"],
+                                "location": child_page["location"],
+                                "name": child_page["name"],
+                                "incident_type": child_page["incident_type"],
+                            },
+                        }
+                        metadata.append(payload)
+                    elif "DocumentCenter" in link_href:
+                        payload = {
+                            "asset_url": f"https://www.lakesheriff.com{link_href}",
+                            "case_id": child_page["case_number"],
+                            "name": link.text,
+                            "title": link.text,
+                            "parent_page": str(child_page["page_filename"]),
+                            "details": {
+                                "date": child_page["date"],
+                                "location": child_page["location"],
+                                "name": child_page["name"],
+                                "incident_type": child_page["incident_type"],
+                            },
+                        }
+                        metadata.append(payload)
+                    elif "gallery" in link_href:
+                        gallery_id = link_href.split("=")[-1]
+                        galley_link = f"https://www.lakesheriff.com/SlideShow.aspx?AID={gallery_id}&AN=Sheriff%20-%20Use%20of%20Force%20-%20Case%2014110123"
+                        self.params["url"] = galley_link
+                        images_file_name = (
+                            f"{self.agency_slug}/images_{gallery_id}.html"
+                        )
+                        self.cache.download(
+                            images_file_name, self.zenrows_api_url, params=self.params
+                        )
+                        html = self.cache.read(images_file_name)
+                        soup = BeautifulSoup(html, "html.parser")
+                        body = soup.find("div", class_="slides")
+                        a_tags = body.find_all("a")
+                        for a_tag in a_tags:
+                            img_tag = a_tag.find("img")
+                            # Get the 'src' and 'alt' attributes
+                            image_src = img_tag.get("src")
+                            image_alt = img_tag.get("alt")
+                            payload = {
+                                "asset_url": f"https://www.lakesheriff.com{image_src}",
+                                "case_id": child_page["case_number"],
+                                "name": image_alt,
+                                "title": link.text,
+                                "parent_page": str(child_page["page_filename"]),
+                                "details": {
+                                    "date": child_page["date"],
+                                    "location": child_page["location"],
+                                    "name": child_page["name"],
+                                    "incident_type": child_page["incident_type"],
+                                },
+                            }
+                            metadata.append(payload)
+
+        outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
+        self.cache.write_json(outfile, metadata)
+        return outfile

From 1e5e4d2aaa130ceafb42aafdca351b785ab8a962 Mon Sep 17 00:00:00 2001
From: Nauman <157045300+naumansharifwork@users.noreply.github.com>
Date: Tue, 29 Oct 2024 14:55:22 +0500
Subject: [PATCH 2/2] added zen-rows doc

---
 clean/ca/lake_county_sheriff.py | 2 +-
 docs/contributing.md            | 5 +++++
 env.sample/.env.sample          | 1 +
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/clean/ca/lake_county_sheriff.py b/clean/ca/lake_county_sheriff.py
index 5bb0886..79e9c26 100644
--- a/clean/ca/lake_county_sheriff.py
+++ b/clean/ca/lake_county_sheriff.py
@@ -37,7 +37,7 @@ def __init__(
         dotenv_path = "env/.env"
         load_dotenv(dotenv_path=dotenv_path)
         self.params = {
-            "apikey": os.getenv("zenrows_key"),
+            "apikey": os.getenv("ZENROWS_KEY"),
             "url": "",  # Target website URL
             # Add any other ZenRows parameters here (optional)
         }
diff --git a/docs/contributing.md b/docs/contributing.md
index 6324a2a..ee1f83c 100644
--- a/docs/contributing.md
+++ b/docs/contributing.md
@@ -358,3 +358,8 @@ git push origin your-branch-name
 The final step is to submit a [pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) to the main respository, asking the maintainers to consider integrating your patch.
 
 GitHub has [a short guide](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) that can walk you through the process. You should tag your issue number in the request so that it gets linked in GitHub's system.
+
+
+## Zen Rows Use
+
+Some sites uses zenrows API, click [here](https://app.zenrows.com/login) to sign up for an account and get a ZENROWS_KEY which you can add to your .env file
diff --git a/env.sample/.env.sample b/env.sample/.env.sample
index 7b161b2..6f19092 100644
--- a/env.sample/.env.sample
+++ b/env.sample/.env.sample
@@ -2,3 +2,4 @@ MUCKROCK_SAMPLE=TKTKTK
 GOVQA_CA_SAMPLECITY_PASSWORD=TKTKTK
 GOVQA_CA_SAMPLECITY_USERNAME=TKTKTK
 MUCKROCK_CRP=TKTKTK
+ZENROWS_KEY=TKTKTK