Skip to content

Commit

Permalink
Documentation tweaks
Browse files Browse the repository at this point in the history
  • Loading branch information
stucka committed Aug 16, 2024
1 parent 30dbd04 commit da91b41
Showing 1 changed file with 6 additions and 6 deletions.
12 changes: 6 additions & 6 deletions clean/ca/ventura_county_sheriff.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,19 @@ class Site:
agency_slug = "ca_ventura_county_sheriff"

def __init__(self, data_dir=utils.CLEAN_DATA_DIR, cache_dir=utils.CLEAN_CACHE_DIR):
# Start page contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
# along with additional index pages
# Start page contains list of "detail"/child pages with links to the SB16/SB1421/AB748
# videos and files along with additional index pages
self.base_url = "https://www.venturasheriff.org"
self.index_urls = {
f"{self.base_url}/sb1421/officer-involved-shooting-ois/": "ois.html",
f"{self.base_url}/sb1421/use-of-force-great-bodily-injury-cases-gbi/": "gbi.html",
f"{self.base_url}/ab748/": "ab748.html",
}

# HEY! Need to build out dirs with appropriate slug structure
# Need to add detail dir

self.cache = Cache(cache_dir) # ~/.clean-scraper/cache/
self.data_dir = data_dir
self.cache_dir = cache_dir

# Use module path to construct agency slug, which we'll use downstream
# to create a subdir inside the main cache directory to stash files for this agency
mod = Path(__file__)
Expand All @@ -51,12 +49,14 @@ def __init__(self, data_dir=utils.CLEAN_DATA_DIR, cache_dir=utils.CLEAN_CACHE_DI
def scrape_meta(self, throttle: int = 0) -> Path:
metadata: List[MetadataDict] = []
page_urls = []
# Scrape index pages for both assets and links to case directories/subpages
for index_url in self.index_urls:
detail_page_links, local_metadata = self._process_index_page(index_url)
page_urls.extend(detail_page_links)
metadata.extend(local_metadata)
time.sleep(throttle)

# Now, process the links of case directories/subpages
for page_url in page_urls:
local_metadata = self._process_detail_page(page_url)
metadata.extend(local_metadata)
Expand All @@ -67,6 +67,7 @@ def scrape_meta(self, throttle: int = 0) -> Path:
full_filename = self.cache.write_json(outfile, metadata)
return full_filename

# Helper/Private Methods
def _process_detail_page(self, target_url) -> List[MetadataDict]:
"""Extract links to files such as videos from a detail page and write to JSON file."""
local_metadata: List[MetadataDict] = []
Expand Down Expand Up @@ -139,7 +140,6 @@ def _process_detail_page(self, target_url) -> List[MetadataDict]:
local_metadata.append(line)
return local_metadata

# Helper/Private Methods
def _process_index_page(self, target_url):
local_metadata: List[MetadataDict] = []
subpages = []
Expand Down

0 comments on commit da91b41

Please sign in to comment.