Skip to content

Commit

Permalink
scraper sonoma da
Browse files Browse the repository at this point in the history
  • Loading branch information
irenecasado authored and tarakc02 committed Jan 15, 2025
1 parent c61c7ce commit 929afb0
Showing 1 changed file with 5 additions and 4 deletions.
9 changes: 5 additions & 4 deletions clean/ca/sonoma_county_da.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def get_main_page_links(self) -> List[str]:
html = self.cache.read(cache_path)
soup = BeautifulSoup(html, "html.parser")

# Find all h2 tags with class 'h3'
# Find all h2 tags with class 'h3', where the links are located
for h2 in soup.find_all("h2", class_="h3"):
for link in h2.find_all("a", href=True):
href = link["href"]
Expand Down Expand Up @@ -90,8 +90,9 @@ def get_detail_page_links(
for link in h2.find_all("a", href=True):
href = link["href"]
if ".pdf" in href.lower():
asset_url = urljoin(self.base_url, href.strip())
file_name = Path(asset_url).name
# Construct full URL and extract file name
asset_url = urljoin(self.base_url, href.strip()) # Full URL
file_name = Path(asset_url).name # File name

payload: MetadataDict = {
"asset_url": asset_url,
Expand All @@ -101,7 +102,7 @@ def get_detail_page_links(
"parent_page": detail_page,
}
metadata.append(payload)
time.sleep(throttle) # Respect throttle delay
time.sleep(throttle)
return metadata

def _download_index_page(self, page_url: str) -> Path:
Expand Down

0 comments on commit 929afb0

Please sign in to comment.