scraper sonoma da

biglocalnews · Jan 15, 2025 · 929afb0 · 929afb0
1 parent c61c7ce
commit 929afb0
Showing 1 changed file with 5 additions and 4 deletions.
diff --git a/clean/ca/sonoma_county_da.py b/clean/ca/sonoma_county_da.py
@@ -56,7 +56,7 @@ def get_main_page_links(self) -> List[str]:
         html = self.cache.read(cache_path)
         soup = BeautifulSoup(html, "html.parser")
 
-        # Find all h2 tags with class 'h3'
+        # Find all h2 tags with class 'h3', where the links are located
         for h2 in soup.find_all("h2", class_="h3"):
             for link in h2.find_all("a", href=True):
                 href = link["href"]
@@ -90,8 +90,9 @@ def get_detail_page_links(
                 for link in h2.find_all("a", href=True):
                     href = link["href"]
                     if ".pdf" in href.lower():
-                        asset_url = urljoin(self.base_url, href.strip())
-                        file_name = Path(asset_url).name
+                        # Construct full URL and extract file name
+                        asset_url = urljoin(self.base_url, href.strip())  # Full URL
+                        file_name = Path(asset_url).name  # File name
 
                         payload: MetadataDict = {
                             "asset_url": asset_url,
@@ -101,7 +102,7 @@ def get_detail_page_links(
                             "parent_page": detail_page,
                         }
                         metadata.append(payload)
-            time.sleep(throttle)  # Respect throttle delay
+            time.sleep(throttle)
         return metadata
 
     def _download_index_page(self, page_url: str) -> Path: