Skip to content

Commit

Permalink
refactor: move page processing to its own method
Browse files Browse the repository at this point in the history
  • Loading branch information
NTFSvolume committed Jan 28, 2025
1 parent b1f8fae commit c125941
Showing 1 changed file with 26 additions and 23 deletions.
49 changes: 26 additions & 23 deletions cyberdrop_dl/scraper/crawlers/xenforo_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,8 @@ async def thread(self, scrape_item: ScrapeItem) -> None:
"""Scrapes a forum thread."""
scrape_item.set_type(FORUM, self.manager)
thread = self.get_thread_info(scrape_item.url)
title = last_scraped_post_number = None
title = None
last_post_url = thread.url
async for soup in self.thread_pager(scrape_item):
if not title:
title_block = soup.select_one(self.selectors.title.element)
Expand All @@ -163,27 +164,31 @@ async def thread(self, scrape_item: ScrapeItem) -> None:
title = self.create_title(title_block.text.replace("\n", ""), thread_id=thread.id_)
scrape_item.add_to_parent_title(title)

posts = soup.select(self.selectors.posts.element)
continue_scraping = False
create = partial(self.create_scrape_item, scrape_item)
for post in posts:
current_post = ForumPost(post, selectors=self.selectors.posts, post_name=self.POST_NAME)
last_scraped_post_number = current_post.number
scrape_post, continue_scraping = self.check_post_number(thread.post, current_post.number)
date = current_post.date
if scrape_post:
post_string = f"{self.POST_NAME}{current_post.number}"
parent_url = thread.url / post_string
new_scrape_item = create(thread.url, possible_datetime=date, add_parent=parent_url)
self.manager.task_group.create_task(self.post(new_scrape_item, current_post))
scrape_item.add_children()

if not continue_scraping:
break
continue_scraping, last_post_url = self.process_thread_page(scrape_item, thread, soup)
if not continue_scraping:
break

await self.write_last_forum_post(thread.url, last_scraped_post_number)
await self.write_last_forum_post(thread.url, last_post_url)

def process_thread_page(self, scrape_item: ScrapeItem, thread: ThreadInfo, soup: BeautifulSoup) -> tuple[bool, URL]:
posts = soup.select(self.selectors.posts.element)
continue_scraping = False
create = partial(self.create_scrape_item, scrape_item)
post_url = thread.url
for post in posts:
current_post = ForumPost(post, selectors=self.selectors.posts, post_name=self.POST_NAME)
scrape_post, continue_scraping = self.check_post_number(thread.post, current_post.number)
date = current_post.date
post_string = f"{self.POST_NAME}{current_post.number}"
post_url = thread.url / post_string
if scrape_post:
new_scrape_item = create(thread.url, possible_datetime=date, add_parent=post_url)
self.manager.task_group.create_task(self.post(new_scrape_item, current_post))
scrape_item.add_children()

if not continue_scraping:
break
return continue_scraping, post_url

async def post(self, scrape_item: ScrapeItem, post: ForumPost) -> None:
"""Scrapes a post."""
Expand Down Expand Up @@ -327,11 +332,9 @@ async def handle_confirmation_link(self, link: URL, *, origin: ScrapeItem | None
link_str: str = confirm_button.get("href")
return self.parse_url(link_str)

async def write_last_forum_post(self, thread_url: URL, post_number: int | None) -> None:
if not post_number:
async def write_last_forum_post(self, thread_url: URL, last_post_url: URL | None) -> None:
if not last_post_url or last_post_url == thread_url:
return
post_string = f"{self.POST_NAME}{post_number}"
last_post_url = thread_url / post_string
await self.manager.log_manager.write_last_post_log(last_post_url)

def is_valid_post_link(self, link_obj: Tag) -> bool:
Expand Down

0 comments on commit c125941

Please sign in to comment.