From e54898cb9fce644fab839d8c1d3bb05e747daf87 Mon Sep 17 00:00:00 2001 From: Andrew Gramigna Date: Sun, 17 Nov 2024 20:32:40 -0500 Subject: [PATCH] add page number logic to levergreen id --- .../spiders/greenhouse_jobs_outline_spider.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/job_board_scraper/job_board_scraper/spiders/greenhouse_jobs_outline_spider.py b/job_board_scraper/job_board_scraper/spiders/greenhouse_jobs_outline_spider.py index f91c095..fb9e939 100644 --- a/job_board_scraper/job_board_scraper/spiders/greenhouse_jobs_outline_spider.py +++ b/job_board_scraper/job_board_scraper/spiders/greenhouse_jobs_outline_spider.py @@ -49,7 +49,7 @@ def parse_job_boards_prefix(self, i, j, department_ids, opening): item=GreenhouseJobsOutlineItem(), selector=Selector(text=opening.get(), type="html"), ) - self.logger.info(f"Parsing row {j+1}, {self.company_name} {self.name}") + # self.logger.info(f"Parsing row {j+1}, {self.company_name} {self.name}") il.add_value("department_ids", department_ids) # nested.add_xpath("office_ids", "@office_id") @@ -57,7 +57,7 @@ def parse_job_boards_prefix(self, i, j, department_ids, opening): il.add_xpath("opening_title", "//p[contains(@class, 'body--medium')]/text()") il.add_xpath("location", "//p[contains(@class, 'body--metadata')]/text()") - il.add_value("id", self.determine_row_id(i * 1000 + j)) + il.add_value("id", self.determine_row_id(i * 1000 + j * 100 + self.page_number)) il.add_value("created_at", self.created_at) il.add_value("updated_at", self.updated_at) il.add_value("source", self.html_source) @@ -78,6 +78,7 @@ def parse(self, response): department_ids, job_openings = self.get_department_ids(job_post) for j, opening in enumerate(job_openings): il = self.parse_job_boards_prefix(i, j, department_ids, opening) + print(il.load_item().get("opening_title"), il.load_item().get("id")) yield il.load_item() if len(job_posts) != 0: self.page_number += 1