diff --git a/.github/actions/deploy/action.yml b/.github/actions/deploy/action.yml new file mode 100644 index 0000000..7dc6b0b --- /dev/null +++ b/.github/actions/deploy/action.yml @@ -0,0 +1,32 @@ +inputs: + DOCKERHUB_USERNAME: + description: "DOCKERHUB_USERNAME" + required: true + DOCKERHUB_TOKEN: + description: "DOCKERHUB_TOKEN" + required: true + +runs: + using: "composite" + steps: + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ inputs.DOCKERHUB_USERNAME }} + password: ${{ inputs.DOCKERHUB_TOKEN }} + + - name: Build and push + id: docker_build + uses: docker/build-push-action@v2 + with: + context: . + file: ./Dockerfile + platforms: linux/arm64,linux/amd64 + push: true + tags: godmountain/jrecin-job-information-downloader:latest, godmountain/jrecin-job-information-downloader:${{ github.ref_name }} diff --git a/.github/actions/docker/action.yml b/.github/actions/docker/action.yml new file mode 100644 index 0000000..670058a --- /dev/null +++ b/.github/actions/docker/action.yml @@ -0,0 +1,7 @@ +runs: + using: "composite" + steps: + - name: build dockerfile + shell: bash + run: | + docker build . -t jrecin diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..d2ac3c2 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,29 @@ +name: release_docker_image + +on: + release: + types: [ published ] + +jobs: + + docker-build: + timeout-minutes: 10 + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: call docker-build workflow + uses: ./.github/actions/docker + + deploy: + needs: [ docker-build ] + timeout-minutes: 20 + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: call deploy workflow + uses: ./.github/actions/deploy + with: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} diff --git a/.gitignore b/.gitignore index 68bc17f..e605249 100644 --- a/.gitignore +++ b/.gitignore @@ -157,4 +157,6 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +.idea/ + +.DS_Store \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5691693 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.11-slim-bullseye + +LABEL maintainer="MGMCN" + +COPY . /APP + +WORKDIR /APP + +RUN pip3 install -r requirements.txt && \ + mkdir excels + +CMD python3 main.py \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000..9dbd284 --- /dev/null +++ b/app.py @@ -0,0 +1,91 @@ +import json +import time +import pandas as pd +import re + +from flask import Flask, request, Response, render_template, stream_with_context + +from crawler import Crawler + + +class App: + def __init__(self): + self.app = Flask(__name__) + self.set_app_routes() + self.tasks = {} + self.path = "./excels/" + + def set_app_routes(self): + def save_data_to_excel(task_id): + df = pd.DataFrame(self.tasks[task_id]['job_descriptions']) + + invalid_chars = r'\/:*?"<>|' + filename = f'jobs_{task_id}.xlsx' + cleaned_filename = re.sub(f"[{re.escape(invalid_chars)}]", '_', filename) + + df.to_excel(self.path+cleaned_filename, index=False, engine='openpyxl') + + def generate_download_stream(task_id): + print(f"Starting download for task ID: {task_id}") + crawler = Crawler(self.tasks[task_id]['url']) + + success, message, html = crawler.do_request(crawler.get_search_request_url(None)) + if success: + search_page_urls = crawler.parse_item_count_and_generate_search_page_url(html) + + detail_page_urls = [] + for search_url in search_page_urls: + success, message, html = crawler.do_request(search_url) + if success: + detail_page_urls.extend(crawler.parse_search_page_and_generte_item_detail_page_url(html)) + + progress_bar_increment = 100 / len(detail_page_urls) + + data = {'message': "init_done"} # Not graceful + js_data = json.dumps(data) + yield f"data: {js_data}\n\n" + + for detail_page_url in detail_page_urls: + data = {'message': "", 'increment': progress_bar_increment} + + if "jrecin" in detail_page_url: + success, message, html = crawler.do_request(detail_page_url) + if success: + job_description = crawler.parse_detail_page_and_generate_job_description_item(html) + job_description['url'] = detail_page_url + self.tasks[task_id]['job_descriptions'].append(job_description) + data['message'] = f"Crawled {detail_page_url}" + else: + data['message'] = f"{message}" + else: + job_description = crawler.parse_detail_page_and_generate_job_description_item(html, False) + job_description['url'] = detail_page_url + self.tasks[task_id]['job_descriptions'].append(job_description) + data['message'] = f"Pages not from jrec cannot be parsed {detail_page_url}" + + js_data = json.dumps(data) + yield f"data: {js_data}\n\n" + + save_data_to_excel(task_id) + data = {'message': "done"} # Not graceful + js_data = json.dumps(data) + yield f"data: {js_data}\n\n" + + @self.app.route("/", methods=["GET"]) + def fake_it(): + return render_template("html/index.html") + + @self.app.route("/download", methods=["GET"]) + def download(): + task_id = hash(request.url.__str__() + time.time().__str__()).__str__() + self.tasks[task_id] = {'url': request.url.__str__()} + self.tasks[task_id]['job_descriptions'] = [] + return render_template('html/download.html', task_id=task_id) + + @self.app.route('/start_download') + def start_download(): + task_id = request.args.get('task_id', None) + return Response(stream_with_context(generate_download_stream(task_id)), content_type='text/event-stream') + + def run(self): + self.app.run(host='0.0.0.0', port=3333) diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..46fcbad --- /dev/null +++ b/crawler.py @@ -0,0 +1,170 @@ +from urllib.parse import urlparse, parse_qs, urlencode +import requests +from bs4 import BeautifulSoup + + +class Crawler: + home_base_url = "https://jrecin.jst.go.jp" + search_base_url = "https://jrecin.jst.go.jp/seek/SeekJorSearch?" + + headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,' + 'application/signed-exchange;v=b3;q=0.7', + 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7', + 'Connection': 'keep-alive', + 'Referer': 'https://jrecin.jst.go.jp/seek/SeekJorSearch', + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', + 'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"macOS"', + } + + def __init__(self, url): + self.original_url = url + + def get_search_request_url(self, url): + parsed_url = urlparse(url if url else self.original_url) + query_params = parse_qs(parsed_url.query) + query_string = urlencode(query_params, doseq=True) + return Crawler.search_base_url + query_string + + def do_request(self, url) -> (bool, str, str): + response = requests.get(url, headers=Crawler.headers, timeout=20) + + success = True + message = "Request {url} succeeded" + + if response.status_code != 200: + success = False + message = f'Request {url} failed,status_code:{response.status_code}' + + return success, message, response.text + + def parse_item_count_and_generate_search_page_url(self, html) -> list: + soup = BeautifulSoup(html, 'html.parser') # 'lxml' + item_count = soup.find('span', class_="em_text me-2") + count = int(item_count.text) if item_count else 0 + pages = 0 + if count % 50 != 0: + pages = int((count / 50) + 1) + else: + pages = int(count / 50) + + search_page_urls = [ + self.get_search_request_url(f'{self.original_url}&page={page + 1}&dispcount=50') for page in + range(pages)] + + # for url in search_page_urls: + # print(url) + return search_page_urls + + def parse_search_page_and_generte_item_detail_page_url(self, html) -> list: + soup = BeautifulSoup(html, 'html.parser') + a_tags = soup.find_all('a', class_='btn dtl_btn text-center mb-2') + detail_page_urls = [f"{Crawler.home_base_url}{tag.get('href')}" for tag in a_tags] + + button_tags = soup.find_all('button', class_='btn dtl_btn text-center mb-2') + outside_detail_page_urls = [tag.get('data-url') for tag in button_tags] + + # print(len(detail_page_urls), detail_page_urls) + # print(len(outside_detail_page_urls), outside_detail_page_urls) + + detail_page_urls.extend(outside_detail_page_urls) + return detail_page_urls + + def parse_detail_page_and_generate_job_description_item(self, html, flag=True) -> dict: + if flag: + soup = BeautifulSoup(html, 'html.parser') + + def clean_text(element): + if element: + return element.get_text(strip=True).replace("\n", "").replace("\t", "") + return "" + + location = clean_text(soup.find('p', string=lambda x: x and '勤務地' in x)) + research_field = clean_text(soup.find('p', string=lambda x: x and '研究分野' in x)) + start_date = clean_text(soup.find('p', string=lambda x: x and '公開開始日' in x)) + end_date = clean_text(soup.find('p', string=lambda x: x and '募集終了日' in x)) + + salary = "" + salaries = soup.find('p', string='給与') + if salaries: + salary_list = salaries.find_next_sibling('ul') + if salary_list: + salary_details = salary_list.find_all('li') + salary = ''.join([clean_text(salary_detail) for salary_detail in salary_details]) + + job_type = "" + job_types = soup.find('p', string='職種') + if job_types: + job_type_list = job_types.find_next_sibling('ul') + if job_type_list: + job_types = job_type_list.find_all('li') + job_type = ''.join([clean_text(type) for type in job_types]) + + qualification = "" + education_requirements = soup.find_all(string="応募に必要な学歴・学位") + for item in education_requirements: + parent = item.find_parent('li') + if parent: + qualification += clean_text(parent) + + experience_requirements = soup.find_all(string="業務における経験") + for item in experience_requirements: + parent = item.find_parent('li') + if parent: + qualification += clean_text(parent) + + # print(f'Location: {location}') + # print(f'Research Field: {research_field}') + # print(f'Start Date: {start_date}') + # print(f'End Date: {end_date}') + # print(f'Job Type: {job_type}') + # print(f'Salary: {salary}') + # print(f'Qualification: {qualification}') + + return { + "location": location, + "research_field": research_field, + "start_date": start_date, + "end_date": end_date, + "job_type": job_type, + "salary": salary, + "qualification": qualification + } + else: + return { + "location": None, + "research_field": None, + "start_date": None, + "end_date": None, + "job_type": None, + "salary": None, + "qualification": None + } + + +# only for test +if __name__ == '__main__': + page = 1 + dispcount = 50 + c = Crawler("http://127.0.0.1:3333/download?keyword_and=&keyword_or=&keyword_not=&_bgCode=1&bgCode=00289&_smCode" + "=1&_orgtype=1&_area=1&_prefecture=1&_jobkind=1&jobkind=00011&jobkind=00022&jobkind=00014&jobkind" + "=00016&jobkind=00018&jobkind=00020&jobkind=00024&jobkind=00026&jobkind=00028&jobkind=00030&jobkind" + "=00033&employmentform=01&_employmentform=on&employmentform=02&_employmentform=on&_employmentform=on" + "&_employmentform=on&_employmentform=on&_employmentform=on&_jobterm=on&_jobterm=on&_jobterm=on" + "&_jobterm=on&wageValue=&_consideration=on&_consideration=on&_consideration=on&_entry=on&_entry=on" + "&_entry=on&_duration=on&_duration=on&_duration=on&sort=0&fn=0") + success, message, html = c.do_request(c.get_search_request_url(None)) + search_page_urls = c.parse_item_count_and_generate_search_page_url(html) + + detail_page_urls = [] + for search_url in search_page_urls: + success, message, html = c.do_request(search_url) + detail_page_urls.extend(c.parse_search_page_and_generte_item_detail_page_url(html)) + + print(len(detail_page_urls)) + + print(detail_page_urls[1]) + success, message, html = c.do_request(detail_page_urls[1]) + c.parse_detail_page_and_generate_job_description_item(html) diff --git a/main.py b/main.py new file mode 100644 index 0000000..919b17e --- /dev/null +++ b/main.py @@ -0,0 +1,5 @@ +from app import App + +if __name__ == '__main__': + app = App() + app.run() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ce0e564 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +beautifulsoup4==4.12.3 +flask==3.0.2 +openpyxl==3.1.2 +pandas==2.2.1 +requests==2.31.0 diff --git a/templates/html/download.html b/templates/html/download.html new file mode 100644 index 0000000..f0609d2 --- /dev/null +++ b/templates/html/download.html @@ -0,0 +1,106 @@ + + +
+ +