project updated

MGMCN · Mar 14, 2024 · 4553d03 · 4553d03
1 parent 390a6ef
commit 4553d03
Show file tree

Hide file tree

Showing 11 changed files with 23,481 additions and 1 deletion.
diff --git a/.github/actions/deploy/action.yml b/.github/actions/deploy/action.yml
@@ -0,0 +1,32 @@
+inputs:
+  DOCKERHUB_USERNAME:
+    description: "DOCKERHUB_USERNAME"
+    required: true
+  DOCKERHUB_TOKEN:
+    description: "DOCKERHUB_TOKEN"
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Set up QEMU
+      uses: docker/setup-qemu-action@v2
+
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v2
+
+    - name: Login to DockerHub
+      uses: docker/login-action@v2
+      with:
+        username: ${{ inputs.DOCKERHUB_USERNAME }}
+        password: ${{ inputs.DOCKERHUB_TOKEN }}
+
+    - name: Build and push
+      id: docker_build
+      uses: docker/build-push-action@v2
+      with:
+        context: .
+        file: ./Dockerfile
+        platforms: linux/arm64,linux/amd64
+        push: true
+        tags: godmountain/jrecin-job-information-downloader:latest, godmountain/jrecin-job-information-downloader:${{ github.ref_name }}
diff --git a/.github/actions/docker/action.yml b/.github/actions/docker/action.yml
@@ -0,0 +1,7 @@
+runs:
+  using: "composite"
+  steps:
+    - name: build dockerfile
+      shell: bash
+      run: |
+        docker build . -t jrecin
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -0,0 +1,29 @@
+name: release_docker_image
+
+on:
+  release:
+    types: [ published ]
+
+jobs:
+
+  docker-build:
+    timeout-minutes: 10
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: call docker-build workflow
+        uses: ./.github/actions/docker
+
+  deploy:
+    needs: [ docker-build ]
+    timeout-minutes: 20
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: call deploy workflow
+        uses: ./.github/actions/deploy
+        with:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
+          DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -157,4 +157,6 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
+
+.DS_Store
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,12 @@
+FROM python:3.11-slim-bullseye
+
+LABEL maintainer="MGMCN"
+
+COPY . /APP
+
+WORKDIR /APP
+
+RUN pip3 install -r requirements.txt && \
+    mkdir excels
+
+CMD python3 main.py
diff --git a/app.py b/app.py
@@ -0,0 +1,91 @@
+import json
+import time
+import pandas as pd
+import re
+
+from flask import Flask, request, Response, render_template, stream_with_context
+
+from crawler import Crawler
+
+
+class App:
+    def __init__(self):
+        self.app = Flask(__name__)
+        self.set_app_routes()
+        self.tasks = {}
+        self.path = "./excels/"
+
+    def set_app_routes(self):
+        def save_data_to_excel(task_id):
+            df = pd.DataFrame(self.tasks[task_id]['job_descriptions'])
+
+            invalid_chars = r'\/:*?"<>|'
+            filename = f'jobs_{task_id}.xlsx'
+            cleaned_filename = re.sub(f"[{re.escape(invalid_chars)}]", '_', filename)
+
+            df.to_excel(self.path+cleaned_filename, index=False, engine='openpyxl')
+
+        def generate_download_stream(task_id):
+            print(f"Starting download for task ID: {task_id}")
+            crawler = Crawler(self.tasks[task_id]['url'])
+
+            success, message, html = crawler.do_request(crawler.get_search_request_url(None))
+            if success:
+                search_page_urls = crawler.parse_item_count_and_generate_search_page_url(html)
+
+                detail_page_urls = []
+                for search_url in search_page_urls:
+                    success, message, html = crawler.do_request(search_url)
+                    if success:
+                        detail_page_urls.extend(crawler.parse_search_page_and_generte_item_detail_page_url(html))
+
+                progress_bar_increment = 100 / len(detail_page_urls)
+
+                data = {'message': "init_done"}  # Not graceful
+                js_data = json.dumps(data)
+                yield f"data: {js_data}\n\n"
+
+                for detail_page_url in detail_page_urls:
+                    data = {'message': "", 'increment': progress_bar_increment}
+
+                    if "jrecin" in detail_page_url:
+                        success, message, html = crawler.do_request(detail_page_url)
+                        if success:
+                            job_description = crawler.parse_detail_page_and_generate_job_description_item(html)
+                            job_description['url'] = detail_page_url
+                            self.tasks[task_id]['job_descriptions'].append(job_description)
+                            data['message'] = f"Crawled {detail_page_url}"
+                        else:
+                            data['message'] = f"{message}"
+                    else:
+                        job_description = crawler.parse_detail_page_and_generate_job_description_item(html, False)
+                        job_description['url'] = detail_page_url
+                        self.tasks[task_id]['job_descriptions'].append(job_description)
+                        data['message'] = f"Pages not from jrec cannot be parsed {detail_page_url}"
+
+                    js_data = json.dumps(data)
+                    yield f"data: {js_data}\n\n"
+
+            save_data_to_excel(task_id)
+            data = {'message': "done"}  # Not graceful
+            js_data = json.dumps(data)
+            yield f"data: {js_data}\n\n"
+
+        @self.app.route("/", methods=["GET"])
+        def fake_it():
+            return render_template("html/index.html")
+
+        @self.app.route("/download", methods=["GET"])
+        def download():
+            task_id = hash(request.url.__str__() + time.time().__str__()).__str__()
+            self.tasks[task_id] = {'url': request.url.__str__()}
+            self.tasks[task_id]['job_descriptions'] = []
+            return render_template('html/download.html', task_id=task_id)
+
+        @self.app.route('/start_download')
+        def start_download():
+            task_id = request.args.get('task_id', None)
+            return Response(stream_with_context(generate_download_stream(task_id)), content_type='text/event-stream')
+
+    def run(self):
+        self.app.run(host='0.0.0.0', port=3333)
diff --git a/crawler.py b/crawler.py
@@ -0,0 +1,170 @@
+from urllib.parse import urlparse, parse_qs, urlencode
+import requests
+from bs4 import BeautifulSoup
+
+
+class Crawler:
+    home_base_url = "https://jrecin.jst.go.jp"
+    search_base_url = "https://jrecin.jst.go.jp/seek/SeekJorSearch?"
+
+    headers = {
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,'
+                  'application/signed-exchange;v=b3;q=0.7',
+        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7',
+        'Connection': 'keep-alive',
+        'Referer': 'https://jrecin.jst.go.jp/seek/SeekJorSearch',
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
+        'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"macOS"',
+    }
+
+    def __init__(self, url):
+        self.original_url = url
+
+    def get_search_request_url(self, url):
+        parsed_url = urlparse(url if url else self.original_url)
+        query_params = parse_qs(parsed_url.query)
+        query_string = urlencode(query_params, doseq=True)
+        return Crawler.search_base_url + query_string
+
+    def do_request(self, url) -> (bool, str, str):
+        response = requests.get(url, headers=Crawler.headers, timeout=20)
+
+        success = True
+        message = "Request {url} succeeded"
+
+        if response.status_code != 200:
+            success = False
+            message = f'Request {url} failed，status_code：{response.status_code}'
+
+        return success, message, response.text
+
+    def parse_item_count_and_generate_search_page_url(self, html) -> list:
+        soup = BeautifulSoup(html, 'html.parser')  # 'lxml'
+        item_count = soup.find('span', class_="em_text me-2")
+        count = int(item_count.text) if item_count else 0
+        pages = 0
+        if count % 50 != 0:
+            pages = int((count / 50) + 1)
+        else:
+            pages = int(count / 50)
+
+        search_page_urls = [
+            self.get_search_request_url(f'{self.original_url}&page={page + 1}&dispcount=50') for page in
+            range(pages)]
+
+        # for url in search_page_urls:
+        #     print(url)
+        return search_page_urls
+
+    def parse_search_page_and_generte_item_detail_page_url(self, html) -> list:
+        soup = BeautifulSoup(html, 'html.parser')
+        a_tags = soup.find_all('a', class_='btn dtl_btn text-center mb-2')
+        detail_page_urls = [f"{Crawler.home_base_url}{tag.get('href')}" for tag in a_tags]
+
+        button_tags = soup.find_all('button', class_='btn dtl_btn text-center mb-2')
+        outside_detail_page_urls = [tag.get('data-url') for tag in button_tags]
+
+        # print(len(detail_page_urls), detail_page_urls)
+        # print(len(outside_detail_page_urls), outside_detail_page_urls)
+
+        detail_page_urls.extend(outside_detail_page_urls)
+        return detail_page_urls
+
+    def parse_detail_page_and_generate_job_description_item(self, html, flag=True) -> dict:
+        if flag:
+            soup = BeautifulSoup(html, 'html.parser')
+
+            def clean_text(element):
+                if element:
+                    return element.get_text(strip=True).replace("\n", "").replace("\t", "")
+                return ""
+
+            location = clean_text(soup.find('p', string=lambda x: x and '勤務地' in x))
+            research_field = clean_text(soup.find('p', string=lambda x: x and '研究分野' in x))
+            start_date = clean_text(soup.find('p', string=lambda x: x and '公開開始日' in x))
+            end_date = clean_text(soup.find('p', string=lambda x: x and '募集終了日' in x))
+
+            salary = ""
+            salaries = soup.find('p', string='給与')
+            if salaries:
+                salary_list = salaries.find_next_sibling('ul')
+                if salary_list:
+                    salary_details = salary_list.find_all('li')
+                    salary = ''.join([clean_text(salary_detail) for salary_detail in salary_details])
+
+            job_type = ""
+            job_types = soup.find('p', string='職種')
+            if job_types:
+                job_type_list = job_types.find_next_sibling('ul')
+                if job_type_list:
+                    job_types = job_type_list.find_all('li')
+                    job_type = ''.join([clean_text(type) for type in job_types])
+
+            qualification = ""
+            education_requirements = soup.find_all(string="応募に必要な学歴・学位")
+            for item in education_requirements:
+                parent = item.find_parent('li')
+                if parent:
+                    qualification += clean_text(parent)
+
+            experience_requirements = soup.find_all(string="業務における経験")
+            for item in experience_requirements:
+                parent = item.find_parent('li')
+                if parent:
+                    qualification += clean_text(parent)
+
+            # print(f'Location: {location}')
+            # print(f'Research Field: {research_field}')
+            # print(f'Start Date: {start_date}')
+            # print(f'End Date: {end_date}')
+            # print(f'Job Type: {job_type}')
+            # print(f'Salary: {salary}')
+            # print(f'Qualification: {qualification}')
+
+            return {
+                "location": location,
+                "research_field": research_field,
+                "start_date": start_date,
+                "end_date": end_date,
+                "job_type": job_type,
+                "salary": salary,
+                "qualification": qualification
+            }
+        else:
+            return {
+                "location": None,
+                "research_field": None,
+                "start_date": None,
+                "end_date": None,
+                "job_type": None,
+                "salary": None,
+                "qualification": None
+            }
+
+
+# only for test
+if __name__ == '__main__':
+    page = 1
+    dispcount = 50
+    c = Crawler("http://127.0.0.1:3333/download?keyword_and=&keyword_or=&keyword_not=&_bgCode=1&bgCode=00289&_smCode"
+                "=1&_orgtype=1&_area=1&_prefecture=1&_jobkind=1&jobkind=00011&jobkind=00022&jobkind=00014&jobkind"
+                "=00016&jobkind=00018&jobkind=00020&jobkind=00024&jobkind=00026&jobkind=00028&jobkind=00030&jobkind"
+                "=00033&employmentform=01&_employmentform=on&employmentform=02&_employmentform=on&_employmentform=on"
+                "&_employmentform=on&_employmentform=on&_employmentform=on&_jobterm=on&_jobterm=on&_jobterm=on"
+                "&_jobterm=on&wageValue=&_consideration=on&_consideration=on&_consideration=on&_entry=on&_entry=on"
+                "&_entry=on&_duration=on&_duration=on&_duration=on&sort=0&fn=0")
+    success, message, html = c.do_request(c.get_search_request_url(None))
+    search_page_urls = c.parse_item_count_and_generate_search_page_url(html)
+
+    detail_page_urls = []
+    for search_url in search_page_urls:
+        success, message, html = c.do_request(search_url)
+        detail_page_urls.extend(c.parse_search_page_and_generte_item_detail_page_url(html))
+
+    print(len(detail_page_urls))
+
+    print(detail_page_urls[1])
+    success, message, html = c.do_request(detail_page_urls[1])
+    c.parse_detail_page_and_generate_job_description_item(html)
diff --git a/main.py b/main.py
@@ -0,0 +1,5 @@
+from app import App
+
+if __name__ == '__main__':
+    app = App()
+    app.run()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,5 @@
+beautifulsoup4==4.12.3
+flask==3.0.2
+openpyxl==3.1.2
+pandas==2.2.1
+requests==2.31.0