Skip to content

Commit

Permalink
project updated
Browse files Browse the repository at this point in the history
  • Loading branch information
MGMCN committed Mar 14, 2024
1 parent 390a6ef commit 4553d03
Show file tree
Hide file tree
Showing 11 changed files with 23,481 additions and 1 deletion.
32 changes: 32 additions & 0 deletions .github/actions/deploy/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
inputs:
DOCKERHUB_USERNAME:
description: "DOCKERHUB_USERNAME"
required: true
DOCKERHUB_TOKEN:
description: "DOCKERHUB_TOKEN"
required: true

runs:
using: "composite"
steps:
- name: Set up QEMU
uses: docker/setup-qemu-action@v2

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2

- name: Login to DockerHub
uses: docker/login-action@v2
with:
username: ${{ inputs.DOCKERHUB_USERNAME }}
password: ${{ inputs.DOCKERHUB_TOKEN }}

- name: Build and push
id: docker_build
uses: docker/build-push-action@v2
with:
context: .
file: ./Dockerfile
platforms: linux/arm64,linux/amd64
push: true
tags: godmountain/jrecin-job-information-downloader:latest, godmountain/jrecin-job-information-downloader:${{ github.ref_name }}
7 changes: 7 additions & 0 deletions .github/actions/docker/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
runs:
using: "composite"
steps:
- name: build dockerfile
shell: bash
run: |
docker build . -t jrecin
29 changes: 29 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: release_docker_image

on:
release:
types: [ published ]

jobs:

docker-build:
timeout-minutes: 10
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3

- name: call docker-build workflow
uses: ./.github/actions/docker

deploy:
needs: [ docker-build ]
timeout-minutes: 20
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3

- name: call deploy workflow
uses: ./.github/actions/deploy
with:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -157,4 +157,6 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.idea/

.DS_Store
12 changes: 12 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FROM python:3.11-slim-bullseye

LABEL maintainer="MGMCN"

COPY . /APP

WORKDIR /APP

RUN pip3 install -r requirements.txt && \
mkdir excels

CMD python3 main.py
91 changes: 91 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import json
import time
import pandas as pd
import re

from flask import Flask, request, Response, render_template, stream_with_context

from crawler import Crawler


class App:
def __init__(self):
self.app = Flask(__name__)
self.set_app_routes()
self.tasks = {}
self.path = "./excels/"

def set_app_routes(self):
def save_data_to_excel(task_id):
df = pd.DataFrame(self.tasks[task_id]['job_descriptions'])

invalid_chars = r'\/:*?"<>|'
filename = f'jobs_{task_id}.xlsx'
cleaned_filename = re.sub(f"[{re.escape(invalid_chars)}]", '_', filename)

df.to_excel(self.path+cleaned_filename, index=False, engine='openpyxl')

def generate_download_stream(task_id):
print(f"Starting download for task ID: {task_id}")
crawler = Crawler(self.tasks[task_id]['url'])

success, message, html = crawler.do_request(crawler.get_search_request_url(None))
if success:
search_page_urls = crawler.parse_item_count_and_generate_search_page_url(html)

detail_page_urls = []
for search_url in search_page_urls:
success, message, html = crawler.do_request(search_url)
if success:
detail_page_urls.extend(crawler.parse_search_page_and_generte_item_detail_page_url(html))

progress_bar_increment = 100 / len(detail_page_urls)

data = {'message': "init_done"} # Not graceful
js_data = json.dumps(data)
yield f"data: {js_data}\n\n"

for detail_page_url in detail_page_urls:
data = {'message': "", 'increment': progress_bar_increment}

if "jrecin" in detail_page_url:
success, message, html = crawler.do_request(detail_page_url)
if success:
job_description = crawler.parse_detail_page_and_generate_job_description_item(html)
job_description['url'] = detail_page_url
self.tasks[task_id]['job_descriptions'].append(job_description)
data['message'] = f"Crawled {detail_page_url}"
else:
data['message'] = f"{message}"
else:
job_description = crawler.parse_detail_page_and_generate_job_description_item(html, False)
job_description['url'] = detail_page_url
self.tasks[task_id]['job_descriptions'].append(job_description)
data['message'] = f"Pages not from jrec cannot be parsed {detail_page_url}"

js_data = json.dumps(data)
yield f"data: {js_data}\n\n"

save_data_to_excel(task_id)
data = {'message': "done"} # Not graceful
js_data = json.dumps(data)
yield f"data: {js_data}\n\n"

@self.app.route("/", methods=["GET"])
def fake_it():
return render_template("html/index.html")

@self.app.route("/download", methods=["GET"])
def download():
task_id = hash(request.url.__str__() + time.time().__str__()).__str__()
self.tasks[task_id] = {'url': request.url.__str__()}
self.tasks[task_id]['job_descriptions'] = []
return render_template('html/download.html', task_id=task_id)

@self.app.route('/start_download')
def start_download():
task_id = request.args.get('task_id', None)
return Response(stream_with_context(generate_download_stream(task_id)), content_type='text/event-stream')

def run(self):
self.app.run(host='0.0.0.0', port=3333)
170 changes: 170 additions & 0 deletions crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
from urllib.parse import urlparse, parse_qs, urlencode
import requests
from bs4 import BeautifulSoup


class Crawler:
home_base_url = "https://jrecin.jst.go.jp"
search_base_url = "https://jrecin.jst.go.jp/seek/SeekJorSearch?"

headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,'
'application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7',
'Connection': 'keep-alive',
'Referer': 'https://jrecin.jst.go.jp/seek/SeekJorSearch',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
}

def __init__(self, url):
self.original_url = url

def get_search_request_url(self, url):
parsed_url = urlparse(url if url else self.original_url)
query_params = parse_qs(parsed_url.query)
query_string = urlencode(query_params, doseq=True)
return Crawler.search_base_url + query_string

def do_request(self, url) -> (bool, str, str):
response = requests.get(url, headers=Crawler.headers, timeout=20)

success = True
message = "Request {url} succeeded"

if response.status_code != 200:
success = False
message = f'Request {url} failed,status_code:{response.status_code}'

return success, message, response.text

def parse_item_count_and_generate_search_page_url(self, html) -> list:
soup = BeautifulSoup(html, 'html.parser') # 'lxml'
item_count = soup.find('span', class_="em_text me-2")
count = int(item_count.text) if item_count else 0
pages = 0
if count % 50 != 0:
pages = int((count / 50) + 1)
else:
pages = int(count / 50)

search_page_urls = [
self.get_search_request_url(f'{self.original_url}&page={page + 1}&dispcount=50') for page in
range(pages)]

# for url in search_page_urls:
# print(url)
return search_page_urls

def parse_search_page_and_generte_item_detail_page_url(self, html) -> list:
soup = BeautifulSoup(html, 'html.parser')
a_tags = soup.find_all('a', class_='btn dtl_btn text-center mb-2')
detail_page_urls = [f"{Crawler.home_base_url}{tag.get('href')}" for tag in a_tags]

button_tags = soup.find_all('button', class_='btn dtl_btn text-center mb-2')
outside_detail_page_urls = [tag.get('data-url') for tag in button_tags]

# print(len(detail_page_urls), detail_page_urls)
# print(len(outside_detail_page_urls), outside_detail_page_urls)

detail_page_urls.extend(outside_detail_page_urls)
return detail_page_urls

def parse_detail_page_and_generate_job_description_item(self, html, flag=True) -> dict:
if flag:
soup = BeautifulSoup(html, 'html.parser')

def clean_text(element):
if element:
return element.get_text(strip=True).replace("\n", "").replace("\t", "")
return ""

location = clean_text(soup.find('p', string=lambda x: x and '勤務地' in x))
research_field = clean_text(soup.find('p', string=lambda x: x and '研究分野' in x))
start_date = clean_text(soup.find('p', string=lambda x: x and '公開開始日' in x))
end_date = clean_text(soup.find('p', string=lambda x: x and '募集終了日' in x))

salary = ""
salaries = soup.find('p', string='給与')
if salaries:
salary_list = salaries.find_next_sibling('ul')
if salary_list:
salary_details = salary_list.find_all('li')
salary = ''.join([clean_text(salary_detail) for salary_detail in salary_details])

job_type = ""
job_types = soup.find('p', string='職種')
if job_types:
job_type_list = job_types.find_next_sibling('ul')
if job_type_list:
job_types = job_type_list.find_all('li')
job_type = ''.join([clean_text(type) for type in job_types])

qualification = ""
education_requirements = soup.find_all(string="応募に必要な学歴・学位")
for item in education_requirements:
parent = item.find_parent('li')
if parent:
qualification += clean_text(parent)

experience_requirements = soup.find_all(string="業務における経験")
for item in experience_requirements:
parent = item.find_parent('li')
if parent:
qualification += clean_text(parent)

# print(f'Location: {location}')
# print(f'Research Field: {research_field}')
# print(f'Start Date: {start_date}')
# print(f'End Date: {end_date}')
# print(f'Job Type: {job_type}')
# print(f'Salary: {salary}')
# print(f'Qualification: {qualification}')

return {
"location": location,
"research_field": research_field,
"start_date": start_date,
"end_date": end_date,
"job_type": job_type,
"salary": salary,
"qualification": qualification
}
else:
return {
"location": None,
"research_field": None,
"start_date": None,
"end_date": None,
"job_type": None,
"salary": None,
"qualification": None
}


# only for test
if __name__ == '__main__':
page = 1
dispcount = 50
c = Crawler("http://127.0.0.1:3333/download?keyword_and=&keyword_or=&keyword_not=&_bgCode=1&bgCode=00289&_smCode"
"=1&_orgtype=1&_area=1&_prefecture=1&_jobkind=1&jobkind=00011&jobkind=00022&jobkind=00014&jobkind"
"=00016&jobkind=00018&jobkind=00020&jobkind=00024&jobkind=00026&jobkind=00028&jobkind=00030&jobkind"
"=00033&employmentform=01&_employmentform=on&employmentform=02&_employmentform=on&_employmentform=on"
"&_employmentform=on&_employmentform=on&_employmentform=on&_jobterm=on&_jobterm=on&_jobterm=on"
"&_jobterm=on&wageValue=&_consideration=on&_consideration=on&_consideration=on&_entry=on&_entry=on"
"&_entry=on&_duration=on&_duration=on&_duration=on&sort=0&fn=0")
success, message, html = c.do_request(c.get_search_request_url(None))
search_page_urls = c.parse_item_count_and_generate_search_page_url(html)

detail_page_urls = []
for search_url in search_page_urls:
success, message, html = c.do_request(search_url)
detail_page_urls.extend(c.parse_search_page_and_generte_item_detail_page_url(html))

print(len(detail_page_urls))

print(detail_page_urls[1])
success, message, html = c.do_request(detail_page_urls[1])
c.parse_detail_page_and_generate_job_description_item(html)
5 changes: 5 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from app import App

if __name__ == '__main__':
app = App()
app.run()
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
beautifulsoup4==4.12.3
flask==3.0.2
openpyxl==3.1.2
pandas==2.2.1
requests==2.31.0
Loading

0 comments on commit 4553d03

Please sign in to comment.