-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape-data.py
163 lines (130 loc) · 6.48 KB
/
scrape-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from selenium import webdriver
import os
import json
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
class DataScraper():
def __init__(self):
self.base_path = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
if os.path.isfile(self.base_path + '/config.json'):
with open(self.base_path + '/config.json') as config:
self.config = json.load(config)
self.driver = webdriver.Chrome(self.base_path + '/' + self.config['driver'])
def getUrlPerPage(self):
urlList = []
soup = BeautifulSoup(self.driver.page_source,"html.parser")
jobs_list = soup.find('div',class_='jobs-list')
job_items = jobs_list.find_all('div',class_="job-item")
for job in job_items:
anchorElement = job.find('a',class_="job-title")
url = self.config['web_base_url'] + '/a' + anchorElement["href"][1:]
urlList.append(url)
return urlList
def hasNextPage(self,currentPage):
soup = BeautifulSoup(self.driver.page_source,"html.parser")
paginationElement = soup.find('ul',class_="el-pager")
liElement = paginationElement.find_all('li',class_="number")
lastPage = liElement[-1].text
if str(currentPage) == lastPage:
return False
return True
def main(self):
urlList = []
informations = []
page = 1
while True:
url = f'https://www.camhr.com/a/job?page={page}¶m={{"page":{page},"size":50}}'
self.driver.get(url)
try:
waitForJobItem = WebDriverWait(self.driver,10).until(EC.presence_of_all_elements_located((By.CLASS_NAME,'job-item')))
except:
continue
urls = self.getUrlPerPage()
urlList.extend(urls)
if self.hasNextPage(page):
page += 1
else:
break
# Step 2 Get detail information of every posts from URL Lists
for url in urlList:
self.driver.get(url)
mainInfo = WebDriverWait(self.driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,'job-maininfo')))
soup = BeautifulSoup(self.driver.page_source,"html.parser")
jobTitle = soup.find("span",class_='job-name-span').text
table = soup.find("table",class_="mailTable")
tableData = table.find_all("td")
level = tableData[0].text
term = tableData[1].text
yearOfExp = tableData[2].text
function = tableData[3].text
hiring = tableData[4].text
industry = tableData[5].text
salary = tableData[6].text
qualification= tableData[7].text
sex = tableData[8].text
language = tableData[9].text
age = tableData[10].text
location = tableData[11].text
print(level,term,yearOfExp,function,hiring,industry,salary,qualification,sex,language,age)
print(url,end="\n")
try:
jobDescription = soup.find_all('div',class_='job-descript')
if len(jobDescription) == 2:
description = jobDescription[0].find('div',class_="descript-list").text
jobRequirement = jobDescription[1].find('div',class_="descript-list").text
else:
title = jobDescription[0].find('div',class_="descript-title").text
if title == 'Job Requirements':
jobRequirement = jobDescription[0].find('div',class_="descript-list").text
description = ""
else:
description = jobDescription[0].find('div',class_="descript-list").text
jobRequirement = ""
except:
description = ""
jobRequirement = ""
jobDetail = soup.find_all('div',class_='jobdetail-item')
try:
companyProfile = jobDetail[0].find('div',class_="company-info").text
except:
companyProfile = ""
if companyProfile:
companyContact = jobDetail[1].find('div',class_="recruiter-info")
else:
companyContact = jobDetail[0].find('div',class_="recruiter-info")
recruiterName = companyContact.find('span',class_="recruiter-name").text
recruiterJob = companyContact.find('span',class_="recruiter-job").text
recruiterInformation = companyContact.find("div",class_="recruiter-baseinfo")
recruiterInformation = recruiterInformation.find_all("a",class_="d-inline-block")
try:
recruiterNumber = recruiterInformation[0].text
except:
recruiterNumber = ""
try:
recruiterEmail = recruiterInformation[1].text
except:
recruiterEmail = ""
try:
recruiterLocation = recruiterInformation[2].text
except:
recruiterLocation = ""
print(recruiterEmail,recruiterNumber,recruiterLocation,recruiterName,recruiterJob)
sendDate = soup.find("div",class_="send-date")
publishedDate = sendDate.find("span").text
closeDate = sendDate.find("span",class_="close-date").text
contact = f'Contact Information {recruiterName} {recruiterJob} {recruiterNumber} {recruiterEmail} {recruiterLocation} {recruiterNumber}'
information = {"jobUrl": url,"job title":jobTitle,"position": jobTitle,"Level":level,"Year of Exp":yearOfExp,"Hiring":hiring,"Salary":salary,"Sex":sex,"Age":age,"Term":term,"Function/Category":function
,"Industry":industry,"Qualification":qualification,"Language":language,"Location":location,"Job Description":description,"Job Requirement":jobRequirement,"Company Profile":companyProfile,"Publish Date":publishedDate,"Closing Date":closeDate,
"Contact Info": contact}
informations.append(information)
for info in informations:
print(info)
df = pd.DataFrame(informations)
df.to_csv("camHr.csv")
self.driver.quit()
if __name__ == "__main__":
scraper = DataScraper()
scraper.main()