forked from jjensen1/Linkedin_Google_ProfileScraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Linkedin&GoogleProfileScraper
137 lines (97 loc) · 4.19 KB
/
Linkedin&GoogleProfileScraper
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
This is a modification of the code sourced from this article: https://www.linkedin.com/pulse/how-easy-scraping-data-from-linkedin-profiles-david-craven/?trackingId=HUfuRSjER1iAyeWmcgHbyg%3D%3D
It is a web scraper scraping google for linkedin profiles; the use case would be recruiters sourcing target candidates for recruiting purposes.
Also copied the find_profiles function from here: https://www.pingshiuanchua.com/blog/post/scraping-search-results-from-google-search
"""
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from bs4.element import Tag
from time import sleep
import csv
from parsel import Selector
import parameters
import numpy
# Function call extracting title and linkedin profile iteratively
def find_profiles():
for r in result_div:
# Checks if each element is present, else, raise exception
try:
link = r.find('a', href=True)
title = None
title = r.find('h3')
# returns True if a specified object is of a specified type; Tag in this instance
if isinstance(title,Tag):
title = title.get_text()
description = None
description = r.find('span', attrs={'class': 'st'})
if isinstance(description, Tag):
description = description.get_text()
# Check to make sure everything is present before appending
if link != '' and title != '' and description != '':
links.append(link['href'])
titles.append(title)
descriptions.append(description)
# Next loop if one element is not present
except Exception as e:
print(e)
continue
# This function iteratively clicks on the "Next" button at the bottom right of the search page.
def profiles_loop():
find_profiles()
next_button = driver.find_element_by_xpath('//*[@id="pnnext"]')
next_button.click()
def repeat_fun(times, f):
for i in range(times): f()
# specifies the path to the chromedriver.exe
driver = webdriver.Chrome('/Users/your_alias/project_folder/chromedriver')
# driver.get method() will navigate to a page given by the URL address
driver.get('https://www.linkedin.com')
# locate email form by_class_name
username = driver.find_element_by_id('session_key')
# send_keys() to simulate key strokes
username.send_keys(parameters.linkedin_username)
sleep(0.5)
# locate password form by_class_name
password = driver.find_element_by_id('session_password')
# send_keys() to simulate key strokes
password.send_keys(parameters.linkedin_password)
sleep(0.5)
# locate submit button by_class_name
log_in_button = driver.find_element_by_class_name('sign-in-form__submit-button')
# .click() to mimic button click
log_in_button.click()
sleep(0.5)
# driver.get method() will navigate to a page given by the URL address
driver.get('https://www.google.com')
sleep(3)
# locate search form by_name
search_query = driver.find_element_by_name('q')
# send_keys() to simulate the search text key strokes
search_query.send_keys('site:linkedin.com/in/ AND "python developer" AND "London"')
# .send_keys() to simulate the return key
search_query.send_keys(Keys.RETURN)
soup = BeautifulSoup(driver.page_source,'lxml')
result_div = soup.find_all('div', attrs={'class': 'g'})
# initialize empty lists
links = []
titles = []
descriptions = []
# Function call x10 of function profiles_loop; you can change the number to as many pages of search as you like.
repeat_fun(10, profiles_loop)
print(titles)
print(links)
# Separates out just the First/Last Names for the titles variable
titles01 = [i.split()[0:2] for i in titles]
# The function below stores scraped data into a .csv file
from itertools import zip_longest
# Load titles and links data into csv
d = [titles01, links]
export_data = zip_longest(*d, fillvalue = '')
with open(parameters.file_name, 'w', encoding="ISO-8859-1", newline='') as myfile:
wr = csv.writer(myfile)
wr.writerow(("Titles", "Links", "Current_Job", "Current_Location" ))
wr.writerows(export_data)
myfile.close()