forked from halcyonjuly7/yellowpages_scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathyellowpages_scraper.py
78 lines (62 loc) · 3.04 KB
/
yellowpages_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import csv
import re
import requests
from bs4 import BeautifulSoup
import time
def extract_yellowpages_links(url,start_page = 1,end_page = 1,all_pages = False):
"""extracts individual links from specified url in yellowpages.com"""
url = url + "&page={0}"
extracted_links = []
if all_pages is False:
while start_page != end_page:
content = requests.get(url.format(start_page)).content
soup = BeautifulSoup(content)
links = [i["href"] for i in soup.find_all("a",{"itemprop":"name"})]
extracted_links.extend(links)
start_page += 1
elif all_pages is True:
while True:
content = requests.get(url.format(start_page)).content
soup = BeautifulSoup(content)
links = [i["href"] for i in soup.find_all("a",{"itemprop":"name"})]
if not links:
break
extracted_links.extend(links)
start_page += 1
return extracted_links
def extracted_links_info(links,wait_interval= 0):
"""gets the information of the extracted links"""
business_names = []
business_emails = []
business_websites = []
street_addresses = []
cities = []
business_contact_numbers = []
for link in links:
time.sleep(wait_interval)
link = "http://www.yellowpages.com" + link
content = requests.get(link).content
soup2 = BeautifulSoup(content)
business_name = [soup2.find("h1",{"itemprop":"name"}).text]
business_email = [re.sub(r'mailto:', "",soup2.find("a",{"class":"email-business"})["href"]) if soup2.find("a",{"class":"email-business"}) else "---" ]
business_site = [soup2.find("a",{"class":"custom-link"})["href"] if soup2.find("a",{"class":"custom-link"}) else "---" ]
street_address = [ soup2.find("p",{"class":"street-address"}).text if soup2.find("p",{"class":"street-address"}) else "----"]
city = [soup2.find("p",{"class":"city-state"}).text if soup2.find("p",{"class":"city-state"}) else "---"]
business_contact_number = [soup2.find("p",{"class":"phone"}).text if soup2.find("p",{"class":"phone"}) else "---"]
business_names.extend(business_name)
business_emails.extend(business_email)
business_websites.extend(business_site)
street_addresses.extend(street_address)
cities.extend(city)
business_contact_numbers.extend(business_contact_number)
return business_names, business_emails, business_websites, street_addresses, cities, business_contact_numbers
def save_to_csv(filename,extracted_info):
"""saves csv file in root of python 3 directory"""
items = list(zip(*extracted_info))
with open("{0}.csv".format(filename), "w")as shops:
writer = csv.writer(shops)
writer.writerows(items)
if __name__ == "__main__":
extracted_links = extract_yellowpages_links("http://www.yellowpages.com/search?search_terms=restaurants&geo_location_terms=tx",1,2)
parsed_info = extracted_links_info(extracted_links)
save_to_csv("test",parsed_info)