-
Notifications
You must be signed in to change notification settings - Fork 4
/
scrape_mushroom_urls.py
110 lines (94 loc) · 3.71 KB
/
scrape_mushroom_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# Functions to scrape data from mushroom.world
# If called from the command line, the script prints out a json object of mushroom information
# example:
# python scrape_mushroom_urls.py 'http://www.mushroom.world/show?n=Amanita-virosa' 'http://www.mushroom.world/show?n=Galerina-marginata'
#
# See the comments above the functions for further documentation
#
# Tuomo Nieminen 10/2017
from bs4 import BeautifulSoup
import requests
import sys
import re
import json
# Scrape mushroom url
#
# scrape_mushroom_url takes as input an url to a page in mushroom.world containing information
# related to a single mushroom and returns a python dictionary of information (including image url's)
# related to the mushroom.
#
# @param url An url to a mushroom web page in mushroom.world
#
# @return
# Returns a python dictionary containing the following keys:
# - name1: (string) Name of the mushroom.
# - name2 (string) Name given in parenthesis. Can be '' if no such name was given
# - images: (list) A list of image urls
# - info: (dict) A dictionary of information related to the mushroom.
# keys: Family, Location, Dimensions, Edibility, Description (dict)
# Description keys: General, Cap, Gills, Stem
#
# @examples
#
# from bs4 import BeautifulSoup
# import requests
#
# url = 'http://www.mushroom.world/show?n=Galerina-marginata'
# mushroom = scrape_mushroom(url)
# print(mushroom)
#
def scrape_mushroom(url):
# retrive site data as BeautifullSoup object
data = requests.get(url).text
soup = BeautifulSoup(data, 'html.parser')
# etract and parse name, labels (Family, Location, Dimensions, Edibility, Description)
# and content text related to the labels
name_content = soup.find(class_ = "caption").find("b").contents
names = re.sub('[^A-Za-z0-9( ]+', '', name_content[0]).split("(")
names = [n.strip() for n in names]
name1 = names[0]
if(len(names) > 1):
name2 = names[1]
else:
name2 = ''
labels = soup.find_all(class_ ="labelus")
labels = [label.contents[0] for label in labels]
texts = soup.find_all(class_ = "textus")
texts = [text.contents[0] for text in texts]
# extract mushroom description as a dictionary
description = soup.find(class_ = "longtextus").contents
description = [re.sub('[^A-Za-z0-9,.<> ]+', '', str(d)).strip() for d in description]
description = [re.sub('<b>', '', d) for d in description if (d != "") & (d != "<br>")]
description.insert(0, 'General')
description = dict(zip(description[0::2], description[1::2]))
texts.append(description)
assert len(labels) == len(texts)
# find image urls
images = soup.find(id="mushroom-list").find_all(class_ = "image")
image_urls = ['http://www.mushroom.world' + image.a["href"] for image in images]
# contruct the mushroom dictionary
mushroom = dict(name1 = name1, name2 = name2, images = image_urls, info = dict(),)
# add labels as keys and text as values
for i in range(len(labels)):
mushroom["info"][labels[i]] = texts[i]
return mushroom
# Get shrooms
# Get shoorm takes as input a list of mushroom urls and scrapes each url using scrape_mushroom().
#
# @param urls A list of mushroom.world mushroom page urls
#
# @return Returns a list of python dictionaries returned by scrape_mushroom().
# See the documentation of scrape_mushroom for details of the dictionaries.
# @examples
#
# urls = ['http://www.mushroom.world/show?n=Amanita-virosa', 'http://www.mushroom.world/show?n=Galerina-marginata']
# shrooms = GetShrooms(urls)
#
# import json
# json.dumps(shrooms)
#
def GetShrooms(urls):
return [scrape_mushroom(url) for url in urls]
if __name__ == '__main__':
shrooms = GetShrooms(sys.argv[1:])
print(json.dumps(shrooms))