-
Notifications
You must be signed in to change notification settings - Fork 0
/
dds_scrape.py
executable file
·58 lines (48 loc) · 1.79 KB
/
dds_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import urllib2, urllib, webbrowser
from BeautifulSoup import BeautifulSoup
import re
import difflib
def removeTags(thing):
for tag in thing.findAll(True): tag.replaceWith(tag.renderContents())
def unescape(s):
s = s.replace("<", "<")
s = s.replace(">", ">")
# this has to be last:
s = s.replace("&", "&")
s = s.replace(" ", " ")
return s
def removeHTML(string):
return re.sub("<(.|\n)*?>", "", string)
def cleanString(string):
return removeHTML(string).strip()
def noms():
print 'Downloading page...'
page = urllib2.urlopen("http://www.dartmouth.edu/dining/")
print 'Page downloaded.'
soup = BeautifulSoup(page)
sidebar = soup.find("div", {"class":"a1"})
noms = []
currentPlace = ""
for thing in sidebar.findAll():
if (thing.name == 'h2'):
removeTags(thing)
currentPlace = thing.renderContents()
elif (thing.name == 'li'):
parts = thing.renderContents().split(":")
if (len(parts) > 1):
category = parts[0]
for foodItem in unescape(parts[1]).replace(";",",").split(","):
noms.append((cleanString(foodItem), cleanString(category), currentPlace))
print 'Page parsed.'
return noms
def stringInString(needle, haystack):
return haystack.lower().find(needle.lower()) > -1
def close_match(stringA, stringB):
return len(difflib.get_close_matches(stringA, [stringB])) > 0
def removePunctuation(text):
return re.sub("\s+", "", re.sub("[^A-z0-9]","",text))
noms = noms()
def isThere(nomName):
global noms
def isRightNom(x): return stringInString(removePunctuation(nomName),removePunctuation(x[0]))
return filter(isRightNom, noms)