This repository has been archived by the owner on Mar 20, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathcmcrawler.py
134 lines (124 loc) · 4.38 KB
/
cmcrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# bscrawler lite by Ian Lurie
# Huge kudos to BeautifulSoup
# lite version differs only in that it does not use database storage
#!/usr/bin/env python
import sys
import httplib
import urllib2
from urllib2 import Request, urlopen, URLError
import urlparse
import string
from BeautifulSoup import BeautifulSoup, SoupStrainer
from time import gmtime, strftime, time
print "start time ",strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()),"\n\n\n"
try:
root = sys.argv[1]
except IndexError:
print " Usage: ./bscrawler.py link"
print " Example: ./bscrawler.py http://www.portent.com/"
exit()
linkz = []
crawled = []
errorz = []
imgz = []
counter = 0
start = time()
result=0
parsedRoot = urlparse.urlparse(root)
if parsedRoot.port == 80:
hostRoot = parsedRoot.netloc[:-3]
else:
hostRoot = parsedRoot.netloc
linkz.append(root)
conn = httplib.HTTPConnection(hostRoot) # open http request
for l in linkz:
pagelinkz = []
giffound = l.find('.gif')
jpgfound = l.find('.jpg')
pngfound = l.find('.png')
pdffound = l.find('.pdf')
try:
conn = urllib2.urlopen(l) # get the url
src = conn.read() # read page contents
code = conn.code # read response code - later need to make this more sensible
links = SoupStrainer('a') # grab all anchors
imgs = SoupStrainer('img') # grab all img elements
bs = BeautifulSoup(src, parseOnlyThese=links) # parse for anchors
try:
if (giffound == -1) & (jpgfound == -1) & (pngfound == -1) & (pdffound == -1):
print "Crawling\t",l,"\t",code
# loop through all of the anchors found on the page
# crawler only records the FIRST time it finds a link. If a link is on 20 pages
# it will still only show up once in the log.
for j in bs.findAll('a', {'href':True}):
try:
testresult = 0
absUrl = urlparse.urljoin(l, j['href'])
absUrl = absUrl.strip()
parsedUrl = urlparse.urlparse(absUrl)
# check for any images that snuck in via a link
giffound = absUrl.find('.gif')
jpgfound = absUrl.find('.jpg')
pngfound = absUrl.find('.png')
if (giffound == -1) & (jpgfound == -1) & (pngfound == -1):
filetype = 1
else:
filetype = 2
if parsedUrl.port == 80:
hostUrl = parsedUrl.netloc[:-3]
else:
hostUrl = parsedUrl.netloc
absUrl = urlparse.urlunparse((parsedUrl.scheme, hostUrl, parsedUrl.path, parsedUrl.params, parsedUrl.query, parsedUrl.fragment))
if (parsedUrl.scheme == 'http') & \
((parsedUrl.netloc.endswith('.' + hostRoot)) | (parsedUrl.netloc == hostRoot)) & \
(absUrl not in linkz) & (filetype == 1):
tester = absUrl.find('#')
if tester == -1:
cleanUrl = absUrl.strip()
print '\t' + cleanUrl + '\tpage'
linkz.append(cleanUrl)
counter = counter + 1
else:
counter = counter + 1
except:
pass
# now to try to grab some images on the same page
# the crawler records EVERY place images are found, not just the first. Long story, but we needed this at Portent.
bsi = BeautifulSoup(src, parseOnlyThese=imgs)
for i in bsi.findAll('img', {'src':True}):
absUrl = urlparse.urljoin(l, i['src'])
parsedUrl = urlparse.urlparse(absUrl)
conn = urllib2.urlopen(absUrl) # get the url
icode = conn.code # read response code
try:
if parsedUrl.port == 80:
hostUrl = parsedUrl.netloc[:-3]
else:
hostUrl = parsedUrl.netloc
absUrl = urlparse.urlunparse((parsedUrl.scheme, hostUrl, parsedUrl.path, parsedUrl.params, parsedUrl.query, parsedUrl.fragment))
if (parsedUrl.scheme == 'http') & \
((parsedUrl.netloc.endswith('.' + hostRoot)) | (parsedUrl.netloc == hostRoot)):
cleanUrl = absUrl.strip()
cleanUrl = cleanUrl.replace('&','&')
print '\t',cleanUrl,'\timage','\t',icode
imgz.append(cleanUrl)
counter = counter + 1
except:
print sys.exc_info() # debugging only disable this if you want clean output
pass
except:
print sys.exc_info() # debugging only disable this if you want clean output
pass
except URLError, e:
if hasattr(e, 'reason'):
print 'Reason: ', e.reason
pass
elif hasattr(e, 'code'):
print "Crawling\t",l,"\t", e.code
pass
else:
pass
except:
print sys.exc_info() # debugging only disable this if you want clean output
pass
print "Completed at ",strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()),"\n\n\n",counter," urls in ", (time() - start)