Skip to content

Commit

Permalink
Enable Ctrl+C to terminate program + more accurate timing in main thread
Browse files Browse the repository at this point in the history
  • Loading branch information
XuYan committed Oct 19, 2016
1 parent 02252b4 commit 7bfc3d0
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 16 deletions.
33 changes: 19 additions & 14 deletions crawler_mt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,14 @@
import time
import threading

thread_list = []

def defensiveCopy(info_list):
copy = []
for info in info_list:
copy.append(info)
return copy

class CrawlThread(threading.Thread):
terminated = False
def __init__(self, crawler, info_list, current_level_info, next_url, next_level):
"""
Parameters
Expand All @@ -29,10 +28,12 @@ def __init__(self, crawler, info_list, current_level_info, next_url, next_level)
self.info_list = defensiveCopy(info_list)
self.next_url = next_url
self.next_level = next_level
self.daemon = True

def run(self):
self.info_list.extend(self.current_level_info)
self.crawler.crawl(self.next_url, self.next_level, self.info_list)
if not CrawlThread.terminated:
self.info_list.extend(self.current_level_info)
self.crawler.crawl(self.next_url, self.next_level, self.info_list)

class Selector():
def __init__(self, css_selector):
Expand Down Expand Up @@ -82,7 +83,6 @@ def crawl(self, url, level, information_list):
current_level_info.append(list_info_list[j][i])
crawl_thread = CrawlThread(self, information_list, current_level_info, redirection_links[i], level + 1)
crawl_thread.start()
thread_list.append(crawl_thread)

else: # This is the last level in our crawling process
(target_length, list_info_list) = self.getListInfoList(html_doc, selectors)
Expand Down Expand Up @@ -208,12 +208,17 @@ def parseArgs():
return (args.url, args.css, args.domain)

if __name__ == '__main__':
base_url, css_selectors, domain = parseArgs()
crawler = Crawler(css_selectors, domain)
start_time = time.time()
crawler.crawl(base_url, 0, [])
for thread in thread_list:
thread.join()
end_time = time.time()
print("Running Time: " + str(end_time - start_time) + " seconds")
crawler.output.close()
try:
base_url, css_selectors, domain = parseArgs()
crawler = Crawler(css_selectors, domain)
start_time = time.time()
crawler.crawl(base_url, 0, [])
while threading.active_count() > 1:
time.sleep(0.5)
end_time = time.time()
print("Running Time: " + str(end_time - start_time) + " seconds")
crawler.output.close()
except KeyboardInterrupt:
print("Program is terminated with ctrl + c")
CrawlThread.terminated = True
raise
File renamed without changes.
2 changes: 1 addition & 1 deletion python_error_page
Original file line number Diff line number Diff line change
@@ -1 +1 @@
python crawler.py -url "https://docs.python.org/2/tutorial/errors.html" -css "information|attribute href|separate|h2 > a.headerlink"
python crawler_st.py -url "https://docs.python.org/2/tutorial/errors.html" -css "information|attribute href|separate|h2 > a.headerlink"
2 changes: 1 addition & 1 deletion yellow_page
Original file line number Diff line number Diff line change
@@ -1 +1 @@
python crawler.py -domain "http://www.yellowpages.com" -url "http://www.yellowpages.com/search?search_terms=event+coordinate&geo_location_terms=bellevue%2C+WA&page=1" -css "redirection|attribute href|separate|div.v-card > div.info > h3.n > a" "information|element|combination|div.sales-info > h1, information|attribute href|combination|div.business-card > section > footer > a.email-business"
python crawler_st.py -domain "http://www.yellowpages.com" -url "http://www.yellowpages.com/search?search_terms=event+coordinate&geo_location_terms=bellevue%2C+WA&page=1" -css "redirection|attribute href|separate|div.v-card > div.info > h3.n > a" "information|element|combination|div.sales-info > h1, information|attribute href|combination|div.business-card > section > footer > a.email-business"

0 comments on commit 7bfc3d0

Please sign in to comment.