Enable Ctrl+C to terminate program + more accurate timing in main thread

XuYan · Oct 19, 2016 · 7bfc3d0 · 7bfc3d0
1 parent 02252b4
commit 7bfc3d0
Show file tree

Hide file tree

Showing 4 changed files with 21 additions and 16 deletions.
diff --git a/crawler_mt.py b/crawler_mt.py
@@ -5,15 +5,14 @@
 import time
 import threading
 
-thread_list = []
-
 def defensiveCopy(info_list):
 	copy = []
 	for info in info_list:
 		copy.append(info)
 	return copy
 
 class CrawlThread(threading.Thread):
+	terminated = False
 	def __init__(self, crawler, info_list, current_level_info, next_url, next_level):
 		"""
 			Parameters
@@ -29,10 +28,12 @@ def __init__(self, crawler, info_list, current_level_info, next_url, next_level)
 		self.info_list = defensiveCopy(info_list)
 		self.next_url = next_url
 		self.next_level = next_level
+		self.daemon = True
 
 	def run(self):
-		self.info_list.extend(self.current_level_info)
-		self.crawler.crawl(self.next_url, self.next_level, self.info_list)
+		if not CrawlThread.terminated:
+			self.info_list.extend(self.current_level_info)
+			self.crawler.crawl(self.next_url, self.next_level, self.info_list)
 
 class Selector():
 	def __init__(self, css_selector):
@@ -82,7 +83,6 @@ def crawl(self, url, level, information_list):
 					current_level_info.append(list_info_list[j][i])
 				crawl_thread = CrawlThread(self, information_list, current_level_info, redirection_links[i], level + 1)
 				crawl_thread.start()
-				thread_list.append(crawl_thread)
 
 		else: # This is the last level in our crawling process
 			(target_length, list_info_list) = self.getListInfoList(html_doc, selectors)
@@ -208,12 +208,17 @@ def parseArgs():
 	return (args.url, args.css, args.domain)
 
 if __name__ == '__main__':
-	base_url, css_selectors, domain = parseArgs()
-	crawler = Crawler(css_selectors, domain)
-	start_time = time.time()
-	crawler.crawl(base_url, 0, [])
-	for thread in thread_list:
-		thread.join()
-	end_time = time.time()
-	print("Running Time: " + str(end_time - start_time) + " seconds")
-	crawler.output.close()
+	try:
+		base_url, css_selectors, domain = parseArgs()
+		crawler = Crawler(css_selectors, domain)
+		start_time = time.time()
+		crawler.crawl(base_url, 0, [])
+		while threading.active_count() > 1:
+			time.sleep(0.5)
+		end_time = time.time()
+		print("Running Time: " + str(end_time - start_time) + " seconds")
+		crawler.output.close()
+	except KeyboardInterrupt:
+		print("Program is terminated with ctrl + c")
+		CrawlThread.terminated = True
+		raise
diff --git a/crawler.py → crawler_st.py b/crawler.py → crawler_st.py
diff --git a/python_error_page b/python_error_page
@@ -1 +1 @@
-python crawler.py -url "https://docs.python.org/2/tutorial/errors.html" -css "information|attribute href|separate|h2 > a.headerlink"
+python crawler_st.py -url "https://docs.python.org/2/tutorial/errors.html" -css "information|attribute href|separate|h2 > a.headerlink"
diff --git a/yellow_page b/yellow_page
@@ -1 +1 @@
-python crawler.py -domain "http://www.yellowpages.com" -url "http://www.yellowpages.com/search?search_terms=event+coordinate&geo_location_terms=bellevue%2C+WA&page=1" -css "redirection|attribute href|separate|div.v-card > div.info > h3.n > a" "information|element|combination|div.sales-info > h1, information|attribute href|combination|div.business-card > section > footer > a.email-business"
+python crawler_st.py -domain "http://www.yellowpages.com" -url "http://www.yellowpages.com/search?search_terms=event+coordinate&geo_location_terms=bellevue%2C+WA&page=1" -css "redirection|attribute href|separate|div.v-card > div.info > h3.n > a" "information|element|combination|div.sales-info > h1, information|attribute href|combination|div.business-card > section > footer > a.email-business"
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		python crawler.py -url "https://docs.python.org/2/tutorial/errors.html" -css "information\|attribute href\|separate\|h2 > a.headerlink"
		python crawler_st.py -url "https://docs.python.org/2/tutorial/errors.html" -css "information\|attribute href\|separate\|h2 > a.headerlink"
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		python crawler.py -domain "http://www.yellowpages.com" -url "http://www.yellowpages.com/search?search_terms=event+coordinate&geo_location_terms=bellevue%2C+WA&page=1" -css "redirection\|attribute href\|separate\|div.v-card > div.info > h3.n > a" "information\|element\|combination\|div.sales-info > h1, information\|attribute href\|combination\|div.business-card > section > footer > a.email-business"
		python crawler_st.py -domain "http://www.yellowpages.com" -url "http://www.yellowpages.com/search?search_terms=event+coordinate&geo_location_terms=bellevue%2C+WA&page=1" -css "redirection\|attribute href\|separate\|div.v-card > div.info > h3.n > a" "information\|element\|combination\|div.sales-info > h1, information\|attribute href\|combination\|div.business-card > section > footer > a.email-business"