From c918851f0bd9e4b1dc0d784f47244f981a98e794 Mon Sep 17 00:00:00 2001 From: minyoung347 Date: Wed, 3 Jul 2019 11:56:11 +0900 Subject: [PATCH 1/2] repeat_request --- korea_news_crawler/articlecrawler.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/korea_news_crawler/articlecrawler.py b/korea_news_crawler/articlecrawler.py index a372164..23566de 100644 --- a/korea_news_crawler/articlecrawler.py +++ b/korea_news_crawler/articlecrawler.py @@ -76,6 +76,16 @@ def make_news_page_url(category_url, start_year, end_year, start_month, end_mont for page in range(1, totalpage + 1): made_urls.append(url + "&page=" + str(page)) return made_urls + + def get_url_data(self, url, max_tries=10): + remaining_tries = int(max_tries) + while remaining_tries > 0: + try: + return requests.get(url) + except requests.exceptions: + time.sleep(60) + remaining_tries = remaining_tries - 1 + raise Exception("Couldn't get the data.") def crawling(self, category_name): # Multi Process PID @@ -96,7 +106,8 @@ def crawling(self, category_name): regex = re.compile("date=(\d+)") news_date = regex.findall(URL)[0] - request = requests.get(URL) + request = self.get_url_data(URL) + document = BeautifulSoup(request.content, 'html.parser') # html - newsflash_body - type06_headline, type06 @@ -115,7 +126,7 @@ def crawling(self, category_name): sleep(0.01) # 기사 HTML 가져옴 - request_content = requests.get(content_url) + request_content = self.get_url_data(content_url) document_content = BeautifulSoup(request_content.content, 'html.parser') try: From 272185a3e768481b56ce92671bda163ea636cc82 Mon Sep 17 00:00:00 2001 From: minyoung347 Date: Wed, 3 Jul 2019 22:47:05 +0900 Subject: [PATCH 2/2] repeat_request & html error --- korea_news_crawler/articlecrawler.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/korea_news_crawler/articlecrawler.py b/korea_news_crawler/articlecrawler.py index 23566de..c4f40b4 100644 --- a/korea_news_crawler/articlecrawler.py +++ b/korea_news_crawler/articlecrawler.py @@ -127,7 +127,10 @@ def crawling(self, category_name): # 기사 HTML 가져옴 request_content = self.get_url_data(content_url) - document_content = BeautifulSoup(request_content.content, 'html.parser') + try: + document_content = BeautifulSoup(request_content.content, 'html.parser') + except: + continue try: # 기사 제목 가져옴