Skip to content

Commit

Permalink
repeat_request
Browse files Browse the repository at this point in the history
  • Loading branch information
minyoung347 committed Jul 3, 2019
1 parent 615e0aa commit c918851
Showing 1 changed file with 13 additions and 2 deletions.
15 changes: 13 additions & 2 deletions korea_news_crawler/articlecrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,16 @@ def make_news_page_url(category_url, start_year, end_year, start_month, end_mont
for page in range(1, totalpage + 1):
made_urls.append(url + "&page=" + str(page))
return made_urls

def get_url_data(self, url, max_tries=10):
remaining_tries = int(max_tries)
while remaining_tries > 0:
try:
return requests.get(url)
except requests.exceptions:
time.sleep(60)
remaining_tries = remaining_tries - 1
raise Exception("Couldn't get the data.")

def crawling(self, category_name):
# Multi Process PID
Expand All @@ -96,7 +106,8 @@ def crawling(self, category_name):
regex = re.compile("date=(\d+)")
news_date = regex.findall(URL)[0]

request = requests.get(URL)
request = self.get_url_data(URL)

document = BeautifulSoup(request.content, 'html.parser')

# html - newsflash_body - type06_headline, type06
Expand All @@ -115,7 +126,7 @@ def crawling(self, category_name):
sleep(0.01)

# 기사 HTML 가져옴
request_content = requests.get(content_url)
request_content = self.get_url_data(content_url)
document_content = BeautifulSoup(request_content.content, 'html.parser')

try:
Expand Down

0 comments on commit c918851

Please sign in to comment.