Skip to content

Commit

Permalink
Add OverbalanceMonth and Fix Parser
Browse files Browse the repository at this point in the history
뉴스 본문에서 "flash 우회 오류 ..." 내용 제거 및 왼쪽 공백 제거
start_month 및 end_month 조건 추가
운영체제에 따라 euc-kr 또는 utf-8 사용
  • Loading branch information
lumyjuwon committed May 16, 2019
1 parent 88d90de commit 69b1b21
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 14 deletions.
12 changes: 12 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,15 @@ korea_news_crawler/__pycache__/exceptions.cpython-36.pyc
korea_news_crawler/__pycache__/articleparser.cpython-36.pyc
korea_news_crawler/__pycache__/articlecrawler.cpython-36.pyc
.idea/KoreaNewsCrawler.iml
build/lib/korea_news_crawler/articlecrawler.py
build/lib/korea_news_crawler/articleparser.py
build/lib/korea_news_crawler/exceptions.py
build/lib/korea_news_crawler/sample.py
build/lib/korea_news_crawler/sportcrawler.py
dist/KoreaNewsCrawler-1.0-py3-none-any.whl
KoreaNewsCrawler.egg-info/dependency_links.txt
KoreaNewsCrawler.egg-info/not-zip-safe
KoreaNewsCrawler.egg-info/PKG-INFO
KoreaNewsCrawler.egg-info/requires.txt
KoreaNewsCrawler.egg-info/SOURCES.txt
KoreaNewsCrawler.egg-info/top_level.txt
26 changes: 17 additions & 9 deletions korea_news_crawler/articlecrawler.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# -*- coding: utf-8, euc-kr -*-

from time import sleep
from bs4 import BeautifulSoup
from multiprocessing import Process
from korea_news_crawler.exceptions import *
from korea_news_crawler.articleparser import ArticleParser
import os
import platform
import calendar
import requests
import csv
Expand All @@ -20,6 +21,7 @@ def __init__(self):
'politics': 100, 'economy': 101, 'society': 102, 'living_culture': 103, 'world': 104, 'IT_science': 105}
self.selected_categories = []
self.date = {'start_year': 0, 'start_month': 0, 'end_year': 0, 'end_month': 0}
self.user_operating_system = str(platform.system())

def set_category(self, *args):
for key in args:
Expand All @@ -35,6 +37,8 @@ def set_date_range(self, start_year, start_month, end_year, end_month):
raise InvalidMonth(start_month)
if end_month < 1 or end_month > 12:
raise InvalidMonth(end_month)
if start_month > end_month:
raise OverbalanceMonth(start_month, end_month)
for key, date in zip(self.date, args):
self.date[key] = date
print(self.date)
Expand Down Expand Up @@ -87,14 +91,18 @@ def crawling(self, category_name):
save_endmonth = str(self.date['end_month'])

# 각 카테고리 기사 저장 할 CSV
file = open('Article_' + category_name + '_' + str(self.date['start_year']) + save_startmonth\
+ '_' + str(self.date['end_year']) + save_endmonth + '.csv', 'w', encoding='utf-8', newline='')
# Windows use euc-kr
if self.user_operating_system == "Windows":
file = open('Article_' + category_name + '_' + str(self.date['start_year']) + save_startmonth
+ '_' + str(self.date['end_year']) + save_endmonth + '.csv', 'w', encoding='euc-kr', newline='')
else:
file = open('Article_' + category_name + '_' + str(self.date['start_year']) + save_startmonth
+ '_' + str(self.date['end_year']) + save_endmonth + '.csv', 'w', encoding='utf-8', newline='')
wcsv = csv.writer(file)
del save_startmonth, save_endmonth

# 기사 URL 형식
url = "http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=" + str(
self.categories.get(category_name)) + "&date="
url = "http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=" + str(self.categories.get(category_name)) + "&date="
# start_year년 start_month월 ~ end_year의 end_month 날짜까지 기사를 수집합니다.
final_urlday = self.make_news_page_url(url, self.date['start_year'], self.date['end_year'], self.date['start_month'], self.date['end_month'])
print(category_name + " Urls are generated")
Expand Down Expand Up @@ -145,7 +153,7 @@ def crawling(self, category_name):
tag_company = document_content.find_all('meta', {'property': 'me2:category1'})
text_company = '' # 언론사 초기화
text_company = text_company + str(tag_company[0].get('content'))
if not text_company: # 공백일 경우 기사 제외 처리 굳.
if not text_company: # 공백일 경우 기사 제외 처리
continue
# CSV 작성
wcsv.writerow([news_date, category_name, text_company, text_headline, text_sentence, content_url])
Expand All @@ -156,7 +164,7 @@ def crawling(self, category_name):
del request_content, document_content

except Exception as ex: # UnicodeEncodeError ..
wcsv.writerow([ex, content_url])
# wcsv.writerow([ex, content_url])
del request_content, document_content
pass
file.close()
Expand All @@ -170,6 +178,6 @@ def start(self):

if __name__ == "__main__":
Crawler = ArticleCrawler()
Crawler.set_category("생활문화","IT과학") # 정치, 경제, 생활문화, IT과학, 세계, 사회 카테고리 사용 가능
Crawler.set_date_range(2017, 1, 2018, 4) # 2017년 1월부터 2018년 4월까지 크롤링 시작
Crawler.set_category("생활문화", "IT과학")
Crawler.set_date_range(2017, 1, 2017, 1)
Crawler.start()
6 changes: 3 additions & 3 deletions korea_news_crawler/articleparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
class ArticleParser(object):
def __init__(self):
self.special_symbol = re.compile('[\{\}\[\]\/?,;:|\)*~`!^\-_+<>@\#$&▲▶◆◀■【】\\\=\(\'\"]')
self.content_pattern = re.compile('본문 내용|TV플레이어| 동영상 뉴스|flash 오류를 우회하기 위한 함수 추가fuctio flashremoveCallback|tt|앵커 멘트|xa0')
self.content_pattern = re.compile('본문 내용|TV플레이어| 동영상 뉴스|flash 오류를 우회하기 위한 함수 추가function flash removeCallback|tt|앵커 멘트|xa0')

def clear_content(self, text):
# 기사 본문에서 필요없는 특수문자 및 본문 양식 등을 다 지움
newline_symbol_removed_text = text.replace('\\n','').replace('\\t','')
newline_symbol_removed_text = text.replace('\\n', '').replace('\\t', '')
special_symbol_removed_content = re.sub(self.special_symbol, ' ', newline_symbol_removed_text)
end_phrase_removed_content = re.sub(self.content_pattern, '', special_symbol_removed_content)
blank_removed_content = re.sub(' +', ' ',end_phrase_removed_content) # 공백 에러 삭제
blank_removed_content = re.sub(' +', ' ', end_phrase_removed_content).lstrip() # 공백 에러 삭제
reversed_content = ''.join(reversed(blank_removed_content)) # 기사 내용을 reverse 한다.
content = ''
for i in range(0, len(blank_removed_content)):
Expand Down
10 changes: 10 additions & 0 deletions korea_news_crawler/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,13 @@ def __init__(self, month):

def __str__(self):
return str(self.message)


class OverbalanceMonth(Exception):
def __init__(self, start_month, end_month):
self.start_month = start_month
self.end_month = end_month
self.message = "start_month(" + str(self.start_month) + ") is an overbalance with end_month" + "(" + str(self.end_month) + ")"

def __str__(self):
return str(self.message)
4 changes: 2 additions & 2 deletions korea_news_crawler/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@

if __name__ == "__main__":
Crawler = ArticleCrawler()
Crawler.set_category("IT과학", "경제") # 정치, 경제, 생활문화, IT과학, 사회 카테고리 사용 가능
Crawler.set_date_range(2017, 2018, 3) # 2017년 1월부터 2018년 3월까지 크롤링 시작
Crawler.set_category("IT과학", "경제") # 정치, 경제, 생활문화, IT과학, 사회, 세계 카테고리 사용 가능
Crawler.set_date_range(2017, 1, 2018, 3) # 2017년 1월부터 2018년 3월까지 크롤링 시작
Crawler.start()

0 comments on commit 69b1b21

Please sign in to comment.