diff --git a/LICENSE b/LICENSE index 896da30..0b7f64c 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2020 lumyjuwon +Copyright (c) 2022 lumyjuwon Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/dist/KoreaNewsCrawler-1.51-py3-none-any.whl b/dist/KoreaNewsCrawler-1.51-py3-none-any.whl new file mode 100644 index 0000000..b71894c Binary files /dev/null and b/dist/KoreaNewsCrawler-1.51-py3-none-any.whl differ diff --git a/korea_news_crawler/articlecrawler.py b/korea_news_crawler/articlecrawler.py index 524ee5a..2d883fa 100644 --- a/korea_news_crawler/articlecrawler.py +++ b/korea_news_crawler/articlecrawler.py @@ -18,7 +18,7 @@ def __init__(self): self.categories = {'정치': 100, '경제': 101, '사회': 102, '생활문화': 103, '세계': 104, 'IT과학': 105, '오피니언': 110, 'politics': 100, 'economy': 101, 'society': 102, 'living_culture': 103, 'world': 104, 'IT_science': 105, 'opinion': 110} self.selected_categories = [] - self.date = {'start_year': 0, 'start_month': 0, 'end_year': 0, 'end_month': 0} + self.date = {'start_year': 0, 'start_month': 0, 'start_day' : 0, 'end_year': 0, 'end_month': 0, 'end_day':0} self.user_operating_system = str(platform.system()) def set_category(self, *args): @@ -27,47 +27,94 @@ def set_category(self, *args): raise InvalidCategory(key) self.selected_categories = args - def set_date_range(self, start_year, start_month, end_year, end_month): - args = [start_year, start_month, end_year, end_month] + def set_date_range(self, start_date:str, end_date:str): + start = list(map(int, start_date.split("-"))) + end = list(map(int, end_date.split("-"))) + + # Setting Start Date + if len(start) == 1: # Input Only Year + start_year = start[0] + start_month = 1 + start_day = 1 + elif len(start) == 2: # Input Year and month + start_year, start_month = start + start_day = 1 + elif len(start) == 3: # Input Year, month and day + start_year, start_month, start_day = start + + # Setting End Date + if len(end) == 1: # Input Only Year + end_year = end[0] + end_month = 12 + end_day = 31 + elif len(end) == 2: # Input Year and month + end_year, end_month = end + end_day = calendar.monthrange(end_year, end_month)[1] + elif len(end) == 3: # Input Year, month and day + end_year, end_month, end_day = end + + args = [start_year, start_month, start_day, end_year, end_month, end_day] + if start_year > end_year: raise InvalidYear(start_year, end_year) if start_month < 1 or start_month > 12: raise InvalidMonth(start_month) if end_month < 1 or end_month > 12: raise InvalidMonth(end_month) + if start_day < 1 or calendar.monthrange(start_year, start_month)[1] < start_day: + raise InvalidDay(start_day) + if end_day < 1 or calendar.monthrange(end_year, end_month)[1] < end_day: + raise InvalidDay(end_day) if start_year == end_year and start_month > end_month: raise OverbalanceMonth(start_month, end_month) + if start_year == end_year and start_month == end_month and start_day > end_day: + raise OverbalanceDay(start_day, end_day) + for key, date in zip(self.date, args): self.date[key] = date print(self.date) @staticmethod - def make_news_page_url(category_url, start_year, end_year, start_month, end_month): + def make_news_page_url(category_url, date): made_urls = [] - for year in range(start_year, end_year + 1): - target_start_month = start_month - target_end_month = end_month - - if start_year != end_year: - if year == start_year: - target_start_month = start_month + for year in range(date['start_year'], date['end_year'] + 1): + if date['start_year'] == date['end_year']: + target_start_month = date['start_month'] + target_end_month = date['end_month'] + else: + if year == date['start_year']: + target_start_month = date['start_month'] target_end_month = 12 - elif year == end_year: + elif year == date['end_year']: target_start_month = 1 - target_end_month = end_month + target_end_month = date['end_month'] else: target_start_month = 1 target_end_month = 12 - + for month in range(target_start_month, target_end_month + 1): - for month_day in range(1, calendar.monthrange(year, month)[1] + 1): + if date['start_month'] == date['end_month']: + target_start_day = date['start_day'] + target_end_day = date['end_day'] + else: + if year == date['start_year'] and month == date['start_month']: + target_start_day = date['start_day'] + target_end_day = calendar.monthrange(year, month)[1] + elif year == date['end_year'] and month == date['end_month']: + target_start_day = 1 + target_end_day = date['end_day'] + else: + target_start_day = 1 + target_end_day = calendar.monthrange(year, month)[1] + + for day in range(target_start_day, target_end_day + 1): if len(str(month)) == 1: month = "0" + str(month) - if len(str(month_day)) == 1: - month_day = "0" + str(month_day) + if len(str(day)) == 1: + day = "0" + str(day) # 날짜별로 Page Url 생성 - url = category_url + str(year) + str(month) + str(month_day) + url = category_url + str(year) + str(month) + str(day) # totalpage는 네이버 페이지 구조를 이용해서 page=10000으로 지정해 totalpage를 알아냄 # page=10000을 입력할 경우 페이지가 존재하지 않기 때문에 page=totalpage로 이동 됨 (Redirect) @@ -94,12 +141,11 @@ def crawling(self, category_name): writer = Writer(category='Article', article_category=category_name, date=self.date) # 기사 url 형식 url_format = f'http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1={self.categories.get(category_name)}&date=' - # start_year년 start_month월 ~ end_year의 end_month 날짜까지 기사를 수집합니다. - target_urls = self.make_news_page_url(url_format, self.date['start_year'], self.date['end_year'], self.date['start_month'], self.date['end_month']) - - print(category_name + " Urls are generated") - print("The crawler starts") + # start_year년 start_month월 start_day일 부터 ~ end_year년 end_month월 end_day일까지 기사를 수집합니다. + target_urls = self.make_news_page_url(url_format, self.date) + print(f'{category_name} Urls are generated') + print(f'{category_name} is collecting ...') for url in target_urls: request = self.get_url_data(url) document = BeautifulSoup(request.content, 'html.parser') @@ -186,5 +232,5 @@ def start(self): if __name__ == "__main__": Crawler = ArticleCrawler() Crawler.set_category('생활문화') - Crawler.set_date_range(2018, 1, 2018, 2) + Crawler.set_date_range('2018-01', '2018-02') Crawler.start() diff --git a/korea_news_crawler/exceptions.py b/korea_news_crawler/exceptions.py index eebae56..9ede287 100644 --- a/korea_news_crawler/exceptions.py +++ b/korea_news_crawler/exceptions.py @@ -53,6 +53,15 @@ def __init__(self, month): def __str__(self): return self.message +# 일이 올바르지 않을 때 +class InvalidDay(Exception): + def __init__(self, day): + self.message = f'{day} is an invalid day' + + def __str__(self): + return self.message + + # 시작 달과 끝나는 달이 올바르지 않을 때 class OverbalanceMonth(Exception): @@ -62,6 +71,13 @@ def __init__(self, start_month, end_month): def __str__(self): return self.message +class OverbalanceDay(Exception): + def __init__(self, start_day, end_day): + self.message = f'{start_day}(start day) is an overbalance with {end_day}(end day)' + + def __str__(self): + return self.message + # 실행시간이 너무 길어서 데이터를 얻을 수 없을 때 class ResponseTimeout(Exception): diff --git a/korea_news_crawler/sample.py b/korea_news_crawler/sample.py index 0818d5b..6faef12 100644 --- a/korea_news_crawler/sample.py +++ b/korea_news_crawler/sample.py @@ -1,9 +1,8 @@ from korea_news_crawler.articlecrawler import ArticleCrawler - if __name__ == "__main__": Crawler = ArticleCrawler() # 정치, 경제, 생활문화, IT과학, 사회, 세계 카테고리 사용 가능 - Crawler.set_category("IT과학", "경제", "생활문화", "IT과학", "사회", "세계") - # 2017년 12월부터 2018년 1월까지 크롤링 시작 - Crawler.set_date_range(2017, 12, 2018, 1) + Crawler.set_category("IT과학", "세계") + # 2017년 12월 (1일) 부터 2018년 1월 13일까지 크롤링 시작 YYYY-MM-DD의 형식으로 입력 + Crawler.set_date_range('2017-12', '2018-01-13') Crawler.start() diff --git a/korea_news_crawler/writer.py b/korea_news_crawler/writer.py index 80968a4..3cca41f 100644 --- a/korea_news_crawler/writer.py +++ b/korea_news_crawler/writer.py @@ -7,9 +7,10 @@ class Writer(object): def __init__(self, category, article_category, date): self.start_year = date['start_year'] self.start_month = f'0{date["start_month"]}' if len(str(date['start_month'])) == 1 else str(date['start_month']) + self.start_day = f'0{date["start_day"]}' if len(str(date['start_day'])) == 1 else str(date['start_day']) self.end_year = date['end_year'] self.end_month = f'0{date["end_month"]}' if len(str(date['end_month'])) == 1 else str(date['end_month']) - + self.end_day = f'0{date["end_day"]}' if len(str(date['end_day'])) == 1 else str(date['end_day']) self.file = None self.initialize_file(category, article_category) @@ -20,7 +21,7 @@ def initialize_file(self, category, article_category): if os.path.exists(output_path) is not True: os.mkdir(output_path) - file_name = f'{output_path}/{category}_{article_category}_{self.start_year}{self.start_month}_{self.end_year}{self.end_month}.csv' + file_name = f'{output_path}/{category}_{article_category}_{self.start_year}{self.start_month}{self.start_day}_{self.end_year}{self.end_month}{self.end_day}.csv' if os.path.isfile(file_name): raise ExistFile(file_name) diff --git a/setup.cfg b/setup.cfg index 224a779..0f94f37 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,2 @@ [metadata] -description-file = README.md \ No newline at end of file +description_file = README.md \ No newline at end of file diff --git a/setup.py b/setup.py index 13b0d6e..a75ab27 100644 --- a/setup.py +++ b/setup.py @@ -1,16 +1,16 @@ from setuptools import setup # build package command: python setup.py bdist_wheel -# release package command: twine upload dist/KoreaNewsCrawler-version-py3-none-any.whl +# release package command: twine upload dist/KoreaNewsCrawler-${version}-py3-none-any.whl setup( name = 'KoreaNewsCrawler', - version = '1.50', + version = '1.51', description = 'Crawl the korean news', author = 'lumyjuwon', author_email = 'lumyjuwon@gmail.com', url = 'https://github.com/lumyjuwon/KoreaNewsCrawler', - download_url = 'https://github.com/lumyjuwon/KoreaNewsCrawler/archive/1.50.tar.gz', + download_url = 'https://github.com/lumyjuwon/KoreaNewsCrawler/archive/1.51.tar.gz', install_requires = ['requests', 'beautifulsoup4'], packages = ['korea_news_crawler'], keywords = ['crawl', 'KoreaNews', 'crawler'],