-
Notifications
You must be signed in to change notification settings - Fork 1
/
RetrieveNewsByKeyword.py
99 lines (93 loc) · 4.66 KB
/
RetrieveNewsByKeyword.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#https://github.com/ranahaani/GNews
from gnews import GNews
from tqdm import tqdm
import json
from datetime import timedelta, datetime
import argparse
parser = argparse.ArgumentParser(description='Process arguments for crawling news')
parser.add_argument('--keywords', type=str, nargs='+',
help='A list of keywords that you want to crawl')
parser.add_argument('--start_date', type=str,
help='The starting date. Should be in the specific format --> Year/Month/Day')
parser.add_argument('--output_file', type=str, default="output.jsonl",
help='The output file. Default is output.jsonl')
parser.add_argument('--end_date', type=str,
help='The end date. Notice that the news on this day will also be crawled. Should be in the specific format --> Year/Month/Day')
parser.add_argument('--country', type=str, default="United States",
help='The country of the news. Not sure whether this feature really works, though. Check https://github.com/ranahaani/GNews#supported-countries for more details.')
parser.add_argument('--language', type=str, default="english",
help='The language of the news. Not sure whether this feature really works, though. Check https://github.com/ranahaani/GNews#supported-languages for more details.')
def main():
# Parse arguments
args = parser.parse_args()
# Parse date and time
# We will have to crawl data per day in this time period because the Gnews.get_news can at most get 100 news at a time.
# I've checked that there're usually less than 100 news per day when we specify some keywords but may need to double check it for your case.
start_date = datetime.strptime(args.start_date, "%Y/%m/%d")
end_date = datetime.strptime(args.end_date, "%Y/%m/%d")
one_day = timedelta(days=1)
# Maintain a set of URLs. There might be duplicate news being retrieved, we check whether the news url alread exist in this set.
# If already exist(duplicate), we discard it.
url_set = set()
source_name_set = set()
# Final output list
output_jsonl = []
# For showing stats
stats = {
"Total News" : None,
"Total News Sources" : None,
"Total News w/ Article" : 0,
}
# Start crawling news day by day
for i in tqdm(range((end_date-start_date).days+1)):
next_day = start_date + one_day
# Crawl one day at a time
google_news = GNews(start_date=(start_date.year, start_date.month, start_date.day), end_date=(next_day.year, next_day.month, next_day.day))
google_news.country = args.country
google_news.language = args.language
# Crawl news by each keyword
for keyword in args.keywords:
# Retrieve a list of news based on current keyword
covid_news = google_news.get_news(keyword)
for news in covid_news:
# Check whether each news already exist using url_set
if news['url'] in url_set:
continue
# New news, update to url_set
url_set.add(news['url'])
source_name_set.add(news['publisher']['title'])
# Try to get news article
article=google_news.get_full_article(news['url'])
# Check whether we can crawl the article
if not (article is None): # Have article
news_dict = {
"source_url": news['url'],
"source_name": news['publisher']['title'],
"title": news["title"],
"publish_time": news['published date'],
"content": [line for line in article.text.split("\n") if line != ""],
"img": list(article.images)
}
stats["Total News w/ Article"] +=1
else:
news_dict = {
"source_url": news['url'],
"source_name": news['publisher']['title'],
"title": news["title"],
"publish_time": news['published date'],
"content": [],
"img": []
}
output_jsonl.append(news_dict)
start_date = next_day
# Write file
with open(args.output_file, "w") as F:
json.dump(output_jsonl, F, indent=2)
print(f"Successfully write file to {args.output_file}")
# Print stats
stats["Total News"] = len(output_jsonl)
stats["Total News Sources"] = len(source_name_set)
for k, v in stats.items():
print(f"{k}: {v}")
if __name__ == "__main__":
main()