-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathRetrieveNewsByURL.py
93 lines (86 loc) · 3.81 KB
/
RetrieveNewsByURL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#https://github.com/ranahaani/GNews
from gnews import GNews
from tqdm import tqdm
import json
from datetime import timedelta,date
from multiprocessing import Process, Pool
import argparse
parser = argparse.ArgumentParser(description='Process arguments for crawling news')
parser.add_argument('--url_list', type=str, nargs='+',
help='A list of urls that you want to crawl. Should specify url_list or url_list_file.')
parser.add_argument('--url_list_file', type=str,
help='The file of url_list. Should be a txt file with a url in each line. Should specify url_list or url_list_file.')
parser.add_argument('--output_file', type=str, default="output.jsonl",
help='The output file. Default is output.jsonl')
parser.add_argument('--num_workers', type=int, default=1,
help='The number of multi-processing workers. Notice that rapid query to server may lead to IP being banned.')
parser.add_argument('--country', type=str, default="United States",
help='The country of the news. Not sure whether this feature really works, though. Check https://github.com/ranahaani/GNews#supported-countries for more details.')
parser.add_argument('--language', type=str, default="english",
help='The language of the news. Not sure whether this feature really works, though. Check https://github.com/ranahaani/GNews#supported-languages for more details.')
def get_article_by_url(url):
google_news = GNews()
article=google_news.get_full_article(url)
# Can get article
if not (article is None):
# get a list of news from article to get meta_data
news = google_news.get_news(article.title)
news_dict = {
"source_url": url,
"source_name": news[0]['publisher']['title'] if len(news) > 0 else "",
"title": article.title,
"publish_time": news[0]['published date'] if len(news) > 0 else "",
"content": [line for line in article.text.split("\n") if line != ""],
"img": list(article.images)
}
# have_articles +=1
else:
news_dict = {
"source_url": url,
"source_name": "",
"title": "",
"publish_time": "", # Can't get it here
"content": [],
"img": []
}
return news_dict
def main():
# Parse arguments
args = parser.parse_args()
# Check url_list and url_list_file
assert (args.url_list is not None) or (args.url_list_file is not None), "Error. Should specify either 'url_list' or 'url_list_file' !!"
assert not ((args.url_list is not None) and (args.url_list_file is not None)), "Error. Should only specify one of the 'url_list' or 'url_list_file' !!"
# Load url_list
if args.url_list is not None:
url_list = args.url_list
if args.url_list_file is not None:
url_list = [line.strip() for line in open(args.url_list_file, "r").readlines()]
# Final output list
output_jsonl = []
# For showing stats
stats = {
"Total News" : None,
"Total News Sources" : None,
"Total News w/ Article" : 0,
}
# Retrieving
with Pool(args.num_workers) as p:
output_jsonl = list(tqdm(p.imap(get_article_by_url, url_list)))
# Write file
with open(args.output_file, "w") as F:
json.dump(output_jsonl, F, indent=2)
# Showing stats
source_set = set()
for news in output_jsonl:
if news['content'] != []:
stats["Total News w/ Article"]+=1
print(news['source_name'])
source_set.add(news['source_name'])
# Remove empty source
source_set.remove("")
stats["Total News"] = len(output_jsonl)
stats["Total News Sources"] = len(source_set)
for k, v in stats.items():
print(f"{k}: {v}")
if __name__ == '__main__':
main()