-
Notifications
You must be signed in to change notification settings - Fork 0
/
news_model.py
executable file
·260 lines (219 loc) · 10.3 KB
/
news_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
#!/usr/bin/env python3
""" NewsModel
This module contains logic to update a RandomWriter object from news headlines
pulled from the internet.
"""
import argparse
from datetime import datetime
from html.parser import HTMLParser
import os
import pickle
import re
from urllib.parse import urlparse
import psycopg2
import requests
import config_reader
import randomwriter
from randomwriter import RandomWriter
class NewsHTMLParser(HTMLParser):
""" Parses HTML from a news website, pulling out headline text """
def __init__(self, save_headline, is_headline_tag):
self.cur_count = 0
self.save_headline = save_headline
self.is_headline_tag = is_headline_tag
self.headline_count = 0
self.cur_headline = ""
super().__init__()
def handle_starttag(self, tag, attrs):
if self.cur_count:
self.cur_count += 1
elif self.is_headline_tag(tag, attrs):
self.cur_count = 1
self.cur_headline = ""
def handle_endtag(self, tag):
if self.cur_count:
self.cur_count -= 1
if not self.cur_count:
if self.save_headline(' '.join(self.cur_headline.split())):
self.headline_count += 1
def handle_data(self, data):
if self.cur_count:
self.cur_headline += data
@classmethod
def identify_headline_class(cls, name):
""" Return a headline tag identifier based on the class attribute """
def is_headline_tag(tag, attrs):
""" Identify a tag as one containing a headline by the class attribute """
return ('class', name) in attrs
return is_headline_tag
class NewsModel(object):
""" This class contains logic to access the PostgreSQL database on news models """
def __init__(self):
self.urlmap = {}
def update_model(self, writer, url, is_headline_tag, old_headlines, new_headlines, blacklist=None):
""" Update a single model with the URL and other parameters """
if not hasattr(self, 'window'):
print("please update the news model through news_model.py")
return
# we don't make blacklist a set() by default at the top because the set
# is mutable and is instantiated when the module is loaded
if blacklist is None:
blacklist = set()
if url in self.urlmap:
headlines, count = self.urlmap[url]
else:
headlines = []
endtime_re = re.compile('^.*(?P<time>\\d\\d?:\\d{2} [AP]M \\wT)$')
def save_headline(headline):
""" Filter out useless headlines and store the useful ones for training later """
if "ttp://" in headline:
return False
endtime_match = endtime_re.match(headline)
if endtime_match is not None: # for new york times headlines ending in the time
headline = headline[:len(headline) - len(endtime_match.group('time'))].strip()
ret = headline not in blacklist and headline.strip()
if ret:
headlines.append(headline)
new_headlines.add(headline)
return ret
html_parser = NewsHTMLParser(save_headline, is_headline_tag)
html_page = requests.get(url)
html_parser.feed(html_page.text)
count = html_parser.headline_count
self.urlmap[url] = (headlines, count)
for headline in headlines:
writer.train(headline)
for headline in old_headlines:
writer.untrain(headline)
return count
@staticmethod
def get_db_conn():
""" Open a connection with the PostgreSQL database """
url = urlparse(os.environ['DATABASE_URL'])
return psycopg2.connect(
database=url.path[1:],
user=url.username,
password=url.password,
host=url.hostname,
port=url.port
)
def get_news_model(self, name, level=5, strategy=randomwriter.CharacterStrategy):
""" Retrieve a news model from the database """
with self.get_db_conn() as conn:
with conn.cursor() as cur:
# check if our table exists
cur.execute("SELECT EXISTS(SELECT * FROM information_schema.tables"
+ " where table_name=%s);", ('models',))
if not cur.fetchone()[0]:
# our table does not exist
cur.execute("CREATE TABLE models (id serial PRIMARY KEY,"
+ " name text NOT NULL UNIQUE, pickle bytea NOT NULL);")
# get the pickle from the database
cur.execute("SELECT pickle FROM models WHERE name=%s;", (name,))
res = cur.fetchone()
if res:
res = res[0].tobytes()
random_writer = RandomWriter.unpickle(res)
else:
random_writer = RandomWriter(level, strategy)
pkl = pickle.dumps(random_writer)
cur.execute("INSERT INTO models (name, pickle) VALUES (%s, %s);", (name, pkl))
return random_writer
def delete_news_model(self, name):
""" Delete a news model from the PostgreSQL database """
with self.get_db_conn() as conn:
with conn.cursor() as cur:
cur.execute("DELETE FROM models WHERE name=%s;", (name,))
def delete_headlines(self):
""" Clear the headlines table """
with self.get_db_conn() as conn:
with conn.cursor() as cur:
cur.execute("TRUNCATE headlines;")
def update_news_models(self, models, conf=None):
""" Update a list of models with headlines pulled from the websites in the config """
if not hasattr(self, 'window'):
print("please update the news model through news_model.py")
return
if conf is None:
conf = config_reader.read_configs()
news_sites = conf['NEWS']
with self.get_db_conn() as conn:
with conn.cursor() as cur:
# get table of already used headlines
cur.execute("SELECT EXISTS(SELECT * FROM information_schema.tables"
+ " where table_name=%s);", ('headlines',))
if not cur.fetchone()[0]:
# our table does not exist
cur.execute("CREATE TABLE headlines (id serial PRIMARY KEY,"
+ " headline text NOT NULL UNIQUE,"
+ " date_added date);")
cur.execute("SELECT headline FROM headlines;")
blacklist = set(res[0] for res in cur)
cur.execute("SELECT headline FROM headlines WHERE date_added < now() - interval '%s days';", (self.window,))
old_headlines = set(res[0] for res in cur)
if isinstance(models, str):
models = {models : None}
output = {}
new_headlines = set()
for name, settings in models.items():
if settings is None:
model = self.get_news_model(name)
else:
model = self.get_news_model(name, settings[0], settings[1])
for url, class_name in news_sites.items():
# counts should never differ, so just save it each time
output[url] = self.update_model(model, "http://" + url,
NewsHTMLParser.identify_headline_class(class_name),
old_headlines, new_headlines, blacklist)
with self.get_db_conn() as conn:
with conn.cursor() as cur:
cur.execute("UPDATE models SET pickle=%s WHERE name=%s",
(pickle.dumps(model), name))
for url, count in output.items():
print(url, count)
# clear out old headlines and add new ones
with self.get_db_conn() as conn:
with conn.cursor() as cur:
cur.execute("DELETE FROM headlines WHERE date_added < now() - interval '%s days';", (self.window,))
for headline in new_headlines:
cur.execute("INSERT INTO headlines (headline, date_added) VALUES (%s, %s);", (headline, datetime.now()))
cur.execute("SELECT COUNT(headline) FROM headlines;")
num_headlines = next(cur)[0]
remove_headlines = num_headlines - int(conf['DB']['max_headlines'])
if remove_headlines > 0:
cur.execute("DELETE FROM headlines WHERE ctid IN (SELECT ctid FROM headlines ORDER BY date_added, id) LIMIT %s", (remove_headlines,))
print("Deleting {} headlines due to too many rows".format(remove_headlines))
def __main():
config = config_reader.read_configs()
models = {}
for model_name in config['MODEL_WEIGHTS']:
if model_name in config['CHARACTER_MODELS']:
models[model_name] = (int(config['CHARACTER_MODELS'][model_name]),
randomwriter.CharacterStrategy)
elif model_name in config['WORD_MODELS']:
models[model_name] = (int(config['WORD_MODELS'][model_name]), randomwriter.WordStrategy)
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers(dest='cmd')
update_parser = subparsers.add_parser('update')
update_parser.add_argument('-o', '--omit', default=[], nargs='+', choices=list(models.keys()))
delete_parser = subparsers.add_parser('delete')
# no arguments means delete headlines only
delete_parser.add_argument('models', nargs='*', choices=list(models.keys()) + [[]])
parser.set_defaults(cmd='update', omit=[])
args = parser.parse_args()
news_model = NewsModel()
news_model.window = int(config['DB']['window'])
if args.cmd == 'update':
for model in args.omit:
del models[model]
news_model.update_news_models(models, config)
elif args.cmd == 'delete':
if args.models:
for model in args.models:
print("deleting model", model)
news_model.delete_news_model(model)
print("done")
else:
news_model.delete_headlines()
if __name__ == '__main__':
__main()