-
Notifications
You must be signed in to change notification settings - Fork 6
/
htmlindex.py
executable file
·152 lines (131 loc) · 5.6 KB
/
htmlindex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env python3
"""For sites that don't have RSS, this module pulls links from an
HTML page according to the specification in html_index_links,
which is a string indicating which HTML tag and attributes to look for,
e.g. 'div class="layout-homepage__lite"'
Multiple attributes are allowed.
"""
from datetime import datetime, timedelta
import urllib.request, urllib.parse
import shlex
import ast
from bs4 import BeautifulSoup
import feedparser
import email.utils as email_utils
from types import SimpleNamespace
import sys
# feedme modules
import pageparser
import utils
def parse(feedname, html_index_links, verbose=True):
"""Parse an HTML page and try to extract all the links matching
the string html_index_links.
Return a feed object, with structure similar to what
feedparser returns.
"""
identifiers = shlex.split(html_index_links)
tagname = None
attrs = {}
for ident in identifiers:
if '=' in ident:
name, val = ident.split('=')
try:
val = ast.literal_eval(val)
except ValueError:
pass
attrs[name] = val
else:
tagname = ident
downloader = pageparser.FeedmeURLDownloader(feedname)
feedurl = utils.g_config.get(feedname, 'url')
soup = BeautifulSoup(downloader.download_url(feedurl), 'lxml')
# May need to splice feedurl onto the beginning of other URLs later
feedurlparts = urllib.parse.urlparse(feedurl)
# feed needs to be accessible via dot notation as feed.feed
# to match expectations based on feedparser.
feedret = SimpleNamespace(
encoding='utf-8',
feed=SimpleNamespace(title=feedname),
entries=[]
)
# In feed.entries, feedme uses: link links id title author published
# summary|content|description
# Generally it uses them like: if 'link' in entry: link = entry.link
# so the object has to handle both "in" and dot notation.
#
# What is the difference between item["link"] and item["links"] ?
# link is a single URL;
# links is a list of dictionaries with 'href',
# 'rel'(='alternate'), 'type'(='text/html')
# item.pubDate is a unicode string, supposed to be in format
# Thu, 11 Aug 2016 14:46:50 GMT (or +0000)
# but feedme actually uses published,
# pub_date = time.mktime(email_utils.parsedate(item.published))
# feedparser has added published_parsed which is a time.struct_time,
# but feedme doesn't count on that. Probably best to provide all three.
# I haven't found a BS find syntax that lets the tag name
# be optional.
if tagname:
finder = soup.find_all(tagname, attrs=attrs)
else:
finder = soup.find_all(attrs=attrs)
for container in finder:
for link in container.find_all('a', href=True):
linktext = link.text.strip()
linkhref = link.get('href')
urlparts = urllib.parse.urlparse(linkhref)
if not urlparts.scheme:
linkhref = urllib.parse.urljoin(feedurl, linkhref)
# IMPORTANT: With FeedParserDict, you can set either
# fpd.id = 123 or fpd['id'] = 123, but YOU SHOULD ONLY USE THE
# SECOND FORM. If you use dot notation to set an attribute,
# that attribute will NOT create a corresponding dict-style
# attribute, so 'id' in fpd will still be false.
thisentry = feedparser.util.FeedParserDict()
thisentry['id'] = linkhref
thisentry['link'] = linkhref
thisentry['links'] = [ linkhref ]
thisentry['title'] = linktext
# Summary has to be some sort of object where summary.value
# is the linktext, so feedme can look at feed.feed.summary.value
thisentry['summary'] = SimpleNamespace(value=linktext)
# Try to get the last modified date. Some websites have
# last-modified, CNN has X-Last-Modified, possibly this
# list will have to grow.
lastmodheaders = [ 'last-modified', 'X-Last-Modified' ]
conn = urllib.request.urlopen(linkhref)
lastmodstr = None
lastmod = None
for lmh in lastmodheaders:
try:
lastmodstr = conn.headers.get(lmh)
if lastmodstr:
# print("Last mod string", lastmodstr, "for", linkhref)
break
except:
pass
if lastmodstr:
# sites mostly seem to use format
# 'Sun, 11 Feb 2024 20:34:41 GMT'
# XXX for now, let's hope that's always true.
try:
lastmod = datetime.strptime(
lastmodstr, '%a, %d %b %Y %H:%M:%S %Z').astimezone()
except Exception as e:
print("htmlindex: couldn't parse '%s': %s"
% (lastmodstr, e), file=sys.stderr)
if not lastmod:
# Set it to a bit under a week ago
lastmod = datetime.now() - timedelta(days=5)
if not lastmodstr:
lastmodstr = lastmod.strftime('%a, %d %b %Y %H:%M:%S %Z')
# Now both lastmod and lastmodstr are set.
thisentry['published'] = lastmodstr
thisentry['published_parsed'] = lastmod.timetuple()
# feedret["entries"].append(thisentry)
feedret.entries.append(thisentry)
return feedret
if __name__ == '__main__':
utils.read_config_file()
from pprint import pprint
pprint(parse('CNN', 'li class="card--lite"'))