-
Notifications
You must be signed in to change notification settings - Fork 6
/
pageparser.py
executable file
·849 lines (734 loc) · 35.4 KB
/
pageparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
#!/usr/bin/env python3
# URL parser for feedme, http://shallowsky.com/software/feedme/
# Copyright 2011-2017 by Akkana Peck.
# Share and enjoy under the GPL v2 or later.
from __future__ import print_function
import os, sys
import urllib.request, urllib.error, urllib.parse
import re
import lxml.html
from bs4 import BeautifulSoup
from http.cookiejar import CookieJar
import io
import gzip
import utils
import traceback
import imagecache
# Use XDG for the config and cache directories if it's available
try:
import xdg.BaseDirectory
except:
pass
class NoContentError(Exception):
pass
SKIP_NODE_PAT = r'''\s*([a-zA-Z]+)\s+(?:([a-zA-Z]+)\s*=\s*['"](.*)['"])?'''
class CookieError(Exception):
def __init__(self, message, longmessage):
"""message is a one-line summary.
longmessage is the traceback.fmt_exc stack trace.
"""
self.message = message
self.longmessage = longmessage
# It would be nice to be able to pass the stack trace
# in a way that could be examined (e.g. print only the
# last message), but traceback.extract_stack() doesn't
# survive being passed through another exception handler;
# it would probably require making a deep copy of it.
# For now, just pass traceback.fmt_exc().
class FeedmeURLDownloader(object):
"""An object that can download stories while retaining
information about a feed, such as feed name, user_agent,
encoding, cookie file and other config values.
If there are no network errors but the content is empty after any
substitutes (or just empty to begin with), raises NoContentError.
"""
def __init__(self, feedname, verbose=False):
self.feedname = feedname
self.user_agent = utils.VersionString
self.encoding = None
self.cookiejar = None
self.verbose = verbose
def download_url(self, url, referrer=None, user_agent=None):
"""Download a URL (likely http or RSS) from the web and return its
contents as a str. Allow for possible vagaries like cookies,
redirection, compression etc.
"""
if not user_agent:
user_agent = utils.VersionString
if url.startswith("file://"):
# In file:, allow for relative filenames even though that's not
# part of the real file:// spec, to make testing a little easier.
filename = url[7:]
with open(filename) as fp:
return fp.read()
request = urllib.request.Request(url)
# If we're after the single-page URL, we may need a referrer
if referrer:
request.add_header('Referer', referrer)
request.add_header('User-Agent', user_agent)
if self.verbose:
print("download_url", url, "referrer=", referrer, \
"user_agent", user_agent, file=sys.stderr)
if not self.cookiejar:
# Create the cookiejar once per site; it will be reused
# for all site stories fetched, but it won't be saved
# for subsequent days.
self.cookiejar = None
cookiefile = utils.g_config.get(self.feedname, "cookiefile",
fallback=None)
if cookiefile:
try:
cookiefile = os.path.expanduser(cookiefile)
# If a cookiefile was specified, use those cookies.
self.cookiejar = get_firefox_cookie_jar(cookiefile)
except Exception as e:
errmsg = "Couldn't get cookies from file %s" % cookiefile
print(errmsg, file=sys.stderr)
raise CookieError(errmsg,
traceback.format_exc()) from None
if not self.cookiejar:
# Allow for cookies in the request even if no cookiejar was
# specified. Some sites, notably nytimes.com, degrade to
# an infinite redirect loop if cookies aren't enabled.
self.cookiejar = CookieJar()
opener = urllib.request.build_opener(
urllib.request.HTTPCookieProcessor(self.cookiejar))
response = opener.open(request, timeout=20)
# Lots of ways this can fail.
# e.g. ValueError, "unknown url type"
# or BadStatusLine: ''
# At this point it would be lovely to check whether the
# mime type is HTML or RSS. Unfortunately, all we have is a
# httplib.HTTPMessage instance which is completely
# undocumented (see http://bugs.python.org/issue3428).
# It's not documented, but sometimes after urlopen
# we can actually get a content type. If it's not
# text/something, that's bad.
ctype = response.headers['content-type']
if ctype and ctype != '' and not ctype.startswith("text") \
and not ctype.startswith("application/rss") \
and not ctype.startswith("application/xml") \
and not ctype.startswith("application/x-rss+xml") \
and not ctype.startswith("application/atom+xml"):
if self.verbose:
print(url, "isn't text -- content-type was", \
ctype, ". Skipping.", file=sys.stderr)
response.close()
# Used to raise a RuntimeError here -- but then the feed
# (especially Xtra) ends up empty with no indication why.
# Instead, return a simple string explaining the problem
# raise RuntimeError("Contents not text (%s)! %s" % (ctype, url))
print("Contents not text (%s)! %s" % (ctype, url), file=sys.stderr)
return '<p>Contents not text! (%s) <a href="%s">%s</a></p>' \
% (ctyle, url, url)
# Were we redirected? geturl() will tell us that.
self.cur_url = response.geturl()
# but sadly, that means we need another request object
# to parse out the host and prefix:
real_request = urllib.request.Request(self.cur_url)
real_request.add_header('User-Agent', user_agent)
# A few sites, like http://nymag.com, gzip their http.
# urllib2 doesn't handle that automatically: we have to ask for it.
# But some other sites, like the LA Monitor, return bad content
# if you ask for gzip.
if utils.g_config.getboolean(self.feedname, 'allow_gzip'):
request.add_header('Accept-encoding', 'gzip')
# feed() is going to need to know the host, to rewrite urls.
# So save host and prefix based on any redirects we've had:
# pagemeparser will need them.
self.host = real_request.host
self.prefix = real_request.type + '://' + self.host + '/'
# urllib2 unfortunately doesn't read unicode,
# so try to figure out the current encoding:
if not self.encoding:
if self.verbose:
print("download_url: self.encoding not set, "
"getting it from headers", file=sys.stderr)
self.encoding = response.headers.get_content_charset()
enctype = response.headers['content-type'].split('charset=')
# If there are multiple values, the encoding should be the last one
if len(enctype) > 1:
self.encoding = enctype[-1]
else:
if self.verbose:
print("No enctype; defaulting to utf-8", file=sys.stderr)
self.encoding = 'utf-8'
# theoatmeal sets this to 'ISO-8859-1; filename=feed.xml'
if ';' in self.encoding:
self.encoding = self.encoding.split(';')[0]
if self.verbose:
print("final encoding is", self.encoding, file=sys.stderr)
# Is the URL gzipped? If so, we'll need to uncompress it.
is_gzip = response.info().get('Content-Encoding') == 'gzip'
# Read the content of the link:
# This can die with socket.error, "connection reset by peer"
# And it may not set html, so initialize it first:
contents = None
try:
contents = response.read()
# XXX Need to guard against IncompleteRead -- but what class owns it??
#except httplib.IncompleteRead, e:
# print("Ignoring IncompleteRead on", url, file=sys.stderr)
except Exception as e:
print("Unknown error from response.read()", url, file=sys.stderr)
# contents can be undefined here.
# If so, no point in doing anything else.
# But save a string telling the user there was a problem
if not contents:
if self.verbose:
print("Didn't read anything from response.read()",
file=sys.stderr)
response.close()
raise NoContentError("Empty response.read()")
if is_gzip:
buf = io.BytesIO(contents)
f = gzip.GzipFile(fileobj=buf)
contents = f.read()
# No docs say I should close this. I can only assume.
response.close()
# response.read() returns bytes. Convert to str as soon as possible
# so the rest of the program can work with str.
# But this sometimes fails with:
# UnicodeDecodeError: 'utf-8' codec can't decode bytes in position nnn-nnn: invalid continuation byte
try:
return contents.decode(encoding=self.encoding)
except UnicodeDecodeError:
print("UnicodeDecodeError on", self.cur_url, file=sys.stderr)
return contents.decode(encoding=self.encoding,
errors="backslashreplace")
except Exception as e:
s = "Unknown error trying to decode %s: %s" % (self.cur_url, e)
print(s, file=sys.stderr)
return s
class FeedmeHTMLParser(FeedmeURLDownloader):
def __init__(self, feedname):
super(FeedmeHTMLParser, self).__init__(feedname)
self.outfile = None
self.skipping = None
self.base_href = None
self.multipages = []
def fetch_url(self, url, newdir, newname, title=None, author=None,
html=None,
footer='', referrer=None, user_agent=None,
sub_page=False):
"""Read a URL from the web. Parse it, rewriting any links,
downloading any images and making any other changes needed
according to the config file and current feed name.
If the optional argument html contains a string,
skip the downloading and use the html provided.
Write the modified HTML output to newdir/newname
(unless newname is None, in which case just return the html)
and download any images into $newdir.
If sub_page is true, then it will append to an existing file
rather than replacing it.
Raises NoContentError if it can't get the page or skipped it.
"""
self.verbose = utils.g_config.getboolean(self.feedname, 'verbose')
if self.verbose:
if html:
print("Parsing html from index, corresponding to", url,
"to", newdir + "/" + newname, file=sys.stderr)
elif newname:
print("Fetching link", url,
"to", newdir + "/" + newname,
"sub_page=", sub_page,
file=sys.stderr)
else:
print("Parsing html from", url, "with dir", newdir,
file=sys.stderr)
self.newdir = newdir
self.newname = newname
self.cururl = url
if type(title) is not str:
title = str(title)
if type(author) is not str:
author = str(author)
# If no base href has been set yet, set it here based on
# the first URL fetched from RSS.
if not self.base_href:
urlparts = urllib.parse.urlparse(url)
urlparts = urlparts._replace(path='/')
self.base_href = urllib.parse.urlunparse(urlparts)
if self.verbose:
print("On first fetched URL, set base_href to",
self.base_href, file=sys.stderr)
# A flag to indicate when we're skipping everything --
# e.g. inside <script> tags.
self.skipping = None
# Do we need to do any substitution on the URL first?
urlsub = utils.g_config.get_multiline(self.feedname, 'url_substitute')
if urlsub:
if self.verbose:
print("Multiline: Substituting", urlsub[0],
"to", urlsub[1], file=sys.stderr)
print("Rewriting:", url, file=sys.stderr)
url = re.sub(urlsub[0], urlsub[1], url)
if self.verbose:
print("Became: ", url, file=sys.stderr)
self.encoding = utils.g_config.get(self.feedname, 'encoding')
if not self.encoding:
self.encoding = "utf-8"
if not html:
html = self.download_url(url, referrer, user_agent)
# Does it contain any of skip_content_pats anywhere? If so, bail.
skip_content_pats = utils.g_config.get_multiline(self.feedname,
'skip_content_pats')
for pat in skip_content_pats:
if re.search(pat, html):
raise NoContentError("Skipping, skip_content_pats " + pat)
if self.newname:
outfilename = os.path.join(self.newdir, self.newname)
if sub_page:
self.outfile = open(outfilename, "a", encoding=self.encoding)
else:
# XXX Open outfile with the right encoding -- which seems to
# be a no-op, as we'll still get
# "UnicodeEncodeError: 'ascii' codec can't encode character
# unless we explicitly encode everything with fallbacks.
# So much for python3 being easier to deal with for unicode.
self.outfile = open(outfilename, "w", encoding=self.encoding)
self.outfile.write("""<html>\n<head>
<meta http-equiv="Content-Type" content="text/html; charset=%s">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" type="text/css" title="Feeds" href="../../feeds.css"/>
<title>%s</title>
</head>
""" % (self.encoding, title))
else:
outfilename = None
self.outfile = io.StringIO()
#
# First, some operations on the HTML source,
# like regexps that match patterns in the source.
# The edited source will later be parsed by BeautifulSoup.
#
if author:
body = html.find('<body>')
if body >= 0:
html = html[0:body] + "By: %s\n<p>\n" % author \
+ html[body:]
# Throw out everything before the first page_start re pattern seen,
# and after the first page_end pattern seen.
page_starts = utils.g_config.get_multiline(self.feedname, 'page_start')
page_ends = utils.g_config.get_multiline(self.feedname, 'page_end')
if len(page_starts) > 0:
for page_start in page_starts:
if self.verbose:
print("looking for page_start", page_start, file=sys.stderr)
start_re = re.compile(page_start, flags=re.DOTALL)
match = start_re.search(html, re.IGNORECASE)
if match:
if self.verbose:
print("Found page_start regexp", page_start,
file=sys.stderr)
html = html[match.end():]
break
if len(page_ends) > 0:
for page_end in page_ends:
if self.verbose:
print("looking for page_end", page_end, file=sys.stderr)
end_re = re.compile(page_end, flags=re.DOTALL)
match = end_re.search(html, re.IGNORECASE)
if match:
if self.verbose:
print("Found page_end regexp", page_end,
file=sys.stderr)
html = html[0:match.start()]
break
# Skip anything matching any of the skip_pats.
# This is an earlier, regex-based version of skip_nodes.
# Most sites should use skip_nodes, but there may be some
# sites where skip_pats works better.
skip_pats = utils.g_config.get_multiline(self.feedname, 'skip_pats')
for skip in skip_pats:
if self.verbose:
print("Trying to skip '%s'" % skip, file=sys.stderr)
#print("in", html.encode('utf-8'), file=sys.stderr)
#sys.stderr.flush()
# flags=DOTALL doesn't exist in re.sub until 2.7,
#html = re.sub(skip, '', html, flags=re.DOTALL)
# but does exist in a compiled re expression:
try:
regexp = re.compile(skip, flags=re.DOTALL)
except Exception as e:
print("Couldn't compile regexp", skip, file=sys.stderr)
print(str(e), file=sys.stderr)
continue
html = regexp.sub('', html)
# print("After all skip_pats, html is:", file=sys.stderr)
# print(html.encode(self.encoding, 'replace'), file=sys.stderr)
self.single_page_url = None
# Keep a record of whether we've seen any content:
self.wrote_data = False
# Delete any skip_nodes
skip_nodespecs = utils.g_config.get_multiline(self.feedname,
'skip_nodes')
if skip_nodespecs:
soup = BeautifulSoup(html, "lxml")
changed = False
for nodespec in skip_nodespecs:
print("looking for skip_node", nodespec, file=sys.stderr)
# Syntax is something like: div class="sticky-box"
# first word should be node type,
# which may be followed by someattr="somename"
try:
nodename, attrname, attrval = \
re.match(SKIP_NODE_PAT, nodespec).groups()
print((f"nodename '{nodename}', "
f"attrname '{attrname}', "
f"attrval='{attrval}'"), file=sys.stderr)
# attrval is a regexp, which BS won't notice unless
# it's already compiled.
attrval = re.compile(attrval)
for node in soup.find_all(nodename,
attrs={ attrname: attrval }):
print(" found a node", file=sys.stderr)
node.decompose()
changed = True
except Exception as e:
print("Problem finding SKIP_NODE_PAT '%s': %s"
% (nodespec, e), file=sys.stderr)
utils.ptraceback()
continue
if changed:
print("Changed nodes in the HTML: rewriting", file=sys.stderr)
html = str(soup)
# Iterate through the HTML, making any necessary simplifications:
try:
self.handle_html(html, title, footer)
except Exception as e:
if self.verbose:
print("error in handle_html:", e, file=sys.stderr)
traceback.print_exc(file=sys.stderr)
# traceback.print_stack(limit=6, file=sys.stderr)
# We're in trouble here, but try to write some indication
# of the error to the outfile.
try:
print("error in handle_html:", e, file=self.outfile)
self.wrote_data = True
self.outfile.close()
except Exception as e:
print("Couldn't save handle_html error in output file"
" because:", e,
file=sys.stderr)
# handle_html() should have closed the file, but if it bombed out
# early it might not have.
try:
self.outfile.close()
except:
pass
# Did we write anything real, any real content?
# XXX Currently this requires text, might want to add img tags.
if not self.wrote_data:
errstr = "No real content"
print(errstr, file=sys.stderr)
if verbose:
print("No content, removing", outfilename, file=sys.stderr)
os.remove(outfilename)
raise NoContentError(errstr)
# Now we've fetched the normal URL.
# Did we see a single-page link? If so, move the fetched
# file aside and call ourselves recursively to try to fetch
# the single-page.
if not self.wrote_data:
# Don't look for single page or multipage if there wasn't a story
pass
elif self.single_page_url and self.single_page_url != url:
# Call ourself recursively.
# It should only be possible for this to happen once;
# when we're called recursively, url will be the single
# page url so we won't make another recursive call.
singlefile = outfilename + ".single"
try:
if self.verbose:
print("Trying to fetch single-page url with referrer =", \
response.geturl(), "instead of", url, file=sys.stderr)
self.fetch_url(self.single_page_url, newdir, singlefile,
title=title, footer=footer,
referrer=response.geturl())
# If the fetch succeeded and we have a single-page file,
# replace the original file with it
# and remove the original.
if os.path.exists(singlefile):
#os.rename(outfilename, outfilename + '.1')
os.remove(outfilename)
os.rename(singlefile, outfilename)
if self.verbose:
print("Removing", outfilename, \
"and renaming", singlefile, file=sys.stderr)
else:
print("Tried to fetch single-page file "
"but apparently failed", file=sys.stderr)
except (IOError, urllib.error.HTTPError) as e:
print("Couldn't read single-page URL", \
self.single_page_url, file=sys.stderr)
print(e, file=sys.stderr)
# Are there multiple pages? Try to fetch them.
elif self.multipages and not sub_page: # XXXXXXXXXXXXXX
if self.multipages:
if self.verbose:
print("Chasing", len(self.multipages), "extra pages",
file=sys.stderr)
for href in self.multipages:
try:
# href is the link to this page.
# Fetch the content, append it to the current file
if self.verbose:
print("Recursively fetching next page", href,
file=sys.stderr)
self.fetch_url(href, newdir, newname,
title=title, author=author,
html=None,
footer=footer, referrer=referrer,
user_agent=user_agent,
sub_page=True)
except Exception as e:
print("Couldn't parse", link, ":", e, file=sys.stderr)
continue
if not outfilename and type(self.outfile) is io.StringIO():
return self.outfile.getvalue()
def handle_html(self, uhtml, title=None, footer=''):
"""Parse the given unicode as HTML and make all needed substitutions.
Append the footer if any, write the resulting <body>
to self.outfile, then close the outfile.
(The caller has already opened the file and written a header.
XXX should handle the header here too, for consistency.)
"""
soup = BeautifulSoup(uhtml, features='lxml')
# Does the page have an H1 header already? If not, manufacture one.
if title and not soup.h1:
h1 = soup.new_tag("h1")
soup.body.insert(0, h1)
h1.append(title)
# Tags to remove, but keep children if any
for tagname in [
# Don't want embedded <head> stuff
# Unfortunately, skipping the <head> means we miss
# meta and base. Missing meta is a problem because it
# means we don't get the charset. XXX But note: we
# probably won't see the charset anyway, because we'll
# look for it in the first head, the one we create ourselves,
# rather than the one that comes from the original page.
# We really need to merge the minimal information from the page
# head into the generated one.
# Meanwhile, these tags may do more harm than good.
# We definitely need to remove <link type="text/css".
"head",
# Omit form elements, since it's too easy to land on
# them accidentally when scrolling and trigger an
# unwanted Android onscreen keyboard:
"form", "input", "textarea",
# font tags are almost always to impose colors that
# only work against certain backgrounds
"font",
# Omit iframes -- they badly confuse Android's WebView
# (goBack fails if there's an iframe anywhere in the page:
# you have to goBack multiple times, I think once for every
# iframe in the page, and this doesn't seem to be a bug
# that's getting fixed any time soon).
# We don't want iframes in simplified HTML anyway.
"iframe",
# assorted other unhelpful tags.
# object is probably flash or video or some such.
"source", "video", "object",
"meta", "link",
]:
for t in soup.find_all(tagname):
t.replace_with_children()
# Tags to remove entirely along with all children
for tagname in [
# disallow scripts
"script",
# style tags are often evil MS-Word crap
"style",
# The source tag is used to specify alternate forms of media.
# But the LA Daily Post uses it for images, and many browsers
# including Android WebView use it to override the img src.
# So leaving in the source tag may cause images to be fetched
# from the net rather than from the locally fetched files.
"source",
# Skip videos regardless of the skip_images setting,
# since there's no mechanism to download videos,
# and including them inline leads to unwanted data charges.
# Some day maybe this could be a separate pref.
"video",
# <link rel="stylesheet" isn't always in the head.
# Undark puts them at the end of the document but they still
# apply to the whole document, making text unreadable.
# I don't know of any other legitimate uses for <link>
# so let's just remove them all.
"link",
]:
for t in soup.find_all(tagname):
t.decompose()
# <base> tags can confuse the HTML displayer program
# into looking remotely for images we've copied locally,
# so remove them.
# But it might be useful to save the base.
for t in soup.find_all("base"):
if "href" in t.attrs:
self.base_href = t.attrs["href"]
t.decompose()
# Remove img if skipping images
if utils.g_config.getboolean(self.feedname, 'skip_images'):
for tagname in [ "img", "svg", "figure" ]:
for t in soup.find_all(tagname):
t.decompose()
# embedded body tags often have unfortunate color settings.
# Embedded <html> tags don't seem to do any harm, but seem wrong.
# Keep the first one.
for tagname in [ "html", "body" ]:
for i, t in enumerate(soup.find_all(tagname)):
if i > 0:
t.replace_with_children()
# meta refreshes won't work when we're offline, but we
# might want to display them to give the user the option.
# <meta http-equiv="Refresh" content="0; URL=http://blah"></meta>
# meta charset is the other meta tag we care about.
# All other meta tags will be skipped, so do this test
# before checking for tag_skippable.
for meta in soup.find_all("meta"):
if 'charset' in meta.attrs and meta.attrs['charset']:
self.encoding = meta.attrs['charset']
if 'http-equiv' in meta.attrs and \
meta.attrs['http-equiv'].lower() == 'refresh':
self.outfile.write("Meta refresh suppressed.<br />")
if 'content' in meta.attrs:
content = meta.attrs['content'].split(';')
if len(content) > 1:
href = content[1].strip()
else:
href = content[0].strip()
# XXX Next comparison might be better done with re,
# in case of spaces around the =.
if href.upper().startswith('URL='):
href = href[4:]
self.outfile.write('<a href="' + href + '">'
+ href + '</a>')
# Also set the refresh target as the single_page_url.
# Maybe we can actually get it here.
if not self.single_page_url:
self.single_page_url = \
imagecache.make_absolute(href, self.base_href)
if self.verbose:
print("\nTrying meta refresh as single-page pat:", \
self.single_page_url.encode('utf-8',
'xmlcharrefreplace'),
file=sys.stderr)
# XXX Note that this won't skip the </meta> tag, unfortunately,
# and doesn't distinguish meta refresh from any other meta tags.
if utils.g_config.getboolean(self.feedname, 'skip_links'):
for t in soup.find_all("a"):
t.replace_with_children()
# Look for a tags matching the single-page pattern,
# if we're not already following one.
if not self.single_page_url:
# print("we're not in the single page already")
single_page_pats = utils.g_config.get_multiline(self.feedname,
'single_page_pats')
for single_page_pat in single_page_pats:
singlepage = soup.find("a", href=single_page_pat)
if singlepage:
self.single_page_url = imagecache.make_absolute(
singlepage.href)
if self.verbose:
print("\nFound single-page pattern:", \
self.single_page_url, file=sys.stderr)
# But continue fetching the regular pattern,
# since the single-page one may fail
break
# Try to make links absolute.
for t in soup.find_all("a"):
try:
abs_href = imagecache.make_absolute(t.href, self.base_href)
if abs_href != t.href:
t.href = abs_href
except:
continue
# Now crawl all tags removing any style= attribute
for t in soup.find_all(style=True):
style = t.attrs['style']
if 'color' in style or 'background' in style:
del t.attrs["style"]
# Finally, handle images
for tagname in [ "img", "svg" ]:
for t in soup.find_all(tagname):
try:
imagecache.process_img_tag(t, self.feedname,
self.base_href, self.newdir)
except Exception as e:
print("Error handling image tag", t, ":", e,
file=sys.stderr)
# find out if there will be a need to look for subsequent pages
multipage_pat = utils.g_config.get(self.feedname, "multipage_pat",
fallback=None)
if multipage_pat:
links = soup.find_all('a', href=re.compile(multipage_pat))
self.multipages = [ a.attrs['href'] for a in links ]
# Eliminate duplicates
self.multipages = list(dict.fromkeys(self.multipages))
if self.verbose:
print("Found multipage links:", file=sys.stderr)
for l in self.multipages:
print(" ", l, file=sys.stderr)
else:
self.multipages = None
# Done with processing! Write the soup's body to self.outfile.
pretty = soup.body.prettify()
if pretty:
if footer:
# pretty already ends with </body>, so find the last
# occurrence of </body> and prepend the footer.
# Anything after the last </body> will be lost,
# but there shouldn't be anything there.
spl = pretty.rsplit('</body>', 1)
pretty = spl[0] + footer + '\n</body>\n</html>\n'
self.outfile.write(pretty)
self.wrote_data = True
else:
print("Empty body! Not writing", file=sys.stderr)
def simplify_html(inhtml):
"""Simplify HTML to contain only a very few tags.
Used for the text in the blurbs in each site toplevel page,
on sites that put the whole article in the RSS and so
need to be truncated.
"""
soup = BeautifulSoup(inhtml, "lxml")
for tag in soup.body.find_all():
if "style" in tag.attrs:
del tag.attrs["style"]
return soup.prettify()
#
# Adapted from:
# https://stackoverflow.com/a/33078599
# Author: Noah Fontes nfontes AT cynigram DOT com
# License: MIT
# Original:
# http://blog.mithis.net/archives/python/90-firefox3-cookies-in-python
# Ported to Python 3 by Dotan Cohen
#
def get_firefox_cookie_jar(filename):
"""
Create a CookieJar based on a Firefox cookies.sqlite.
"""
import sqlite3
from http.cookiejar import MozillaCookieJar
con = sqlite3.connect(filename)
cur = con.cursor()
cur.execute("SELECT host, path, isSecure, expiry, name, value "
"FROM moz_cookies")
ftstr = ["FALSE", "TRUE"]
s = io.StringIO()
s.write("""\
# Netscape HTTP Cookie File
# http://www.netscape.com/newsref/std/cookie_spec.html
# This is a generated file! Do not edit.
""")
for item in cur.fetchall():
s.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (
item[0], ftstr[item[0].startswith('.')], item[1],
ftstr[item[2]], item[3], item[4], item[5]))
s.seek(0)
cookie_jar = MozillaCookieJar()
cookie_jar._really_load(s, '', True, True)
return cookie_jar