-
Notifications
You must be signed in to change notification settings - Fork 6
/
feedme.py
executable file
·1685 lines (1474 loc) · 69.3 KB
/
feedme.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
# feedme: download RSS/Atom feeds and convert to HTML, epub, Plucker,
# or other formats suitable for offline reading on a handheld device,
#
# Copyright 2009-2023 by Akkana Peck <[email protected]>
# and licensed under the GPLv2 or later.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details:
# <http://www.gnu.org/licenses/>.
from __future__ import print_function
ConfigHelp = """Configuration options:
Configuration options most useful in a DEFAULT section,
applicable to all feeds:
ascii
Convert all pages to plain ASCII. Useful for reading devices like Palm
that can't display other character sets reliably.
block_nonlocal_images
If images aren't downloaded, normally the img tag will still point
to the image on the original website. But that can result in unwanted
data use: people with limited or metered bandwidth can set this to True
to remove the original image URL so it can't be downloaded when the
story is viewed.
dir
Where to save the collected pages.
See save_days for how long they will be kept.
formats
Comma-separated list of output formats.
Default "none", which will result in HTML output.
Other options: epub, fb2, plucker.
logfile
Save output (including debugging) to this log.
verbose
Print lots of debugging chatter while feeding.
min_width
The minimum number of characters in an item link. Links shorter than this
will be padded to this length (to make tapping easier). Default 25.
save_days
How long to retain feeds locally.
order
The order of the feeds you'd like to see: an ordered list
(one name per line) of the full names (not filenames) of each feed.
Feed directories will have _01, _02 etc. prepended.
Feeds not listed in order will be sorted alphabetically after
the ordered ones.
Configuration options you might want to reset for specific feeds:
continue_on_timeout
Normally, if one page times out, feedme will assume the site is down.
On sites that link to content from many different URLs, set this
to true.
encoding
Normally feedme will try to guess the encoding from the page.
But some pages lie, so use this to override that.
levels
Level 1: only save the RSS page.
Level 1.5: only read the RSS page, but make story pages from it
Level 2: save sub-pages.
nocache
Don't check whether we've seen an entry before: collect everything.
nonlocal_images
Normally feedme will ignore images from other domains (usually ads).
But some sites link to images from all over; set this to true in that case.
page_start, page_end
regexps that define the part of a page that will be fetched.
skip_images
Don't save images. Default true.
skip_links:
For sites with levels=1 where you just want a single news feed and
never want to click on anything (e.g. slashdot), this can eliminate
distracting links that you might tap on accidentally while scrolling.
skip_pats
Throw out anything matching these patterns
url
The RSS URL for the site.
when
When to check this site, if not every time.
May be a weekday, e.g. Sat, or a month date, e.g. 1 to check only
on the first day of any month.
"""
import time
import os, sys
import re
#import types
import shutil
import traceback
import feedparser
import output_fmt
import urllib.error
import socket
import posixpath
import unicodedata
# sheesh, this is apparently the recommended way to parse RFC 2822 dates:
import email.utils as email_utils
# FeedMe's module for parsing HTML inside feeds:
import pageparser
from tee import tee
import msglog
from cache import FeedmeCache
from utils import falls_between, last_time_this_feed, expanduser
# Rewriting image URLs to local ones
import imagecache
# utilities, mostly config-file related:
import utils
# For parsing sites that don't have RSS, just HTML with a list of links
import htmlindex
# Allow links in top page content.
# Feedparser 6.0 has dropped _HTMLSanitizer.acceptable_elements,
# but the documentation says that now it allows a and img by default.
# https://pythonhosted.org/feedparser/html-sanitization.html
try:
feedparser._HTMLSanitizer.acceptable_elements.add('a')
feedparser._HTMLSanitizer.acceptable_elements.add('img')
except AttributeError:
print("Don't know how to whitelist elements in feedparser",
feedparser.__version__, file=sys.stderr)
from bs4 import BeautifulSoup
# For importing helper modules
import importlib
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)),
"helpers"))
verbose = False
#
# Clean up old feed directories
#
def clean_up():
try:
days = int(utils.g_config.get('DEFAULT', 'save_days'))
feedsdir = expanduser(utils.g_config.get('DEFAULT', 'dir'))
cachedir = FeedmeCache.get_cache_dir()
except:
print("Error trying to get save_days and feed dir; can't clean up", file=sys.stderr)
return
now = time.time()
def clean_up_dir(dirname, rmdir):
'''If rmdir==True, remove (recursively) old directories,
ignoring files at the toplevel.
Otherwise remove old files at the toplevel.
'''
for f in os.listdir(dirname):
# Files never to delete:
if f in ['feedme.dat', 'feeds.css', 'darkfeeds.css',
'LOG', 'urlrss.log' ]:
continue
f = os.path.join(dirname, f)
# Logical xor: if rmdir is set, or it's not a directory,
# but not both, then skip this entry.
# ^ is bitwise xor but works if both args are bool.
if rmdir ^ os.path.isdir(f):
continue
try:
howold = (now - os.path.getctime(f)) / 60 / 60 / 24
if howold > days:
print("Deleting", f, file=sys.stderr)
if os.path.isdir(f):
shutil.rmtree(f)
else:
os.unlink(f)
except Exception as e:
print("Couldn't unlink", f, str(e))
print("Cleaning up files older than %d days from feed and cache dirs"
% days)
clean_up_dir(feedsdir, True)
clean_up_dir(cachedir, False)
#
# Ctrl-C Interrupt handler: prompt for what to do.
#
def handle_keyboard_interrupt(msg):
# os.isatty() doesn't work, so:
if not hasattr(sys.stdin, "isatty"):
print("Interrupt, and not running interactively. Exiting.")
sys.exit(1)
try:
response = input(msg)
except EOFError:
# This happens if we're run from a script rather than interactively
# and yet someone sends a SIGINT, perhaps because we're timing out
# and someone logged in to kick us back into operation.
# In this case, pretend the user typed 'n',
# meaning skip to next site.
return 'n'
if response == '':
return '\0'
if response[0] == 'q':
sys.exit(1)
return response[0]
def parse_name_from_conf_file(feedfile):
"""Given the full pathname to a .conf file name,
return the site name from the initial [The Site Name] line.
"""
with open(feedfile) as fp:
for line in fp:
line = line.strip()
if line.startswith('[') and line.endswith(']'):
return line[1:-1].strip()
# Could also do line.strip('][')
return None
allow_unicode = False
def slugify(value, allow_unicode=False):
"""
Taken from https://github.com/django/django/blob/master/django/utils/text.py
Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
dashes to single underscores. Remove characters that aren't alphanumerics,
underscores, or hyphens. Convert to lowercase. Also strip leading and
trailing whitespace, dashes, and underscores.
"""
value = str(value)
if allow_unicode:
value = unicodedata.normalize('NFKC', value)
else:
value = unicodedata.normalize('NFKD', value) \
.encode('ascii', 'ignore').decode('ascii')
value = re.sub(r'[^\w\s-]', '', value)
return re.sub(r'[-\s]+', '_', value).strip('-_')
# A number prepended to each feed if the user specifies feed order.
g_feednum = 0
#
# Get a single feed
#
def get_feed(feedname, cache, last_time, msglog):
"""Fetch a single site's feed.
feedname can be the feed's config name ("Washington Post")
or the conf file name ("washingtonpost" or "washingtonpost.conf").
"""
global verbose, g_feednum
verbose = (utils.g_config.get("DEFAULT", 'verbose').lower() == 'true')
# Mandatory arguments:
try:
sitefeedurl = utils.g_config.get(feedname, 'url')
feedsdir = utils.g_config.get(feedname, 'dir')
except Exception as e:
sitefeedurl = None
# If feedname isn't a name in the config files, maybe it's the name
# of a config file itself, e.g. if not "Foo News",
# then maybe foonews or foonews.conf.
if not sitefeedurl:
fakefeedname = None
if os.path.exists(feedname):
print("feedname is", feedname, "and it exists")
# XXX This clause will accept the full path to a .conf file as
# a commandline argument -- but that file will only be
# used for the feed name, not for the actual feed parameters
# or other config values, which probably isn't what the user
# expects. The config object has already been initialized
# by this time, and overwriting it is probably more work than
# is warranted given that I never actually expect to use
# config files from outside the configdir.
fakefeedname = parse_name_from_conf_file(feedname)
print("fakefeedname is", fakefeedname)
if fakefeedname:
msglog.warn("Warning: Using name '%s' from %s,"
" but config parameters will actually be parsed "
" from files in %s"
% (fakefeedname, feedname,
utils.g_default_confdir ))
else:
feedfile = os.path.join(utils.g_default_confdir, feedname)
if os.path.exists(feedfile):
fakefeedname = parse_name_from_conf_file(feedfile)
if not sitefeedurl and not feedfile.endswith(".conf"):
feedfile += ".conf"
if os.path.exists(feedfile):
fakefeedname = parse_name_from_conf_file(feedfile)
if fakefeedname:
try:
sitefeedurl = utils.g_config.get(fakefeedname, 'url')
feedsdir = utils.g_config.get(fakefeedname, 'dir')
feedname = fakefeedname
except:
if verbose:
print(feedname, "isn't a site feed name either",
file=sys.stderr)
if not sitefeedurl:
msglog.err("Can't find a config for: " + feedname)
return
verbose = (utils.g_config.get(feedname, 'verbose').lower() == 'true')
levels = float(utils.g_config.get(feedname, 'levels'))
feedsdir = expanduser(feedsdir)
todaystr = time.strftime("%m-%d-%a")
feedsdir = os.path.join(feedsdir, todaystr)
formats = utils.g_config.get(feedname, 'formats').split(',')
encoding = utils.g_config.get(feedname, 'encoding')
ascii = utils.g_config.getboolean(feedname, 'ascii')
skip_links = utils.g_config.getboolean(feedname, 'skip_links')
skip_link_pats = utils.g_config.get_multiline(feedname, 'skip_link_pats')
skip_title_pats = utils.g_config.get_multiline(feedname, 'skip_title_pats')
user_agent = utils.g_config.get(feedname, 'user_agent')
if verbose:
print("\n=============\nGetting %s feed" % feedname, file=sys.stderr)
# Is this a feed we should only check occasionally?
"""Does this feed specify only gathering at certain times?
If so, has such a time passed since the last time the
cache file was written?
"""
when = utils.g_config.get(feedname, "when")
if when and when != '' and last_time:
if not falls_between(when, last_time, time.localtime()):
print("Skipping", feedname, "-- not", when, file=sys.stderr)
return
print("Yes, it's time to feed:", when, file=sys.stderr)
#encoding = utils.g_config.get(feedname, 'encoding')
print("\n============\nfeedname:", feedname, file=sys.stderr)
# Make it a legal and sane dirname
feednamedir = slugify(feedname)
# Make sure the link is at least some minimum width.
# This is for viewers that have special areas defined on the
# screen, e.g. areas for paging up/down or adjusting brightness.
minwidth = utils.g_config.getint(feedname, 'min_width')
# Is there already a feednamedir, with or without a prepended order number?
# If it has an index.html in it, then feedme has already fed this
# site today, and should bail rather than overwriting what's
# already there.
if os.path.exists(feedsdir):
for d in os.listdir(feedsdir):
if d.endswith(feednamedir):
if os.path.exists(os.path.join(feedsdir, d, "index.html")):
print("Already fed %s: not overwriting" % d)
return
# Partially fed this site earlier today, but didn't finish.
# Continue, but note the fact on stderr.
if verbose:
print("Partially fed %s: will overwrite old files" % d)
# If the user specified an order, prepend its number
g_feednum += 1
try:
order = utils.g_config.get_multiline('DEFAULT', 'order')
feednamedir = "%02d_%s" % (g_feednum, feednamedir)
except:
pass
outdir = os.path.join(feedsdir, feednamedir)
if verbose:
print("feednamedir:", feednamedir, file=sys.stderr)
print("outdir:", outdir, file=sys.stderr)
# Get any helpers for this feed, if any.
# A feed_helper takes precedence over a page_helper.
# The helpers subdir has already been added to os.path,
# at the end, so if the user has an earlier version
# it will override a built-in of the same name.
try:
feed_helper = utils.g_config.get(feedname, 'feed_helper')
except:
feed_helper = None
try:
page_helper = utils.g_config.get(feedname, 'page_helper')
except:
page_helper = None
if feed_helper or page_helper:
# Read all helper args, which start with "helper_",
# $D will map to today's datedir.
# No tilde expansion will be done.
# Turn them into a dictionary, e.g.
# helper_executable_path = ~/firefox-esr
# helper_log = $d/nyt_selenium.log
# -> {
# "executable_path": "~/firefox-esr",
# "log": "/home/username/feeds/10-25-Mon/nyt_selenium.log"
# }
confoptions = utils.g_config.options(feedname)
helper_args = {}
for opt in confoptions:
if opt.startswith("helper_"):
key = opt[7:]
if key:
helper_args[key] = utils.g_config.get(feedname, opt)
if '$f' in helper_args[key] and \
r'\$f' not in helper_args[key]:
helper_args[key] = helper_args[key].replace("$f",
outdir)
if '$d' in helper_args[key] and \
r'\$d' not in helper_args[key]:
helper_args[key] = helper_args[key].replace("$d",
feedsdir)
else:
print("Skipping bad key '%s' in %s config file"
% (opt, feedname), file-sys.stderr)
if verbose:
print(feedname, "helper args:", helper_args, file=sys.stderr)
if feed_helper:
if verbose:
print("Trying to import", feed_helper)
try:
helpermod = importlib.import_module(feed_helper)
except Exception as e:
traceback.print_exc(file=sys.stderr)
print("Couldn't import module '%s'" % feed_helper,
file=sys.stderr)
try:
helpermod.fetch_feed(outdir, helper_args)
if verbose:
print("Fetched feed with %s(%s) to %s"
% (feed_helper, helper_args, outdir))
except Exception as e:
traceback.print_exc(file=sys.stderr)
print("Couldn't run helper module '%s'" % feed_helper,
file=sys.stderr)
# Whether the copy helper was successful or not,
# it's time to return.
return
else: # must be a page_helper
if verbose:
print("Trying to import", page_helper)
try:
helpermod = importlib.import_module(page_helper)
if verbose:
print("Initializing", page_helper, file=sys.stderr)
helpermod.initialize(helper_args)
except Exception as e:
print("Couldn't import module '%s'" % page_helper,
file=sys.stderr)
traceback.print_exc(file=sys.stderr)
return
else:
helpermod = None
# When did we last run this feed?
# This code is probably brittle so wrap it in try/except.
last_fed_this = last_time_this_feed(cache, feedname)
if verbose:
print("Last fetched %s on %s" % (feedname, str(last_fed_this)),
file=sys.stderr)
if cache == None:
nocache = True
else:
nocache = (utils.g_config.get(feedname, 'nocache') == 'true')
if verbose and nocache:
msglog.msg(feedname + ": Ignoring cache")
downloaded_string ="\n<hr><i>(Downloaded by " + \
utils.VersionString + ")</i>\n"
html_index_links = utils.g_config.get(feedname, 'html_index_links')
# feedparser doesn't understand file:// URLs, so translate those
# to a local file:
# Ironically, with newer changes to feedparser, now file://
# is the only type it *does* handle reliably, and anything else
# we have to fetch for it before it can parse.
# if sitefeedurl.startswith('file://'):
# sitefeedurl = sitefeedurl[7:]
# feedparser.parse() can throw unexplained errors like
# "xml.sax._exceptions.SAXException: Read failed (no details available)"
# which will kill our whole process, so guard against that.
# Sadly, feedparser usually doesn't give any details about what went wrong.
socket.setdefaulttimeout(100)
try:
print("Running: feedparser.parse(%s)" % (sitefeedurl), file=sys.stderr)
# Feedparser sometimes makes bogus decisions about charsets
# fetched from http servers.
# For instance, on http://www.lamonitor.com/todaysnews/rss.xml
# some versions of feedparser (actually, some version of some
# underlying library it uses, but since feedparser's documentation
# is so sketchy and it's so inflexible, it's impossible to tell
# exactly where the problem is) will ignore the encoding specified
# in the feed and randomly decide to use something else.
# For instance, http://www.lamonitor.com/todaysnews/rss.xml
# specifies encoding="utf-8", but on Debian Jessie the parsed feed
# has 'encoding': u'iso-8859-2'.
# (Debian Stretch gets the right answer of utf-8. Even though
# the versions of feedfetcher, chardet and urllib2 are identical.)
# But if we fetch the RSS explicitly with urllib2 and pass it
# as a string to feedfetcher.parse(), it doesn't do this.
# feed = feedparser.parse(sitefeedurl)
# HOWEVER:
# If we do this on file:// URLs it causes an "unknown url type"
# error -- no idea why. I just love feedparser so much. :-(
if sitefeedurl.startswith("file://"):
feed = feedparser.parse(sitefeedurl)
elif html_index_links:
feed = htmlindex.parse(feedname, html_index_links, verbose=verbose)
else:
downloader = pageparser.FeedmeURLDownloader(feedname,
verbose=verbose)
rss_str = downloader.download_url(sitefeedurl)
feed = feedparser.parse(rss_str)
rss_str = None
response = None
# except xml.sax._exceptions.SAXException, e:
except urllib.error.HTTPError as e:
print("HTTP error parsing URL:", sitefeedurl, file=sys.stderr)
print(str(e), file=sys.stderr)
return
except pageparser.CookieError as e:
msglog.err("No cookies, skipping site")
msglog.err("Error was: %s" % e.message)
msglog.err("Cookiefile details: %s\n" % e.longmessage)
return
except ValueError as e:
msglog.err("Exception fetching URL %s: %s" % (url, e))
return
except Exception as e:
print("Couldn't parse feed: URL:", sitefeedurl, file=sys.stderr)
print(str(e), file=sys.stderr)
# raise(e)
utils.ptraceback()
# print(traceback.format_exc())
return
# feedparser has no error return! One way is to check len(feed.feed).
# Which makes no sense sicne feed is an object, why should it have a length?
# if len(feed.feed) == 0:
if len(feed.entries) == 0:
msglog.err("Can't read " + sitefeedurl)
return
# XXX Sometimes feeds die a few lines later getting feed.feed.title.
# Here's a braindead guard against it -- but why isn't this
# whole clause inside a try? It should be.
try:
title = feed.feed.title
except:
title = None
# if not 'title' in feed.feed:
if not title:
msglog.msg(sitefeedurl + " lacks a title!")
feed.feed.title = '[' + feedname + ']'
if cache and not nocache:
try:
feedcachedict = cache.thedict[sitefeedurl]
except:
feedcachedict = []
newfeedcachedict = []
# suburls: mapping of URLs we've encountered to local URLs.
# Any anchors (#anchor) will be discarded.
# This is for sites like WorldWideWords that make many links
# to the same page.
suburls = []
# Some sites, like Washington Post, repeat the same story
# several times but with different URLs (and no ID specified).
# The only way to tell we've seen them before is by title.
titles = []
# indexstr is the contents of the index.html file.
# Kept as a string until we know whether there are new, non-cached
# stories so it's worth updating the copy on disk.
# The stylesheet is for FeedViewer and shouldn't bother plucker etc.
day = time.strftime("%a")
indexstr = """<html>\n<head>
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>%s: %s</title>
<link rel="stylesheet" type="text/css" title="Feeds" href="../../feeds.css"/>
</head>
<body>\n<h1>%s: %s: %s</h1>
\n""" % (day, feedname, day, feedname, feed.feed.title)
if verbose:
print("********* Reading", sitefeedurl, file=sys.stderr)
# A pattern to tell the user how to get to the next story: >->
# We also might want to remove that pattern later, in case
# a story wasn't successfully downloaded -- so make a
# regexp that can match it.
next_item_string = '<br>\n<center><i><a href=\"#%d\">>-></a></i></center>\n<br>\n'
next_item_pattern = '<br>\n<center><i><a href=\"#[0-9]+\">>-></a></i></center>\n<br>\n'
try:
urlrewrite = utils.g_config.get_multiline(feedname, 'story_url_rewrite')
if urlrewrite:
print("**** urlrewrite:", urlrewrite, file=sys.stderr)
except:
urlrewrite = None
days = int(utils.g_config.get('DEFAULT', 'save_days'))
too_old = time.time() - days * 60 * 60 * 24
print("too_old would be", too_old, "(now is", time.time(), ")",
file=sys.stderr)
# We'll increment itemnum as soon as we start showing entries,
# so start it negative so anchor links will start at zero.
itemnum = -1
last_page_written = None
for item in feed.entries:
try:
#
# Get the list of links (href) and a (hopefully) unique ID.
# Make sure the id is a string; most of these components
# inside item are bytes.
# XXX Is href[] ever even used? Is this clause obsolete?
# Answer: not obsolete, at least A Word A Day uses it.
# But sometimes this clause will be triggered on a site
# that doesn't have "links" in its RSS source
# (e.g. Washington Post), which doesn't have href either.
#
if 'links' in item:
hrefs = [str(i['href']) for i in item.links
if 'rel' in i and 'href' in i
and i['rel'] == 'alternate']
elif 'link' in item:
hrefs = [ str(item.link) ]
else:
hrefs = []
if 'id' in item:
item_id = str(item.id)
if verbose:
print("\nID %s" % item_id, file=sys.stderr)
elif 'guid' in item:
item_id = str(item.guid)
if verbose:
print("\nGUID %s" % item_id, file=sys.stderr)
elif hrefs:
item_id = str(hrefs[0])
if verbose:
print("Using URL '%s' for ID" % item_id, file=sys.stderr)
else:
if verbose:
print("Item in %s had no ID or URL." % str(href[0]),
file=sys.stderr)
continue # or return?
# Whatever pattern we're using for the ID, it will need to
# have spaces mapped to + before putting it in the cache.
# So do that now.
item_id = FeedmeCache.id_encode(item_id)
# Does the link match a pattern we're skipping?
item_link = str(item.link)
if skip_link_pats:
skipping = False
for spat in skip_link_pats:
if re.search(spat, item_link):
skipping = True
if verbose:
print("Skipping", item_link, \
"because it matches", spat, file=sys.stderr)
break
if skipping:
continue
# How about the title? Does that match a skip pattern?
item_title = str(item.title)
if skip_title_pats:
skipping = False
for pat in skip_title_pats:
if re.search(pat, item_title, flags=re.IGNORECASE):
skipping = True
if verbose:
print("Skipping", item_link, \
"because of skip_title_pats " + pat,
file=sys.stderr)
break
if skipping:
continue
# Filter out file types known not to work
# XXX Only mp3 for now. Obviously, make this more general.
# Wish we could do this using the server's type rather than
# file extension!
if item_link.endswith("mp3"):
print("Filtering out mp3 link", item_link, file=sys.stderr)
continue
# Make sure ids don't have named anchors appended:
anchor_index = item_id.rfind('#')
if anchor_index >= 0:
anchor = item_id[anchor_index:]
item_id = item_id[0:anchor_index]
else:
anchor = ""
# See if we've already seen this page's ID in this run.
try:
pagenum = suburls.index(item_id)
# We've already seen a link to this URL.
# That could mean it's a link to a different named anchor
# within the same file, or it could mean that it's just
# a duplicate (see below).
if verbose:
print("already seen item id", item_id, "this run: skipping",
file=sys.stderr)
continue
except ValueError:
if verbose:
print("haven't seen item id", item_id, "yet this run",
file=sys.stderr)
# Is it a duplicate story that we've already seen in this run?
# Some sites, like Washington Post, repeat the same stories
# multiple times on their RSS feed, but stories won't be
# added to our real feedcachedict until we've succeeded in
# fetching the whole site. So check the temporary cache.
# On the other hand, Risks Digest has a single story and
# a lot of RSS entries with links to #name tags in that story.
# So in that case we should include the entry but not
# re-fetch the story.
if newfeedcachedict and item_id in newfeedcachedict:
if verbose:
print("%s repeated today -- skipping" % item_id,
file=sys.stderr)
continue
# How about the title? Have we already seen that before?
# Washington Post runs the same story (same title) several
# times with different URLs.
# But on other sites, like the Los Alamos Daily Post Legal Notices,
# titles can be as simple as "LEGAL NOTICE" and are often dups,
# but the actual stories/links are different.
if item.title in titles and not utils.g_config.getboolean(
feedname, 'allow_dup_titles'):
print('Skipping repeated title with a new ID: "%s", ID "%s"' \
% (item.title, item_id), file=sys.stderr)
continue
titles.append(item.title)
# Get the published date.
# item.pubDate is a unicode string, supposed to be in format
# Thu, 11 Aug 2016 14:46:50 GMT (or +0000)
# email.utils.parsedate returns a tuple.
# Pass it to time.mktime() to get seconds since epoch.
# XXX feedparser now has published_parsed which is
# a time.struct_time. Can we count on that and not
# have to do parsing here?
try:
pub_date = time.mktime(email_utils.parsedate(item.published))
except Exception as e:
pub_date = None
if verbose:
print("Couldn't read real pubdate:", e, file=sys.stderr)
# Haven't seen it yet this run. But is it in the cache already?
if not nocache:
# We want it in the cache, whether it's new or not:
if verbose:
print("Will cache as %s" % item_id, file=sys.stderr)
if item_id not in newfeedcachedict:
newfeedcachedict.append(item_id)
if item_id in feedcachedict:
if verbose:
print("Seen it before, it's in the cache",
file=sys.stderr)
# We've seen this ID before. HOWEVER, it may still
# be new: a site might have a static URL for the
# monthly photo contest that gets updated once
# a month with all-new content.
if not utils.g_config.getboolean(feedname, 'allow_repeats'):
if verbose:
print(item_id, "already cached -- skipping",
file=sys.stderr)
continue
# Repeats are allowed. So check the pub date.
# XXX Unfortunately cache entries don't include
# a date, so for now, allow repeat URLs if their
# content was updated since the last feedme run.
# This will unfortunately miss sites that
# aren't checked every day.
if verbose:
print("Seen this before, but repeats are allowed")
print("Last time this feed", last_fed_this)
print("pub_date", pub_date)
if pub_date <= last_fed_this:
print("No new changes, skipping")
continue
print("Recent change, re-fetching", file=sys.stderr)
elif verbose:
print("'%s' is not in the cache -- fetching" % item_id,
file=sys.stderr)
# We're probably including this item. Add it to suburls.
suburls.append(item_id)
pagenum = len(suburls) - 1
# Sanity check: is the pubDate newer than the last
# time we ran feedme? A negative answer isn't
# necessarily a reason not to get it.
# See if it's newer or older. If it's older,
# we've probably seen it already; give a warning.
if pub_date and last_fed_this:
if verbose:
print("Comparing pub_date %s to last_fed_this %s"
% (str(pub_date), str(last_fed_this)),
file=sys.stderr)
if pub_date < last_fed_this and (verbose or not nocache):
# If an entry is older than the maximum age
# for the cache, skip it with a warning.
if pub_date <= too_old and not nocache:
msglog.warn("%s is so old (%s -> %s) it's expired from the cache -- skipping" \
% (item_id, str(item.published),
str(pub_date)))
# XXX Remove from suburls?
continue
# Else warn about it, but include it in the feed.
msglog.warn("%s is older (%s) than the last time we updated this feed (%s)" \
% (item_id, str(item.published),
time.strftime("%m-%d-%a-%y",
time.gmtime(last_fed_this))))
else:
print("%s: last updated %s, pubDate is %s" \
% (item_id, str(item.published),
time.strftime("%m-%d-%a-%y",
time.gmtime(last_fed_this))))
elif verbose and not pub_date:
print(item_id, ": No pub_date!", file=sys.stderr)
itemnum += 1
if verbose:
print("Item:", item_title, file=sys.stderr)
# Now itemnum is the number of the entry on the index page;
# pagenum is the html file of the subentry, e.g. 3.html.
# Make the directory for this feed if we haven't already
if not os.access(outdir, os.W_OK):
if verbose:
print("Making", outdir, file=sys.stderr)
os.makedirs(outdir)
if 'author' in item:
author = str(item.author)
else:
author = None
content = get_content(item)
# A parser is mostly needed for levels > 1, but even with
# levels=1 we'll use it at the end for rewriting images
# in the index string.
parser = pageparser.FeedmeHTMLParser(feedname)
#
# If it's a normal multi-level site,
# follow the link and make a file for it:
#
if levels > 1:
try: # Try to trap keyboard interrupts, + others
# For the sub-pages, we're getting HTML, not RSS.
# Nobody seems to have RSS pointing to RSS.
fnam = str(pagenum) + ".html"
# Add a nextitem link in the footer to the next story,
# Shouldn't do this for the last story;
# but there's no easy way to tell if this is the last story,
# because we don't know until we try whether the next
# story will actually be fetched or not.
footer = '<center><a href="%d.html">>-%d-></a></center>' \
% (itemnum+1, itemnum+1)
# Add the page's URL to the footer:
footer += downloaded_string
footer += '\n<br>\n<a href="%s">%s</a>' % (item_link,
item_link)
if helpermod:
try:
htmlstr = helpermod.fetch_article(item_link)
except Exception as e:
print("Helper couldn't fetch", item_link,
file=sys.stderr)
traceback.print_exc(file=sys.stderr)
continue
if not htmlstr:
if verbose:
print("fetch failed on", item_link,
file=sys.stderr)
continue
else:
if levels == 1.5:
# On sites that put the full story in the RSS
# entry, we can use that for the story,
# no need to fetch another file.
htmlstr = content
else:
htmlstr = None
if urlrewrite:
if len(urlrewrite) == 2:
oldlink = item_link
item_link = re.sub(urlrewrite[0],
urlrewrite[1],
item_link)
print("Rewrote", oldlink, "to", item_link,
file=sys.stderr)
else:
print("story_url_rewrite had wrong # args:",
len(urlrewrite), urlrewrite,
file=sys.stderr)
parser.fetch_url(item_link,
outdir, fnam,
title=item_title, author=author,
footer=footer, html=htmlstr,
user_agent=user_agent)
# On level 1.5 sites, import any changes just made
# to the final output.
# On level 2 sites, pageparser didn't change the
# index page, just sub-pages so this wouldn't help.
# XXX should be a better way to get that from
# the pageparser, and a way to get pageparser
# to clean the index page for levels 1 and 2.
if levels == 1.5:
justwrotefile = os.path.join(outdir, fnam)
if os.path.exists(justwrotefile):
with open(justwrotefile) as justwrotefp:
content = justwrotefp.read()
elif verbose:
print("File", justwrotefile,
"doesn't exist, can't read back",
file=sys.stderr)
last_page_written = fnam
except pageparser.NoContentError as e:
# fetch_url didn't get the page or didn't write a file.
# So don't increment pagenum or itemnum for the next story.
msglog.warn("Didn't find any content on " + item_link
+ ": " + str(e))
# It is so annoying needing to repeat these
# lines every time! Isn't there a way I can define
# a subfunction that knows about this function's
# local variables?
itemnum -= 1
suburls.remove(item_id)
# Include a note in the indexstr
indexstr += '<p>No content for <a href="%s">%s</a>\n' \
% (item_link, item_title)