-
Notifications
You must be signed in to change notification settings - Fork 0
/
related_reading.py
169 lines (127 loc) · 4.84 KB
/
related_reading.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# -*- coding: utf-8 -*-
import os.path
import re
import itertools
from bs4 import BeautifulSoup
from codecs import open
try:
from urlparse import urljoin
except ImportError:
from urllib.parse import urljoin
import logging
from pelican import signals
from pelican.generators import CachingGenerator
logger = logging.getLogger(__name__)
def unTypography(string):
ret = string
# Uncaught whitespace
ret = re.sub(r"[\n\r]+", "\n", ret)
# Large replacements
ret = ret.replace("^", '^')
ret = ret.replace(' ', ' ')
ret = ret.replace("“ ", '"')
# Group replacements
ret = re.sub(r"[“”]", '"', ret)
ret = re.sub(r"[‘’]", "'", ret)
# Single character replacements
to = " "
fr = "¶"
for (pattern, repl) in [(c, to[i]) for i, c in enumerate(fr)]:
# ret = re.sub(pattern, repl, ret)
ret = ret.replace(pattern, repl)
return ret
class RelatedReadingAggregateGenerator(CachingGenerator):
def __init__(self, context, settings, path, theme, output_path, *null):
super().__init__(context, settings, path, theme, output_path)
self.output_path = output_path
self.context = context
self.siteurl = settings.get('SITEURL')
self.relative_urls = settings.get('RELATIVE_URLS')
self.tpages = settings.get('TEMPLATE_PAGES')
self.output_path = output_path
self.json_nodes = []
self.save_as = "related_reading.html"
def generate_output(self, writer):
# The primary function that gets called.
# Gather all the content we can
pages = self.context['pages'] + self.context['articles']
for article in self.context['articles']:
pages += article.translations
# Process template pages
for srclink in self.tpages:
self.json_nodes.append(self.nodeFromRawPage(srclink))
# Process non-template pages
for page in pages:
self.json_nodes.append(self.nodeFromPage(page))
keyf = lambda p: p['category']
related_reading = itertools.groupby(
sorted([p for p in self.json_nodes if p['links']], key=keyf),
keyf
)
# logger.info(str(related_reading))
writer.write_file(
name=self.save_as,
template=self.get_template("relatedreading"),
context=self.context,
relative_urls=self.settings['RELATIVE_URLS'],
related_reading=related_reading
)
def nodeFromPage(self, page):
# Takes a page or article and creates a search node
# Don't index drafts or other non-published documents
if getattr(page, 'status', 'published') != 'published':
return
soup_title = BeautifulSoup(page.title, 'html.parser')
page_title = unTypography(soup_title.get_text(' ', strip=True))
soup_text = BeautifulSoup(page._content, 'html.parser')
page_links = []
anchors = itertools.chain(
soup_text.find_all("a", class_="related-reading"),
soup_text.select(".related-reading a")
)
for anchor in anchors:
page_links.append(dict(text=anchor.text, href=anchor.get('href')))
page_category = page.category.name if getattr(page, 'category', 'None') != 'None' else ''
page_url = '.'
if page.url:
page_url = page.url # if self.relative_urls else (self.siteurl + '/' + page.url)
node = {
'title': page_title,
'category': page_category,
'url': page_url,
'links': page_links
}
return node
def nodeFromRawPage(self, srclink):
# Takes a url to a template page and creates a search node
srcfile = open(os.path.join(self.output_path, srclink), encoding='utf-8')
soup = BeautifulSoup(srcfile, 'html.parser')
# Only printable characters
while True:
script = soup.find("script")
if script:
script.extract()
else:
break
page_title = unTypography(soup.title.string) if soup.title is not None else ''
page_links = []
anchors = itertools.chain(
soup.find_all("a", class_="related-reading"),
soup.select(".related-reading a")
)
for anchor in anchors:
page_links.append(dict(text=anchor.text, href=anchor.href))
# Should set default category?
page_category = 'page'
page_url = urljoin(self.siteurl, self.tpages[srclink])
node = {
'title': page_title,
'category': page_category,
'url': page_url,
'links': page_links
}
return node
def get_generators(generators):
return RelatedReadingAggregateGenerator
def register():
signals.get_generators.connect(get_generators)