Skip to content

Commit

Permalink
improved parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
ilude committed Mar 29, 2024
1 parent c3e36e2 commit 3eae68e
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 27 deletions.
36 changes: 32 additions & 4 deletions app/post_processor.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
import html
import importlib
import re
import os

from bs4 import BeautifulSoup

class NoOpClass:
def process(self, data):
return data

class PostProcessor:
def __init__(self):
self.loaded_classes = {}
self.pwd = os.path.dirname(os.path.abspath(__file__))

def to_snake_case(self, input_string):
# Replace non-alphanumeric characters and apostrophes with spaces and split the string into words
Expand All @@ -22,14 +26,17 @@ def to_snake_case(self, input_string):

return snake_case_string

def process(self, class_name, data):
class_name = self.to_snake_case(class_name)
def process(self, widget):

self.normalize(widget)

# Check if the class has already been loaded
class_name = self.to_snake_case(widget['name'])
if class_name in self.loaded_classes:
instance = self.loaded_classes[class_name]
else:
# Construct file path to the "processors" subdirectory
file_path = os.path.join("processors", class_name + ".py")
file_path = os.path.join(self.pwd, "processors", class_name + ".py")
if os.path.exists(file_path):
module = importlib.import_module(f"processors.{class_name}")
cls = getattr(module, ''.join(word.title() for word in class_name.split('_')))
Expand All @@ -40,8 +47,29 @@ def process(self, class_name, data):
self.loaded_classes[class_name] = instance

# Call process() method of the instance with the provided data
result = instance.process(data)
result = instance.process(widget)
return result

def normalize(self, widget):
for article in widget['articles']:
article['title'] = re.sub('\s+', ' ', article['title'])

if not article['summary']:
continue

article['summary'] = article['summary'].replace('\n', ' ').replace('\r', ' ').strip()
article['summary'] = BeautifulSoup(html.unescape(article['summary']), 'lxml').text
# strip [...] from the end of the summary
article['summary'] = re.sub(r'\[[\.+|…\]].*$', '', article['summary'])

if article['summary'] == article['title']:
article['summary'] = None
elif (article['title'] in article['summary'] and len(article['title'])/len(article['summary']) > 0.64):
article['title'] = article['summary']
article['summary'] = None
elif (article['summary'] in article['title']):
article['summary'] = article['title']
article['title'] = None

# Instantiate loader when the module is imported
post_processor = PostProcessor()
6 changes: 3 additions & 3 deletions app/processors/cafe_hayek.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@


class CafeHayek:
def process(self, data):
for article in data['articles']:
def process(self, widget):
for article in widget['articles'][:]:
article['summary'] = re.sub(r'^Tweet\s*\.{0,3}|\…\s+', '', article['summary'])

return data
return widget
11 changes: 7 additions & 4 deletions app/processors/instapundit.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@


class Instapundit:
def process(self, data):
for article in data['articles']:
article['title'] = re.sub(r'http[s]?://\S+', '', article['title'], flags=re.IGNORECASE)
def process(self, widget):
for article in widget['articles'][:]:
if '#CommissionEarned' in article['title']:
widget['articles'].remove(article)
next
article['title'] = re.sub(r'http[s]?://\S+', '', article['title'], flags=re.IGNORECASE)

return data
return widget
24 changes: 9 additions & 15 deletions app/rss.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,6 @@ def __init__(self):
if not os.path.exists(cache_dir):
os.makedirs(cache_dir)
self.feed_cache = FileSystemCache(cache_dir, default_timeout=60*15)

def clean_html(self, text: str) -> str:
text = text.replace('\n', ' ').replace('\r', ' ').strip()
if text:
text = BeautifulSoup(html.unescape(text), 'lxml').text
text = re.sub(r'\[.*?\].*$', '', text)
# text = re.sub(r'http[s]?://\S+', '', text, flags=re.IGNORECASE)
# text = ' '.join([x.capitalize() for x in text.split(' ')])

return text

async def load_feed(self, widget):
start_time = time.time()
Expand All @@ -38,7 +28,7 @@ async def load_feed(self, widget):
# check if feed is in self.feeds and that the last updated time is less than 15 minutes ago
if cached_widget and (start_time - cached_widget['last_updated']) < 60 * 15:
widget['articles'] = cached_widget['articles']
print(f"Loaded {widget['name']} from cache")
# print(f"Loaded {widget['name']} from cache")
else:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'}
async with aiohttp.ClientSession() as session:
Expand All @@ -48,16 +38,20 @@ async def load_feed(self, widget):
print(await response.text())
else:
print(f"Loaded {widget['name']} with Status Code: {response.status}")
article_limit = widget.get('article_limit', 10)
parsed_feed = feedparser.parse(await response.text())

widget['articles'] = [{
'title': " ".join(entry.get('title', 'No Title').split()).strip() ,
'title': entry.get('title', 'No Title').strip() ,
'link': entry.link,
'summary': self.clean_html(entry.get('summary', ''))} for entry in parsed_feed.entries[:widget.get('article_limit',10)]] if 'entries' in parsed_feed else []
'summary': entry.get('summary', None)
} for entry in parsed_feed.entries[:article_limit]] if 'entries' in parsed_feed else []

widget['last_updated'] = start_time
widget = post_processor.process(widget['name'], widget)
self.feed_cache.set(widget['name'], widget)


widget = post_processor.process(widget)

return (time.time() - start_time)

def find_feed_links(self, url):
Expand Down
2 changes: 1 addition & 1 deletion app/templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
<a href="{{ feed.link }}" target="_blank">{{ feed.name }}</a></div>
<div class="box-content {{ feed.type }}-content">
<ul>
{% for article in feed.articles %}
{% for article in feed.articles[:feed.get('article_limit', 10)] %}
<li>
<a href="{{ article.link }}" class="{{ feed.type }}-link">
{% if article.title %}
Expand Down

0 comments on commit 3eae68e

Please sign in to comment.