improved parsing

traefikturkey · Mar 29, 2024 · 3eae68e · 3eae68e
1 parent c3e36e2
commit 3eae68e
Show file tree

Hide file tree

Showing 5 changed files with 52 additions and 27 deletions.
diff --git a/app/post_processor.py b/app/post_processor.py
@@ -1,14 +1,18 @@
+import html
 import importlib
 import re
 import os
 
+from bs4 import BeautifulSoup
+
 class NoOpClass:
 	def process(self, data):
 		return data
 
 class PostProcessor:
 	def __init__(self):
 		self.loaded_classes = {}
+		self.pwd = os.path.dirname(os.path.abspath(__file__))
 
 	def to_snake_case(self, input_string):
 		# Replace non-alphanumeric characters and apostrophes with spaces and split the string into words
@@ -22,14 +26,17 @@ def to_snake_case(self, input_string):
 
 		return snake_case_string
 
-	def process(self, class_name, data):
-		class_name = self.to_snake_case(class_name)
+	def process(self, widget):
+
+		self.normalize(widget)
+
 		# Check if the class has already been loaded
+		class_name = self.to_snake_case(widget['name'])
 		if class_name in self.loaded_classes:
 			instance = self.loaded_classes[class_name]
 		else:
 			# Construct file path to the "processors" subdirectory
-			file_path = os.path.join("processors", class_name + ".py")
+			file_path = os.path.join(self.pwd, "processors", class_name + ".py")
 			if os.path.exists(file_path):
 				module = importlib.import_module(f"processors.{class_name}")
 				cls = getattr(module, ''.join(word.title() for word in class_name.split('_')))
@@ -40,8 +47,29 @@ def process(self, class_name, data):
 			self.loaded_classes[class_name] = instance
 
 		# Call process() method of the instance with the provided data
-		result = instance.process(data)
+		result = instance.process(widget)
 		return result
 
+	def normalize(self, widget):
+			for article in widget['articles']:
+				article['title'] = re.sub('\s+', ' ', article['title'])
+
+				if not article['summary']:
+					continue
+
+				article['summary'] = article['summary'].replace('\n', ' ').replace('\r', ' ').strip()
+				article['summary'] = BeautifulSoup(html.unescape(article['summary']), 'lxml').text
+				# strip [...] from the end of the summary
+				article['summary'] = re.sub(r'\[[\.+|…\]].*$', '', article['summary'])
+
+				if article['summary'] == article['title']:
+					article['summary'] = None
+				elif (article['title'] in article['summary'] and len(article['title'])/len(article['summary']) > 0.64):
+					article['title'] = article['summary']
+					article['summary'] = None
+				elif (article['summary'] in article['title']):
+					article['summary'] = article['title']
+					article['title'] = None
+
 # Instantiate loader when the module is imported
 post_processor = PostProcessor()
diff --git a/app/processors/cafe_hayek.py b/app/processors/cafe_hayek.py
@@ -3,8 +3,8 @@
 
 
 class CafeHayek:
-	def process(self, data):
-		for article in data['articles']:
+	def process(self, widget):
+		for article in widget['articles'][:]:
 			article['summary'] = re.sub(r'^Tweet\s*\.{0,3}|\…\s+', '', article['summary'])
 
-		return data
+		return widget
diff --git a/app/processors/instapundit.py b/app/processors/instapundit.py
@@ -2,8 +2,11 @@
 
 
 class Instapundit:
-  def process(self, data):
-    for article in data['articles']:
-      article['title'] = re.sub(r'http[s]?://\S+', '', article['title'], flags=re.IGNORECASE)
+	def process(self, widget):
+		for article in widget['articles'][:]:
+			if '#CommissionEarned' in article['title']:
+				widget['articles'].remove(article)
+				next
+			article['title'] = re.sub(r'http[s]?://\S+', '', article['title'], flags=re.IGNORECASE)
 
-    return data
+		return widget
diff --git a/app/rss.py b/app/rss.py
@@ -19,16 +19,6 @@ def __init__(self):
 		if not os.path.exists(cache_dir):
 			os.makedirs(cache_dir)
 		self.feed_cache = FileSystemCache(cache_dir, default_timeout=60*15)
-
-	def clean_html(self, text: str) -> str:
-		text = text.replace('\n', ' ').replace('\r', ' ').strip()
-		if text:
-			text = BeautifulSoup(html.unescape(text), 'lxml').text
-			text = re.sub(r'\[.*?\].*$', '', text)
-			# text = re.sub(r'http[s]?://\S+', '', text, flags=re.IGNORECASE)
-			# text = ' '.join([x.capitalize() for x in text.split(' ')])
-
-		return text
 
 	async def load_feed(self, widget):
 		start_time = time.time()
@@ -38,7 +28,7 @@ async def load_feed(self, widget):
 		# check if feed is in self.feeds and that the last updated time is less than 15 minutes ago	
 		if cached_widget and (start_time - cached_widget['last_updated']) < 60 * 15:
 			widget['articles'] = cached_widget['articles']
-			print(f"Loaded {widget['name']} from cache")
+			# print(f"Loaded {widget['name']} from cache")
 		else:
 			headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'}
 			async with aiohttp.ClientSession() as session:
@@ -48,16 +38,20 @@ async def load_feed(self, widget):
 						print(await response.text())
 					else:
 						print(f"Loaded {widget['name']} with Status Code: {response.status}")
+						article_limit = widget.get('article_limit', 10)
 						parsed_feed = feedparser.parse(await response.text())
 
 						widget['articles'] = [{
-								'title': " ".join(entry.get('title', 'No Title').split()).strip() , 
+								'title': entry.get('title', 'No Title').strip() , 
 								'link': entry.link, 
-								'summary': self.clean_html(entry.get('summary', ''))} for entry in parsed_feed.entries[:widget.get('article_limit',10)]] if 'entries' in parsed_feed else []
+								'summary': entry.get('summary', None)
+        		} for entry in parsed_feed.entries[:article_limit]] if 'entries' in parsed_feed else []
+
 						widget['last_updated'] = start_time
-						widget = post_processor.process(widget['name'], widget)
 						self.feed_cache.set(widget['name'], widget)
-
+
+		widget = post_processor.process(widget)
+
 		return (time.time() - start_time)
 
 	def find_feed_links(self, url):

diff --git a/app/templates/index.html b/app/templates/index.html
@@ -87,7 +87,7 @@
               <a href="{{ feed.link }}" target="_blank">{{ feed.name }}</a></div>
             <div class="box-content {{ feed.type }}-content">
               <ul>
-                {% for article in feed.articles %}
+                {% for article in feed.articles[:feed.get('article_limit', 10)] %}
                 <li>
                   <a href="{{ article.link }}" class="{{ feed.type }}-link">
                     {% if article.title %}