Skip to content

Commit

Permalink
Minor fix to web crawling
Browse files Browse the repository at this point in the history
  • Loading branch information
lirondos committed Feb 6, 2024
1 parent e1787c7 commit 9a13147
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 2 deletions.
4 changes: 2 additions & 2 deletions utils/constants.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
DAYS_SINCE = 5
LAST_WEEK = 7
HOURS_TO_TWEET = 12
HOURS_TO_TWEET = 14
TO_BE_TWEETED_FOLDER = "tobetweeted/"
LOGS_FOLDER = "logs/"

FORBIDDEN_URL_PATTERNS = [
"https://cat.elpais.com",
"que-ver-hoy-en-tv",
"/encatala/",
"/horoscopo-",
"/horoscopo",
"vodafone.es",
"/escaparate/",
"/mingote/",
Expand Down
2 changes: 2 additions & 0 deletions utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ def clean_html(article: Article):
)
article.html = re.sub(r"<blockquote cite=\".+?</blockquote>", "", article.html)
article.html = re.sub(r"<figcaption.+?</figcaption>", "", article.html)
article.html = re.sub(r"<span class=\"author .+?</span>", "", article.html)

if "vertele" in article.url:
article.html = re.sub(
r"<h2 class=\"mce\">&middot.+?</p>", "", article.html
Expand Down

0 comments on commit 9a13147

Please sign in to comment.