Skip to content

Commit

Permalink
Merge pull request #2808 from omnivore-app/fix/published-date
Browse files Browse the repository at this point in the history
fix/published date
  • Loading branch information
sywhb authored Sep 28, 2023
2 parents 3793d0f + f0abdd6 commit c9b178c
Show file tree
Hide file tree
Showing 12 changed files with 2,086 additions and 623 deletions.
55 changes: 40 additions & 15 deletions packages/readabilityjs/Readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,29 @@ const extractPublishedDateFromAuthor = (author)=> {
return [null, null];
}
const authorName = author.replace(/^by\s+/i, '');
const regex = /(January|February|March|April|May|June|July|August|September|Octrober|November|December)\s\d{1,2},\s\d{2,4}/;
if (!regex.test(author)) {
return [authorName, null];
}
const regex = /(January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,2},\s\d{2,4}/i;
const chineseDateRegex = /(\d{2,4})(\d{1,2})(\d{1,2})/;

// English date
if (regex.test(author)) {
const match = author.match(regex) || [];
return [authorName.replace(regex, ''), match[0]];
}

const matchedDates = author.match(regex) || [];
const publishedAt = matchedDates[0];
// Chinese date
if (chineseDateRegex.test(author)) {
const match = author.match(chineseDateRegex);
if (match) {
const year = parseInt(match[1], 10);
const month = parseInt(match[2], 10) - 1; // January is 0 in JavaScript Date
const day = parseInt(match[3], 10);

const publishedAt = new Date(year, month, day);
return [authorName.replace(chineseDateRegex, ''), publishedAt];
}
}

return [authorName.replace(regex, ''), publishedAt];
return [authorName, null];
};

/**
Expand Down Expand Up @@ -204,8 +217,10 @@ Readability.prototype = {
DATES_REGEXPS: [
/([0-9]{4}[-\/]?((0[13-9]|1[012])[-\/]?(0[1-9]|[12][0-9]|30)|(0[13578]|1[02])[-\/]?31|02[-\/]?(0[1-9]|1[0-9]|2[0-8]))|([0-9]{2}(([2468][048]|[02468][48])|[13579][26])|([13579][26]|[02468][048]|0[0-9]|1[0-6])00)[-\/]?02[-\/]?29)/i,
/(((0[13-9]|1[012])[-/]?(0[1-9]|[12][0-9]|30)|(0[13578]|1[02])[-/]?31|02[-/]?(0[1-9]|1[0-9]|2[0-8]))[-/]?[0-9]{4}|02[-/]?29[-/]?([0-9]{2}(([2468][048]|[02468][48])|[13579][26])|([13579][26]|[02468][048]|0[0-9]|1[0-6])00))/i,
/(((0[1-9]|[12][0-9]|30)[-/]?(0[13-9]|1[012])|31[-/]?(0[13578]|1[02])|(0[1-9]|1[0-9]|2[0-8])[-/]?02)[-/]?[0-9]{4}|29[-/]?02[-/]?([0-9]{2}(([2468][048]|[02468][48])|[13579][26])|([13579][26]|[02468][048]|0[0-9]|1[0-6])00))/i
]
/(((0[1-9]|[12][0-9]|30)[-/]?(0[13-9]|1[012])|31[-/]?(0[13578]|1[02])|(0[1-9]|1[0-9]|2[0-8])[-/]?02)[-/]?[0-9]{4}|29[-/]?02[-/]?([0-9]{2}(([2468][048]|[02468][48])|[13579][26])|([13579][26]|[02468][048]|0[0-9]|1[0-6])00))/i,
],
LONG_DATE_REGEXP: /^(Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(tember)?|Oct(ober)?|Nov(ember)?|Dec(ember)?)\s\d{1,2}(?:st|nd|rd|th)?(,)?\s\d{2,4}$/i,
CHINESE_DATE_REGEXP: /^\d{2,4}\d{1,2}\d{1,2}$/,
},

UNLIKELY_ROLES: ["menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog"],
Expand Down Expand Up @@ -1067,27 +1082,37 @@ Readability.prototype = {
// we don't want to check for dates in the URL's
if (node.tagName.toLowerCase() === 'a') return
// Searching for the real date in the text content
let dateRegExpFound = this.REGEXPS.DATES_REGEXPS.find(regexp => regexp.test(node.textContent.trim()))
dateRegExpFound && (dateRegExpFound = dateRegExpFound.exec(node.textContent.trim()))
const content = node.textContent.trim()
let dateFound
const dateRegExpFound = this.REGEXPS.DATES_REGEXPS.find(regexp => regexp.test(content))
if (dateRegExpFound) {
dateFound = dateRegExpFound.exec(content)[0]
} else if (this.REGEXPS.LONG_DATE_REGEXP.test(content)) {
dateFound = this.REGEXPS.LONG_DATE_REGEXP.exec(content)[0].replace(/st|nd|rd|th/i, '')
} else if (this.REGEXPS.CHINESE_DATE_REGEXP.test(content)) {
dateFound = this.REGEXPS.CHINESE_DATE_REGEXP.exec(content)[0].replace(/|/g, '-').replace(//g, '')
}

let publishedDateParsed
try {
// Trying to parse the Date from the content itself
publishedDateParsed = new Date(node.textContent.trim())
publishedDateParsed = new Date(content)
} catch (error) { }

if (
// Checking for the dates keywords through the attributes except URL's
((this._someNodeAttribute(node, ({ value, name }) => {
if (/href|uri|url/i.test(name)) return false;
return this.REGEXPS.publishedDate.test(value)
}) || dateRegExpFound) || (/date/i.test(matchString) && !isNaN(publishedDateParsed)))
}) || dateFound) || (/date/i.test(matchString) && !isNaN(publishedDateParsed)))
&& this._isValidPublishedDate(node.textContent)
) {
try {
if (isNaN(publishedDateParsed))
if (isNaN(publishedDateParsed)) {
// Trying to parse the Date from the found by REGEXP string
publishedDateParsed = new Date(dateRegExpFound[0])
publishedDateParsed = new Date(dateFound)
}

if (!isNaN(publishedDateParsed) && !this._articlePublishedDate)
this._articlePublishedDate = publishedDateParsed
}
Expand Down
Loading

1 comment on commit c9b178c

@vercel
Copy link

@vercel vercel bot commented on c9b178c Sep 28, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.