From 41ef59212ffeb380b34dce24b010ee20d4bf75ef Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Tue, 28 Jun 2022 21:47:04 +0200 Subject: [PATCH] Keep h1 and other headings Even though using h1 tags for sections inside an article is semantically wrong, a lot of websites are doing it anyway. So the idea here is to stop stripping headings, including h1 on Readability's side. Fixes wallabag/wallabag#5805 Signed-off-by: Kevin Decherf --- src/Readability.php | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index 4336bca..3ad909b 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -395,14 +395,17 @@ public function prepArticle(\DOMNode $articleContent): void $this->clean($articleContent, 'object'); $this->clean($articleContent, 'iframe'); $this->clean($articleContent, 'canvas'); - $this->clean($articleContent, 'h1'); /* - * If there is only one h2, they are probably using it as a main header, so remove it since we + * If there is only one h1 or h2, they are probably using it as a main header, so remove it since we * already have a header. */ + $h1s = $articleContent->getElementsByTagName('h1'); + if (1 === $h1s->length && mb_strlen($this->getInnerText($h1s->item(0), true, true)) < 100) { + $this->clean($articleContent, 'h1'); + } $h2s = $articleContent->getElementsByTagName('h2'); - if (1 === $h2s->length && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) { + if (0 === $h1s->length && 1 === $h2s->length && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) { $this->clean($articleContent, 'h2'); }