Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Iterate more node lists with foreach #91

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 30 additions & 59 deletions src/Readability.php
Original file line number Diff line number Diff line change
Expand Up @@ -302,8 +302,7 @@ public function addFootnotes(\DOMElement $articleContent): void
$articleLinks = $articleContent->getElementsByTagName('a');
$linkCount = 0;

for ($i = 0; $i < $articleLinks->length; ++$i) {
$articleLink = $articleLinks->item($i);
foreach ($articleLinks as $articleLink) {
$footnoteLink = $articleLink->cloneNode(true);
$refLink = $this->dom->createElement('a');
$footnote = $this->dom->createElement('li');
Expand Down Expand Up @@ -375,16 +374,15 @@ public function prepArticle(\DOMNode $articleContent): void
* which is what they were before.
*/
$elems = $xpath->query('.//p[@data-readability-styled]', $articleContent);
for ($i = $elems->length - 1; $i >= 0; --$i) {
$e = $elems->item($i);
foreach (iterator_to_array($elems) as $e) {
$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
}
}

// Remove service data-candidate attribute.
$elems = $xpath->query('.//*[@data-candidate]', $articleContent);
for ($i = $elems->length - 1; $i >= 0; --$i) {
$elems->item($i)->removeAttribute('data-candidate');
foreach ($elems as $elem) {
$elem->removeAttribute('data-candidate');
}

// Clean out junk from the article content.
Expand Down Expand Up @@ -416,9 +414,7 @@ public function prepArticle(\DOMNode $articleContent): void
// Remove extra paragraphs.
$articleParagraphs = $articleContent->getElementsByTagName('p');

for ($i = $articleParagraphs->length - 1; $i >= 0; --$i) {
$item = $articleParagraphs->item($i);

foreach (iterator_to_array($articleParagraphs) as $item) {
$imgCount = $item->getElementsByTagName('img')->length;
$embedCount = $item->getElementsByTagName('embed')->length;
$objectCount = $item->getElementsByTagName('object')->length;
Expand Down Expand Up @@ -520,11 +516,12 @@ public function getLinkDensity(\DOMElement $e, bool $excludeExternal = false): f
$textLength = mb_strlen($this->getInnerText($e, true, true));
$linkLength = 0;

for ($dRe = $this->domainRegExp, $i = 0, $il = $links->length; $i < $il; ++$i) {
if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) {
$dRe = $this->domainRegExp;
foreach ($links as $link) {
if ($excludeExternal && $dRe && !preg_match($dRe, $link->getAttribute('href'))) {
continue;
}
$linkLength += mb_strlen($this->getInnerText($links->item($i)));
$linkLength += mb_strlen($this->getInnerText($link));
}

if ($textLength > 0 && $linkLength > 0) {
Expand Down Expand Up @@ -573,10 +570,8 @@ public function clean(\DOMElement $e, string $tag): void
$targetList = $e->getElementsByTagName($tag);
$isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag);

for ($y = $targetList->length - 1; $y >= 0; --$y) {
foreach (iterator_to_array($targetList) as $currentItem) {
// Allow youtube and vimeo videos through as people usually want to see those.
$currentItem = $targetList->item($y);

if ($isEmbed) {
$attributeValues = $currentItem->getAttribute('src') . ' ' . $currentItem->getAttribute('href');

Expand All @@ -586,7 +581,7 @@ public function clean(\DOMElement $e, string $tag): void
}

// Then check the elements inside this element for the same.
if (preg_match($this->regexps['media'], $targetList->item($y)->getInnerHTML())) {
if (preg_match($this->regexps['media'], $currentItem->getInnerHTML())) {
continue;
}
}
Expand All @@ -607,16 +602,14 @@ public function cleanConditionally(\DOMElement $e, string $tag): void
}

$tagsList = $e->getElementsByTagName($tag);
$curTagsLength = $tagsList->length;

/*
* Gather counts for other typical elements embedded within.
* Traverse backwards so we can remove nodes at the same time without effecting the traversal.
*
* TODO: Consider taking into account original contentScore here.
*/
for ($i = $curTagsLength - 1; $i >= 0; --$i) {
$node = $tagsList->item($i);
foreach (iterator_to_array($tagsList) as $node) {
$weight = $this->getWeight($node);
$contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0;
$this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : ''));
Expand All @@ -640,15 +633,15 @@ public function cleanConditionally(\DOMElement $e, string $tag): void
$embedCount = 0;
$embeds = $node->getElementsByTagName('embed');

for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
foreach ($embeds as $embed) {
if (preg_match($this->regexps['media'], $embed->getAttribute('src'))) {
++$embedCount;
}
}

$embeds = $node->getElementsByTagName('iframe');
for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
foreach ($embeds as $embed) {
if (preg_match($this->regexps['media'], $embed->getAttribute('src'))) {
++$embedCount;
}
}
Expand Down Expand Up @@ -718,9 +711,9 @@ public function cleanHeaders(\DOMElement $e): void
for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
$headers = $e->getElementsByTagName('h' . $headerIndex);

for ($i = $headers->length - 1; $i >= 0; --$i) {
if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
$headers->item($i)->parentNode->removeChild($headers->item($i));
foreach (iterator_to_array($headers) as $header) {
if ($this->getWeight($header) < 0 || $this->getLinkDensity($header) > 0.33) {
$header->parentNode->removeChild($header);
}
}
}
Expand Down Expand Up @@ -811,13 +804,13 @@ protected function prepDocument(): void

// Remove all style tags in head.
$styleTags = $this->dom->getElementsByTagName('style');
for ($i = $styleTags->length - 1; $i >= 0; --$i) {
$styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
foreach (iterator_to_array($styleTags) as $styleTag) {
$styleTag->parentNode->removeChild($styleTag);
}

$linkTags = $this->dom->getElementsByTagName('link');
for ($i = $linkTags->length - 1; $i >= 0; --$i) {
$linkTags->item($i)->parentNode->removeChild($linkTags->item($i));
foreach (iterator_to_array($linkTags) as $linkTag) {
$linkTag->parentNode->removeChild($linkTag);
}
}

Expand Down Expand Up @@ -910,8 +903,7 @@ protected function grabArticle(?\DOMElement $page = null)

$allElements = $page->getElementsByTagName('*');

for ($nodeIndex = 0; $allElements->item($nodeIndex); ++$nodeIndex) {
$node = $allElements->item($nodeIndex);
foreach (iterator_to_array($allElements) as $node) {
$tagName = $node->tagName;

$nodeContent = $node->getInnerHTML();
Expand All @@ -924,7 +916,6 @@ protected function grabArticle(?\DOMElement $page = null)
if (!$this->isNodeVisible($node)) {
$this->logger->debug('Removing invisible node ' . $node->getNodePath());
$node->parentNode->removeChild($node);
--$nodeIndex;
continue;
}

Expand All @@ -937,7 +928,6 @@ protected function grabArticle(?\DOMElement $page = null)
) {
$this->logger->debug('Removing unlikely candidate (using conf) ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '"');
$node->parentNode->removeChild($node);
--$nodeIndex;
continue;
}

Expand All @@ -956,7 +946,6 @@ protected function grabArticle(?\DOMElement $page = null)
$newNode->setInnerHtml($nodeContent);

$node->parentNode->replaceChild($newNode, $node);
--$nodeIndex;
$nodesToScore[] = $newNode;
} catch (\Exception $e) {
$this->logger->error('Could not alter div/article to p, reverting back to div: ' . $e->getMessage());
Expand Down Expand Up @@ -1015,15 +1004,15 @@ protected function grabArticle(?\DOMElement $page = null)
* A score is determined by things like number of commas, class names, etc.
* Maybe eventually link density.
*/
for ($pt = 0, $scored = \count($nodesToScore); $pt < $scored; ++$pt) {
$ancestors = $this->getAncestors($nodesToScore[$pt], 5);
foreach ($nodesToScore as $nodeToScore) {
$ancestors = $this->getAncestors($nodeToScore, 5);

// No parent node? Move on...
if (0 === \count($ancestors)) {
continue;
}

$innerText = $this->getInnerText($nodesToScore[$pt]);
$innerText = $this->getInnerText($nodeToScore);

// If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it.
if (mb_strlen($innerText) < self::MIN_PARAGRAPH_LENGTH) {
Expand Down Expand Up @@ -1067,20 +1056,14 @@ protected function grabArticle(?\DOMElement $page = null)
if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) {
$candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement);

for ($c = $candidates->length - 1; $c >= 0; --$c) {
$node = $candidates->item($c);
foreach (iterator_to_array($candidates) as $node) {
// node should be readable but not inside of an article otherwise it's probably non-readable block
if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? 0 !== strcasecmp($node->parentNode->tagName, 'article') : true)) {
$this->logger->debug('Removing unlikely candidate (using note) ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
$node->parentNode->removeChild($node);
}
}

$candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement);

for ($c = $candidates->length - 1; $c >= 0; --$c) {
$node = $candidates->item($c);
}
unset($candidates);
}

Expand All @@ -1094,9 +1077,7 @@ protected function grabArticle(?\DOMElement $page = null)
$candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement);
$this->logger->debug('Candidates: ' . $candidates->length);

for ($c = $candidates->length - 1; $c >= 0; --$c) {
$item = $candidates->item($c);

foreach (iterator_to_array($candidates) as $item) {
// Scale the final candidates score based on link density. Good content should have a
// relatively small link density (5% or less) and be mostly unaffected by this operation.
// If not for this we would have used XPath to find maximum @readability.
Expand Down Expand Up @@ -1231,13 +1212,7 @@ protected function grabArticle(?\DOMElement $page = null)
$parentOfTopCandidate = $topCandidate->parentNode;
$siblingNodes = $parentOfTopCandidate->childNodes;

if (0 === $siblingNodes->length) {
$siblingNodes = new \stdClass();
$siblingNodes->length = 0;
}

for ($s = 0, $sl = $siblingNodes->length; $s < $sl; ++$s) {
$siblingNode = $siblingNodes->item($s);
foreach (iterator_to_array($siblingNodes) as $siblingNode) {
$siblingNodeName = $siblingNode->nodeName;
$append = false;
$this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . ((\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
Expand Down Expand Up @@ -1280,13 +1255,9 @@ protected function grabArticle(?\DOMElement $page = null)
} catch (\Exception $e) {
$this->logger->debug('Could not alter siblingNode "' . $siblingNodeName . '" to "div", reverting to original.');
$nodeToAppend = $siblingNode;
--$s;
--$sl;
}
} else {
$nodeToAppend = $siblingNode;
--$s;
--$sl;
}

// To ensure a node does not interfere with readability styles, remove its classnames & ids.
Expand Down
Loading