From c54b69b6894832e050fe28a3cb7e84a6fbb0db43 Mon Sep 17 00:00:00 2001
From: "FiveFilters.org"
Date: Tue, 3 Dec 2024 16:03:49 +0100
Subject: [PATCH 1/5] Initial version using PHP 8.4's HTML5 parser
---
composer.json | 1 -
src/Configuration.php | 25 +--
src/Nodes/DOM/{DOMAttr.php => Attr.php} | 2 +-
.../DOM/{DOMNotation.php => CdataSection.php} | 2 +-
...{DOMCdataSection.php => CharacterData.php} | 2 +-
src/Nodes/DOM/{DOMEntity.php => Comment.php} | 2 +-
src/Nodes/DOM/DOMDocument.php | 30 ---
src/Nodes/DOM/DOMDocumentFragment.php | 10 -
src/Nodes/DOM/DOMDocumentType.php | 10 -
src/Nodes/DOM/DOMEntityReference.php | 10 -
src/Nodes/DOM/DOMProcessingInstruction.php | 10 -
src/Nodes/DOM/DocumentFragment.php | 10 +
src/Nodes/DOM/DocumentType.php | 10 +
src/Nodes/DOM/{DOMElement.php => Element.php} | 8 +-
src/Nodes/DOM/{DOMComment.php => Entity.php} | 2 +-
...MCharacterData.php => EntityReference.php} | 2 +-
src/Nodes/DOM/{DOMNode.php => Node.php} | 2 +-
.../DOM/{DOMNodeList.php => NodeList.php} | 8 +-
src/Nodes/DOM/Notation.php | 10 +
src/Nodes/DOM/ProcessingInstruction.php | 10 +
src/Nodes/DOM/{DOMText.php => Text.php} | 2 +-
src/Nodes/NodeTrait.php | 56 ++---
src/Nodes/NodeUtility.php | 50 +++--
src/Readability.php | 210 ++++++++----------
test/ConfigurationTest.php | 2 -
test/ReadabilityTest.php | 3 -
test/test-pages/marketwatch/config.json | 3 +-
27 files changed, 221 insertions(+), 271 deletions(-)
rename src/Nodes/DOM/{DOMAttr.php => Attr.php} (79%)
rename src/Nodes/DOM/{DOMNotation.php => CdataSection.php} (73%)
rename src/Nodes/DOM/{DOMCdataSection.php => CharacterData.php} (72%)
rename src/Nodes/DOM/{DOMEntity.php => Comment.php} (77%)
delete mode 100644 src/Nodes/DOM/DOMDocument.php
delete mode 100644 src/Nodes/DOM/DOMDocumentFragment.php
delete mode 100644 src/Nodes/DOM/DOMDocumentType.php
delete mode 100644 src/Nodes/DOM/DOMEntityReference.php
delete mode 100644 src/Nodes/DOM/DOMProcessingInstruction.php
create mode 100644 src/Nodes/DOM/DocumentFragment.php
create mode 100644 src/Nodes/DOM/DocumentType.php
rename src/Nodes/DOM/{DOMElement.php => Element.php} (82%)
rename src/Nodes/DOM/{DOMComment.php => Entity.php} (76%)
rename src/Nodes/DOM/{DOMCharacterData.php => EntityReference.php} (70%)
rename src/Nodes/DOM/{DOMNode.php => Node.php} (86%)
rename src/Nodes/DOM/{DOMNodeList.php => NodeList.php} (86%)
create mode 100644 src/Nodes/DOM/Notation.php
create mode 100644 src/Nodes/DOM/ProcessingInstruction.php
rename src/Nodes/DOM/{DOMText.php => Text.php} (79%)
diff --git a/composer.json b/composer.json
index c31fc29..f573695 100644
--- a/composer.json
+++ b/composer.json
@@ -32,7 +32,6 @@
"ext-xml": "*",
"ext-mbstring": "*",
"psr/log": "^1.0 || ^2.0 || ^3.0",
- "masterminds/html5": "^2.0",
"league/uri": "^7.0"
},
"require-dev": {
diff --git a/src/Configuration.php b/src/Configuration.php
index c4fe88f..52222d3 100644
--- a/src/Configuration.php
+++ b/src/Configuration.php
@@ -20,7 +20,6 @@ class Configuration
protected bool $cleanConditionally = true;
protected bool $weightClasses = true;
protected bool $fixRelativeURLs = false;
- protected bool $substituteEntities = false;
protected bool $normalizeEntities = false;
protected bool $summonCthulhu = false;
protected string $originalURL = 'http://fakehost';
@@ -206,24 +205,6 @@ public function setFixRelativeURLs(bool $fixRelativeURLs): Configuration
return $this;
}
- /**
- * Get substitute entities.
- */
- public function getSubstituteEntities(): bool
- {
- return $this->substituteEntities;
- }
-
- /**
- * Set substitute entities.
- */
- public function setSubstituteEntities(bool $substituteEntities): Configuration
- {
- $this->substituteEntities = $substituteEntities;
-
- return $this;
- }
-
/**
* Get normalize entities.
*/
@@ -273,7 +254,11 @@ public function getParser(): string
*/
public function setParser(string $parser): Configuration
{
- $this->parser = $parser;
+ if ($parser !== 'html5') {
+ throw new \InvalidArgumentException('This version of Readability.php only supports the HTML5 parser introduced in PHP 8.4');
+ } else {
+ $this->parser = $parser;
+ }
return $this;
}
diff --git a/src/Nodes/DOM/DOMAttr.php b/src/Nodes/DOM/Attr.php
similarity index 79%
rename from src/Nodes/DOM/DOMAttr.php
rename to src/Nodes/DOM/Attr.php
index 1bdf395..ce03daf 100644
--- a/src/Nodes/DOM/DOMAttr.php
+++ b/src/Nodes/DOM/Attr.php
@@ -4,7 +4,7 @@
use fivefilters\Readability\Nodes\NodeTrait;
-class DOMAttr extends \DOMAttr
+class Attr extends \DOM\Attr
{
use NodeTrait;
}
diff --git a/src/Nodes/DOM/DOMNotation.php b/src/Nodes/DOM/CdataSection.php
similarity index 73%
rename from src/Nodes/DOM/DOMNotation.php
rename to src/Nodes/DOM/CdataSection.php
index d276e42..39bc0ce 100644
--- a/src/Nodes/DOM/DOMNotation.php
+++ b/src/Nodes/DOM/CdataSection.php
@@ -4,7 +4,7 @@
use fivefilters\Readability\Nodes\NodeTrait;
-class DOMNotation extends \DOMNotation
+class CdataSection extends \DOM\CdataSection
{
use NodeTrait;
}
diff --git a/src/Nodes/DOM/DOMCdataSection.php b/src/Nodes/DOM/CharacterData.php
similarity index 72%
rename from src/Nodes/DOM/DOMCdataSection.php
rename to src/Nodes/DOM/CharacterData.php
index 6ac3dcd..ac91426 100644
--- a/src/Nodes/DOM/DOMCdataSection.php
+++ b/src/Nodes/DOM/CharacterData.php
@@ -4,7 +4,7 @@
use fivefilters\Readability\Nodes\NodeTrait;
-class DOMCdataSection extends \DOMCdataSection
+class CharacterData extends \DOM\CharacterData
{
use NodeTrait;
}
diff --git a/src/Nodes/DOM/DOMEntity.php b/src/Nodes/DOM/Comment.php
similarity index 77%
rename from src/Nodes/DOM/DOMEntity.php
rename to src/Nodes/DOM/Comment.php
index 751b59c..7415696 100644
--- a/src/Nodes/DOM/DOMEntity.php
+++ b/src/Nodes/DOM/Comment.php
@@ -4,7 +4,7 @@
use fivefilters\Readability\Nodes\NodeTrait;
-class DOMEntity extends \DOMEntity
+class Comment extends \DOM\Comment
{
use NodeTrait;
}
diff --git a/src/Nodes/DOM/DOMDocument.php b/src/Nodes/DOM/DOMDocument.php
deleted file mode 100644
index d912338..0000000
--- a/src/Nodes/DOM/DOMDocument.php
+++ /dev/null
@@ -1,30 +0,0 @@
-registerNodeClass('DOMAttr', DOMAttr::class);
- $this->registerNodeClass('DOMCdataSection', DOMCdataSection::class);
- $this->registerNodeClass('DOMCharacterData', DOMCharacterData::class);
- $this->registerNodeClass('DOMComment', DOMComment::class);
- $this->registerNodeClass('DOMDocument', self::class);
- $this->registerNodeClass('DOMDocumentFragment', DOMDocumentFragment::class);
- $this->registerNodeClass('DOMDocumentType', DOMDocumentType::class);
- $this->registerNodeClass('DOMElement', DOMElement::class);
- $this->registerNodeClass('DOMEntity', DOMEntity::class);
- $this->registerNodeClass('DOMEntityReference', DOMEntityReference::class);
- $this->registerNodeClass('DOMNode', DOMNode::class);
- $this->registerNodeClass('DOMNotation', DOMNotation::class);
- $this->registerNodeClass('DOMProcessingInstruction', DOMProcessingInstruction::class);
- $this->registerNodeClass('DOMText', DOMText::class);
- }
-}
diff --git a/src/Nodes/DOM/DOMDocumentFragment.php b/src/Nodes/DOM/DOMDocumentFragment.php
deleted file mode 100644
index 33a3f95..0000000
--- a/src/Nodes/DOM/DOMDocumentFragment.php
+++ /dev/null
@@ -1,10 +0,0 @@
-childNodes as $node) {
if ($node->nodeType === XML_ELEMENT_NODE) {
$newList->add($node);
@@ -29,7 +29,7 @@ public function children(): DOMNodeList
*
* @deprecated Use previousElementSibling instead - introduced in PHP 8.0.
*/
- public function previousElementSibling(): ?DOMElement
+ public function previousElementSibling(): ?Element
{
return $this->previousElementSibling;
}
diff --git a/src/Nodes/DOM/DOMComment.php b/src/Nodes/DOM/Entity.php
similarity index 76%
rename from src/Nodes/DOM/DOMComment.php
rename to src/Nodes/DOM/Entity.php
index 3b691f4..721be70 100644
--- a/src/Nodes/DOM/DOMComment.php
+++ b/src/Nodes/DOM/Entity.php
@@ -4,7 +4,7 @@
use fivefilters\Readability\Nodes\NodeTrait;
-class DOMComment extends \DOMComment
+class Entity extends \DOM\Entity
{
use NodeTrait;
}
diff --git a/src/Nodes/DOM/DOMCharacterData.php b/src/Nodes/DOM/EntityReference.php
similarity index 70%
rename from src/Nodes/DOM/DOMCharacterData.php
rename to src/Nodes/DOM/EntityReference.php
index b196979..286e3f6 100644
--- a/src/Nodes/DOM/DOMCharacterData.php
+++ b/src/Nodes/DOM/EntityReference.php
@@ -4,7 +4,7 @@
use fivefilters\Readability\Nodes\NodeTrait;
-class DOMCharacterData extends \DOMCharacterData
+class EntityReference extends \DOM\EntityReference
{
use NodeTrait;
}
diff --git a/src/Nodes/DOM/DOMNode.php b/src/Nodes/DOM/Node.php
similarity index 86%
rename from src/Nodes/DOM/DOMNode.php
rename to src/Nodes/DOM/Node.php
index 4a3ab0d..e879546 100644
--- a/src/Nodes/DOM/DOMNode.php
+++ b/src/Nodes/DOM/Node.php
@@ -8,7 +8,7 @@
* @method getAttribute($attribute)
* @method hasAttribute($attribute)
*/
-class DOMNode extends \DOMNode
+class Node extends \DOM\Node
{
use NodeTrait;
}
diff --git a/src/Nodes/DOM/DOMNodeList.php b/src/Nodes/DOM/NodeList.php
similarity index 86%
rename from src/Nodes/DOM/DOMNodeList.php
rename to src/Nodes/DOM/NodeList.php
index 2b34e22..09d2df7 100644
--- a/src/Nodes/DOM/DOMNodeList.php
+++ b/src/Nodes/DOM/NodeList.php
@@ -3,7 +3,7 @@
namespace fivefilters\Readability\Nodes\DOM;
/**
- * Class DOMNodeList.
+ * Class NodeList.
*
* This is a fake DOMNodeList class that allows adding items to the list. The original class is static and the nodes
* are defined automagically when instantiating it. This fake version behaves exactly the same way but adds the function
@@ -12,7 +12,7 @@
* It cannot extend the original DOMNodeList class because the functionality behind the property ->length is hidden
* from the user and cannot be extended, changed, or tweaked.
*/
-class DOMNodeList implements \Countable, \IteratorAggregate
+class NodeList implements \Countable, \IteratorAggregate
{
/**
* @var array
@@ -42,7 +42,7 @@ public function __get($name)
/**
* Add node to the list.
*/
- public function add(DOMNode|DOMElement|DOMText|DOMComment $node): DOMNodeList
+ public function add(Node|Element|Text|Comment $node): NodeList
{
$this->items[] = $node;
$this->length++;
@@ -53,7 +53,7 @@ public function add(DOMNode|DOMElement|DOMText|DOMComment $node): DOMNodeList
/**
* Get node.
*/
- public function item(int $offset): DOMNode|DOMElement|DOMText|DOMComment
+ public function item(int $offset): Node|Element|Text|Comment
{
return $this->items[$offset];
}
diff --git a/src/Nodes/DOM/Notation.php b/src/Nodes/DOM/Notation.php
new file mode 100644
index 0000000..8a50f5c
--- /dev/null
+++ b/src/Nodes/DOM/Notation.php
@@ -0,0 +1,10 @@
+attributes)) {
- return parent::getAttribute($attributeName);
+ if ($this instanceof \Dom\HtmlElement) {
+ return parent::getAttribute($attributeName) ?? '';
}
return '';
@@ -166,7 +165,7 @@ public function getAttribute(string $attributeName): string
*/
public function hasAttribute(string $attributeName): bool
{
- if (!is_null($this->attributes)) {
+ if ($this instanceof \Dom\HtmlElement) {
return parent::hasAttribute($attributeName);
}
@@ -185,7 +184,7 @@ public function getNodeAncestors(int|bool $maxLevel = 3): array
$node = $this->parentNode;
- while ($node && !($node instanceof DOMDocument)) {
+ while ($node && !($node instanceof \Dom\HtmlDocument)) {
$ancestors[] = $node;
$level++;
if ($level === $maxLevel) {
@@ -221,7 +220,7 @@ public function getLinkDensity(): float
$links = $this->getAllLinks();
if ($links) {
- /** @var DOMElement $link */
+ /** @var Element $link */
foreach ($links as $link) {
$href = $link->getAttribute('href');
$coefficient = ($href && preg_match(NodeUtility::$regexps['hashUrl'], $href)) ? 0.3 : 1;
@@ -273,12 +272,12 @@ public function getClassWeight(): int
*/
public function getTextContent(bool $normalize = true): string
{
- $nodeValue = trim($this->textContent);
+ $textContent = mb_trim($this->textContent);
if ($normalize) {
- $nodeValue = preg_replace(NodeUtility::$regexps['normalize'], ' ', $nodeValue);
+ $textContent = preg_replace(NodeUtility::$regexps['normalize'], ' ', $textContent);
}
- return $nodeValue;
+ return $textContent;
}
/**
@@ -289,7 +288,7 @@ public function getRowAndColumnCount(): array
$rows = $columns = 0;
$trs = $this->getElementsByTagName('tr');
foreach ($trs as $tr) {
- /** @var \DOMElement $tr */
+ /** @var \DOM\Element $tr */
$rowspan = $tr->getAttribute('rowspan');
$rows += ($rowspan || 1);
@@ -297,7 +296,7 @@ public function getRowAndColumnCount(): array
$columnsInThisRow = 0;
$cells = $tr->getElementsByTagName('td');
foreach ($cells as $cell) {
- /** @var \DOMElement $cell */
+ /** @var \DOM\Element $cell */
$colspan = $cell->getAttribute('colspan');
$columnsInThisRow += ($colspan || 1);
}
@@ -310,10 +309,11 @@ public function getRowAndColumnCount(): array
/**
* Creates a new node based on the text content of the original node.
*/
- public function createNode(DOMNode $originalNode, string $tagName): DOMElement
+ public function createNode(Node $originalNode, string $tagName): Element
{
$text = $originalNode->getTextContent(false);
- $newNode = $originalNode->ownerDocument->createElement($tagName, $text);
+ $newNode = $originalNode->ownerDocument->createElement($tagName);
+ $newNode->appendChild($originalNode->ownerDocument->createTextNode($text));
return $newNode;
}
@@ -378,7 +378,7 @@ public function hasSingleChildBlockElement(): bool
$result = true;
} else {
// If any of the hasSingleChildBlockElement calls return true, return true then.
- /** @var $child DOMElement */
+ /** @var $child Element */
$result = ($result || $child->hasSingleChildBlockElement());
}
}
@@ -392,8 +392,8 @@ public function hasSingleChildBlockElement(): bool
*/
public function isElementWithoutContent(): bool
{
- return $this instanceof DOMElement &&
- mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 &&
+ return $this instanceof Element &&
+ mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent) ?? '') === 0 &&
($this->childNodes->length === 0 ||
$this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length
/*
@@ -405,7 +405,7 @@ public function isElementWithoutContent(): bool
* mb_strlen in this chain of checks).
*/
+ count(array_filter(iterator_to_array($this->childNodes), function ($child) {
- return $child instanceof DOMText;
+ return $child instanceof Text;
}))
);
@@ -443,7 +443,7 @@ public function isProbablyVisible(): bool
*/
public function isWhitespace(): bool
{
- return ($this->nodeType === XML_TEXT_NODE && $this->isWhitespaceInElementContent()) ||
+ return ($this->nodeType === XML_TEXT_NODE && mb_strlen(mb_trim($this->textContent)) === 0) ||
($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br');
}
@@ -487,9 +487,9 @@ public function shiftingAwareGetElementsByTagName(string $tag): \Generator
/**
* Git first element child or null
*/
- public function getFirstElementChild(): ?DOMElement
+ public function getFirstElementChild(): ?Element
{
- if ($this->nodeType === XML_ELEMENT_NODE || $this->nodeType === XML_DOCUMENT_NODE) {
+ if ($this->nodeType === XML_ELEMENT_NODE || $this->nodeType === XML_HTML_DOCUMENT_NODE) {
return $this->firstElementChild;
}
diff --git a/src/Nodes/NodeUtility.php b/src/Nodes/NodeUtility.php
index 2f094f1..a6d63e8 100644
--- a/src/Nodes/NodeUtility.php
+++ b/src/Nodes/NodeUtility.php
@@ -2,12 +2,12 @@
namespace fivefilters\Readability\Nodes;
-use fivefilters\Readability\Nodes\DOM\DOMDocument;
-use fivefilters\Readability\Nodes\DOM\DOMElement;
-use fivefilters\Readability\Nodes\DOM\DOMNode;
-use fivefilters\Readability\Nodes\DOM\DOMText;
-use fivefilters\Readability\Nodes\DOM\DOMComment;
-use fivefilters\Readability\Nodes\DOM\DOMNodeList;
+use fivefilters\Readability\Nodes\DOM;
+use fivefilters\Readability\Nodes\DOM\Element;
+use fivefilters\Readability\Nodes\DOM\Node;
+use fivefilters\Readability\Nodes\DOM\Text;
+use fivefilters\Readability\Nodes\DOM\Comment;
+use fivefilters\Readability\Nodes\DOM\NodeList;
/**
* Class NodeUtility.
@@ -52,7 +52,7 @@ class NodeUtility
*
* Imported from the Element class on league\html-to-markdown.
*/
- public static function nextNode(DOMNode|DOMComment|DOMText|DOMElement|null $node): DOMNode|DOMComment|DOMText|DOMElement|null
+ public static function nextNode(Node|Comment|Text|Element|null $node): Node|Comment|Text|Element|null
{
$next = $node;
while ($next
@@ -68,9 +68,10 @@ public static function nextNode(DOMNode|DOMComment|DOMText|DOMElement|null $node
* Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new
* element with the new tag name and importing it to the main DOMDocument.
*/
- public static function setNodeTag(DOMNode|DOMElement $node, string $value, bool $importAttributes = true): DOMNode|DOMElement
+ public static function setNodeTag(Node|Element $node, string $value, bool $importAttributes = true): Node|Element
{
- $new = new DOMDocument('1.0', 'utf-8');
+ $new = \Dom\HtmlDocument::createEmpty();
+ NodeUtility::registerReadabilityNodeClasses($new);
$new->appendChild($new->createElement($value));
$children = $node->childNodes;
@@ -97,7 +98,7 @@ public static function setNodeTag(DOMNode|DOMElement $node, string $value, bool
/**
* Removes the current node and returns the next node to be parsed (child, sibling or parent).
*/
- public static function removeAndGetNext(DOMNode|DOMComment|DOMText|DOMElement $node): DOMNode|DOMComment|DOMText|DOMElement|null
+ public static function removeAndGetNext(Node|Comment|Text|Element $node): Node|Comment|Text|Element|null
{
$nextNode = self::getNextNode($node, true);
$node->parentNode->removeChild($node);
@@ -108,7 +109,7 @@ public static function removeAndGetNext(DOMNode|DOMComment|DOMText|DOMElement $n
/**
* Remove the selected node.
*/
- public static function removeNode(DOMNode|DOMComment|DOMText|DOMElement $node): void
+ public static function removeNode(Node|Comment|Text|Element $node): void
{
$parent = $node->parentNode;
if ($parent) {
@@ -120,7 +121,7 @@ public static function removeNode(DOMNode|DOMComment|DOMText|DOMElement $node):
* Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally
* for parents.
*/
- public static function getNextNode(DOMNode|DOMComment|DOMText|DOMElement|DOMDocument $originalNode, bool $ignoreSelfAndKids = false): DOMNode|DOMComment|DOMText|DOMElement|DOMDocument|null
+ public static function getNextNode(Node|Comment|Text|Element|\Dom\HtmlDocument $originalNode, bool $ignoreSelfAndKids = false): Node|Comment|Text|Element|\Dom\HtmlDocument|null
{
/*
* Traverse the DOM from node to node, starting at the node passed in.
@@ -153,15 +154,34 @@ public static function getNextNode(DOMNode|DOMComment|DOMText|DOMElement|DOMDocu
/**
* Remove all empty DOMNodes from DOMNodeLists.
*/
- public static function filterTextNodes(\DOMNodeList $list): DOMNodeList
+ public static function filterTextNodes(\Dom\NodeList $list): NodeList
{
- $newList = new DOMNodeList();
+ $newList = new NodeList();
foreach ($list as $node) {
- if ($node->nodeType !== XML_TEXT_NODE || !$node->isWhitespaceInElementContent()) {
+ if ($node->nodeType !== XML_TEXT_NODE || mb_trim($node->nodeValue) !== '') {
$newList->add($node);
}
}
return $newList;
}
+
+ public static function registerReadabilityNodeClasses(\DOM\HtmlDocument $dom): void
+ {
+ $dom->registerNodeClass('DOM\HtmlElement', DOM\Element::class);
+ $dom->registerNodeClass('DOM\Attr', DOM\Attr::class);
+ $dom->registerNodeClass('DOM\CdataSection', DOM\CdataSection::class);
+ $dom->registerNodeClass('DOM\CharacterData', DOM\CharacterData::class);
+ $dom->registerNodeClass('DOM\Comment', DOM\Comment::class);
+ //$dom->registerNodeClass('DOM\HtmlDocument', DOM\HtmlDocument::class);
+ $dom->registerNodeClass('DOM\DocumentFragment', DOM\DocumentFragment::class);
+ $dom->registerNodeClass('DOM\DocumentType', DOM\DocumentType::class);
+ $dom->registerNodeClass('DOM\Element', DOM\Element::class);
+ $dom->registerNodeClass('DOM\Entity', DOM\Entity::class);
+ $dom->registerNodeClass('DOM\EntityReference', DOM\EntityReference::class);
+ $dom->registerNodeClass('DOM\Node', DOM\Node::class);
+ $dom->registerNodeClass('DOM\Notation', DOM\Notation::class);
+ $dom->registerNodeClass('DOM\ProcessingInstruction', DOM\ProcessingInstruction::class);
+ $dom->registerNodeClass('DOM\Text', DOM\Text::class);
+ }
}
diff --git a/src/Readability.php b/src/Readability.php
index c37c9b6..a2ff53b 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -2,14 +2,12 @@
namespace fivefilters\Readability;
-use fivefilters\Readability\Nodes\DOM\DOMDocument;
-use fivefilters\Readability\Nodes\DOM\DOMElement;
-use fivefilters\Readability\Nodes\DOM\DOMNode;
-use fivefilters\Readability\Nodes\DOM\DOMText;
-use fivefilters\Readability\Nodes\DOM\DOMComment;
+use fivefilters\Readability\Nodes\DOM\Element;
+use fivefilters\Readability\Nodes\DOM\Node;
+use fivefilters\Readability\Nodes\DOM\Text;
+use fivefilters\Readability\Nodes\DOM\Comment;
use fivefilters\Readability\Nodes\NodeUtility;
use Psr\Log\LoggerInterface;
-use Masterminds\HTML5;
use League\Uri\BaseUri;
/**
@@ -18,9 +16,9 @@
class Readability
{
/**
- * Main DOMDocument where all the magic happens.
+ * Main HtmlDocument where all the magic happens.
*/
- protected DOMDocument $dom;
+ protected \Dom\HtmlDocument $dom;
/**
* Title of the article.
@@ -28,9 +26,9 @@ class Readability
protected ?string $title = null;
/**
- * Final DOMDocument with the fully parsed HTML.
+ * Final HtmlDocument with the fully parsed HTML.
*/
- protected ?DOMDocument $content = null;
+ protected ?\Dom\HtmlDocument $content = null;
/**
* Excerpt of the article.
@@ -59,7 +57,7 @@ class Readability
/**
* Base URI
- * HTML5PHP doesn't appear to store it in the baseURI property like PHP's DOMDocument does when parsing with libxml
+ * HTML5PHP doesn't appear to store it in the baseURI property like PHP's HtmlDocument does when parsing with libxml
*/
protected ?string $baseURI = null;
@@ -162,6 +160,7 @@ public function parse(?string $html = null): bool
throw new ParseException('Invalid or incomplete HTML.');
}
+ $root = $this->dom->getElementsByTagName('body')->item(0);
$bodyCache = $root->cloneNode(true);
@@ -170,12 +169,11 @@ public function parse(?string $html = null): bool
$this->getMainImage();
while (true) {
-
- $this->logger->debug('Starting parse loop');
+ $this->logger->debug('Starting parse loop (#' . count($this->attempts) . ')');
//$root = $root->firstChild;
$elementsToScore = $this->getNodes($root->firstChild);
- $this->logger->debug(sprintf('Elements to score: \'%s\'', count($elementsToScore)));
+ $this->logger->debug(sprintf('Elements to score: %d', count($elementsToScore)));
$result = $this->rateNodes($elementsToScore);
@@ -187,8 +185,7 @@ public function parse(?string $html = null): bool
* finding the -right- content.
*/
- $length = !$result ? 0 : mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->textContent));
-
+ $length = !$result ? 0 : mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->documentElement->textContent));
$this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getCharThreshold()));
if ($result && $length < $this->configuration->getCharThreshold()) {
@@ -271,14 +268,15 @@ public function loadHTML(string $html): void
$this->logger->debug('[Loading] Loading HTML...');
// To avoid throwing a gazillion of errors on malformed HTMLs
- libxml_use_internal_errors(true);
+ //libxml_use_internal_errors(true);
//$html = preg_replace('/(
]*>[ \n\r\t]*){2,}/i', '
', $html);
if ($this->configuration->getParser() === 'html5') {
$this->logger->debug('[Loading] Using HTML5 parser...');
- $html5 = new HTML5(['disable_html_ns' => true, 'target_document' => new DOMDocument('1.0', 'utf-8')]);
- $dom = $html5->loadHTML($html);
+ // New DOM class with HTML5 parser introduced in PHP 8.4
+ $dom = \Dom\HtmlDocument::createFromString($html, \Dom\HTML_NO_DEFAULT_NS | LIBXML_NOERROR);
+ NodeUtility::registerReadabilityNodeClasses($dom);
//TODO: Improve this so it looks inside
, not just any
$base = $dom->getElementsByTagName('base');
if ($base->length > 0) {
@@ -289,31 +287,9 @@ public function loadHTML(string $html): void
}
}
} else {
- $this->logger->debug('[Loading] Using libxml parser...');
- $dom = new DOMDocument('1.0', 'utf-8');
- if ($this->configuration->getNormalizeEntities()) {
- $this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.');
- // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content
- $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
- }
- }
-
- if (!$this->configuration->getSubstituteEntities()) {
- // Keep the original HTML entities
- $dom->substituteEntities = false;
- }
-
- if ($this->configuration->getSummonCthulhu()) {
- $this->logger->debug('[Loading] Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘');
- $html = preg_replace('/