Skip to content

Commit

Permalink
HTML API: Add functions to read inner and outer HTML.
Browse files Browse the repository at this point in the history
  • Loading branch information
dmsnell committed Aug 2, 2023
1 parent e0f5297 commit e2a7146
Show file tree
Hide file tree
Showing 3 changed files with 305 additions and 6 deletions.
123 changes: 117 additions & 6 deletions src/wp-includes/html-api/class-wp-html-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,84 @@ public function next_tag( $query = null ) {
return false;
}

/**
* Returns the raw HTMl content inside a matched tag.
*
* @since 6.4.0
*
* @throws Exception When unable to allocate a bookmark for internal tracking of the open tag.
*
* @return string|null The inner HTML if available, else NULL.
*/
public function get_inner_markup() {
if ( null === $this->get_tag() ) {
return null;
}

$start = $this->current_token;
parent::set_bookmark( 'start' );
// @TODO: add after-pop hook to turn this into a constant boolean check.
do {
$found_tag = $this->step();
} while ( $found_tag && $this->state->stack_of_open_elements->contains_node( $start ) );

/*
* If there's no tag to bookmark then it means the opened tag has no closing
* and the rest of the document is contained within the inner HTML.
*/
if ( ! $found_tag ) {
$inner_html = $this->substr_bookmark( 'after', 'start' );
parent::release_bookmark( 'start' );
} else {
parent::set_bookmark( 'end' );
$inner_html = $this->substr_bookmarks( 'after', 'start', 'before', 'end' );
parent::release_bookmark( 'start' );
parent::release_bookmark( 'end' );
}

return $inner_html;
}

/**
* Returns the raw HTMl content inside a matched tag.
*
* @since 6.4.0
*
* @throws Exception When unable to allocate a bookmark for internal tracking of the open tag.
*
* @return string|null The inner HTML if available, else NULL.
*/
public function get_outer_markup() {
if ( null === $this->get_tag() ) {
return null;
}

$start = $this->current_token;
parent::set_bookmark( 'start' );
// @TODO: add after-pop hook to turn this into a constant boolean check.
do {
$found_tag = $this->step();
} while ( $found_tag && $this->state->stack_of_open_elements->contains_node( $start ) );

/*
* If there's no tag to bookmark then it means the opened tag has no closing
* and the rest of the document is contained within the inner HTML.
*/
if ( ! $found_tag ) {
$inner_html = $this->substr_bookmark( 'before', 'start' );
} else {
parent::set_bookmark( 'end' );
$did_close = $this->get_tag() === $start->node_name && $this->is_tag_closer();
$end_position = $did_close ? 'after' : 'before';
$inner_html = $this->substr_bookmarks( 'before', 'start', $end_position, 'end' );
}

parent::release_bookmark( 'start' );
parent::release_bookmark( 'end' );

return $inner_html;
}

/**
* Steps through the HTML document and stop at the next tag, if any.
*
Expand All @@ -437,12 +515,9 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
$this->state->stack_of_open_elements->pop();
}

parent::next_tag( self::VISIT_EVERYTHING );
}

// Finish stepping when there are no more tokens in the document.
if ( null === $this->get_tag() ) {
return false;
if ( ! parent::next_tag( self::VISIT_EVERYTHING ) ) {
return false;
}
}

$this->current_token = new WP_HTML_Token(
Expand Down Expand Up @@ -722,6 +797,42 @@ private function bookmark_tag() {
return "{$this->bookmark_counter}";
}

/**
* Returns a substring of the input HTML document from a bookmark until the end.
*
* @since 6.4.0
*
* @param string $start_position "before" to clip before bookmark, "after" to clip after.
* @param string $start Bookmark name at which to start clipping.
* @return string Clipped substring of input HTMl document.
*/
private function substr_bookmark( $start_position, $start ) {
$start_bookmark = $this->bookmarks[ $start ];
$start_offset = 'before' === $start_position ? $start_bookmark->start : $start_bookmark->end + 1;

return substr( $this->html, $start_offset );
}

/**
* Returns a substring of the input HTML document delimited by bookmarks.
*
* @since 6.4.0
*
* @param string $start_position "before" to clip before bookmark, "after" to clip after.
* @param string $start Bookmark name at which to start clipping.
* @param string $end_position "before" to clip before bookmark, "after" to clip after.
* @param string $end Bookmark name at which to end clipping.
* @return string Clipped substring of input HTMl document.
*/
private function substr_bookmarks( $start_position, $start, $end_position, $end ) {
$start_bookmark = $this->bookmarks[ $start ];
$end_bookmark = $this->bookmarks[ $end ];
$start_offset = 'before' === $start_position ? $start_bookmark->start : $start_bookmark->end + 1;
$end_offset = 'before' === $end_position ? $end_bookmark->start : $end_bookmark->end + 1;

return substr( $this->html, $start_offset, $end_offset - $start_offset );
}

/*
* HTML semantic overrides for Tag Processor
*/
Expand Down
93 changes: 93 additions & 0 deletions tests/phpunit/tests/html-api/wpHtmlProcessorGetInnerMarkup.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
<?php
/**
* Unit tests covering WP_HTML_Processor::get_inner_markup()
*
* @package WordPress
* @subpackage HTML-API
*
* @since 6.4.0
*
* @group html-api
*
* @coversDefaultClass WP_HTML_Processor
*/
class Tests_HtmlApi_WpHtmlProcessorGetInnerMarkup extends WP_UnitTestCase {
/**
* @ticket {TICKET_NUMBER}
*
* @covers WP_HTML_Processor::get_inner_markup
*
* @since 6.4.0
*/
public function test_returns_null_when_not_on_a_matching_tag() {
$p = WP_HTML_Processor::createFragment( '<p><div><span></span></div>' );

$this->assertNull( $p->get_inner_markup() );

$this->assertFalse( $p->next_tag( 'BUTTON' ), "Should not have found a BUTTON tag but stopped at {$p->get_tag()}." );
$this->assertNull( $p->get_inner_markup() );
}

/**
* @ticket {TICKET_NUMBER}
*
* @covers WP_HTML_Processor::get_inner_markup
*
* @dataProvider data_html_with_inner_markup
*
* @since 6.4.0
*
* @param string $html_with_target_node HTML containing a node with the `target` attribute set.
* @param string $expected_inner_markup Inner markup of target node.
*/
public function test_returns_appropriate_inner_markup( $html_with_target_node, $expected_inner_markup ) {
$p = WP_HTML_Processor::createFragment( $html_with_target_node );

while ( $p->next_tag() && null === $p->get_attribute( 'target' ) ) {
continue;
}

$this->assertSame( $expected_inner_markup, $p->get_inner_markup(), 'Failed to return appropriate inner markup.' );
}

/**
* Data provider.
*
* @return array[]
*/
public function data_html_with_inner_markup() {
$data = array(
'Empty elements' => array( '<div target></div>', '' ),
'Element containing only text' => array( '<div target>inside</div>', 'inside' ),
'Element with nested tags' => array( '<div target>inside <span>the</span> div</div>', 'inside <span>the</span> div' ),
'Unclosed element' => array( '<div target>This is <em>all</em> inside the DIV', 'This is <em>all</em> inside the DIV' ),
'Partially-closed element' => array( '<div target>This is <em>all</em> inside the DIV</div', 'This is <em>all</em> inside the DIV</div' ),
'Implicitly-closed element' => array( '<div><p target>Inside the P</div>Outside the P</p>', 'Inside the P' ),
);

$inner_html = <<<HTML
<p>This is inside the <strong>Match</strong></p>
<p><img></p>
<div>
<figure>
<img>
<figcaption>Look at the <strike>picture</strike> photograph.</figcaption>
</figure>
</div>
HTML;

$html = <<<HTML
<div>
<p>This is not in the match.
<p>This is another paragraph not <a href="#">in</a> the match.
</div>
<div target>{$inner_html}</div>
<div>
<p>This is also note in the match.</p>
</div>
HTML;
$data['Complicated inner nesting'] = array( $html, $inner_html );

return $data;
}
}
95 changes: 95 additions & 0 deletions tests/phpunit/tests/html-api/wpHtmlProcessorGetOuterMarkup.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
<?php
/**
* Unit tests covering WP_HTML_Processor::get_outer_html()
*
* @package WordPress
* @subpackage HTML-API
*
* @since 6.4.0
*
* @group html-api
*
* @coversDefaultClass WP_HTML_Processor
*/
class Tests_HtmlApi_WpHtmlProcessorGetOuterMarkup extends WP_UnitTestCase {
/**
* Ensures that it's not possible to get inner contents when not stopped at a tag in the HTML.
*
* @ticket {TICKET_NUMBER}
*
* @covers WP_HTML_Processor::get_outer_markup
*
* @since 6.4.0
*/
public function test_returns_null_when_not_on_a_matching_tag() {
$p = WP_HTML_Processor::createFragment( '<p><div><span></span></div>' );

$this->assertNull( $p->get_outer_markup() );

$this->assertFalse( $p->next_tag( 'BUTTON' ), "Should not have found a BUTTON tag but stopped at {$p->get_tag()}." );
$this->assertNull( $p->get_outer_markup() );
}

/**
* @ticket {TICKET_NUMBER}
*
* @covers WP_HTML_Processor::get_outer_markup
*
* @dataProvider data_html_with_outer_markup
*
* @since 6.4.0
*
* @param string $html_with_target_node HTML containing a node with the `target` attribute set.
* @param string $expected_outer_markup Outer markup of target node.
*/
public function test_returns_appropriate_outer_markup( $html_with_target_node, $expected_outer_markup ) {
$p = WP_HTML_Processor::createFragment( $html_with_target_node );

while ( $p->next_tag() && null === $p->get_attribute( 'target' ) ) {
continue;
}

$this->assertSame( $expected_outer_markup, $p->get_outer_markup(), 'Failed to return appropriate inner markup.' );
}

/**
* Data provider.
*
* @return array[]
*/
public function data_html_with_outer_markup() {
$data = array(
'Empty elements' => array( '<div target></div>', '<div target></div>' ),
'Element containing only text' => array( '<div target>inside</div>', '<div target>inside</div>' ),
'Element with nested tags' => array( '<div target>inside <span>the</span> div</div>', '<div target>inside <span>the</span> div</div>' ),
'Unclosed element' => array( '<div target>This is <em>all</em> inside the DIV', '<div target>This is <em>all</em> inside the DIV' ),
'Partially-closed element' => array( '<div target>This is <em>all</em> inside the DIV</div', '<div target>This is <em>all</em> inside the DIV</div' ),
'Implicitly-closed element' => array( '<div><p target>Inside the P</div>Outside the P</p>', '<p target>Inside the P' ),
);

$inner_html = <<<HTML
<p>This is inside the <strong>Match</strong></p>
<p><img></p>
<div>
<figure>
<img>
<figcaption>Look at the <strike>picture</strike> photograph.</figcaption>
</figure>
</div>
HTML;

$html = <<<HTML
<div>
<p>This is not in the match.
<p>This is another paragraph not <a href="#">in</a> the match.
</div>
<div target>{$inner_html}</div>
<div>
<p>This is also note in the match.</p>
</div>
HTML;
$data['Complicated inner nesting'] = array( $html, "<div target>{$inner_html}</div>" );

return $data;
}
}

0 comments on commit e2a7146

Please sign in to comment.