diff --git a/.gitignore b/.gitignore index 15876fa47fee8..5a2a7820fc7b1 100644 --- a/.gitignore +++ b/.gitignore @@ -48,6 +48,7 @@ wp-tests-config.php /artifacts /setup.log /coverage +/tools/html-api-fuzz/oracles/lexbor/build/ # Files and folders that get created in wp-content /src/wp-content/blogs.dir diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 56ea0f705c2b8..8f3068adab378 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -251,6 +251,29 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { */ private $current_element = null; + /** + * Whether to record relevant tokens consumed without visitable stack events. + * + * Some semantic end tags, such as BODY and HTML closers, update parser + * state without popping their element from the stack of open elements. This + * flag enables targeted bookkeeping while set_inner_html() searches for a + * target's inner span. + * + * @since 7.0.0 + * + * @var bool + */ + private $record_nonvisitable_token_events = false; + + /** + * Non-visitable token events consumed while recording is enabled. + * + * @since 7.0.0 + * + * @var array + */ + private $nonvisitable_token_events = array(); + /** * Context node if created as a fragment parser. * @@ -2332,6 +2355,7 @@ private function step_in_body(): bool { * * This parser does not currently support this behavior: ignore the token. */ + $this->record_nonvisitable_token_event( 'html-or-body-start-tag' ); } // Ignore the token. @@ -2367,6 +2391,9 @@ private function step_in_body(): bool { 'BODY' !== ( $this->state->stack_of_open_elements->at( 2 )->node_name ?? null ) || $this->state->stack_of_open_elements->contains( 'TEMPLATE' ) ) { + if ( ! $this->state->stack_of_open_elements->contains( 'TEMPLATE' ) ) { + $this->record_nonvisitable_token_event( 'html-or-body-start-tag' ); + } // Ignore the token. return $this->step(); } @@ -2379,6 +2406,7 @@ private function step_in_body(): bool { * * This parser does not currently support this behavior: ignore the token. */ + $this->record_nonvisitable_token_event( 'html-or-body-start-tag' ); $this->state->frameset_ok = false; return $this->step(); @@ -2423,6 +2451,7 @@ private function step_in_body(): bool { */ $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_BODY; + $this->record_nonvisitable_token_event( 'html-or-body-end-tag' ); /* * The BODY element is not removed from the stack of open elements. * Only internal state has changed, this does not qualify as a "step" @@ -4523,6 +4552,7 @@ private function step_after_body(): bool { } $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_BODY; + $this->record_nonvisitable_token_event( 'html-or-body-end-tag' ); /* * The HTML element is not removed from the stack of open elements. * Only internal state has changed, this does not qualify as a "step" @@ -5204,6 +5234,344 @@ private function step_in_foreign_content(): bool { * Internal helpers */ + /** + * Finds the byte offset where the currently matched element's inner HTML ends. + * + * @since 7.0.0 + * + * @param WP_HTML_Span $target_bookmark Bookmark span of the target opener. + * @return int|null Byte offset where inner HTML ends, or null if it cannot be found. + */ + private function find_current_element_inner_html_end( WP_HTML_Span $target_bookmark ): ?int { + $processor = $this->create_processor_for_current_parsing_mode( $this->html ); + if ( null === $processor ) { + return null; + } + + $processor->record_nonvisitable_token_events = true; + $target_token = null; + + do { + $has_token = $processor->next_token(); + foreach ( $processor->consume_nonvisitable_token_events() as $event ) { + if ( + isset( $target_token ) && + 'html-or-body-end-tag' === $event['operation'] && + $target_token->node_name === $event['node_name'] + ) { + return $event['start']; + } + } + + if ( ! $has_token ) { + break; + } + + if ( ! isset( $target_token ) ) { + if ( $processor->is_at_source_span( $target_bookmark ) ) { + $target_token = $processor->current_element->token; + } + continue; + } + + if ( + isset( $processor->current_element ) && + WP_HTML_Stack_Event::POP === $processor->current_element->operation && + $target_token === $processor->current_element->token + ) { + return $processor->get_current_source_token_start(); + } + } while ( true ); + + return null; + } + + /** + * Returns the source offset for the token currently being processed. + * + * For virtual stack pops at the end of the document, there is no source + * token, so the offset is the end of the HTML string. + * + * @since 7.0.0 + * + * @return int Source offset for the current token. + */ + private function get_current_source_token_start(): int { + if ( + WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state || + WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state || + ! isset( $this->state->current_token->bookmark_name ) || + ! isset( $this->bookmarks[ $this->state->current_token->bookmark_name ] ) + ) { + return strlen( $this->html ); + } + + return $this->bookmarks[ $this->state->current_token->bookmark_name ]->start; + } + + /** + * Records a relevant token that is consumed without a visitable stack event. + * + * @since 7.0.0 + * + * @param string $operation Non-visitable operation name. + */ + private function record_nonvisitable_token_event( string $operation ): void { + if ( ! $this->record_nonvisitable_token_events ) { + return; + } + + $attribute_names = 'html-or-body-start-tag' === $operation + ? $this->get_attribute_names_with_prefix( '' ) + : null; + + $this->nonvisitable_token_events[] = array( + 'has_attributes' => isset( $attribute_names ) && count( $attribute_names ) > 0, + 'node_name' => $this->get_token_name(), + 'operation' => $operation, + 'start' => $this->get_current_source_token_start(), + ); + } + + /** + * Returns and clears recorded non-visitable token events. + * + * @since 7.0.0 + * + * @return array + */ + private function consume_nonvisitable_token_events(): array { + $events = $this->nonvisitable_token_events; + $this->nonvisitable_token_events = array(); + + return $events; + } + + /** + * Checks whether a proposed inner HTML replacement preserves the outside tree. + * + * @since 7.0.0 + * + * @param string $replacement Proposed raw inner HTML. + * @param WP_HTML_Span $target_bookmark Bookmark span of the target opener. + * @param int $inner_start Byte offset where the original inner HTML starts. + * @param int $inner_end Byte offset where the original inner HTML ends. + * @return bool Whether the replacement can be safely applied. + */ + private function is_safe_inner_html_replacement( string $replacement, WP_HTML_Span $target_bookmark, int $inner_start, int $inner_end ): bool { + $candidate_html = substr( $this->html, 0, $inner_start ) . $replacement . substr( $this->html, $inner_end ); + + $original_signature = $this->get_outer_html_signature( + $this->html, + $target_bookmark, + true, + $inner_start, + $inner_end + ); + $candidate_signature = $this->get_outer_html_signature( + $candidate_html, + $target_bookmark, + true, + $inner_start, + $inner_start + strlen( $replacement ) + ); + + return null !== $original_signature && $original_signature === $candidate_signature; + } + + /** + * Returns a signature of all parsed tokens outside a target element. + * + * @since 7.0.0 + * + * @param string $html HTML to parse. + * @param WP_HTML_Span $target_bookmark Bookmark span of the target opener. + * @param bool $reject_html_body_attr_hoisting Whether to reject ignored HTML/BODY start tags with attributes. + * @param int|null $replacement_start Start offset of replacement bytes in candidate HTML. + * @param int|null $replacement_end End offset of replacement bytes in candidate HTML. + * @return array>|null Outside token signature, or null on parse failure. + */ + private function get_outer_html_signature( + string $html, + WP_HTML_Span $target_bookmark, + bool $reject_html_body_attr_hoisting = false, + ?int $replacement_start = null, + ?int $replacement_end = null + ): ?array { + $processor = $this->create_processor_for_current_parsing_mode( $html ); + if ( null === $processor ) { + return null; + } + + $processor->record_nonvisitable_token_events = $reject_html_body_attr_hoisting; + + $signature = array(); + $target_token = null; + $target_active_formatting_elements = null; + $inside = false; + $found_target = false; + + while ( true ) { + $has_token = $processor->next_token(); + foreach ( $processor->consume_nonvisitable_token_events() as $event ) { + if ( + 'html-or-body-start-tag' === $event['operation'] && + $event['has_attributes'] && + ( + $inside || + ( + isset( $replacement_start, $replacement_end ) && + $event['start'] >= $replacement_start && + $event['start'] < $replacement_end + ) + ) + ) { + return null; + } + } + + if ( ! $has_token ) { + break; + } + + if ( ! $inside && $processor->is_at_source_span( $target_bookmark ) ) { + $target_token = $processor->current_element->token; + $target_active_formatting_elements = $processor->get_active_formatting_elements_signature( + $target_token, + in_array( + $target_token->node_name, + array( 'APPLET', 'CAPTION', 'MARQUEE', 'OBJECT', 'TD', 'TEMPLATE', 'TH' ), + true + ) + ); + $inside = true; + $found_target = true; + $signature[] = $processor->get_current_token_signature(); + continue; + } + + if ( $inside ) { + if ( + isset( $processor->current_element ) && + WP_HTML_Stack_Event::POP === $processor->current_element->operation && + $target_token === $processor->current_element->token + ) { + if ( $target_active_formatting_elements !== $processor->get_active_formatting_elements_signature() ) { + return null; + } + $inside = false; + $signature[] = $processor->get_current_token_signature(); + } + + continue; + } + + $signature[] = $processor->get_current_token_signature(); + } + + if ( null !== $processor->get_last_error() || ! $found_target || $inside ) { + return null; + } + + return $signature; + } + + /** + * Creates a fresh processor in the same public parsing mode as this one. + * + * @since 7.0.0 + * + * @param string $html HTML to parse. + * @return static|null Processor in the same parsing mode, or null if unsupported. + */ + private function create_processor_for_current_parsing_mode( string $html ) { + if ( null === $this->context_node ) { + return static::create_full_parser( $html, $this->state->encoding ?? 'UTF-8' ); + } + + if ( 'html' !== $this->context_node->namespace || 'BODY' !== $this->context_node->node_name ) { + return null; + } + + return static::create_fragment( $html, '', $this->state->encoding ?? 'UTF-8' ); + } + + /** + * Checks whether the current token starts at a given source span. + * + * @since 7.0.0 + * + * @param WP_HTML_Span $span Source span to match. + * @return bool Whether the current token starts at the given source span. + */ + private function is_at_source_span( WP_HTML_Span $span ): bool { + if ( + $this->is_virtual() || + ! isset( $this->state->current_token->bookmark_name ) || + ! isset( $this->bookmarks[ $this->state->current_token->bookmark_name ] ) + ) { + return false; + } + + $current_span = $this->bookmarks[ $this->state->current_token->bookmark_name ]; + + return $current_span->start === $span->start && $current_span->length === $span->length; + } + + /** + * Returns a parsed token signature for outside-tree comparison. + * + * @since 7.0.0 + * + * @return array Token signature. + */ + private function get_current_token_signature(): array { + return array( + 'type' => $this->get_token_type(), + 'name' => $this->get_token_name(), + 'namespace' => $this->get_namespace(), + 'is_closer' => $this->is_tag_closer(), + 'breadcrumbs' => $this->get_breadcrumbs(), + 'html' => $this->serialize_token(), + ); + } + + /** + * Returns a parser-state signature for active formatting elements. + * + * @since 7.0.0 + * + * @param WP_HTML_Token|null $target_token Optional target token to omit from the signature. + * @param bool $omit_target_marker Whether to omit a parser marker introduced by the target. + * @return array> Active formatting elements signature. + */ + private function get_active_formatting_elements_signature( + ?WP_HTML_Token $target_token = null, + bool $omit_target_marker = false + ): array { + $signature = array(); + + foreach ( $this->state->active_formatting_elements->walk_down() as $item ) { + if ( null !== $target_token && $item->bookmark_name === $target_token->bookmark_name ) { + continue; + } + + $signature[] = array( + 'bookmark_name' => $item->bookmark_name, + 'node_name' => $item->node_name, + 'namespace' => $item->namespace, + 'integration_node_type' => $item->integration_node_type, + ); + } + + $last_index = count( $signature ) - 1; + if ( $omit_target_marker && $last_index >= 0 && 'marker' === $signature[ $last_index ]['node_name'] ) { + array_pop( $signature ); + } + + return $signature; + } + /** * Creates a new bookmark for the currently-matched token and returns the generated name. * @@ -5446,6 +5814,64 @@ public function remove_attribute( $name ): bool { return $this->is_virtual() ? false : parent::remove_attribute( $name ); } + /** + * Replaces the inner HTML for the currently matched tag, if matched. + * + * The inner HTML is replaced as raw HTML markup. This method refuses to + * apply a replacement if the updated markup would change the parsed tree + * outside of the currently matched element. + * + * @since 7.0.0 + * + * @param string $html New raw inner HTML. + * @return bool Whether the inner HTML was able to update. + */ + public function set_inner_html( string $html ): bool { + if ( + $this->is_virtual() || + WP_HTML_Tag_Processor::STATE_MATCHED_TAG !== $this->parser_state || + $this->is_tag_closer() || + ! $this->expects_closer() + ) { + return false; + } + + /* + * Flush pending updates so that the candidate tree is validated against + * the same HTML source that will receive the inner HTML replacement. + */ + $this->get_updated_html(); + + $target_token = $this->current_element->token; + if ( null !== $target_token->integration_node_type ) { + return false; + } + + if ( ! isset( $target_token->bookmark_name, $this->bookmarks[ $target_token->bookmark_name ] ) ) { + return false; + } + + $target_bookmark = $this->bookmarks[ $target_token->bookmark_name ]; + $inner_start = $target_bookmark->start + $target_bookmark->length; + $inner_end = $this->find_current_element_inner_html_end( $target_bookmark ); + + if ( null === $inner_end || $inner_end < $inner_start ) { + return false; + } + + if ( ! $this->is_safe_inner_html_replacement( $html, $target_bookmark, $inner_start, $inner_end ) ) { + return false; + } + + $this->lexical_updates['inner_html'] = new WP_HTML_Text_Replacement( + $inner_start, + $inner_end - $inner_start, + $html + ); + + return true; + } + /** * Gets lowercase names of all attributes matching a given prefix in the current tag. * diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorSetInnerHtml.php b/tests/phpunit/tests/html-api/wpHtmlProcessorSetInnerHtml.php new file mode 100644 index 0000000000000..11e85d2699b84 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorSetInnerHtml.php @@ -0,0 +1,389 @@ +Old old' ); + + $this->assertTrue( $processor->next_tag( 'DIV' ), 'Failed to find the DIV opener.' ); + $this->assertTrue( $processor->set_inner_html( '

New

' ), 'Failed to set inner HTML.' ); + $this->assertSame( + '

New

', + $processor->get_updated_html(), + 'Should have replaced the inner HTML.' + ); + } + + /** + * Ensures that inner HTML can be replaced in a full document. + * + * @covers WP_HTML_Processor::set_inner_html + */ + public function test_set_inner_html_replaces_inner_html_in_full_document(): void { + $html = '
Old
'; + $processor = WP_HTML_Processor::create_full_parser( $html ); + + $this->assertTrue( $processor->next_tag( 'BODY' ), 'Failed to find the BODY opener.' ); + $this->assertTrue( $processor->set_inner_html( '
New
' ), 'Failed to set BODY inner HTML.' ); + $this->assertSame( + '
New
', + $processor->get_updated_html(), + 'Should have replaced the BODY inner HTML.' + ); + } + + /** + * Ensures BODY replacement ignores BODY-looking syntax inside TEMPLATE content. + * + * @covers WP_HTML_Processor::set_inner_html + */ + public function test_set_inner_html_replaces_body_with_body_closer_in_template(): void { + $html = '

After

'; + $processor = WP_HTML_Processor::create_full_parser( $html ); + + $this->assertTrue( $processor->next_tag( 'BODY' ), 'Failed to find the BODY opener.' ); + $this->assertTrue( $processor->set_inner_html( '
New
' ), 'Failed to set BODY inner HTML.' ); + $this->assertSame( + '
New
', + $processor->get_updated_html(), + 'Should have replaced the full BODY contents instead of stopping inside TEMPLATE content.' + ); + } + + /** + * Ensures that inner HTML cannot be set when not paused on a token. + * + * @covers WP_HTML_Processor::set_inner_html + */ + public function test_set_inner_html_rejects_when_not_paused_on_token(): void { + $html = '
Old
'; + $processor = WP_HTML_Processor::create_fragment( $html ); + + $this->assertFalse( $processor->set_inner_html( '

New

' ), 'Should not set inner HTML before matching a tag.' ); + $this->assertSame( $html, $processor->get_updated_html(), 'HTML should be unchanged.' ); + } + + /** + * Ensures that inner HTML cannot be set on tag closers. + * + * @covers WP_HTML_Processor::set_inner_html + */ + public function test_set_inner_html_rejects_tag_closers(): void { + $html = '
Old
'; + $processor = WP_HTML_Processor::create_fragment( $html ); + + $this->assertTrue( + $processor->next_tag( + array( + 'tag_name' => 'DIV', + 'tag_closers' => 'visit', + ) + ), + 'Failed to find the DIV opener.' + ); + $this->assertTrue( + $processor->next_tag( + array( + 'tag_name' => 'DIV', + 'tag_closers' => 'visit', + ) + ), + 'Failed to find the DIV closer.' + ); + + $this->assertTrue( $processor->is_tag_closer(), 'Should be paused on the DIV closer.' ); + $this->assertFalse( $processor->set_inner_html( '

New

' ), 'Should not set inner HTML on a closer.' ); + $this->assertSame( $html, $processor->get_updated_html(), 'HTML should be unchanged.' ); + } + + /** + * Ensures that inner HTML cannot be set on atomic elements. + * + * @dataProvider data_set_inner_html_rejects_atomic_elements + * + * @covers WP_HTML_Processor::set_inner_html + * + * @param string $html HTML containing an atomic target. + * @param string $target_tag Target tag to find. + */ + public function test_set_inner_html_rejects_atomic_elements( string $html, string $target_tag ): void { + $processor = WP_HTML_Processor::create_fragment( $html ); + + $this->assertTrue( $processor->next_tag( $target_tag ), "Failed to find {$target_tag}." ); + $this->assertFalse( $processor->set_inner_html( '

New

' ), "Should not set inner HTML on {$target_tag}." ); + $this->assertSame( $html, $processor->get_updated_html(), 'HTML should be unchanged.' ); + } + + /** + * Data provider. + * + * @return array + */ + public static function data_set_inner_html_rejects_atomic_elements(): array { + return array( + 'SCRIPT' => array( '', 'SCRIPT' ), + 'STYLE' => array( '', 'STYLE' ), + 'TEXTAREA' => array( '', 'TEXTAREA' ), + 'IMG' => array( 'old', 'IMG' ), + 'SVG TITLE integration' => array( 'old', 'TITLE' ), + 'SVG DESC integration' => array( 'old', 'DESC' ), + 'SVG FOREIGNOBJECT integration' => array( '

old

', 'FOREIGNOBJECT' ), + 'MathML MI integration' => array( 'old', 'MI' ), + 'MathML ANNOTATION-XML integration' => array( '

old

', 'ANNOTATION-XML' ), + ); + } + + /** + * Ensures that replacements are rejected when they alter the tree outside the target. + * + * @dataProvider data_set_inner_html_rejects_tree_leaks + * + * @covers WP_HTML_Processor::set_inner_html + * + * @param string $html Original HTML. + * @param string $target_tag Target tag to update. + * @param string $replacement Proposed inner HTML. + */ + public function test_set_inner_html_rejects_tree_leaks( string $html, string $target_tag, string $replacement ): void { + $processor = WP_HTML_Processor::create_fragment( $html ); + + $this->assertTrue( $processor->next_tag( $target_tag ), "Failed to find {$target_tag}." ); + $this->assertFalse( $processor->set_inner_html( $replacement ), 'Should have rejected leaking inner HTML.' ); + $this->assertSame( $html, $processor->get_updated_html(), 'HTML should be unchanged after a rejected replacement.' ); + } + + /** + * Data provider. + * + * @return array + */ + public static function data_set_inner_html_rejects_tree_leaks(): array { + return array( + 'original BODY attributes would be removed' => array( + '
Old
After', + 'DIV', + '

New

', + ), + 'original HTML attributes would be removed' => array( + '
Old
After', + 'DIV', + '

New

', + ), + 'nested A closes target A' => array( + 'OldAfter', + 'A', + 'New', + ), + 'explicit closer escapes target' => array( + '
Old
After', + 'DIV', + '

Leaked

', + ), + 'BODY attributes after escaped target' => array( + '
Old
After', + 'DIV', + '', + ), + 'HTML attributes after escaped target' => array( + '
Old
After', + 'DIV', + '', + ), + 'active formatting reconstructs outside' => array( + '
Old
After', + 'DIV', + 'New', + ), + 'active formatting reconstructs before textarea outside' => array( + '
Old
', + 'SECTION', + 'New', + ), + 'BODY attributes can be hoisted outside' => array( + '
Old
After', + 'MAIN', + 'New', + ), + 'HTML attributes can be hoisted outside' => array( + '
Old
After', + 'MAIN', + 'New', + ), + ); + } + + /** + * Ensures that BODY and HTML attribute hoisting is rejected in full documents. + * + * @dataProvider data_set_inner_html_rejects_full_document_attribute_hoisting + * + * @covers WP_HTML_Processor::set_inner_html + * + * @param string $replacement Proposed inner HTML. + */ + public function test_set_inner_html_rejects_full_document_attribute_hoisting( string $replacement ): void { + $html = '
Old
After'; + $processor = WP_HTML_Processor::create_full_parser( $html ); + + $this->assertTrue( $processor->next_tag( 'MAIN' ), 'Failed to find MAIN.' ); + $this->assertFalse( $processor->set_inner_html( $replacement ), 'Should reject attribute hoisting outside the target.' ); + $this->assertSame( $html, $processor->get_updated_html(), 'HTML should be unchanged after a rejected replacement.' ); + } + + /** + * Data provider. + * + * @return array + */ + public static function data_set_inner_html_rejects_full_document_attribute_hoisting(): array { + return array( + 'BODY attributes' => array( 'New' ), + 'HTML attributes' => array( 'New' ), + ); + } + + /** + * Ensures BODY and HTML-looking tags are allowed when they do not affect the outer tree. + * + * @dataProvider data_set_inner_html_allows_unhoisted_body_and_html_tags + * + * @covers WP_HTML_Processor::set_inner_html + * + * @param string $html Original HTML. + * @param string $target_tag Target tag to update. + * @param string $replacement Proposed inner HTML. + * @param string $expected_html Expected updated HTML. + */ + public function test_set_inner_html_allows_unhoisted_body_and_html_tags( + string $html, + string $target_tag, + string $replacement, + string $expected_html + ): void { + $processor = WP_HTML_Processor::create_fragment( $html ); + + $this->assertTrue( $processor->next_tag( $target_tag ), "Failed to find {$target_tag}." ); + $this->assertTrue( $processor->set_inner_html( $replacement ), 'Should allow BODY/HTML-looking tags that remain inside the target.' ); + $this->assertSame( $expected_html, $processor->get_updated_html(), 'Should preserve the safe replacement.' ); + } + + /** + * Data provider. + * + * @return array + */ + public static function data_set_inner_html_allows_unhoisted_body_and_html_tags(): array { + return array( + 'foreign HTML element' => array( + 'OldAfter', + 'SVG', + '', + 'After', + ), + 'TEMPLATE ignores BODY tag' => array( + '
Old
After', + 'DIV', + '', + '
After', + ), + ); + } + + /** + * Ensures rejected replacements do not poison the live processor state. + * + * @covers WP_HTML_Processor::set_inner_html + */ + public function test_set_inner_html_failure_does_not_poison_processor_state(): void { + $html = '

After

'; + $processor = WP_HTML_Processor::create_fragment( $html ); + + $this->assertTrue( $processor->next_tag( 'DIV' ), 'Failed to find the DIV opener.' ); + $this->assertFalse( $processor->set_inner_html( '

New

' ), 'Should reject when the target end cannot be safely found.' ); + $this->assertNull( $processor->get_last_error(), 'Rejected replacement should not poison the live processor.' ); + $this->assertSame( $html, $processor->get_updated_html(), 'HTML should be unchanged.' ); + } + + /** + * Ensures safe parser repairs inside the target are accepted. + * + * @covers WP_HTML_Processor::set_inner_html + */ + public function test_set_inner_html_allows_repairs_inside_target(): void { + $processor = WP_HTML_Processor::create_fragment( '
Old
After' ); + + $this->assertTrue( $processor->next_tag( 'DIV' ), 'Failed to find the DIV opener.' ); + $this->assertTrue( $processor->set_inner_html( '

One

Two' ), 'Should set inner HTML when repairs stay inside the target.' ); + $this->assertSame( + '

One

Two

After', + $processor->get_updated_html(), + 'Should preserve the raw inner HTML replacement.' + ); + } + + /** + * Ensures inner HTML replacement works when the target closer is implicit. + * + * @dataProvider data_set_inner_html_with_implicit_closer + * + * @covers WP_HTML_Processor::set_inner_html + * + * @param string $html Original HTML. + * @param string $target_tag Target tag to update. + * @param string $replacement Proposed inner HTML. + * @param string $expected_html Expected updated HTML. + */ + public function test_set_inner_html_with_implicit_closer( + string $html, + string $target_tag, + string $replacement, + string $expected_html + ): void { + $processor = WP_HTML_Processor::create_fragment( $html ); + + $this->assertTrue( $processor->next_tag( $target_tag ), "Failed to find {$target_tag}." ); + $this->assertTrue( $processor->set_inner_html( $replacement ), 'Should set inner HTML before the implicit closer.' ); + $this->assertSame( $expected_html, $processor->get_updated_html(), 'Should replace only the implicit target inner span.' ); + } + + /** + * Data provider. + * + * @return array + */ + public static function data_set_inner_html_with_implicit_closer(): array { + return array( + 'closed by following P opener' => array( + '

Old

After', + 'P', + 'New', + '

New

After', + ), + 'closed by EOF' => array( + '

Old', + 'DIV', + 'New', + '
New', + ), + 'self-closing flag ignored' => array( + '
Old', + 'DIV', + 'New', + '
New', + ), + ); + } +} diff --git a/tools/html-api-fuzz/oracles/lexbor/README.md b/tools/html-api-fuzz/oracles/lexbor/README.md new file mode 100644 index 0000000000000..4baf4ede5736e --- /dev/null +++ b/tools/html-api-fuzz/oracles/lexbor/README.md @@ -0,0 +1,74 @@ +# Lexbor Source Oracle + +This directory contains a standalone oracle binary used by the +`set_inner_html` fuzzer to self-check accepted updates against a source-built +Lexbor checkout. + +Build upstream `master`: + +```sh +tools/html-api-fuzz/oracles/lexbor/build.sh +``` + +The script clones Lexbor under `.cache/lexbor//source`, builds and +installs a static Lexbor library under the same cache entry, then writes: + +```text +tools/html-api-fuzz/oracles/lexbor/build/lexbor-tree-oracle +``` + +Use it from the `set_inner_html` fuzzer by passing the binary path, or by +building it at the default location shown above: + +```sh +php tools/html-api-fuzz/set-inner-html.php \ + --iterations 100 \ + --lexbor-oracle-bin tools/html-api-fuzz/oracles/lexbor/build/lexbor-tree-oracle \ + --output-dir artifacts/html-api-fuzz/set-inner-html-lexbor +``` + +Pass `--lexbor-oracle-bin PATH` or set `HTML_API_FUZZ_LEXBOR_ORACLE` when the +binary is not at the default build path above. + +When present, the oracle parses the original and updated HTML after an accepted +`set_inner_html()` call and compares the rendered tree outside the target +element. A changed outside tree is a fuzzer failure: it indicates a replacement +that should have been rejected by the HTML API. + +The oracle also self-checks each Lexbor parse by serializing the parsed tree, +parsing that serialization again in the same mode, and comparing the rendered +tree bytes. The fuzzer records this as `originalSelfCheck` and +`updatedSelfCheck` in Lexbor failure details. The self-check is diagnostic +metadata; the original-vs-updated outside-tree comparison remains the oracle +signal. + +The binary records the resolved Lexbor commit in its JSON metadata, even when +building from a moving ref such as `master`. + +Use a different checkout or commit when bisecting upstream behavior: + +```sh +LEXBOR_SOURCE_DIR=/path/to/lexbor \ +LEXBOR_COMMIT=481c444261a132190a3fb746d6d2f60824af3717 \ +tools/html-api-fuzz/oracles/lexbor/build.sh +``` + +Direct CLI examples: + +```sh +tools/html-api-fuzz/oracles/lexbor/build/lexbor-tree-oracle \ + --mode full-document \ + --max-nodes 3000 \ + --input /path/to/input.bin + +tools/html-api-fuzz/oracles/lexbor/build/lexbor-tree-oracle \ + --mode fragment-body \ + --context body \ + --max-nodes 3000 \ + --input /path/to/input.bin +``` + +The oracle returns JSON with `status`, `oracle` metadata, `selfCheck`, `tree`, +`treeBase64`, and `nodeCount`. The `treeBase64` field is the exact +html5lib-style tree bytes consumed by the PHP adapter; `tree` is the same tree +as a JSON-safe display string. Neither field is serialized HTML. diff --git a/tools/html-api-fuzz/oracles/lexbor/build.sh b/tools/html-api-fuzz/oracles/lexbor/build.sh new file mode 100755 index 0000000000000..b7dd09f5f19d9 --- /dev/null +++ b/tools/html-api-fuzz/oracles/lexbor/build.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env sh +set -eu + +ref="${LEXBOR_COMMIT:-master}" +script_dir="$(CDPATH= cd -- "$(dirname -- "$0")" && pwd)" +repo_root="$(CDPATH= cd -- "$script_dir/../../../.." && pwd)" +cache_dir="${LEXBOR_CACHE_DIR:-$repo_root/.cache/lexbor/$ref}" +source_dir="${LEXBOR_SOURCE_DIR:-$cache_dir/source}" +build_dir="${LEXBOR_BUILD_DIR:-$cache_dir/build}" +install_dir="${LEXBOR_INSTALL_DIR:-$cache_dir/install}" +oracle_build_dir="$script_dir/build" +oracle_bin="$oracle_build_dir/lexbor-tree-oracle" + +if [ ! -d "$source_dir/.git" ]; then + mkdir -p "$(dirname "$source_dir")" + git clone https://github.com/lexbor/lexbor.git "$source_dir" +fi + +git -C "$source_dir" fetch --tags origin + +checkout_ref="$ref" +if git -C "$source_dir" rev-parse --verify --quiet "origin/$ref^{commit}" >/dev/null; then + checkout_ref="origin/$ref" +fi +git -C "$source_dir" checkout --detach "$checkout_ref" +commit="$(git -C "$source_dir" rev-parse HEAD)" + +cmake -S "$source_dir" -B "$build_dir" \ + -DLEXBOR_BUILD_SHARED=OFF \ + -DLEXBOR_BUILD_STATIC=ON \ + -DLEXBOR_BUILD_SEPARATELY=OFF \ + -DLEXBOR_BUILD_EXAMPLES=OFF \ + -DLEXBOR_BUILD_TESTS=OFF \ + -DLEXBOR_BUILD_UTILS=OFF \ + -DCMAKE_INSTALL_PREFIX="$install_dir" + +cmake --build "$build_dir" --target lexbor_static +cmake --install "$build_dir" --prefix "$install_dir" + +mkdir -p "$oracle_build_dir" + +cc ${CFLAGS:-} \ + -std=c99 \ + -Wall \ + -Wextra \ + -Werror \ + -I"$install_dir/include" \ + -DHTML_API_FUZZ_LEXBOR_COMMIT="\"$commit\"" \ + "$script_dir/lexbor-tree-oracle.c" \ + "$install_dir/lib/liblexbor_static.a" \ + -o "$oracle_bin" \ + ${LDFLAGS:-} + +printf '%s\n' "$oracle_bin" diff --git a/tools/html-api-fuzz/oracles/lexbor/lexbor-tree-oracle.c b/tools/html-api-fuzz/oracles/lexbor/lexbor-tree-oracle.c new file mode 100644 index 0000000000000..9b3cb6fa5b4b6 --- /dev/null +++ b/tools/html-api-fuzz/oracles/lexbor/lexbor-tree-oracle.c @@ -0,0 +1,1358 @@ +/* + * Source-built Lexbor tree oracle for the HTML API fuzzer. + * + * Parses one input with Lexbor and emits a JSON result whose "tree" field uses + * the same html5lib-style text format as HtmlApiFuzz\TreeRenderer. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifndef HTML_API_FUZZ_LEXBOR_COMMIT +#define HTML_API_FUZZ_LEXBOR_COMMIT "unknown" +#endif + +typedef struct { + char *data; + size_t length; + size_t capacity; + bool failed; +} buffer_t; + +typedef struct { + char *sort_name; + char *render_name; + char *value; +} attr_record_t; + +typedef enum { + ORACLE_OK, + ORACLE_UNSUPPORTED, + ORACLE_ERROR, +} oracle_status_t; + +typedef struct { + oracle_status_t status; + const char *failure_class; + const char *message; + buffer_t tree; + size_t node_count; + size_t max_nodes; + bool self_check_performed; + bool self_check_stable; +} render_ctx_t; + +typedef struct { + const char *mode; + const char *context; + const char *input_path; + size_t max_nodes; + bool show_help; + bool show_version; +} cli_options_t; + +static void buffer_init(buffer_t *buf); +static void buffer_destroy(buffer_t *buf); +static bool buffer_reserve(buffer_t *buf, size_t extra); +static bool buffer_append_mem(buffer_t *buf, const char *data, size_t len); +static bool buffer_append_cstr(buffer_t *buf, const char *data); +static bool buffer_append_char(buffer_t *buf, char ch); +static bool buffer_append_repeat(buffer_t *buf, const char *data, size_t len, size_t count); +static char *buffer_take_cstr(buffer_t *buf); +static bool append_escaped_scalar(buffer_t *buf, const lxb_char_t *data, size_t len, bool scrub); +static bool append_json_string(buffer_t *buf, const char *data, size_t len); +static bool append_json_base64(buffer_t *buf, const char *data, size_t len); +static bool append_tree_line_indent(buffer_t *buf, int indent_level); +static bool append_display_element_name(buffer_t *buf, lxb_dom_element_t *element); +static bool append_escaped_display_element_name(buffer_t *buf, lxb_dom_element_t *element); +static bool append_display_attribute_name(buffer_t *buf, lxb_dom_attr_t *attr); +static int compare_attr_records(const void *a_ptr, const void *b_ptr); +static bool render_attributes(render_ctx_t *ctx, lxb_dom_element_t *element, int indent_level); +static void destroy_attr_records(attr_record_t *records, size_t count); +static void render_node(render_ctx_t *ctx, lxb_dom_node_t *node, int indent_level); +static void render_children(render_ctx_t *ctx, lxb_dom_node_t *first, int indent_level); +static lxb_status_t serialize_buffer_cb(const lxb_char_t *data, size_t len, void *ctx); +static bool read_file(const char *path, lxb_char_t **data, size_t *len, const char **message); +static bool parse_size(const char *value, size_t *out); +static bool parse_args(int argc, char **argv, cli_options_t *options, const char **message); +static void print_usage(FILE *stream); +static void print_version(void); +static void print_result(render_ctx_t *ctx); +static void print_cli_error(const char *message); +static bool context_to_tag(const char *context, lxb_tag_id_t *tag_id, lxb_ns_id_t *ns_id); +static void init_render_ctx(render_ctx_t *ctx, size_t max_nodes); +static void destroy_render_ctx(render_ctx_t *ctx); +static bool tree_buffers_equal(buffer_t *a, buffer_t *b); +static bool render_and_serialize_full_document(render_ctx_t *ctx, const lxb_char_t *input, size_t input_len, buffer_t *serialized); +static bool render_and_serialize_fragment(render_ctx_t *ctx, const lxb_char_t *input, size_t input_len, const char *context, buffer_t *serialized); +static void self_check_rendered_tree(render_ctx_t *ctx, buffer_t *serialized, const char *context); +static void render_full_document(render_ctx_t *ctx, const lxb_char_t *input, size_t input_len); +static void render_fragment(render_ctx_t *ctx, const lxb_char_t *input, size_t input_len, const char *context); + +static void +buffer_init(buffer_t *buf) +{ + buf->data = NULL; + buf->length = 0; + buf->capacity = 0; + buf->failed = false; +} + +static void +buffer_destroy(buffer_t *buf) +{ + free(buf->data); + buffer_init(buf); +} + +static bool +buffer_reserve(buffer_t *buf, size_t extra) +{ + size_t needed; + size_t next_capacity; + char *next; + + if (buf->failed) { + return false; + } + + if (extra > SIZE_MAX - buf->length - 1) { + buf->failed = true; + return false; + } + + needed = buf->length + extra + 1; + if (needed <= buf->capacity) { + return true; + } + + next_capacity = buf->capacity == 0 ? 256 : buf->capacity; + while (next_capacity < needed) { + if (next_capacity > SIZE_MAX / 2) { + next_capacity = needed; + break; + } + next_capacity *= 2; + } + + next = (char *) realloc(buf->data, next_capacity); + if (next == NULL) { + buf->failed = true; + return false; + } + + buf->data = next; + buf->capacity = next_capacity; + buf->data[buf->length] = '\0'; + return true; +} + +static bool +buffer_append_mem(buffer_t *buf, const char *data, size_t len) +{ + if (!buffer_reserve(buf, len)) { + return false; + } + + if (len > 0) { + memcpy(buf->data + buf->length, data, len); + buf->length += len; + } + + buf->data[buf->length] = '\0'; + return true; +} + +static bool +buffer_append_cstr(buffer_t *buf, const char *data) +{ + return buffer_append_mem(buf, data, strlen(data)); +} + +static bool +buffer_append_char(buffer_t *buf, char ch) +{ + return buffer_append_mem(buf, &ch, 1); +} + +static bool +buffer_append_repeat(buffer_t *buf, const char *data, size_t len, size_t count) +{ + size_t i; + + for (i = 0; i < count; i++) { + if (!buffer_append_mem(buf, data, len)) { + return false; + } + } + + return true; +} + +static char * +buffer_take_cstr(buffer_t *buf) +{ + char *data; + + if (!buffer_reserve(buf, 0)) { + return NULL; + } + + data = buf->data; + buf->data = NULL; + buf->length = 0; + buf->capacity = 0; + return data; +} + +static bool +append_escaped_byte(buffer_t *buf, unsigned char byte) +{ + char hex[5]; + + switch (byte) { + case '\n': + return buffer_append_cstr(buf, "\\n"); + case '\r': + return buffer_append_cstr(buf, "\\r"); + case '\t': + return buffer_append_cstr(buf, "\\t"); + case '\0': + return buffer_append_cstr(buf, "\\0"); + case '\\': + return buffer_append_cstr(buf, "\\\\"); + case '"': + return buffer_append_cstr(buf, "\\\""); + default: + if (byte < 0x20 || byte == 0x7f) { + snprintf(hex, sizeof(hex), "\\x%02X", byte); + return buffer_append_cstr(buf, hex); + } + return buffer_append_char(buf, (char) byte); + } +} + +static bool +append_escaped_scalar(buffer_t *buf, const lxb_char_t *data, size_t len, bool scrub) +{ + size_t i; + static const char replacement[] = "\xEF\xBF\xBD"; + + if (data == NULL) { + len = 0; + } + + for (i = 0; i < len; i++) { + unsigned char byte = (unsigned char) data[i]; + + if (scrub) { + if (byte == '\0') { + if (!buffer_append_mem(buf, replacement, sizeof(replacement) - 1)) { + return false; + } + continue; + } + + if (byte == '\r') { + if (i + 1 < len && data[i + 1] == '\n') { + i++; + } + byte = '\n'; + } + } + + if (!append_escaped_byte(buf, byte)) { + return false; + } + } + + return true; +} + +static bool +append_json_string(buffer_t *buf, const char *data, size_t len) +{ + size_t i; + char hex[7]; + static const char replacement[] = "\\uFFFD"; + + if (!buffer_append_char(buf, '"')) { + return false; + } + + for (i = 0; i < len; i++) { + unsigned char byte = (unsigned char) data[i]; + + switch (byte) { + case '"': + if (!buffer_append_cstr(buf, "\\\"")) { + return false; + } + break; + case '\\': + if (!buffer_append_cstr(buf, "\\\\")) { + return false; + } + break; + case '\b': + if (!buffer_append_cstr(buf, "\\b")) { + return false; + } + break; + case '\f': + if (!buffer_append_cstr(buf, "\\f")) { + return false; + } + break; + case '\n': + if (!buffer_append_cstr(buf, "\\n")) { + return false; + } + break; + case '\r': + if (!buffer_append_cstr(buf, "\\r")) { + return false; + } + break; + case '\t': + if (!buffer_append_cstr(buf, "\\t")) { + return false; + } + break; + default: + if (byte < 0x20) { + snprintf(hex, sizeof(hex), "\\u%04X", byte); + if (!buffer_append_cstr(buf, hex)) { + return false; + } + } else if (byte < 0x80) { + if (!buffer_append_char(buf, (char) byte)) { + return false; + } + } else { + size_t sequence_len = 0; + bool valid = false; + + if (byte >= 0xC2 && byte <= 0xDF) { + sequence_len = 2; + } else if (byte >= 0xE0 && byte <= 0xEF) { + sequence_len = 3; + } else if (byte >= 0xF0 && byte <= 0xF4) { + sequence_len = 4; + } + + if (sequence_len > 0 && i + sequence_len <= len) { + unsigned char b1 = sequence_len > 1 ? (unsigned char) data[i + 1] : 0; + unsigned char b2 = sequence_len > 2 ? (unsigned char) data[i + 2] : 0; + unsigned char b3 = sequence_len > 3 ? (unsigned char) data[i + 3] : 0; + valid = true; + if (sequence_len >= 2 && (b1 < 0x80 || b1 > 0xBF)) { + valid = false; + } + if (sequence_len >= 3 && (b2 < 0x80 || b2 > 0xBF)) { + valid = false; + } + if (sequence_len >= 4 && (b3 < 0x80 || b3 > 0xBF)) { + valid = false; + } + if (byte == 0xE0 && b1 < 0xA0) { + valid = false; + } + if (byte == 0xED && b1 > 0x9F) { + valid = false; + } + if (byte == 0xF0 && b1 < 0x90) { + valid = false; + } + if (byte == 0xF4 && b1 > 0x8F) { + valid = false; + } + } + + if (valid) { + if (!buffer_append_mem(buf, data + i, sequence_len)) { + return false; + } + i += sequence_len - 1; + } else if (!buffer_append_cstr(buf, replacement)) { + return false; + } + } + break; + } + } + + return buffer_append_char(buf, '"'); +} + +static bool +append_json_base64(buffer_t *buf, const char *data, size_t len) +{ + static const char alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + size_t i; + + if (!buffer_append_char(buf, '"')) { + return false; + } + + for (i = 0; i < len; i += 3) { + unsigned int b0 = (unsigned char) data[i]; + unsigned int b1 = i + 1 < len ? (unsigned char) data[i + 1] : 0; + unsigned int b2 = i + 2 < len ? (unsigned char) data[i + 2] : 0; + char encoded[4]; + + encoded[0] = alphabet[b0 >> 2]; + encoded[1] = alphabet[((b0 & 0x03) << 4) | (b1 >> 4)]; + encoded[2] = i + 1 < len ? alphabet[((b1 & 0x0F) << 2) | (b2 >> 6)] : '='; + encoded[3] = i + 2 < len ? alphabet[b2 & 0x3F] : '='; + + if (!buffer_append_mem(buf, encoded, sizeof(encoded))) { + return false; + } + } + + return buffer_append_char(buf, '"'); +} + +static bool +append_tree_line_indent(buffer_t *buf, int indent_level) +{ + return buffer_append_repeat(buf, " ", 2, (size_t) indent_level); +} + +static bool +append_ascii_lower(buffer_t *buf, const lxb_char_t *data, size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + unsigned char byte = (unsigned char) data[i]; + if (byte >= 'A' && byte <= 'Z') { + byte = (unsigned char) tolower(byte); + } + if (!buffer_append_char(buf, (char) byte)) { + return false; + } + } + + return true; +} + +static bool +append_display_element_name(buffer_t *buf, lxb_dom_element_t *element) +{ + size_t len = 0; + const lxb_char_t *name; + lxb_ns_id_t ns = lxb_dom_element_ns_id(element); + + name = lxb_dom_element_local_name(element, &len); + if (ns == LXB_NS_HTML) { + return append_ascii_lower(buf, name, len); + } + + if (ns == LXB_NS_SVG) { + name = lxb_dom_element_qualified_name(element, &len); + return buffer_append_cstr(buf, "svg ") && buffer_append_mem(buf, (const char *) name, len); + } + + if (ns == LXB_NS_MATH) { + return buffer_append_cstr(buf, "math ") && buffer_append_mem(buf, (const char *) name, len); + } + + name = lxb_dom_element_qualified_name(element, &len); + return buffer_append_mem(buf, (const char *) name, len); +} + +static bool +append_escaped_display_element_name(buffer_t *buf, lxb_dom_element_t *element) +{ + buffer_t display; + bool ok; + + buffer_init(&display); + ok = append_display_element_name(&display, element) + && append_escaped_scalar(buf, (const lxb_char_t *) display.data, display.length, false); + buffer_destroy(&display); + + return ok; +} + +static bool +append_display_attribute_name(buffer_t *buf, lxb_dom_attr_t *attr) +{ + size_t len = 0; + const lxb_char_t *name; + lxb_ns_id_t ns = (lxb_ns_id_t) lxb_dom_interface_node(attr)->ns; + + if (ns == LXB_NS_XLINK) { + name = lxb_dom_attr_local_name(attr, &len); + return buffer_append_cstr(buf, "xlink ") && buffer_append_mem(buf, (const char *) name, len); + } + + if (ns == LXB_NS_XML) { + name = lxb_dom_attr_local_name(attr, &len); + return buffer_append_cstr(buf, "xml ") && buffer_append_mem(buf, (const char *) name, len); + } + + if (ns == LXB_NS_XMLNS) { + name = lxb_dom_attr_local_name(attr, &len); + return buffer_append_cstr(buf, "xmlns ") && buffer_append_mem(buf, (const char *) name, len); + } + + name = lxb_dom_attr_qualified_name(attr, &len); + return buffer_append_mem(buf, (const char *) name, len); +} + +static int +compare_display_names(const char *a, const char *b) +{ + bool a_has_colon = strchr(a, ':') != NULL; + bool b_has_colon = strchr(b, ':') != NULL; + bool a_has_space = strchr(a, ' ') != NULL; + bool b_has_space = strchr(b, ' ') != NULL; + int compared; + + if (a_has_colon != b_has_colon) { + return a_has_colon ? 1 : -1; + } + + if (a_has_space != b_has_space) { + return a_has_space ? 1 : -1; + } + + compared = strcmp(a, b); + if (compared < 0) { + return -1; + } + if (compared > 0) { + return 1; + } + return 0; +} + +static int +compare_attr_records(const void *a_ptr, const void *b_ptr) +{ + const attr_record_t *a = (const attr_record_t *) a_ptr; + const attr_record_t *b = (const attr_record_t *) b_ptr; + int compared = compare_display_names(a->sort_name, b->sort_name); + + if (compared != 0) { + return compared; + } + + return compare_display_names(a->render_name, b->render_name); +} + +static bool +render_attributes(render_ctx_t *ctx, lxb_dom_element_t *element, int indent_level) +{ + lxb_dom_attr_t *attr; + attr_record_t *records = NULL; + size_t count = 0; + size_t index = 0; + size_t i; + bool ok = false; + + for (attr = lxb_dom_element_first_attribute(element); attr != NULL; attr = lxb_dom_element_next_attribute(attr)) { + count++; + } + + if (count == 0) { + return true; + } + + records = (attr_record_t *) calloc(count, sizeof(attr_record_t)); + if (records == NULL) { + ctx->status = ORACLE_ERROR; + ctx->failure_class = "oracle-renderer-error"; + ctx->message = "Could not allocate attribute records."; + return false; + } + + for (attr = lxb_dom_element_first_attribute(element); attr != NULL; attr = lxb_dom_element_next_attribute(attr)) { + buffer_t display; + buffer_t sort; + buffer_t render; + buffer_t value; + size_t value_len = 0; + const lxb_char_t *value_data; + + buffer_init(&display); + buffer_init(&sort); + buffer_init(&render); + buffer_init(&value); + + value_data = lxb_dom_attr_value(attr, &value_len); + if ( + !append_display_attribute_name(&display, attr) || + !append_escaped_scalar(&sort, (const lxb_char_t *) display.data, display.length, true) || + !append_escaped_scalar(&render, (const lxb_char_t *) display.data, display.length, false) || + !append_escaped_scalar(&value, value_data, value_len, false) + ) { + buffer_destroy(&display); + buffer_destroy(&sort); + buffer_destroy(&render); + buffer_destroy(&value); + ctx->status = ORACLE_ERROR; + ctx->failure_class = "oracle-renderer-error"; + ctx->message = "Could not render attributes."; + goto cleanup; + } + + records[index].sort_name = buffer_take_cstr(&sort); + records[index].render_name = buffer_take_cstr(&render); + records[index].value = buffer_take_cstr(&value); + + buffer_destroy(&display); + buffer_destroy(&sort); + buffer_destroy(&render); + buffer_destroy(&value); + + if (records[index].sort_name == NULL || records[index].render_name == NULL || records[index].value == NULL) { + ctx->status = ORACLE_ERROR; + ctx->failure_class = "oracle-renderer-error"; + ctx->message = "Could not store attribute records."; + goto cleanup; + } + + index++; + } + + qsort(records, count, sizeof(attr_record_t), compare_attr_records); + + for (i = 0; i < count; i++) { + if ( + !append_tree_line_indent(&ctx->tree, indent_level) || + !buffer_append_cstr(&ctx->tree, records[i].render_name) || + !buffer_append_cstr(&ctx->tree, "=\"") || + !buffer_append_cstr(&ctx->tree, records[i].value) || + !buffer_append_cstr(&ctx->tree, "\"\n") + ) { + ctx->status = ORACLE_ERROR; + ctx->failure_class = "oracle-renderer-error"; + ctx->message = "Could not append attribute lines."; + goto cleanup; + } + } + + ok = true; + +cleanup: + destroy_attr_records(records, count); + return ok; +} + +static void +destroy_attr_records(attr_record_t *records, size_t count) +{ + size_t i; + + if (records == NULL) { + return; + } + + for (i = 0; i < count; i++) { + free(records[i].sort_name); + free(records[i].render_name); + free(records[i].value); + } + + free(records); +} + +static bool +increment_node_count(render_ctx_t *ctx) +{ + ctx->node_count++; + if (ctx->node_count > ctx->max_nodes) { + ctx->status = ORACLE_ERROR; + ctx->failure_class = "node-limit-exceeded"; + ctx->message = "DOM node limit exceeded."; + return false; + } + + return true; +} + +static void +render_node(render_ctx_t *ctx, lxb_dom_node_t *node, int indent_level) +{ + if (ctx->status != ORACLE_OK || node == NULL) { + return; + } + + if (!increment_node_count(ctx)) { + return; + } + + switch (node->type) { + case LXB_DOM_NODE_TYPE_DOCUMENT_TYPE: { + lxb_dom_document_type_t *doctype = lxb_dom_interface_document_type(node); + size_t name_len = 0; + size_t public_len = 0; + size_t system_len = 0; + const lxb_char_t *name = lxb_dom_document_type_name(doctype, &name_len); + const lxb_char_t *public_id = lxb_dom_document_type_public_id(doctype, &public_len); + const lxb_char_t *system_id = lxb_dom_document_type_system_id(doctype, &system_len); + + if ( + !buffer_append_cstr(&ctx->tree, "tree, name, name_len, false) + ) { + ctx->status = ORACLE_ERROR; + ctx->failure_class = "oracle-renderer-error"; + ctx->message = "Could not render doctype."; + return; + } + + if (public_len > 0 || system_len > 0) { + if ( + !buffer_append_cstr(&ctx->tree, " \"") || + !append_escaped_scalar(&ctx->tree, public_id, public_len, false) || + !buffer_append_cstr(&ctx->tree, "\" \"") || + !append_escaped_scalar(&ctx->tree, system_id, system_len, false) || + !buffer_append_char(&ctx->tree, '"') + ) { + ctx->status = ORACLE_ERROR; + ctx->failure_class = "oracle-renderer-error"; + ctx->message = "Could not render doctype identifiers."; + return; + } + } + + if (!buffer_append_cstr(&ctx->tree, ">\n")) { + ctx->status = ORACLE_ERROR; + ctx->failure_class = "oracle-renderer-error"; + ctx->message = "Could not finish doctype."; + } + return; + } + + case LXB_DOM_NODE_TYPE_ELEMENT: { + lxb_dom_element_t *element = lxb_dom_interface_element(node); + + if ( + !append_tree_line_indent(&ctx->tree, indent_level) || + !buffer_append_char(&ctx->tree, '<') || + !append_escaped_display_element_name(&ctx->tree, element) || + !buffer_append_cstr(&ctx->tree, ">\n") || + !render_attributes(ctx, element, indent_level + 1) + ) { + if (ctx->status == ORACLE_OK) { + ctx->status = ORACLE_ERROR; + ctx->failure_class = "oracle-renderer-error"; + ctx->message = "Could not render element."; + } + return; + } + + if (node->local_name == LXB_TAG_TEMPLATE && node->ns == LXB_NS_HTML) { + lxb_html_template_element_t *template_element = lxb_html_interface_template(node); + if (!append_tree_line_indent(&ctx->tree, indent_level + 1) || !buffer_append_cstr(&ctx->tree, "content\n")) { + ctx->status = ORACLE_ERROR; + ctx->failure_class = "oracle-renderer-error"; + ctx->message = "Could not render template content marker."; + return; + } + if (template_element->content != NULL) { + render_children(ctx, template_element->content->node.first_child, indent_level + 2); + } + return; + } + + render_children(ctx, node->first_child, indent_level + 1); + return; + } + + case LXB_DOM_NODE_TYPE_TEXT: + case LXB_DOM_NODE_TYPE_CDATA_SECTION: { + lxb_dom_character_data_t *character_data = lxb_dom_interface_character_data(node); + if (character_data->data.length == 0) { + return; + } + if ( + !append_tree_line_indent(&ctx->tree, indent_level) || + !buffer_append_char(&ctx->tree, '"') || + !append_escaped_scalar(&ctx->tree, character_data->data.data, character_data->data.length, false) || + !buffer_append_cstr(&ctx->tree, "\"\n") + ) { + ctx->status = ORACLE_ERROR; + ctx->failure_class = "oracle-renderer-error"; + ctx->message = "Could not render text."; + } + return; + } + + case LXB_DOM_NODE_TYPE_COMMENT: { + lxb_dom_character_data_t *character_data = lxb_dom_interface_character_data(node); + if ( + !append_tree_line_indent(&ctx->tree, indent_level) || + !buffer_append_cstr(&ctx->tree, "\n") + ) { + ctx->status = ORACLE_ERROR; + ctx->failure_class = "oracle-renderer-error"; + ctx->message = "Could not render comment."; + } + return; + } + + default: + return; + } +} + +static void +render_children(render_ctx_t *ctx, lxb_dom_node_t *first, int indent_level) +{ + lxb_dom_node_t *child; + + for (child = first; child != NULL && ctx->status == ORACLE_OK; child = child->next) { + render_node(ctx, child, indent_level); + } +} + +static lxb_status_t +serialize_buffer_cb(const lxb_char_t *data, size_t len, void *ctx) +{ + buffer_t *buf = (buffer_t *) ctx; + + return buffer_append_mem(buf, (const char *) data, len) + ? LXB_STATUS_OK + : LXB_STATUS_ERROR_MEMORY_ALLOCATION; +} + +static bool +read_file(const char *path, lxb_char_t **data, size_t *len, const char **message) +{ + FILE *file; + long size; + size_t read_len; + lxb_char_t *bytes; + + file = fopen(path, "rb"); + if (file == NULL) { + *message = strerror(errno); + return false; + } + + if (fseek(file, 0, SEEK_END) != 0) { + fclose(file); + *message = "Could not seek input file."; + return false; + } + + size = ftell(file); + if (size < 0) { + fclose(file); + *message = "Could not determine input size."; + return false; + } + + if (fseek(file, 0, SEEK_SET) != 0) { + fclose(file); + *message = "Could not rewind input file."; + return false; + } + + bytes = (lxb_char_t *) malloc((size_t) size + 1); + if (bytes == NULL) { + fclose(file); + *message = "Could not allocate input buffer."; + return false; + } + + read_len = fread(bytes, 1, (size_t) size, file); + if (read_len != (size_t) size || ferror(file)) { + free(bytes); + fclose(file); + *message = "Could not read input file."; + return false; + } + + fclose(file); + bytes[read_len] = '\0'; + *data = bytes; + *len = read_len; + return true; +} + +static bool +parse_size(const char *value, size_t *out) +{ + char *end = NULL; + unsigned long parsed; + + errno = 0; + parsed = strtoul(value, &end, 10); + if (errno != 0 || end == value || *end != '\0' || parsed == 0) { + return false; + } + + *out = (size_t) parsed; + return true; +} + +static bool +parse_args(int argc, char **argv, cli_options_t *options, const char **message) +{ + int i; + + options->mode = NULL; + options->context = "body"; + options->input_path = NULL; + options->max_nodes = 3000; + options->show_help = false; + options->show_version = false; + + for (i = 1; i < argc; i++) { + const char *arg = argv[i]; + + if (strcmp(arg, "--help") == 0 || strcmp(arg, "-h") == 0) { + options->show_help = true; + return true; + } + if (strcmp(arg, "--version") == 0) { + options->show_version = true; + return true; + } + + if (i + 1 >= argc) { + *message = "Missing option value."; + return false; + } + + if (strcmp(arg, "--mode") == 0) { + options->mode = argv[++i]; + } else if (strcmp(arg, "--context") == 0) { + options->context = argv[++i]; + } else if (strcmp(arg, "--input") == 0) { + options->input_path = argv[++i]; + } else if (strcmp(arg, "--max-nodes") == 0) { + if (!parse_size(argv[++i], &options->max_nodes)) { + *message = "Expected --max-nodes to be a positive integer."; + return false; + } + } else { + *message = "Unknown option."; + return false; + } + } + + if (options->mode == NULL) { + *message = "Missing --mode."; + return false; + } + if (strcmp(options->mode, "full-document") != 0 && strcmp(options->mode, "fragment-body") != 0) { + *message = "Expected --mode full-document or fragment-body."; + return false; + } + if (options->input_path == NULL) { + *message = "Missing --input."; + return false; + } + + return true; +} + +static void +print_usage(FILE *stream) +{ + fprintf( + stream, + "Usage: lexbor-tree-oracle --mode full-document|fragment-body --input PATH [--context TAG] [--max-nodes N]\n" + ); +} + +static void +print_version(void) +{ + printf( + "{\"status\":\"ok\",\"oracle\":{\"kind\":\"lexbor-source\",\"lexborCommit\":\"%s\",\"lexborVersion\":\"%s\"}}\n", + HTML_API_FUZZ_LEXBOR_COMMIT, + LXB_HTML_VERSION_STRING + ); +} + +static void +print_result(render_ctx_t *ctx) +{ + buffer_t json; + const char *status_text = ctx->status == ORACLE_OK + ? "ok" + : (ctx->status == ORACLE_UNSUPPORTED ? "unsupported" : "error"); + + buffer_init(&json); + buffer_append_cstr(&json, "{\n \"status\": "); + append_json_string(&json, status_text, strlen(status_text)); + buffer_append_cstr(&json, ",\n \"oracle\": {\n \"kind\": \"lexbor-source\",\n \"lexborCommit\": "); + append_json_string(&json, HTML_API_FUZZ_LEXBOR_COMMIT, strlen(HTML_API_FUZZ_LEXBOR_COMMIT)); + buffer_append_cstr(&json, ",\n \"lexborVersion\": "); + append_json_string(&json, LXB_HTML_VERSION_STRING, strlen(LXB_HTML_VERSION_STRING)); + buffer_append_cstr(&json, "\n }"); + buffer_append_cstr(&json, ",\n \"selfCheck\": {\n \"roundTripStable\": "); + buffer_append_cstr(&json, ctx->self_check_performed && ctx->self_check_stable ? "true" : "false"); + buffer_append_cstr(&json, ",\n \"performed\": "); + buffer_append_cstr(&json, ctx->self_check_performed ? "true" : "false"); + buffer_append_cstr(&json, "\n }"); + + if (ctx->status == ORACLE_OK) { + if (ctx->tree.length == 0) { + buffer_append_char(&ctx->tree, '\n'); + } else { + if (ctx->tree.data[ctx->tree.length - 1] != '\n') { + buffer_append_char(&ctx->tree, '\n'); + } + buffer_append_char(&ctx->tree, '\n'); + } + buffer_append_cstr(&json, ",\n \"tree\": "); + append_json_string(&json, ctx->tree.data == NULL ? "" : ctx->tree.data, ctx->tree.length); + buffer_append_cstr(&json, ",\n \"treeBase64\": "); + append_json_base64(&json, ctx->tree.data == NULL ? "" : ctx->tree.data, ctx->tree.length); + } + + buffer_append_cstr(&json, ",\n \"nodeCount\": "); + { + char count[32]; + snprintf(count, sizeof(count), "%zu", ctx->node_count); + buffer_append_cstr(&json, count); + } + + if (ctx->failure_class != NULL) { + buffer_append_cstr(&json, ",\n \"failureClass\": "); + append_json_string(&json, ctx->failure_class, strlen(ctx->failure_class)); + } + + if (ctx->message != NULL) { + const char *key = ctx->status == ORACLE_UNSUPPORTED ? "unsupported" : "error"; + buffer_append_cstr(&json, ",\n \""); + buffer_append_cstr(&json, key); + if (ctx->status == ORACLE_UNSUPPORTED) { + buffer_append_cstr(&json, "\": {\n \"message\": "); + append_json_string(&json, ctx->message, strlen(ctx->message)); + buffer_append_cstr(&json, "\n }"); + } else { + buffer_append_cstr(&json, "\": "); + append_json_string(&json, ctx->message, strlen(ctx->message)); + } + } + + buffer_append_cstr(&json, "\n}\n"); + + if (json.failed) { + fputs("{\"status\":\"error\",\"failureClass\":\"oracle-renderer-error\",\"error\":\"Could not encode JSON result.\"}\n", stdout); + } else { + fwrite(json.data, 1, json.length, stdout); + } + + buffer_destroy(&json); +} + +static void +print_cli_error(const char *message) +{ + render_ctx_t ctx; + + init_render_ctx(&ctx, 0); + ctx.status = ORACLE_ERROR; + ctx.failure_class = "oracle-cli-error"; + ctx.message = message; + print_result(&ctx); + destroy_render_ctx(&ctx); +} + +static bool +context_to_tag(const char *context, lxb_tag_id_t *tag_id, lxb_ns_id_t *ns_id) +{ + *ns_id = LXB_NS_HTML; + + if (strcmp(context, "body") == 0) { + *tag_id = LXB_TAG_BODY; + } else if (strcmp(context, "div") == 0) { + *tag_id = LXB_TAG_DIV; + } else if (strcmp(context, "p") == 0) { + *tag_id = LXB_TAG_P; + } else if (strcmp(context, "td") == 0) { + *tag_id = LXB_TAG_TD; + } else if (strcmp(context, "tr") == 0) { + *tag_id = LXB_TAG_TR; + } else if (strcmp(context, "table") == 0) { + *tag_id = LXB_TAG_TABLE; + } else if (strcmp(context, "caption") == 0) { + *tag_id = LXB_TAG_CAPTION; + } else if (strcmp(context, "colgroup") == 0) { + *tag_id = LXB_TAG_COLGROUP; + } else if (strcmp(context, "select") == 0) { + *tag_id = LXB_TAG_SELECT; + } else if (strcmp(context, "option") == 0) { + *tag_id = LXB_TAG_OPTION; + } else if (strcmp(context, "template") == 0) { + *tag_id = LXB_TAG_TEMPLATE; + } else if (strcmp(context, "title") == 0) { + *tag_id = LXB_TAG_TITLE; + } else if (strcmp(context, "textarea") == 0) { + *tag_id = LXB_TAG_TEXTAREA; + } else if (strcmp(context, "script") == 0) { + *tag_id = LXB_TAG_SCRIPT; + } else if (strcmp(context, "style") == 0) { + *tag_id = LXB_TAG_STYLE; + } else if (strcmp(context, "svg") == 0) { + *tag_id = LXB_TAG_SVG; + *ns_id = LXB_NS_SVG; + } else if (strcmp(context, "math") == 0) { + *tag_id = LXB_TAG_MATH; + *ns_id = LXB_NS_MATH; + } else { + return false; + } + + return true; +} + +static void +init_render_ctx(render_ctx_t *ctx, size_t max_nodes) +{ + buffer_init(&ctx->tree); + ctx->status = ORACLE_OK; + ctx->failure_class = NULL; + ctx->message = NULL; + ctx->node_count = 0; + ctx->max_nodes = max_nodes; + ctx->self_check_performed = false; + ctx->self_check_stable = false; +} + +static void +destroy_render_ctx(render_ctx_t *ctx) +{ + buffer_destroy(&ctx->tree); +} + +static bool +tree_buffers_equal(buffer_t *a, buffer_t *b) +{ + return a->length == b->length && 0 == memcmp(a->data, b->data, a->length); +} + +static bool +render_and_serialize_full_document(render_ctx_t *ctx, const lxb_char_t *input, size_t input_len, buffer_t *serialized) +{ + lxb_status_t status; + lxb_html_document_t *document = lxb_html_document_create(); + + if (document == NULL) { + ctx->status = ORACLE_ERROR; + ctx->failure_class = "oracle-renderer-error"; + ctx->message = "Could not create Lexbor document."; + return false; + } + + status = lxb_html_document_parse(document, input, input_len); + if (status != LXB_STATUS_OK) { + lxb_html_document_destroy(document); + ctx->status = ORACLE_ERROR; + ctx->failure_class = "oracle-parse-error"; + ctx->message = "Lexbor could not parse the input."; + return false; + } + + render_children(ctx, lxb_dom_interface_node(document)->first_child, 0); + if (ctx->status == ORACLE_OK && serialized != NULL) { + status = lxb_html_serialize_deep_cb( + lxb_dom_interface_node(document), + serialize_buffer_cb, + serialized + ); + if (status != LXB_STATUS_OK || serialized->failed) { + ctx->status = ORACLE_ERROR; + ctx->failure_class = "oracle-renderer-error"; + ctx->message = "Could not serialize the parsed tree."; + } + } + + lxb_html_document_destroy(document); + + return ctx->status == ORACLE_OK; +} + +static bool +render_and_serialize_fragment(render_ctx_t *ctx, const lxb_char_t *input, size_t input_len, const char *context, buffer_t *serialized) +{ + lxb_status_t status; + lxb_html_parser_t *parser = NULL; + lxb_html_document_t *document = NULL; + lxb_dom_node_t *fragment = NULL; + lxb_tag_id_t tag_id; + lxb_ns_id_t ns_id; + + if (!context_to_tag(context, &tag_id, &ns_id)) { + ctx->status = ORACLE_UNSUPPORTED; + ctx->failure_class = "oracle-unsupported"; + ctx->message = "Unsupported fragment context."; + return false; + } + + parser = lxb_html_parser_create(); + if (parser == NULL) { + ctx->status = ORACLE_ERROR; + ctx->failure_class = "oracle-renderer-error"; + ctx->message = "Could not create Lexbor parser."; + return false; + } + + status = lxb_html_parser_init(parser); + if (status != LXB_STATUS_OK) { + lxb_html_parser_destroy(parser); + ctx->status = ORACLE_ERROR; + ctx->failure_class = "oracle-renderer-error"; + ctx->message = "Could not initialize Lexbor parser."; + return false; + } + + document = lxb_html_document_create(); + if (document == NULL) { + lxb_html_parser_destroy(parser); + ctx->status = ORACLE_ERROR; + ctx->failure_class = "oracle-renderer-error"; + ctx->message = "Could not create Lexbor document."; + return false; + } + + fragment = lxb_html_parse_fragment_by_tag_id(parser, document, tag_id, ns_id, input, input_len); + if (fragment == NULL || lxb_html_parser_status(parser) != LXB_STATUS_OK) { + lxb_html_document_destroy(document); + lxb_html_parser_destroy(parser); + ctx->status = ORACLE_ERROR; + ctx->failure_class = "oracle-parse-error"; + ctx->message = "Lexbor could not parse the fragment."; + return false; + } + + render_children(ctx, fragment->first_child, 0); + if (ctx->status == ORACLE_OK && serialized != NULL) { + status = lxb_html_serialize_deep_cb(fragment, serialize_buffer_cb, serialized); + if (status != LXB_STATUS_OK || serialized->failed) { + ctx->status = ORACLE_ERROR; + ctx->failure_class = "oracle-renderer-error"; + ctx->message = "Could not serialize the parsed tree."; + } + } + + lxb_html_document_destroy(document); + lxb_html_parser_destroy(parser); + + return ctx->status == ORACLE_OK; +} + +static void +self_check_rendered_tree(render_ctx_t *ctx, buffer_t *serialized, const char *context) +{ + render_ctx_t check; + + if (ctx->status != ORACLE_OK) { + return; + } + + init_render_ctx(&check, ctx->max_nodes); + if (context == NULL) { + render_and_serialize_full_document(&check, (const lxb_char_t *) serialized->data, serialized->length, NULL); + } else { + render_and_serialize_fragment(&check, (const lxb_char_t *) serialized->data, serialized->length, context, NULL); + } + + ctx->self_check_performed = true; + ctx->self_check_stable = check.status == ORACLE_OK && tree_buffers_equal(&ctx->tree, &check.tree); + + destroy_render_ctx(&check); +} + +static void +render_full_document(render_ctx_t *ctx, const lxb_char_t *input, size_t input_len) +{ + buffer_t serialized; + + buffer_init(&serialized); + if (render_and_serialize_full_document(ctx, input, input_len, &serialized)) { + self_check_rendered_tree(ctx, &serialized, NULL); + } + buffer_destroy(&serialized); +} + +static void +render_fragment(render_ctx_t *ctx, const lxb_char_t *input, size_t input_len, const char *context) +{ + buffer_t serialized; + + buffer_init(&serialized); + if (render_and_serialize_fragment(ctx, input, input_len, context, &serialized)) { + self_check_rendered_tree(ctx, &serialized, context); + } + buffer_destroy(&serialized); +} + +int +main(int argc, char **argv) +{ + cli_options_t options; + const char *message = NULL; + lxb_char_t *input = NULL; + size_t input_len = 0; + render_ctx_t ctx; + + if (!parse_args(argc, argv, &options, &message)) { + print_cli_error(message); + return EXIT_FAILURE; + } + + if (options.show_help) { + print_usage(stdout); + return EXIT_SUCCESS; + } + + if (options.show_version) { + print_version(); + return EXIT_SUCCESS; + } + + init_render_ctx(&ctx, options.max_nodes); + + if (!read_file(options.input_path, &input, &input_len, &message)) { + ctx.status = ORACLE_ERROR; + ctx.failure_class = "oracle-cli-error"; + ctx.message = message; + print_result(&ctx); + destroy_render_ctx(&ctx); + return EXIT_FAILURE; + } + + if (strcmp(options.mode, "full-document") == 0) { + render_full_document(&ctx, input, input_len); + } else { + render_fragment(&ctx, input, input_len, options.context); + } + + if (ctx.tree.failed && ctx.status == ORACLE_OK) { + ctx.status = ORACLE_ERROR; + ctx.failure_class = "oracle-renderer-error"; + ctx.message = "Could not allocate tree output."; + } + + print_result(&ctx); + free(input); + destroy_render_ctx(&ctx); + + return ctx.status == ORACLE_OK || ctx.status == ORACLE_UNSUPPORTED ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/tools/html-api-fuzz/set-inner-html.php b/tools/html-api-fuzz/set-inner-html.php new file mode 100644 index 0000000000000..4d51dc6bc95c4 --- /dev/null +++ b/tools/html-api-fuzz/set-inner-html.php @@ -0,0 +1,2270 @@ +#!/usr/bin/env php +seed = (string) $seed; + } + + /** + * Returns pseudo-random bytes. + * + * @param int $length Byte count. + * @return string Bytes. + */ + public function bytes( int $length ): string { + while ( strlen( $this->buffer ) < $length ) { + $this->buffer .= hash( 'sha256', $this->seed . ':' . $this->counter++, true ); + } + + $out = substr( $this->buffer, 0, $length ); + $this->buffer = substr( $this->buffer, $length ); + return $out; + } + + /** + * Returns an integer in a closed interval. + * + * @param int $min Minimum. + * @param int $max Maximum. + * @return int Number. + */ + public function int( int $min, int $max ): int { + if ( $max <= $min ) { + return $min; + } + + $parts = unpack( 'Nvalue', $this->bytes( 4 ) ); + return $min + ( (int) $parts['value'] % ( $max - $min + 1 ) ); + } + + /** + * Returns one value from a list. + * + * @param array $values Values. + * @return mixed Value. + */ + public function choice( array $values ) { + return $values[ $this->int( 0, count( $values ) - 1 ) ]; + } + + /** + * Returns true with the given percentage chance. + * + * @param int $percent Percent chance. + * @return bool Whether selected. + */ + public function chance( int $percent ): bool { + return $this->int( 1, 100 ) <= $percent; + } +} + +/** + * Prints usage. + */ +function wp_html_set_inner_html_fuzzer_usage(): void { + echo "Usage: php tools/html-api-fuzz/set-inner-html.php [--iterations N] [--start-seed N] [--output-dir DIR] [--stop-on-failure] [--lexbor-oracle-bin PATH] [--coverage-details]\n"; +} + +/** + * Parses simple CLI options. + * + * @param string[] $argv Arguments. + * @return array Options. + */ +function wp_html_set_inner_html_fuzzer_parse_options( array $argv ): array { + $options = array(); + $count = count( $argv ); + + for ( $i = 1; $i < $count; ++$i ) { + $arg = $argv[ $i ]; + if ( 0 !== strpos( $arg, '--' ) ) { + continue; + } + + $arg = substr( $arg, 2 ); + if ( false !== strpos( $arg, '=' ) ) { + list( $name, $value ) = explode( '=', $arg, 2 ); + $options[ $name ] = $value; + continue; + } + + if ( $i + 1 < $count && 0 !== strpos( $argv[ $i + 1 ], '--' ) ) { + $options[ $arg ] = $argv[ ++$i ]; + } else { + $options[ $arg ] = true; + } + } + + return $options; +} + +/** + * Returns an integer option. + * + * @param array $options Options. + * @param string $name Option name. + * @param int $default Default value. + * @return int Option value. + */ +function wp_html_set_inner_html_fuzzer_int_option( array $options, string $name, int $default ): int { + if ( ! array_key_exists( $name, $options ) || true === $options[ $name ] ) { + return $default; + } + + $value = filter_var( $options[ $name ], FILTER_VALIDATE_INT ); + if ( false === $value ) { + throw new InvalidArgumentException( "Expected --{$name} to be an integer." ); + } + + return (int) $value; +} + +/** + * Returns a string option. + * + * @param array $options Options. + * @param string $name Option name. + * @param string|null $default Default value. + * @return string|null Option value. + */ +function wp_html_set_inner_html_fuzzer_string_option( array $options, string $name, ?string $default ): ?string { + return array_key_exists( $name, $options ) && true !== $options[ $name ] + ? (string) $options[ $name ] + : $default; +} + +/** + * Returns the optional Lexbor oracle binary path. + * + * @param array $options Options. + * @return string|null Binary path, or null when unavailable. + */ +function wp_html_set_inner_html_fuzzer_lexbor_oracle_bin( array $options ): ?string { + $root = dirname( __DIR__, 2 ); + $from_env = getenv( 'HTML_API_FUZZ_LEXBOR_ORACLE' ); + $candidate = wp_html_set_inner_html_fuzzer_string_option( + $options, + 'lexbor-oracle-bin', + false !== $from_env && '' !== $from_env + ? $from_env + : $root . '/tools/html-api-fuzz/oracles/lexbor/build/lexbor-tree-oracle' + ); + + return is_string( $candidate ) && is_file( $candidate ) && is_executable( $candidate ) + ? $candidate + : null; +} + +/** + * Loads the HTML API without bootstrapping WordPress. + */ +function wp_html_set_inner_html_fuzzer_bootstrap(): void { + $root = dirname( __DIR__, 2 ); + $files = array( + 'src/wp-includes/compat.php', + 'src/wp-includes/compat-utf8.php', + 'src/wp-includes/utf8.php', + 'src/wp-includes/class-wp-token-map.php', + 'src/wp-includes/html-api/html5-named-character-references.php', + 'src/wp-includes/html-api/class-wp-html-attribute-token.php', + 'src/wp-includes/html-api/class-wp-html-span.php', + 'src/wp-includes/html-api/class-wp-html-doctype-info.php', + 'src/wp-includes/html-api/class-wp-html-text-replacement.php', + 'src/wp-includes/html-api/class-wp-html-decoder.php', + 'src/wp-includes/html-api/class-wp-html-tag-processor.php', + 'src/wp-includes/html-api/class-wp-html-unsupported-exception.php', + 'src/wp-includes/html-api/class-wp-html-active-formatting-elements.php', + 'src/wp-includes/html-api/class-wp-html-open-elements.php', + 'src/wp-includes/html-api/class-wp-html-token.php', + 'src/wp-includes/html-api/class-wp-html-stack-event.php', + 'src/wp-includes/html-api/class-wp-html-processor-state.php', + 'src/wp-includes/html-api/class-wp-html-processor.php', + ); + + foreach ( $files as $file ) { + require_once $root . DIRECTORY_SEPARATOR . $file; + } +} + +/** + * Returns current HTML elements. + * + * @return string[] Element names. + */ +function wp_html_set_inner_html_fuzzer_html_elements(): array { + return array( + 'a', + 'abbr', + 'address', + 'area', + 'article', + 'aside', + 'audio', + 'b', + 'base', + 'bdi', + 'bdo', + 'blockquote', + 'body', + 'br', + 'button', + 'canvas', + 'caption', + 'cite', + 'code', + 'col', + 'colgroup', + 'data', + 'datalist', + 'dd', + 'del', + 'details', + 'dfn', + 'dialog', + 'div', + 'dl', + 'dt', + 'em', + 'embed', + 'fieldset', + 'figcaption', + 'figure', + 'footer', + 'form', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'head', + 'header', + 'hgroup', + 'hr', + 'html', + 'i', + 'iframe', + 'img', + 'input', + 'ins', + 'kbd', + 'label', + 'legend', + 'li', + 'link', + 'main', + 'map', + 'mark', + 'menu', + 'meta', + 'meter', + 'nav', + 'noscript', + 'object', + 'ol', + 'optgroup', + 'option', + 'output', + 'p', + 'picture', + 'pre', + 'progress', + 'q', + 'rp', + 'rt', + 'ruby', + 's', + 'samp', + 'script', + 'search', + 'section', + 'select', + 'selectedcontent', + 'slot', + 'small', + 'source', + 'span', + 'strong', + 'style', + 'sub', + 'summary', + 'sup', + 'table', + 'tbody', + 'td', + 'template', + 'textarea', + 'tfoot', + 'th', + 'thead', + 'time', + 'title', + 'tr', + 'track', + 'u', + 'ul', + 'var', + 'video', + 'wbr', + ); +} + +/** + * Returns historical HTML elements that remain useful parser coverage. + * + * @return string[] Element names. + */ +function wp_html_set_inner_html_fuzzer_deprecated_html_elements(): array { + return array( + 'acronym', + 'applet', + 'basefont', + 'bgsound', + 'big', + 'blink', + 'center', + 'command', + 'content', + 'dir', + 'font', + 'frame', + 'frameset', + 'image', + 'isindex', + 'keygen', + 'listing', + 'marquee', + 'menuitem', + 'multicol', + 'nextid', + 'nobr', + 'noembed', + 'noframes', + 'param', + 'plaintext', + 'rb', + 'rtc', + 'shadow', + 'spacer', + 'strike', + 'tt', + 'xmp', + ); +} + +/** + * Returns HTML elements that do not have normal inner HTML. + * + * @return string[] Element names. + */ +function wp_html_set_inner_html_fuzzer_html_void_elements(): array { + return array( + 'area', + 'base', + 'basefont', + 'bgsound', + 'br', + 'col', + 'command', + 'embed', + 'frame', + 'hr', + 'image', + 'img', + 'input', + 'isindex', + 'keygen', + 'link', + 'meta', + 'param', + 'source', + 'track', + 'wbr', + ); +} + +/** + * Returns HTML elements the processor treats as void elements. + * + * This mirrors WP_HTML_Processor::is_void(). + * + * @return string[] Element names. + */ +function wp_html_set_inner_html_fuzzer_processor_void_elements(): array { + return array( + 'area', + 'base', + 'basefont', + 'bgsound', + 'br', + 'col', + 'embed', + 'frame', + 'hr', + 'img', + 'input', + 'keygen', + 'link', + 'meta', + 'param', + 'source', + 'track', + 'wbr', + ); +} + +/** + * Returns SVG elements. + * + * @return string[] Element names. + */ +function wp_html_set_inner_html_fuzzer_svg_elements(): array { + return array( + 'a', + 'altGlyph', + 'altGlyphDef', + 'altGlyphItem', + 'animate', + 'animateColor', + 'animateMotion', + 'animateTransform', + 'circle', + 'clipPath', + 'color-profile', + 'cursor', + 'defs', + 'desc', + 'discard', + 'ellipse', + 'feBlend', + 'feColorMatrix', + 'feComponentTransfer', + 'feComposite', + 'feConvolveMatrix', + 'feDiffuseLighting', + 'feDisplacementMap', + 'feDistantLight', + 'feDropShadow', + 'feFlood', + 'feFuncA', + 'feFuncB', + 'feFuncG', + 'feFuncR', + 'feGaussianBlur', + 'feImage', + 'feMerge', + 'feMergeNode', + 'feMorphology', + 'feOffset', + 'fePointLight', + 'feSpecularLighting', + 'feSpotLight', + 'feTile', + 'feTurbulence', + 'filter', + 'font', + 'font-face', + 'font-face-format', + 'font-face-name', + 'font-face-src', + 'font-face-uri', + 'foreignObject', + 'g', + 'glyph', + 'glyphRef', + 'hatch', + 'hatchpath', + 'hkern', + 'image', + 'line', + 'linearGradient', + 'marker', + 'mask', + 'metadata', + 'mesh', + 'meshgradient', + 'meshpatch', + 'meshrow', + 'missing-glyph', + 'mpath', + 'path', + 'pattern', + 'polygon', + 'polyline', + 'radialGradient', + 'rect', + 'script', + 'set', + 'solidcolor', + 'stop', + 'style', + 'svg', + 'switch', + 'symbol', + 'text', + 'textPath', + 'title', + 'tref', + 'tspan', + 'use', + 'view', + 'vkern', + ); +} + +/** + * Returns MathML elements from MathML Core and MathML 3. + * + * @return string[] Element names. + */ +function wp_html_set_inner_html_fuzzer_mathml_elements(): array { + return array( + 'abs', + 'and', + 'annotation', + 'annotation-xml', + 'apply', + 'approx', + 'arccos', + 'arccosh', + 'arccot', + 'arccoth', + 'arccsc', + 'arccsch', + 'arcsec', + 'arcsech', + 'arcsin', + 'arcsinh', + 'arctan', + 'arctanh', + 'arg', + 'bind', + 'bvar', + 'card', + 'cartesianproduct', + 'cbytes', + 'ceiling', + 'cerror', + 'ci', + 'cn', + 'codomain', + 'complexes', + 'compose', + 'condition', + 'conjugate', + 'cos', + 'cosh', + 'cot', + 'coth', + 'cs', + 'csc', + 'csch', + 'csymbol', + 'curl', + 'declare', + 'degree', + 'determinant', + 'diff', + 'divergence', + 'divide', + 'domain', + 'domainofapplication', + 'emptyset', + 'eq', + 'equivalent', + 'eulergamma', + 'exists', + 'exp', + 'exponentiale', + 'factorial', + 'factorof', + 'false', + 'floor', + 'fn', + 'forall', + 'gcd', + 'geq', + 'grad', + 'gt', + 'ident', + 'image', + 'imaginary', + 'imaginaryi', + 'implies', + 'in', + 'infinity', + 'int', + 'integers', + 'intersect', + 'interval', + 'inverse', + 'lambda', + 'laplacian', + 'lcm', + 'leq', + 'limit', + 'list', + 'ln', + 'log', + 'logbase', + 'lowlimit', + 'lt', + 'maction', + 'maligngroup', + 'malignmark', + 'math', + 'matrix', + 'matrixrow', + 'max', + 'mean', + 'median', + 'menclose', + 'merror', + 'mfenced', + 'mfrac', + 'mglyph', + 'mi', + 'min', + 'minus', + 'mlabeledtr', + 'mlongdiv', + 'mmultiscripts', + 'mn', + 'mo', + 'mode', + 'moment', + 'momentabout', + 'mover', + 'mpadded', + 'mphantom', + 'mprescripts', + 'mroot', + 'mrow', + 'ms', + 'mscarry', + 'mscarries', + 'msgroup', + 'msline', + 'mspace', + 'msqrt', + 'msrow', + 'mstack', + 'mstyle', + 'msub', + 'msubsup', + 'msup', + 'mtable', + 'mtd', + 'mtext', + 'mtr', + 'munder', + 'munderover', + 'naturalnumbers', + 'neq', + 'none', + 'not', + 'notanumber', + 'notin', + 'notsubset', + 'notprsubset', + 'or', + 'otherwise', + 'outerproduct', + 'partialdiff', + 'piece', + 'piecewise', + 'pi', + 'plus', + 'power', + 'primes', + 'product', + 'prsubset', + 'quotient', + 'rationals', + 'reals', + 'real', + 'reln', + 'rem', + 'root', + 'scalarproduct', + 'sdev', + 'sec', + 'sech', + 'selector', + 'semantics', + 'sep', + 'set', + 'setdiff', + 'share', + 'sin', + 'sinh', + 'subset', + 'sum', + 'tan', + 'tanh', + 'tendsto', + 'times', + 'transpose', + 'true', + 'union', + 'uplimit', + 'variance', + 'vector', + 'vectorproduct', + 'xor', + ); +} + +/** + * Returns all HTML element names the fuzzer should cover. + * + * @return string[] Element names. + */ +function wp_html_set_inner_html_fuzzer_all_html_elements(): array { + return array_values( + array_unique( + array_merge( + wp_html_set_inner_html_fuzzer_html_elements(), + wp_html_set_inner_html_fuzzer_deprecated_html_elements() + ) + ) + ); +} + +/** + * Returns a deterministic custom element name. + * + * @param WP_HTML_Set_Inner_HTML_Fuzzer_PRNG $rng PRNG. + * @return string Custom element name. + */ +function wp_html_set_inner_html_fuzzer_custom_element_name( WP_HTML_Set_Inner_HTML_Fuzzer_PRNG $rng ): string { + $prefix = $rng->choice( array( 'x', 'wp', 'codex', 'fuzz', 'html-api' ) ); + $suffix = $rng->choice( array( 'alpha', 'beta', 'panel', 'card', 'thing', 'node' ) ); + return "{$prefix}-{$suffix}-" . $rng->int( 0, 999 ); +} + +/** + * Returns HTML elements that are atomic for inner HTML updates. + * + * These elements either never have an end tag, or their contents are handled + * as text-like data rather than parsed HTML children. + * + * @return string[] Element names. + */ +function wp_html_set_inner_html_fuzzer_atomic_html_elements(): array { + return array_values( + array_unique( + array_merge( + wp_html_set_inner_html_fuzzer_processor_void_elements(), + array( + 'iframe', + 'noembed', + 'noframes', + 'script', + 'style', + 'textarea', + 'title', + 'xmp', + ) + ) + ) + ); +} + +/** + * Returns context-aware target markup for one HTML element. + * + * @param string $tag Element name. + * @param string $inner Original inner HTML. + * @param string $replacement Replacement inner HTML. + * @return array{full: bool, targetTag: string, html: string, replacement: string, expected: string, expectSet: bool|null} Target case. + */ +function wp_html_set_inner_html_fuzzer_html_target_markup( string $tag, string $inner, string $replacement ): array { + $target_tag = strtoupper( $tag ); + + if ( in_array( $tag, wp_html_set_inner_html_fuzzer_atomic_html_elements(), true ) ) { + $html = in_array( $tag, wp_html_set_inner_html_fuzzer_processor_void_elements(), true ) + ? "<{$tag} data-fuzz-target=\"1\">After" + : "<{$tag} data-fuzz-target=\"1\">{$inner}After"; + + return array( + 'full' => false, + 'targetTag' => $target_tag, + 'html' => $html, + 'replacement' => $replacement, + 'expected' => $html, + 'expectSet' => false, + ); + } + + $full = false; + switch ( $tag ) { + case 'html': + $full = true; + $html = 'Old' . $inner . ''; + $expected = '' . $replacement . ''; + break; + + case 'head': + $full = true; + $html = '' . $inner . 'After'; + $expected = '' . $replacement . 'After'; + break; + + case 'body': + $full = true; + $html = '' . $inner . ''; + $expected = '' . $replacement . ''; + break; + + case 'frameset': + $full = true; + $replacement = ''; + $html = ''; + $expected = '' . $replacement . ''; + break; + + case 'table': + $replacement = 'target'; + $html = '
' . $inner . '
After'; + $expected = '' . $replacement . '
After'; + break; + + case 'caption': + $html = '
' . $inner . '
After
After'; + $expected = '
' . $replacement . '
After
After'; + break; + + case 'colgroup': + $replacement = ''; + $html = '
After
After'; + $expected = '' . $replacement . '
After
After'; + break; + + case 'thead': + case 'tbody': + case 'tfoot': + $replacement = 'target'; + $html = '<' . $tag . ' data-fuzz-target="1">
' . $inner . '
After'; + $expected = '<' . $tag . ' data-fuzz-target="1">' . $replacement . '
After'; + break; + + case 'tr': + $replacement = 'target'; + $html = '
' . $inner . '
After'; + $expected = '' . $replacement . '
After'; + break; + + case 'td': + case 'th': + $html = '<' . $tag . ' data-fuzz-target="1">' . $inner . '
After'; + $expected = '<' . $tag . ' data-fuzz-target="1">' . $replacement . '
After'; + break; + + case 'select': + $replacement = ''; + $html = 'After'; + $expected = 'After'; + break; + + case 'optgroup': + $replacement = ''; + $html = 'After'; + $expected = 'After'; + break; + + case 'option': + $replacement = 'target'; + $html = 'After'; + $expected = 'After'; + break; + + case 'ul': + case 'ol': + case 'menu': + $replacement = '
  • target
  • '; + $html = '<' . $tag . ' data-fuzz-target="1">
  • ' . $inner . '
  • After'; + $expected = '<' . $tag . ' data-fuzz-target="1">' . $replacement . 'After'; + break; + + case 'dl': + $replacement = '
    target
    value
    '; + $html = '
    ' . $inner . '
    value
    After'; + $expected = '
    ' . $replacement . '
    After'; + break; + + case 'ruby': + $replacement = 'basetarget'; + $html = '' . $inner . 'oldAfter'; + $expected = '' . $replacement . 'After'; + break; + + case 'rtc': + $replacement = 'target'; + $html = 'base' . $inner . 'After'; + $expected = 'base' . $replacement . 'After'; + break; + + default: + $html = "<{$tag} data-fuzz-target=\"1\">{$inner}After"; + $expected = "<{$tag} data-fuzz-target=\"1\">{$replacement}After"; + break; + } + + return array( + 'full' => $full, + 'targetTag' => $target_tag, + 'html' => $html, + 'replacement' => $replacement, + 'expected' => $expected, + 'expectSet' => null, + ); +} + +/** + * Returns target markup for one SVG element. + * + * @param string $tag Element name. + * @param string $inner Original inner HTML. + * @param string $replacement Replacement inner HTML. + * @return array{full: bool, targetTag: string, html: string, replacement: string, expected: string, expectSet: bool|null} Target case. + */ +function wp_html_set_inner_html_fuzzer_svg_target_markup( string $tag, string $inner, string $replacement ): array { + $target = "<{$tag} data-fuzz-target=\"1\">{$inner}"; + $updated = "<{$tag} data-fuzz-target=\"1\">{$replacement}"; + + return array( + 'full' => false, + 'targetTag' => strtoupper( $tag ), + 'html' => '' . $target . 'After', + 'replacement' => $replacement, + 'expected' => '' . $updated . 'After', + 'expectSet' => null, + ); +} + +/** + * Returns target markup for one MathML element. + * + * @param string $tag Element name. + * @param string $inner Original inner HTML. + * @param string $replacement Replacement inner HTML. + * @return array{full: bool, targetTag: string, html: string, replacement: string, expected: string, expectSet: bool|null} Target case. + */ +function wp_html_set_inner_html_fuzzer_mathml_target_markup( string $tag, string $inner, string $replacement ): array { + $target = "<{$tag} data-fuzz-target=\"1\">{$inner}"; + $updated = "<{$tag} data-fuzz-target=\"1\">{$replacement}"; + + return array( + 'full' => false, + 'targetTag' => strtoupper( $tag ), + 'html' => '' . $target . 'After', + 'replacement' => $replacement, + 'expected' => '' . $updated . 'After', + 'expectSet' => null, + ); +} + +/** + * Returns randomized attributes. + * + * @param WP_HTML_Set_Inner_HTML_Fuzzer_PRNG $rng PRNG. + * @param string $namespace Element namespace. + * @return string Attribute text. + */ +function wp_html_set_inner_html_fuzzer_attrs( WP_HTML_Set_Inner_HTML_Fuzzer_PRNG $rng, string $namespace = 'html' ): string { + $attributes = array( + 'id' => 'fuzz-' . $rng->int( 0, 99 ), + 'class' => $rng->choice( array( 'alpha beta', 'one', 'two', 'targetish' ) ), + 'data-fuzz' => (string) $rng->int( 0, 999 ), + 'title' => $rng->choice( array( 'title', 'a & b', '' ) ), + 'aria-label' => 'label', + 'hidden' => null, + 'xml:space' => 'preserve', + 'xlink:href' => '#fuzz', + 'encoding' => $rng->choice( array( 'text/html', 'application/xhtml+xml', 'application/xml' ) ), + 'xmlns' => 'svg' === $namespace ? 'http://www.w3.org/2000/svg' : 'http://www.w3.org/1998/Math/MathML', + ); + + $out = ''; + $count = $rng->int( 0, 4 ); + $keys = array_keys( $attributes ); + for ( $i = 0; $i < $count; ++$i ) { + $name = $rng->choice( $keys ); + $value = $attributes[ $name ]; + if ( null === $value ) { + $out .= " {$name}"; + continue; + } + $quote = $rng->choice( array( '"', "'" ) ); + $out .= " {$name}={$quote}{$value}{$quote}"; + } + + if ( $rng->chance( 8 ) ) { + $out .= ' data-fuzz data-fuzz="duplicate"'; + } + + return $out; +} + +/** + * Renders one HTML element. + * + * @param string $tag Element name. + * @param WP_HTML_Set_Inner_HTML_Fuzzer_PRNG $rng PRNG. + * @param string $content Element contents. + * @return string HTML. + */ +function wp_html_set_inner_html_fuzzer_render_html_element( string $tag, WP_HTML_Set_Inner_HTML_Fuzzer_PRNG $rng, string $content = 'x' ): string { + $attrs = wp_html_set_inner_html_fuzzer_attrs( $rng, 'html' ); + if ( in_array( $tag, wp_html_set_inner_html_fuzzer_html_void_elements(), true ) ) { + return "<{$tag}{$attrs}>"; + } + + if ( in_array( $tag, array( 'script', 'style', 'xmp', 'iframe', 'noembed', 'noframes', 'plaintext' ), true ) ) { + $content = 'style' === $tag ? 'a{color:red}' : '1 < 2 & 3'; + } + + if ( in_array( $tag, array( 'textarea', 'title' ), true ) ) { + $content = 'rcdata & text'; + } + + return "<{$tag}{$attrs}>{$content}"; +} + +/** + * Renders one SVG element inside an SVG container. + * + * @param string $tag Element name. + * @param WP_HTML_Set_Inner_HTML_Fuzzer_PRNG $rng PRNG. + * @return string HTML. + */ +function wp_html_set_inner_html_fuzzer_render_svg_element( string $tag, WP_HTML_Set_Inner_HTML_Fuzzer_PRNG $rng ): string { + $attrs = wp_html_set_inner_html_fuzzer_attrs( $rng, 'svg' ); + $content = in_array( $tag, array( 'script', 'style' ), true ) ? '1 < 2' : 'svg'; + return "<{$tag}{$attrs}>{$content}"; +} + +/** + * Renders one MathML element inside a MathML container. + * + * @param string $tag Element name. + * @param WP_HTML_Set_Inner_HTML_Fuzzer_PRNG $rng PRNG. + * @return string HTML. + */ +function wp_html_set_inner_html_fuzzer_render_mathml_element( string $tag, WP_HTML_Set_Inner_HTML_Fuzzer_PRNG $rng ): string { + $attrs = wp_html_set_inner_html_fuzzer_attrs( $rng, 'math' ); + $content = 'annotation-xml' === $tag ? '

    html integration

    ' : 'x'; + return "<{$tag}{$attrs}>{$content}"; +} + +/** + * Returns HTML element tags suitable for structurally safe source interiors. + * + * @return string[] Element names. + */ +function wp_html_set_inner_html_fuzzer_safe_html_elements(): array { + return array_values( + array_diff( + wp_html_set_inner_html_fuzzer_all_html_elements(), + array( + 'body', + 'frame', + 'frameset', + 'head', + 'html', + 'plaintext', + ) + ) + ); +} + +/** + * Returns one random balanced tree. + * + * @param WP_HTML_Set_Inner_HTML_Fuzzer_PRNG $rng PRNG. + * @param int $depth Remaining depth. + * @param bool $allow_leaks Whether leak-prone syntax is allowed. + * @return string HTML. + */ +function wp_html_set_inner_html_fuzzer_tree( WP_HTML_Set_Inner_HTML_Fuzzer_PRNG $rng, int $depth, bool $allow_leaks ): string { + if ( $depth <= 0 ) { + return $rng->choice( array( '', 'text', ' & ', '' ) ); + } + + $count = $rng->int( 1, 4 ); + $html = ''; + for ( $i = 0; $i < $count; ++$i ) { + $kind = $rng->choice( + $allow_leaks + ? array( 'text', 'html', 'svg', 'math', 'custom', 'template', 'table', 'leak' ) + : array( 'text', 'html', 'svg', 'math', 'custom', 'template', 'table' ) + ); + + switch ( $kind ) { + case 'text': + $html .= $rng->choice( array( 'text', '0', "line\nbreak", '', ' & ' ) ); + break; + + case 'html': + $tags = $allow_leaks + ? wp_html_set_inner_html_fuzzer_all_html_elements() + : wp_html_set_inner_html_fuzzer_safe_html_elements(); + $tag = $rng->choice( $tags ); + $html .= wp_html_set_inner_html_fuzzer_render_html_element( + $tag, + $rng, + wp_html_set_inner_html_fuzzer_tree( $rng, $depth - 1, false ) + ); + break; + + case 'svg': + $html .= wp_html_set_inner_html_fuzzer_render_svg_element( + $rng->choice( wp_html_set_inner_html_fuzzer_svg_elements() ), + $rng + ); + break; + + case 'math': + $html .= wp_html_set_inner_html_fuzzer_render_mathml_element( + $rng->choice( wp_html_set_inner_html_fuzzer_mathml_elements() ), + $rng + ); + break; + + case 'custom': + $tag = wp_html_set_inner_html_fuzzer_custom_element_name( $rng ); + $html .= "<{$tag}" . wp_html_set_inner_html_fuzzer_attrs( $rng ) . '>' . + wp_html_set_inner_html_fuzzer_tree( $rng, $depth - 1, false ) . + ""; + break; + + case 'template': + $html .= ''; + break; + + case 'table': + $html .= $rng->choice( + array( + '
    c
    cell
    ', + '
    h
    c
    ', + '
    c
    ', + ) + ); + break; + + case 'leak': + $html .= $rng->choice( + array( + '

    leak

    ', + 'leak', + 'nested', + 'unclosed', + 'x', + 'x', + 'tail', + ) + ); + break; + } + } + + return $html; +} + +/** + * Returns a generated HTML fragment. + * + * @param WP_HTML_Set_Inner_HTML_Fuzzer_PRNG $rng PRNG. + * @param int $max_snippets Maximum snippets. + * @param bool $allow_leaks Whether to include snippets intended to leak. + * @return string HTML. + */ +function wp_html_set_inner_html_fuzzer_fragment( WP_HTML_Set_Inner_HTML_Fuzzer_PRNG $rng, int $max_snippets = 5, bool $allow_leaks = true ): string { + $texts = array( '', 'text', ' &amp; ', '0', "line\nbreak", '<!--comment-->' ); + $snippets = array( + 'plain' => static function () use ( $rng, $texts ): string { + return $rng->choice( $texts ); + }, + 'element' => static function () use ( $rng ): string { + $tag = $rng->choice( wp_html_set_inner_html_fuzzer_all_html_elements() ); + return wp_html_set_inner_html_fuzzer_render_html_element( + $tag, + $rng, + $rng->choice( array( 'x', 'y', '<em>z</em>', '' ) ) + ); + }, + 'omitted' => static function () use ( $rng ): string { + return $rng->choice( array( '<p>one<p>two', '<ul><li>one<li>two</ul>', '<dl><dt>a<dd>b' ) ); + }, + 'foreign' => static function () use ( $rng ): string { + return $rng->choice( + array( + wp_html_set_inner_html_fuzzer_render_svg_element( + $rng->choice( wp_html_set_inner_html_fuzzer_svg_elements() ), + $rng + ), + '<svg><html lang="fr"></html></svg>', + wp_html_set_inner_html_fuzzer_render_mathml_element( + $rng->choice( wp_html_set_inner_html_fuzzer_mathml_elements() ), + $rng + ), + ) + ); + }, + 'template' => static function () use ( $rng ): string { + return $rng->choice( array( '<template><body add-class>t</template>', '<template></body><p>x</p></template>' ) ); + }, + 'rawtext' => static function () use ( $rng ): string { + return $rng->choice( array( '<script>1 < 2</script>', '<style>a{color:red}</style>', '<textarea>x</textarea>' ) ); + }, + 'leak' => static function () use ( $rng ): string { + return $rng->choice( array( '</div><p>leak</p>', '</section><span>leak</span>', '<a>nested</a>', '<b>unclosed', '<body add-class>x', '<html lang="en">x', '<plaintext>tail' ) ); + }, + 'table' => static function () use ( $rng ): string { + return $rng->choice( array( '<table><tr><td>c</td></tr></table>', '<table><td>c</table>' ) ); + }, + 'tree' => static function () use ( $rng, $allow_leaks ): string { + return wp_html_set_inner_html_fuzzer_tree( $rng, 2, $allow_leaks ); + }, + 'custom' => static function () use ( $rng ): string { + $tag = wp_html_set_inner_html_fuzzer_custom_element_name( $rng ); + return "<{$tag}" . wp_html_set_inner_html_fuzzer_attrs( $rng ) . '>custom</' . $tag . '>'; + }, + ); + + if ( ! $allow_leaks ) { + unset( $snippets['leak'] ); + unset( $snippets['omitted'] ); + } + + $html = ''; + $count = $rng->int( 0, $max_snippets ); + for ( $i = 0; $i < $count; ++$i ) { + $factory = $rng->choice( array_values( $snippets ) ); + $html .= $factory(); + } + + return $html; +} + +/** + * Builds one fuzz case. + * + * @param int $seed Seed. + * @return array<string, string|bool|int> Case data. + */ +function wp_html_set_inner_html_fuzzer_case( int $seed ): array { + $rng = new WP_HTML_Set_Inner_HTML_Fuzzer_PRNG( $seed ); + + if ( $rng->chance( 35 ) ) { + $inner = 'Old'; + $replacement = wp_html_set_inner_html_fuzzer_fragment( $rng, 5 ); + $kind = $rng->choice( array( 'html', 'html', 'svg', 'math', 'custom' ) ); + + switch ( $kind ) { + case 'svg': + $target_case = wp_html_set_inner_html_fuzzer_svg_target_markup( + $rng->choice( wp_html_set_inner_html_fuzzer_svg_elements() ), + $inner, + $replacement + ); + break; + + case 'math': + $target_case = wp_html_set_inner_html_fuzzer_mathml_target_markup( + $rng->choice( wp_html_set_inner_html_fuzzer_mathml_elements() ), + $inner, + $replacement + ); + break; + + case 'custom': + $target_case = wp_html_set_inner_html_fuzzer_html_target_markup( + wp_html_set_inner_html_fuzzer_custom_element_name( $rng ), + $inner, + $replacement + ); + break; + + case 'html': + default: + $target_case = wp_html_set_inner_html_fuzzer_html_target_markup( + $rng->choice( wp_html_set_inner_html_fuzzer_all_html_elements() ), + $inner, + $replacement + ); + break; + } + + return array( + 'seed' => $seed, + 'full' => $target_case['full'], + 'targetTag' => $target_case['targetTag'], + 'html' => $target_case['html'], + 'replacement' => $target_case['replacement'], + 'expected' => $target_case['expected'], + ); + } + + $full = $rng->chance( 35 ); + $target_tag = $rng->choice( array( 'div', 'section', 'main', 'article' ) ); + $prefix = wp_html_set_inner_html_fuzzer_tree( $rng, 2, false ); + $inner = wp_html_set_inner_html_fuzzer_tree( $rng, 3, false ); + $suffix = wp_html_set_inner_html_fuzzer_tree( $rng, 2, false ); + $replace = wp_html_set_inner_html_fuzzer_fragment( $rng, 5 ); + $opener = "<{$target_tag} data-fuzz-target=\"1\">"; + $closer = "</{$target_tag}>"; + $fragment = $prefix . $opener . $inner . $closer . $suffix; + $expected = $prefix . $opener . $replace . $closer . $suffix; + + if ( $full ) { + $fragment = '<!DOCTYPE html><html><body>' . $fragment . '</body></html>'; + $expected = '<!DOCTYPE html><html><body>' . $expected . '</body></html>'; + } + + return array( + 'seed' => $seed, + 'full' => $full, + 'targetTag' => strtoupper( $target_tag ), + 'html' => $fragment, + 'replacement' => $replace, + 'expected' => $expected, + ); +} + +/** + * Returns deterministic regression cases to run before random fuzz cases. + * + * @return array<int, array<string, string|bool|int|null>> Corpus cases. + */ +function wp_html_set_inner_html_fuzzer_corpus_cases(): array { + $cases = array( + array( + 'seed' => 0, + 'name' => 'fragment-body-attribute-hoist', + 'full' => false, + 'targetTag' => 'MAIN', + 'html' => '<main data-fuzz-target="1">Old</main><span>After</span>', + 'replacement' => '<body add-class>New', + 'expected' => '<main data-fuzz-target="1"><body add-class>New</main><span>After</span>', + 'expectSet' => false, + ), + array( + 'seed' => 0, + 'name' => 'fragment-html-attribute-hoist', + 'full' => false, + 'targetTag' => 'MAIN', + 'html' => '<main data-fuzz-target="1">Old</main><span>After</span>', + 'replacement' => '<html lang="en">New', + 'expected' => '<main data-fuzz-target="1"><html lang="en">New</main><span>After</span>', + 'expectSet' => false, + ), + array( + 'seed' => 0, + 'name' => 'escaped-target-body-attribute-hoist', + 'full' => false, + 'targetTag' => 'DIV', + 'html' => '<div data-fuzz-target="1">Old</div><span>After</span>', + 'replacement' => '</div><body add-class>', + 'expected' => '<div data-fuzz-target="1"></div><body add-class></div><span>After</span>', + 'expectSet' => false, + ), + array( + 'seed' => 0, + 'name' => 'escaped-target-html-attribute-hoist', + 'full' => false, + 'targetTag' => 'DIV', + 'html' => '<div data-fuzz-target="1">Old</div><span>After</span>', + 'replacement' => '</div><html lang="en">', + 'expected' => '<div data-fuzz-target="1"></div><html lang="en"></div><span>After</span>', + 'expectSet' => false, + ), + array( + 'seed' => 0, + 'name' => 'original-body-attribute-hoist-would-be-removed', + 'full' => false, + 'targetTag' => 'DIV', + 'html' => '<div data-fuzz-target="1"><body add-class>Old</div><span>After</span>', + 'replacement' => '<p>New</p>', + 'expected' => '<div data-fuzz-target="1"><p>New</p></div><span>After</span>', + 'expectSet' => false, + ), + array( + 'seed' => 0, + 'name' => 'original-html-attribute-hoist-would-be-removed', + 'full' => false, + 'targetTag' => 'DIV', + 'html' => '<div data-fuzz-target="1"><html lang="en">Old</div><span>After</span>', + 'replacement' => '<p>New</p>', + 'expected' => '<div data-fuzz-target="1"><p>New</p></div><span>After</span>', + 'expectSet' => false, + ), + array( + 'seed' => 0, + 'name' => 'template-body-tag-does-not-hoist', + 'full' => false, + 'targetTag' => 'DIV', + 'html' => '<div data-fuzz-target="1">Old</div><span>After</span>', + 'replacement' => '<template><body add-class>New</template>', + 'expected' => '<div data-fuzz-target="1"><template><body add-class>New</template></div><span>After</span>', + 'expectSet' => true, + ), + array( + 'seed' => 0, + 'name' => 'foreign-html-tag-does-not-hoist', + 'full' => false, + 'targetTag' => 'SVG', + 'html' => '<svg data-fuzz-target="1"><title>Old</title></svg><span>After</span>', + 'replacement' => '<html lang="fr"></html>', + 'expected' => '<svg data-fuzz-target="1"><html lang="fr"></html></svg><span>After</span>', + 'expectSet' => true, + ), + array( + 'seed' => 0, + 'name' => 'full-document-body-attribute-hoist', + 'full' => true, + 'targetTag' => 'MAIN', + 'html' => '<!DOCTYPE html><html><body><main data-fuzz-target="1">Old</main><span>After</span></body></html>', + 'replacement' => '<body add-class>New', + 'expected' => '<!DOCTYPE html><html><body><main data-fuzz-target="1"><body add-class>New</main><span>After</span></body></html>', + 'expectSet' => false, + ), + array( + 'seed' => 0, + 'name' => 'full-document-html-attribute-hoist', + 'full' => true, + 'targetTag' => 'MAIN', + 'html' => '<!DOCTYPE html><html><body><main data-fuzz-target="1">Old</main><span>After</span></body></html>', + 'replacement' => '<html lang="en">New', + 'expected' => '<!DOCTYPE html><html><body><main data-fuzz-target="1"><html lang="en">New</main><span>After</span></body></html>', + 'expectSet' => false, + ), + ); + + foreach ( wp_html_set_inner_html_fuzzer_all_html_elements() as $tag ) { + $rng = new WP_HTML_Set_Inner_HTML_Fuzzer_PRNG( 'corpus-html-' . $tag ); + $replacement = wp_html_set_inner_html_fuzzer_render_html_element( $tag, $rng, '<span>html</span>' ); + $cases[] = array( + 'seed' => 0, + 'name' => 'coverage-html-' . $tag, + 'full' => false, + 'targetTag' => 'DIV', + 'html' => '<div data-fuzz-target="1">Old</div><span>After</span>', + 'replacement' => $replacement, + 'expected' => '<div data-fuzz-target="1">' . $replacement . '</div><span>After</span>', + 'expectSet' => null, + ); + } + + foreach ( wp_html_set_inner_html_fuzzer_svg_elements() as $tag ) { + $rng = new WP_HTML_Set_Inner_HTML_Fuzzer_PRNG( 'corpus-svg-' . $tag ); + $replacement = wp_html_set_inner_html_fuzzer_render_svg_element( $tag, $rng ); + $cases[] = array( + 'seed' => 0, + 'name' => 'coverage-svg-' . strtolower( $tag ), + 'full' => false, + 'targetTag' => 'DIV', + 'html' => '<div data-fuzz-target="1">Old</div><span>After</span>', + 'replacement' => $replacement, + 'expected' => '<div data-fuzz-target="1">' . $replacement . '</div><span>After</span>', + 'expectSet' => null, + ); + } + + foreach ( wp_html_set_inner_html_fuzzer_mathml_elements() as $tag ) { + $rng = new WP_HTML_Set_Inner_HTML_Fuzzer_PRNG( 'corpus-mathml-' . $tag ); + $replacement = wp_html_set_inner_html_fuzzer_render_mathml_element( $tag, $rng ); + $cases[] = array( + 'seed' => 0, + 'name' => 'coverage-mathml-' . $tag, + 'full' => false, + 'targetTag' => 'DIV', + 'html' => '<div data-fuzz-target="1">Old</div><span>After</span>', + 'replacement' => $replacement, + 'expected' => '<div data-fuzz-target="1">' . $replacement . '</div><span>After</span>', + 'expectSet' => null, + ); + } + + for ( $i = 0; $i < 32; ++$i ) { + $rng = new WP_HTML_Set_Inner_HTML_Fuzzer_PRNG( 'corpus-custom-' . $i ); + $tag = wp_html_set_inner_html_fuzzer_custom_element_name( $rng ); + $replacement = "<{$tag}" . wp_html_set_inner_html_fuzzer_attrs( $rng ) . '>custom</' . $tag . '>'; + $cases[] = array( + 'seed' => 0, + 'name' => 'coverage-custom-' . $i . '-' . $tag, + 'full' => false, + 'targetTag' => 'DIV', + 'html' => '<div data-fuzz-target="1">Old</div><span>After</span>', + 'replacement' => $replacement, + 'expected' => '<div data-fuzz-target="1">' . $replacement . '</div><span>After</span>', + 'expectSet' => null, + ); + } + + foreach ( wp_html_set_inner_html_fuzzer_all_html_elements() as $tag ) { + $target_case = wp_html_set_inner_html_fuzzer_html_target_markup( + $tag, + 'Old', + '<span>target</span>' + ); + + $cases[] = array( + 'seed' => 0, + 'name' => 'target-html-' . $tag, + 'full' => $target_case['full'], + 'targetTag' => $target_case['targetTag'], + 'html' => $target_case['html'], + 'replacement' => $target_case['replacement'], + 'expected' => $target_case['expected'], + 'expectSet' => $target_case['expectSet'], + ); + } + + foreach ( wp_html_set_inner_html_fuzzer_svg_elements() as $tag ) { + $target_case = wp_html_set_inner_html_fuzzer_svg_target_markup( + $tag, + '<title>Old</title>', + '<title>target</title>' + ); + + $cases[] = array( + 'seed' => 0, + 'name' => 'target-svg-' . strtolower( $tag ), + 'full' => $target_case['full'], + 'targetTag' => $target_case['targetTag'], + 'html' => $target_case['html'], + 'replacement' => $target_case['replacement'], + 'expected' => $target_case['expected'], + 'expectSet' => $target_case['expectSet'], + ); + } + + foreach ( wp_html_set_inner_html_fuzzer_mathml_elements() as $tag ) { + $target_case = wp_html_set_inner_html_fuzzer_mathml_target_markup( + $tag, + '<mi>Old</mi>', + '<mi>target</mi>' + ); + + $cases[] = array( + 'seed' => 0, + 'name' => 'target-mathml-' . $tag, + 'full' => $target_case['full'], + 'targetTag' => $target_case['targetTag'], + 'html' => $target_case['html'], + 'replacement' => $target_case['replacement'], + 'expected' => $target_case['expected'], + 'expectSet' => $target_case['expectSet'], + ); + } + + for ( $i = 0; $i < 32; ++$i ) { + $rng = new WP_HTML_Set_Inner_HTML_Fuzzer_PRNG( 'target-custom-' . $i ); + $tag = wp_html_set_inner_html_fuzzer_custom_element_name( $rng ); + $replacement = '<span>target</span>'; + $target_case = wp_html_set_inner_html_fuzzer_html_target_markup( $tag, 'Old', $replacement ); + $cases[] = array( + 'seed' => 0, + 'name' => 'target-custom-' . $i . '-' . $tag, + 'full' => $target_case['full'], + 'targetTag' => $target_case['targetTag'], + 'html' => $target_case['html'], + 'replacement' => $target_case['replacement'], + 'expected' => $target_case['expected'], + 'expectSet' => $target_case['expectSet'], + ); + } + + return $cases; +} + +/** + * Creates a processor for a case. + * + * @param string $html HTML. + * @param bool $full Whether to create a full parser. + * @return WP_HTML_Processor|null Processor. + */ +function wp_html_set_inner_html_fuzzer_create_processor( string $html, bool $full ): ?WP_HTML_Processor { + return $full ? WP_HTML_Processor::create_full_parser( $html ) : WP_HTML_Processor::create_fragment( $html ); +} + +/** + * Moves a processor to the fuzz target. + * + * @param WP_HTML_Processor $processor Processor. + * @return bool Whether the target was found. + */ +function wp_html_set_inner_html_fuzzer_seek_target( WP_HTML_Processor $processor ): bool { + while ( $processor->next_tag() ) { + if ( '1' === $processor->get_attribute( 'data-fuzz-target' ) ) { + return true; + } + } + + return false; +} + +/** + * Returns a signature for the current token. + * + * @param WP_HTML_Processor $processor Processor. + * @return array<string, mixed> Token signature. + */ +function wp_html_set_inner_html_fuzzer_token_signature( WP_HTML_Processor $processor ): array { + return array( + 'type' => $processor->get_token_type(), + 'name' => $processor->get_token_name(), + 'namespace' => $processor->get_namespace(), + 'isCloser' => $processor->is_tag_closer(), + 'breadcrumbs' => $processor->get_breadcrumbs(), + 'html' => $processor->serialize_token(), + ); +} + +/** + * Returns a signature for parser continuation from the current token. + * + * @param WP_HTML_Processor $processor Processor. + * @return array{tokens: array<int, array<string, mixed>>, lastError: string|null} Continuation signature. + */ +function wp_html_set_inner_html_fuzzer_continuation_signature( WP_HTML_Processor $processor ): array { + $signature = array(); + + while ( $processor->next_token() ) { + $signature[] = wp_html_set_inner_html_fuzzer_token_signature( $processor ); + } + + return array( + 'tokens' => $signature, + 'lastError' => $processor->get_last_error(), + ); +} + +/** + * Returns a token signature outside the fuzz target. + * + * @param string $html HTML. + * @param bool $full Whether to create a full parser. + * @return array<int, array<string, mixed>>|null Signature, or null if unsupported. + */ +function wp_html_set_inner_html_fuzzer_outer_signature( string $html, bool $full ): ?array { + $processor = wp_html_set_inner_html_fuzzer_create_processor( $html, $full ); + if ( null === $processor ) { + return null; + } + + $signature = array(); + $skipping = false; + $target_tag = null; + $target_parent_depth = null; + + while ( $processor->next_token() ) { + if ( ! $skipping ) { + $signature[] = wp_html_set_inner_html_fuzzer_token_signature( $processor ); + + if ( '#tag' === $processor->get_token_type() && ! $processor->is_tag_closer() && '1' === $processor->get_attribute( 'data-fuzz-target' ) ) { + $skipping = true; + $target_tag = $processor->get_tag(); + $target_parent_depth = count( $processor->get_breadcrumbs() ) - 1; + } + continue; + } + + if ( + '#tag' === $processor->get_token_type() && + $processor->is_tag_closer() && + $processor->get_tag() === $target_tag && + count( $processor->get_breadcrumbs() ) === $target_parent_depth + ) { + $signature[] = wp_html_set_inner_html_fuzzer_token_signature( $processor ); + $skipping = false; + } + } + + if ( null !== $processor->get_last_error() || $skipping ) { + return null; + } + + return $signature; +} + +/** + * Renders a tree with the optional Lexbor oracle. + * + * @param string $html HTML. + * @param bool $full Whether to parse a full document. + * @param string $lexbor_oracle_bin Oracle binary path. + * @return array<string, mixed> Oracle result. + */ +function wp_html_set_inner_html_fuzzer_lexbor_tree( string $html, bool $full, string $lexbor_oracle_bin ): array { + $input = tempnam( sys_get_temp_dir(), 'wp-html-set-inner-html-' ); + if ( false === $input ) { + return array( + 'status' => 'error', + 'error' => 'Could not create temporary oracle input.', + ); + } + + file_put_contents( $input, $html ); + $mode = $full ? 'full-document' : 'fragment-body'; + $command = escapeshellarg( $lexbor_oracle_bin ) . + ' --mode ' . escapeshellarg( $mode ) . + ' --context body --max-nodes 10000 --input ' . escapeshellarg( $input ); + $output = array(); + $status = 0; + exec( $command, $output, $status ); + @unlink( $input ); + + if ( 0 !== $status ) { + return array( + 'status' => 'error', + 'error' => 'Lexbor oracle exited with status ' . $status, + 'output' => implode( "\n", $output ), + ); + } + + $result = json_decode( implode( "\n", $output ), true ); + if ( ! is_array( $result ) ) { + return array( + 'status' => 'error', + 'error' => 'Lexbor oracle returned invalid JSON.', + 'output' => implode( "\n", $output ), + ); + } + + return $result; +} + +/** + * Counts the leading spaces in a rendered tree line. + * + * @param string $line Rendered tree line. + * @return int Leading spaces. + */ +function wp_html_set_inner_html_fuzzer_tree_indent( string $line ): int { + return strspn( $line, ' ' ); +} + +/** + * Returns an outside-target signature from an html5lib-style rendered tree. + * + * @param string $tree Rendered tree. + * @return string|null Signature, or null when the target marker is absent. + */ +function wp_html_set_inner_html_fuzzer_lexbor_outer_tree_signature( string $tree ): ?string { + $lines = preg_split( "/\r\n|\n|\r/", trim( $tree ) ); + $signature = array(); + $target_found = false; + $count = count( $lines ); + + for ( $i = 0; $i < $count; ++$i ) { + $line = $lines[ $i ]; + if ( ! preg_match( '/^(\s*)<[^>]+>$/', $line, $matches ) ) { + $signature[] = $line; + continue; + } + + $indent = strlen( $matches[1] ); + $is_target = false; + $lookahead_i = $i + 1; + while ( $lookahead_i < $count ) { + $lookahead = $lines[ $lookahead_i ]; + $lookahead_indent = wp_html_set_inner_html_fuzzer_tree_indent( $lookahead ); + if ( $lookahead_indent <= $indent ) { + break; + } + if ( $lookahead_indent === $indent + 2 && preg_match( '/^\s*data-fuzz-target="1"$/', $lookahead ) ) { + $is_target = true; + break; + } + ++$lookahead_i; + } + + if ( ! $is_target ) { + $signature[] = $line; + continue; + } + + $target_found = true; + $signature[] = $line; + for ( $j = $i + 1; $j < $count; ++$j ) { + $child_line = $lines[ $j ]; + $child_indent = wp_html_set_inner_html_fuzzer_tree_indent( $child_line ); + if ( $child_indent <= $indent ) { + $i = $j - 1; + break; + } + if ( $child_indent === $indent + 2 && preg_match( '/^\s*[^<"\s][^=]*=".*"$/', $child_line ) ) { + $signature[] = $child_line; + } + if ( $j === $count - 1 ) { + $i = $j; + } + } + } + + return $target_found ? implode( "\n", $signature ) : null; +} + +/** + * Checks accepted updates with the optional Lexbor oracle. + * + * @param string $original Original HTML. + * @param string $updated Updated HTML. + * @param bool $full Whether to parse a full document. + * @param string|null $lexbor_oracle_bin Optional Lexbor oracle binary. + * @return array<string, mixed> Check result. + */ +function wp_html_set_inner_html_fuzzer_check_lexbor_outside_tree( string $original, string $updated, bool $full, ?string $lexbor_oracle_bin ): array { + if ( null === $lexbor_oracle_bin ) { + return array( 'status' => 'skipped' ); + } + + $original_tree = wp_html_set_inner_html_fuzzer_lexbor_tree( $original, $full, $lexbor_oracle_bin ); + $updated_tree = wp_html_set_inner_html_fuzzer_lexbor_tree( $updated, $full, $lexbor_oracle_bin ); + + if ( 'ok' !== ( $original_tree['status'] ?? null ) || 'ok' !== ( $updated_tree['status'] ?? null ) ) { + return array( + 'status' => 'skipped', + 'originalTree' => $original_tree, + 'updatedTree' => $updated_tree, + ); + } + + $original_signature = wp_html_set_inner_html_fuzzer_lexbor_outer_tree_signature( (string) $original_tree['tree'] ); + $updated_signature = wp_html_set_inner_html_fuzzer_lexbor_outer_tree_signature( (string) $updated_tree['tree'] ); + if ( null === $original_signature || null === $updated_signature ) { + return array( + 'status' => 'skipped', + 'originalSignature' => $original_signature, + 'updatedSignature' => $updated_signature, + ); + } + + return array( + 'status' => $original_signature === $updated_signature ? 'ok' : 'changed', + 'originalSignature' => $original_signature, + 'updatedSignature' => $updated_signature, + 'originalOracle' => $original_tree['oracle'] ?? null, + 'updatedOracle' => $updated_tree['oracle'] ?? null, + 'originalSelfCheck' => $original_tree['selfCheck'] ?? null, + 'updatedSelfCheck' => $updated_tree['selfCheck'] ?? null, + ); +} + +/** + * Returns a compact coverage inventory summary. + * + * @return array<string, int> Coverage counts. + */ +function wp_html_set_inner_html_fuzzer_coverage_summary(): array { + return array( + 'htmlElements' => count( wp_html_set_inner_html_fuzzer_all_html_elements() ), + 'svgElements' => count( wp_html_set_inner_html_fuzzer_svg_elements() ), + 'mathmlElements' => count( wp_html_set_inner_html_fuzzer_mathml_elements() ), + 'customElementCorpus' => 32, + 'htmlTargetElements' => count( wp_html_set_inner_html_fuzzer_all_html_elements() ), + 'svgTargetElements' => count( wp_html_set_inner_html_fuzzer_svg_elements() ), + 'mathmlTargetElements' => count( wp_html_set_inner_html_fuzzer_mathml_elements() ), + 'customTargetCorpus' => 32, + ); +} + +/** + * Returns exact coverage inventory details. + * + * @return array<string, mixed> Coverage details. + */ +function wp_html_set_inner_html_fuzzer_coverage_details(): array { + $custom_replacement_elements = array(); + $custom_target_elements = array(); + + for ( $i = 0; $i < 32; ++$i ) { + $custom_replacement_elements[] = wp_html_set_inner_html_fuzzer_custom_element_name( + new WP_HTML_Set_Inner_HTML_Fuzzer_PRNG( 'corpus-custom-' . $i ) + ); + $custom_target_elements[] = wp_html_set_inner_html_fuzzer_custom_element_name( + new WP_HTML_Set_Inner_HTML_Fuzzer_PRNG( 'target-custom-' . $i ) + ); + } + + return array( + 'sources' => array( + 'htmlCurrent' => 'https://html.spec.whatwg.org/multipage/indices.html#elements-3', + 'htmlObsolete' => 'https://html.spec.whatwg.org/multipage/obsolete.html', + 'svg' => 'https://svgwg.org/svg2-draft/eltindex.html', + 'mathmlCore' => 'https://www.w3.org/TR/mathml-core/#elements-and-attributes', + 'mathml3RelaxNG' => 'https://www.w3.org/Math/RelaxNG/mathml3/', + ), + 'htmlElements' => wp_html_set_inner_html_fuzzer_all_html_elements(), + 'htmlCurrentElements' => wp_html_set_inner_html_fuzzer_html_elements(), + 'htmlDeprecatedElements' => wp_html_set_inner_html_fuzzer_deprecated_html_elements(), + 'svgElements' => wp_html_set_inner_html_fuzzer_svg_elements(), + 'mathmlElements' => wp_html_set_inner_html_fuzzer_mathml_elements(), + 'customElementCorpus' => $custom_replacement_elements, + 'htmlTargetElements' => wp_html_set_inner_html_fuzzer_all_html_elements(), + 'svgTargetElements' => wp_html_set_inner_html_fuzzer_svg_elements(), + 'mathmlTargetElements' => wp_html_set_inner_html_fuzzer_mathml_elements(), + 'customTargetCorpus' => $custom_target_elements, + ); +} + +/** + * Writes a failing case. + * + * @param string $output_dir Output directory. + * @param array<string, mixed> $failure Failure. + */ +function wp_html_set_inner_html_fuzzer_write_failure( string $output_dir, array $failure ): void { + if ( ! is_dir( $output_dir ) ) { + mkdir( $output_dir, 0777, true ); + } + + $name = isset( $failure['case']['name'] ) + ? '-' . preg_replace( '/[^A-Za-z0-9_.-]+/', '-', (string) $failure['case']['name'] ) + : ''; + + file_put_contents( + $output_dir . '/failure-seed-' . $failure['seed'] . $name . '.json', + json_encode( $failure, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_SUBSTITUTE ) . "\n" + ); +} + +/** + * Runs one fuzz case. + * + * @param array<string, string|bool|int|null> $case Case. + * @param string|null $lexbor_oracle_bin Optional Lexbor oracle binary. + * @return array<string, mixed> Result. + */ +function wp_html_set_inner_html_fuzzer_run_case( array $case, ?string $lexbor_oracle_bin = null ): array { + $processor = wp_html_set_inner_html_fuzzer_create_processor( $case['html'], $case['full'] ); + if ( null === $processor || ! wp_html_set_inner_html_fuzzer_seek_target( $processor ) ) { + return array( + 'ok' => true, + 'status' => 'unsupported-original', + ); + } + + $original_signature = wp_html_set_inner_html_fuzzer_outer_signature( $case['html'], $case['full'] ); + $set = $processor->set_inner_html( $case['replacement'] ); + $updated = $processor->get_updated_html(); + $last_error = $processor->get_last_error(); + + if ( array_key_exists( 'expectSet', $case ) && null !== $case['expectSet'] && $set !== $case['expectSet'] ) { + return array( + 'ok' => false, + 'failure' => $case['expectSet'] ? 'expected-acceptance' : 'expected-rejection', + 'updated' => $updated, + 'lastError' => $processor->get_last_error(), + ); + } + + $expected_continuation_html = $set ? $updated : $case['html']; + $expected_continuation_processor = wp_html_set_inner_html_fuzzer_create_processor( $expected_continuation_html, $case['full'] ); + if ( null !== $expected_continuation_processor && wp_html_set_inner_html_fuzzer_seek_target( $expected_continuation_processor ) ) { + $expected_continuation = wp_html_set_inner_html_fuzzer_continuation_signature( $expected_continuation_processor ); + $actual_continuation = wp_html_set_inner_html_fuzzer_continuation_signature( $processor ); + + if ( $actual_continuation !== $expected_continuation ) { + return array( + 'ok' => false, + 'failure' => 'set-inner-html-changed-live-continuation', + 'expectedContinuation' => $expected_continuation, + 'actualContinuation' => $actual_continuation, + 'updated' => $updated, + ); + } + } + + if ( ! $set ) { + if ( $updated !== $case['html'] ) { + return array( + 'ok' => false, + 'failure' => 'rejected-update-changed-html', + 'updated' => $updated, + ); + } + + if ( null !== $last_error ) { + return array( + 'ok' => false, + 'failure' => 'rejected-update-poisoned-processor', + 'lastError' => $last_error, + ); + } + + return array( + 'ok' => true, + 'status' => 'rejected', + ); + } + + if ( $updated !== $case['expected'] ) { + return array( + 'ok' => false, + 'failure' => 'accepted-update-did-not-set-raw-inner-html', + 'expected' => $case['expected'], + 'updated' => $updated, + ); + } + + if ( null === $original_signature ) { + return array( + 'ok' => true, + 'status' => 'accepted-original-signature-unsupported', + ); + } + + $updated_signature = wp_html_set_inner_html_fuzzer_outer_signature( $updated, $case['full'] ); + if ( null === $updated_signature ) { + return array( + 'ok' => false, + 'failure' => 'accepted-update-produced-unsupported-output', + 'updated' => $updated, + ); + } + + if ( $original_signature !== $updated_signature ) { + return array( + 'ok' => false, + 'failure' => 'accepted-update-changed-outside-tree', + 'originalSignature' => $original_signature, + 'updatedSignature' => $updated_signature, + 'updated' => $updated, + ); + } + + $lexbor_check = wp_html_set_inner_html_fuzzer_check_lexbor_outside_tree( + $case['html'], + $updated, + $case['full'], + $lexbor_oracle_bin + ); + if ( 'changed' === $lexbor_check['status'] ) { + return array( + 'ok' => false, + 'failure' => 'accepted-update-changed-lexbor-outside-tree', + 'lexborCheck' => $lexbor_check, + 'updated' => $updated, + ); + } + + if ( 'ok' === $lexbor_check['status'] ) { + return array( + 'ok' => true, + 'status' => 'accepted-lexbor-checked', + ); + } + + if ( 'skipped' === $lexbor_check['status'] && null !== $lexbor_oracle_bin ) { + return array( + 'ok' => true, + 'status' => 'accepted-lexbor-skipped', + ); + } + + return array( + 'ok' => true, + 'status' => 'accepted', + ); +} + +$options = wp_html_set_inner_html_fuzzer_parse_options( $argv ); +if ( isset( $options['help'] ) || isset( $options['h'] ) ) { + wp_html_set_inner_html_fuzzer_usage(); + exit( 0 ); +} + +if ( isset( $options['coverage-details'] ) ) { + echo json_encode( + array( + 'summary' => wp_html_set_inner_html_fuzzer_coverage_summary(), + 'details' => wp_html_set_inner_html_fuzzer_coverage_details(), + ), + JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES + ) . "\n"; + exit( 0 ); +} + +$iterations = wp_html_set_inner_html_fuzzer_int_option( $options, 'iterations', 1000 ); +$start_seed = wp_html_set_inner_html_fuzzer_int_option( $options, 'start-seed', 1 ); +$stop_on_failure = isset( $options['stop-on-failure'] ); +$output_dir = wp_html_set_inner_html_fuzzer_string_option( $options, 'output-dir', dirname( __DIR__, 2 ) . '/artifacts/html-api-fuzz/set-inner-html' ); +$lexbor_oracle_bin = wp_html_set_inner_html_fuzzer_lexbor_oracle_bin( $options ); + +wp_html_set_inner_html_fuzzer_bootstrap(); + +$counts = array( + 'corpus' => 0, + 'accepted' => 0, + 'accepted-lexbor-checked' => 0, + 'accepted-lexbor-skipped' => 0, + 'accepted-original-signature-unsupported' => 0, + 'rejected' => 0, + 'unsupported-original' => 0, + 'failures' => 0, +); + +foreach ( wp_html_set_inner_html_fuzzer_corpus_cases() as $case ) { + $result = wp_html_set_inner_html_fuzzer_run_case( $case, $lexbor_oracle_bin ); + ++$counts['corpus']; + + if ( ! $result['ok'] ) { + ++$counts['failures']; + $failure = array( + 'seed' => $case['seed'], + 'case' => $case, + 'result' => $result, + ); + wp_html_set_inner_html_fuzzer_write_failure( $output_dir, $failure ); + fwrite( STDERR, 'Failure in corpus case ' . $case['name'] . ': ' . $result['failure'] . "\n" ); + if ( $stop_on_failure ) { + echo json_encode( + array( + 'ok' => false, + 'startSeed' => $start_seed, + 'iterations' => $iterations, + 'counts' => $counts, + 'outputDir' => $output_dir, + 'coverage' => wp_html_set_inner_html_fuzzer_coverage_summary(), + ), + JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES + ) . "\n"; + exit( 1 ); + } + continue; + } + + $status = $result['status']; + if ( ! isset( $counts[ $status ] ) ) { + $counts[ $status ] = 0; + } + ++$counts[ $status ]; +} + +for ( $i = 0; $i < $iterations; ++$i ) { + $seed = $start_seed + $i; + $case = wp_html_set_inner_html_fuzzer_case( $seed ); + $result = wp_html_set_inner_html_fuzzer_run_case( $case, $lexbor_oracle_bin ); + + if ( ! $result['ok'] ) { + ++$counts['failures']; + $failure = array( + 'seed' => $seed, + 'case' => $case, + 'result' => $result, + ); + wp_html_set_inner_html_fuzzer_write_failure( $output_dir, $failure ); + fwrite( STDERR, 'Failure at seed ' . $seed . ': ' . $result['failure'] . "\n" ); + if ( $stop_on_failure ) { + break; + } + continue; + } + + $status = $result['status']; + if ( ! isset( $counts[ $status ] ) ) { + $counts[ $status ] = 0; + } + ++$counts[ $status ]; +} + +echo json_encode( + array( + 'ok' => 0 === $counts['failures'], + 'startSeed' => $start_seed, + 'iterations' => $iterations, + 'counts' => $counts, + 'outputDir' => $output_dir, + 'coverage' => wp_html_set_inner_html_fuzzer_coverage_summary(), + 'lexborOracle' => null === $lexbor_oracle_bin ? null : $lexbor_oracle_bin, + ), + JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES +) . "\n"; + +exit( 0 === $counts['failures'] ? 0 : 1 );