Skip to content

Commit 5303dfb

Browse files
authored
[BlockMarkupURLProcessor] Support URLs in CSS (#195)
## Description Adds support for migrating URLs within CSS syntax in the `style` HTML attribute during WXR imports. For example, this markup: ```html <div style="background-image:url(https://oldsite.com/image.png)"> ``` Would be rewritten as: ```html <div style="background-image:url(https://newsite.com/image.png)"> ``` ## Motivation When importing WordPress sites via WXR, URLs embedded in CSS (like `background: url("/old-site.com/image.jpg")`) need to be migrated to the new site. Previously, these URLs were missed, leading to broken images and assets after import. [Cover blocks are a good example](WordPress/wordpress-importer#223). Without this PR, the background image in this cover block would not be rewritten: ```html <!-- wp:cover {"url":"http://localhost:8881/wp-content/image.jpg"}} --> <div style="background-position:50% 50%;background-image:url(http://localhost:8881/wp-content/uploads/2025/09/image-2-766x1024.jpeg)"> ``` ### Implementation The implementation introduces a new `CSSUrlProcessor` class that can parse CSS `url()` functions, handle CSS escape sequences, and efficiently skip over large data URIs. It uses the same design principles as `WP_HTML_Tag_Processor`: simple state-machine API, no regexps, minimal allocations. The `CSSUrlProcessor` is integrated with `BlockMarkupURLProcessor` and can be used as follows: ```php $markup = '<div style="background: url(&quot;/old.jpg&quot;)">Content</div>'; $processor = new BlockMarkupUrlProcessor( $markup, 'https://new-site.com' ); while ( $processor->next_url() ) { // Finds "/old.jpg" in the style attribute $processor->set_raw_url( '/new.jpg' ); } echo $processor->get_updated_html(); // Output: <div style="background: url(&quot;/new.jpg&quot;)">Content</div> ``` ## Testing instructions * Review thoroughly * Confirm the CI tests pass
1 parent 9cae1f9 commit 5303dfb

File tree

6 files changed

+1161
-38
lines changed

6 files changed

+1161
-38
lines changed

components/DataLiberation/BlockMarkup/class-blockmarkupurlprocessor.php

Lines changed: 98 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,8 @@
44

55
use Rowbot\URL\URL;
66
use WordPress\DataLiberation\URL\URLInTextProcessor;
7+
use WordPress\DataLiberation\URL\CSSURLProcessor;
78
use WordPress\DataLiberation\URL\WPURL;
8-
use WordPress\DataLiberation\URL\ConvertedUrl;
9-
10-
use function WordPress\DataLiberation\URL\urldecode_n;
119

1210
/**
1311
* Reports all the URLs in the imported post and enables rewriting them.
@@ -23,6 +21,8 @@ class BlockMarkupUrlProcessor extends BlockMarkupProcessor {
2321
private $base_url_object;
2422
private $url_in_text_processor;
2523
private $url_in_text_node_updated;
24+
private $css_url_processor;
25+
private $css_url_processor_updated;
2626

2727
/**
2828
* The list of names of URL-related HTML attributes that may be available on
@@ -52,6 +52,14 @@ public function get_updated_html(): string {
5252
$this->url_in_text_node_updated = false;
5353
}
5454

55+
if ( $this->css_url_processor_updated ) {
56+
if ( null !== $this->css_url_processor ) {
57+
$updated_css = $this->css_url_processor->get_updated_css();
58+
$this->set_attribute( 'style', $updated_css );
59+
}
60+
$this->css_url_processor_updated = false;
61+
}
62+
5563
return parent::get_updated_html();
5664
}
5765

@@ -70,8 +78,11 @@ public function next_token(): bool {
7078
$this->parsed_url = null;
7179
$this->inspecting_html_attributes = null;
7280
$this->url_in_text_processor = null;
73-
// Do not reset url_in_text_node_updated – it's reset in get_updated_html() which
74-
// is called in parent::next_token().
81+
$this->css_url_processor = null;
82+
/*
83+
* Do not reset url_in_text_node_updated or css_url_processor_updated – they're reset
84+
* in get_updated_html() which is called in parent::next_token().
85+
*/
7586

7687
return parent::next_token();
7788
}
@@ -111,7 +122,7 @@ private function next_url_in_text_node() {
111122
* way to recognize a substring "WordPress.org" as a URL. We might
112123
* get some false positives this way, e.g. in this string:
113124
*
114-
* > And that's how you build a theme.Now let's take a look at..."
125+
* > And that's how you build a theme. Now let's take a look at..."
115126
*
116127
* `theme.Now` would be recognized as a URL. It's up to the API consumer
117128
* to filter out such false positives e.g. by checking the domain against
@@ -130,20 +141,75 @@ private function next_url_in_text_node() {
130141
return false;
131142
}
132143

144+
/**
145+
* Advances to the next CSS URL in the `style` attribute of the current tag token.
146+
*
147+
* @return bool Whether a CSS URL was found.
148+
*/
149+
private function next_url_in_css() {
150+
if ( '#tag' !== $this->get_token_type() ) {
151+
return false;
152+
}
153+
154+
if ( null === $this->css_url_processor ) {
155+
$css_value = $this->get_attribute( 'style' );
156+
if ( ! is_string( $css_value ) ) {
157+
return false;
158+
}
159+
160+
$this->css_url_processor = new CSSURLProcessor( $css_value );
161+
}
162+
163+
while ( $this->css_url_processor->next_url() ) {
164+
/**
165+
* Skip data URIs. They may be really large and they don't
166+
* have a hostname to migrate.
167+
*/
168+
if ( $this->css_url_processor->is_data_uri() ) {
169+
continue;
170+
}
171+
$this->raw_url = $this->css_url_processor->get_raw_url();
172+
$this->parsed_url = WPURL::parse( $this->raw_url, $this->base_url_string );
173+
if ( false === $this->parsed_url ) {
174+
continue;
175+
}
176+
177+
return true;
178+
}
179+
180+
return false;
181+
}
182+
133183
private function next_url_attribute() {
134184
$tag = $this->get_tag();
135185

136-
if ( ! array_key_exists( $tag, self::HTML_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM ) ) {
137-
return false;
186+
// Check if we have a style attribute with CSS URLs to process.
187+
if ( null !== $this->css_url_processor ) {
188+
if ( $this->next_url_in_css() ) {
189+
return true;
190+
}
191+
// Done with CSS URLs in this attribute, apply any pending updates and move on.
192+
$this->get_updated_html();
193+
$this->css_url_processor = null;
138194
}
139195

140196
if ( null === $this->inspecting_html_attributes ) {
141-
/**
142-
* Initialize the list on the first call to next_url_attribute()
143-
* for the current token. The last element is the attribute we'll
144-
* inspect in the while() loop below.
145-
*/
146-
$this->inspecting_html_attributes = self::HTML_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM[ $tag ];
197+
if ( array_key_exists( $tag, self::HTML_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM ) ) {
198+
/**
199+
* Initialize the list on the first call to next_url_attribute()
200+
* for the current token. The last element is the attribute we'll
201+
* inspect in the while() loop below.
202+
*/
203+
$this->inspecting_html_attributes = self::HTML_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM[ $tag ];
204+
// Add style attribute to the list if it exists.
205+
if ( null !== $this->get_attribute( 'style' ) ) {
206+
$this->inspecting_html_attributes[] = 'style';
207+
}
208+
} elseif ( null !== $this->get_attribute( 'style' ) ) {
209+
$this->inspecting_html_attributes = array( 'style' );
210+
} else {
211+
return false;
212+
}
147213
} else {
148214
/**
149215
* Forget the attribute we've inspected on the previous call to
@@ -160,6 +226,18 @@ private function next_url_attribute() {
160226
continue;
161227
}
162228

229+
// Rewrite any CSS `url()` declarations in the `style` attribute.
230+
if ( 'style' === $attr ) {
231+
$this->css_url_processor = new CSSURLProcessor( $url_maybe );
232+
if ( $this->next_url_in_css() ) {
233+
return true;
234+
}
235+
// No CSS URLs found, move to next attribute.
236+
$this->css_url_processor = null;
237+
array_pop( $this->inspecting_html_attributes );
238+
continue;
239+
}
240+
163241
/*
164242
* Use base URL to resolve known URI attributes as we are certain we're
165243
* dealing with URI values.
@@ -277,6 +355,12 @@ public function set_url( $raw_url, $parsed_url ) {
277355
$this->parsed_url = $parsed_url;
278356
switch ( parent::get_token_type() ) {
279357
case '#tag':
358+
// Check if we're processing a CSS URL.
359+
if ( null !== $this->css_url_processor ) {
360+
$this->css_url_processor_updated = true;
361+
return $this->css_url_processor->set_raw_url( $raw_url );
362+
}
363+
280364
$attr = $this->get_inspected_attribute_name();
281365
if ( false === $attr ) {
282366
return false;

components/DataLiberation/URL/class-cssprocessor.php renamed to components/DataLiberation/CSS/class-cssprocessor.php

Lines changed: 46 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
<?php
22

3-
namespace WordPress\DataLiberation\URL;
3+
namespace WordPress\DataLiberation\CSS;
44

55
use function WordPress\Encoding\codepoint_to_utf8_bytes;
66
use function WordPress\Encoding\compat\_wp_scan_utf8;
@@ -742,6 +742,32 @@ public function get_token_value() {
742742
return $this->token_value;
743743
}
744744

745+
/**
746+
* Determines whether the current token is a data URI.
747+
*
748+
* Only meaningful for URL and STRING tokens. Returns false for all other token types.
749+
*
750+
* @return bool Whether the current token value starts with "data:" (case-insensitive).
751+
*/
752+
public function is_data_uri(): bool {
753+
if ( null === $this->token_value_starts_at || null === $this->token_value_length ) {
754+
return false;
755+
}
756+
757+
if ( $this->token_value_length < 5 ) {
758+
return false;
759+
}
760+
761+
$offset = $this->token_value_starts_at;
762+
return (
763+
( 'd' === $this->css[ $offset ] || 'D' === $this->css[ $offset ] ) &&
764+
( 'a' === $this->css[ $offset + 1 ] || 'A' === $this->css[ $offset + 1 ] ) &&
765+
( 't' === $this->css[ $offset + 2 ] || 'T' === $this->css[ $offset + 2 ] ) &&
766+
( 'a' === $this->css[ $offset + 3 ] || 'A' === $this->css[ $offset + 3 ] ) &&
767+
':' === $this->css[ $offset + 4 ]
768+
);
769+
}
770+
745771
/**
746772
* Gets the token start at.
747773
*
@@ -812,27 +838,26 @@ public function get_token_value_length(): ?int {
812838
* @return bool Whether the value was successfully updated.
813839
*/
814840
public function set_token_value( string $new_value ): bool {
815-
// Only URL tokens are currently supported.
816-
if ( self::TOKEN_URL !== $this->token_type ) {
817-
return false;
818-
}
819-
820-
// Ensure we have valid token value boundaries.
821-
if ( null === $this->token_value_starts_at || null === $this->token_value_length ) {
822-
return false;
841+
// Only URL and string tokens are currently supported.
842+
switch ( $this->token_type ) {
843+
case self::TOKEN_URL:
844+
$this->lexical_updates[] = array(
845+
'start' => $this->token_value_starts_at,
846+
'length' => $this->token_value_length,
847+
'text' => $this->escape_url_value( $new_value ),
848+
);
849+
return true;
850+
case self::TOKEN_STRING:
851+
$this->lexical_updates[] = array(
852+
'start' => $this->token_starts_at,
853+
'length' => $this->token_length,
854+
'text' => $this->escape_url_value( $new_value ),
855+
);
856+
return true;
857+
default:
858+
_doing_it_wrong( __METHOD__, 'set_token_value() only supports URL and string tokens. Got token type: ' . $this->token_type, '1.0.0' );
859+
return false;
823860
}
824-
825-
// Escape the URL value for unquoted URL syntax.
826-
$escaped_value = $this->escape_url_value( $new_value );
827-
828-
// Queue the lexical update.
829-
$this->lexical_updates[] = array(
830-
'start' => $this->token_value_starts_at,
831-
'length' => $this->token_value_length,
832-
'text' => $escaped_value,
833-
);
834-
835-
return true;
836861
}
837862

838863
/**

0 commit comments

Comments
 (0)