From 704ca55010148e3cd64b05e92c36135be59e44cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam.zielinski@automattic.com>
Date: Thu, 29 May 2025 10:43:32 +0200
Subject: [PATCH 1/6] Separate blueprints.phar build from components library
 build

---
 bin/build-libraries-phar.sh                   |  2 +-
 bin/build-phar/smoke-test.php                 |  4 ++-
 .../Importer/StreamImporter.php               |  7 +++++
 composer.json                                 |  2 +-
 phar-box.json => phar-blueprints.json         |  0
 phar-libraries.json                           | 31 +++++++++++++++++++
 6 files changed, 43 insertions(+), 3 deletions(-)
 rename phar-box.json => phar-blueprints.json (100%)
 create mode 100644 phar-libraries.json

diff --git a/bin/build-libraries-phar.sh b/bin/build-libraries-phar.sh
index 839fa0b6..92da6363 100644
--- a/bin/build-libraries-phar.sh
+++ b/bin/build-libraries-phar.sh
@@ -20,7 +20,7 @@ cd $PROJECT_DIR
 mkdir -p $BUILD_DIR
 rm $DIST_DIR/wordpress-libraries.* > /dev/null 2>&1 || true
 export BOX_BASE_PATH=$(type -a box | grep -v 'alias' | awk '{print $3}')
-php $BUILD_DIR/box.php compile -d $PROJECT_DIR -c $PROJECT_DIR/phar-box.json
+php $BUILD_DIR/box.php compile -d $PROJECT_DIR -c $PROJECT_DIR/phar-libraries.json
 php -d 'phar.readonly=0' $BUILD_DIR/truncate-composer-checks.php $DIST_DIR/wordpress-libraries.phar
 cd $DIST_DIR
 php $BUILD_DIR/smoke-test.php
diff --git a/bin/build-phar/smoke-test.php b/bin/build-phar/smoke-test.php
index 6fcbe520..dbe8d8cd 100644
--- a/bin/build-phar/smoke-test.php
+++ b/bin/build-phar/smoke-test.php
@@ -9,7 +9,9 @@
  */
 $c = WordPress\DataLiberation\Importer\StreamImporter::create_for_wxr_file(__DIR__ . '/nosuchfile.xml', [
     'uploads_path' => __DIR__ . '/uploads',
-    'new_site_url' => 'https://smoke-test.org'
+    'new_site_url' => 'https://smoke-test.org',
+	'new_site_content_root_url' => 'https://smoke-test.org',
+	'new_media_root_url' => 'https://smoke-test.org',
 ]);
 
 WordPress\DataLiberation\URL\WPURL::parse('https://example.com');
diff --git a/components/DataLiberation/Importer/StreamImporter.php b/components/DataLiberation/Importer/StreamImporter.php
index 409eeea0..a96be2e8 100644
--- a/components/DataLiberation/Importer/StreamImporter.php
+++ b/components/DataLiberation/Importer/StreamImporter.php
@@ -2,6 +2,7 @@
 
 namespace WordPress\DataLiberation\Importer;
 
+use InvalidArgumentException;
 use WordPress\ByteStream\ReadStream\FileReadStream;
 use WordPress\DataLiberation\BlockMarkup\BlockMarkupUrlProcessor;
 use WordPress\DataLiberation\EntityReader\EntityReaderIterator;
@@ -286,6 +287,9 @@ protected static function parse_options( $options ) {
 			// throw new DataLiberationException( 'The "source_site_url" option is required' );
 		}
 		if ( ! isset( $options['new_site_content_root_url'] ) ) {
+			if(!function_exists('get_site_url')) {
+				throw new InvalidArgumentException('Option "new_site_content_root_url" is required');
+			}
 			$options['new_site_content_root_url'] = get_site_url();
 		}
 
@@ -296,6 +300,9 @@ protected static function parse_options( $options ) {
 		$options['uploads_path'] = rtrim( $options['uploads_path'], '/' );
 
 		if ( ! isset( $options['new_media_root_url'] ) ) {
+			if(!function_exists('get_site_url')) {
+				throw new InvalidArgumentException('Option "new_media_root_url" is required');
+			}
 			$options['new_media_root_url'] = rtrim( get_site_url(), '/' ) . '/wp-content/uploads';
 		}
 		// Remove the trailing slash to make concatenation easier later.
diff --git a/composer.json b/composer.json
index 992dd0ca..4ef1653f 100644
--- a/composer.json
+++ b/composer.json
@@ -72,7 +72,7 @@
         }
     },
     "scripts": {
-        "build-blueprints-phar": "box compile -c phar-box.json",
+        "build-blueprints-phar": "box compile -c phar-blueprints.json",
         "regenerate-json-schema": "node components/Blueprints/Versions/Version2/json-schema/regenerate-schema.ts",
         "test": "phpunit -c phpunit.xml",
         "lint": "phpcs --standard=WordPress .",
diff --git a/phar-box.json b/phar-blueprints.json
similarity index 100%
rename from phar-box.json
rename to phar-blueprints.json
diff --git a/phar-libraries.json b/phar-libraries.json
new file mode 100644
index 00000000..ef512789
--- /dev/null
+++ b/phar-libraries.json
@@ -0,0 +1,31 @@
+{
+	"$schema": "https://raw.githubusercontent.com/box-project/box/refs/heads/main/res/schema.json",
+	"main": "vendor/autoload.php",
+	"output": "dist/wordpress-libraries.phar",
+	"force-autodiscovery": false,
+	"compactors": [
+		"KevinGH\\Box\\Compactor\\Php"
+	],
+    "check-requirements": false,
+	"annotations": false,
+    "shebang": "#!/usr/bin/env php",
+	"compression": "GZ",
+    "finder": [
+          {
+            "notName": "/.*\\.md|.*\\.dist|Makefile|composer\\.json|composer\\.lock/",
+            "exclude": [
+                "untracked",
+                "test",
+                "test_old",
+                "tests",
+                "Tests",
+                "Test",
+                "vendor-bin"
+            ],
+            "in": "components"
+        }
+    ],
+	"directories": [
+        "vendor/composer"
+	]
+}

From a1053ae83ef17e95c398bee760be500283f6c9b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Thu, 29 May 2025 10:43:51 +0200
Subject: [PATCH 2/6] add a cli importer script

---
 .../DataLiberation/bin/cli-importer.php       | 762 ++++++++++++++++++
 1 file changed, 762 insertions(+)
 create mode 100644 components/DataLiberation/bin/cli-importer.php

diff --git a/components/DataLiberation/bin/cli-importer.php b/components/DataLiberation/bin/cli-importer.php
new file mode 100644
index 00000000..348737a2
--- /dev/null
+++ b/components/DataLiberation/bin/cli-importer.php
@@ -0,0 +1,762 @@
+<?php
+
+use Rowbot\URL\URL;
+use WordPress\ByteStream\ReadStream\FileReadStream;
+use WordPress\DataLiberation\EntityReader\EPubEntityReader;
+use WordPress\DataLiberation\EntityReader\FilesystemEntityReader;
+use WordPress\DataLiberation\EntityReader\WXREntityReader;
+use WordPress\DataLiberation\Importer\ImportSession;
+use WordPress\DataLiberation\Importer\ImportUtils;
+use WordPress\DataLiberation\Importer\RetryFrontloadingIterator;
+use WordPress\DataLiberation\Importer\StreamImporter;
+use WordPress\DataLiberation\URL\WPURL;
+use WordPress\Filesystem\Layer\ChrootLayer;
+use WordPress\Filesystem\LocalFilesystem;
+use WordPress\Git\GitFilesystem;
+use WordPress\Git\GitRepository;
+use WordPress\HttpClient\Crawler;
+use WordPress\Zip\ZipFilesystem;
+
+use function WordPress\DataLiberation\URL\is_child_url_of;
+use function WordPress\Filesystem\wp_join_unix_paths;
+
+if ( file_exists( '/wordpress/wp-load.php' ) ) {
+	require_once '/wordpress/wp-load.php';
+}
+
+if ( file_exists( __DIR__ . '/../../vendor/autoload.php' ) ) {
+	require_once __DIR__ . '/../../vendor/autoload.php';
+} elseif ( file_exists( __DIR__ . '/wp-content/vendor/autoload.php' ) ) {
+	require_once __DIR__ . '/wp-content/vendor/autoload.php';
+}
+
+require_once __DIR__ . '/cli/Parser.php';
+require_once __DIR__ . '/playground-protocol/PlaygroundProtocolClient.php';
+require_once __DIR__ . '/cli/ConsoleWriter.php';
+require_once __DIR__ . '/cli/ProgressBar.php';
+
+$console_writer = new PlaygroundConsoleWriter();
+
+/**
+ * Custom autoloader that should not be needed because we already have
+ * the vendor autoloader in place.
+ *
+ * @TODO: Investigate why it's needed and get rid of it.
+ */
+spl_autoload_register(
+	function ( $class ) use ( $console_writer ) {
+		// Base directory for components
+		$baseDir = WP_CONTENT_DIR . '/components/';
+
+		// Convert namespace to path
+		$path = str_replace( '\\', DIRECTORY_SEPARATOR, $class ) . '.php';
+		if ( str_starts_with( $path, 'WordPress/' ) ) {
+			$path = substr( $path, 10 );
+		}
+
+		// Full path to the file
+		$file = $baseDir . $path;
+
+		// Check if file exists and include it
+		if ( file_exists( $file ) ) {
+			require_once $file;
+			return true;
+		}
+
+		return false;
+	}
+);
+
+// Parse CLI arguments
+function help_message_and_die( $error = false ) {
+	global $console_writer;
+	$console_writer->write( "\033[1;32mDescription:\033[0m\n" );
+	$console_writer->write( "  Imports content into a new WordPress site\n\n" );
+
+	$console_writer->write( "\033[1;32mUsage:\033[0m\n" );
+	$console_writer->write( "  php import-markdown-directory.php <mode> [options]\n\n" );
+
+	$console_writer->write( "\033[1;32mModes:\033[0m\n" );
+	$console_writer->write( "  \033[1;33mcrawler\033[0m     Import content by crawling a website\n" );
+	$console_writer->write( "  \033[1;33mlocal-directory\033[0m        Import content from a local directory\n" );
+	$console_writer->write( "  \033[1;33mgit\033[0m         Import content from a git repository\n" );
+	$console_writer->write( "  \033[1;33mwxr\033[0m         Import content from a WordPress eXtended RSS file\n" );
+	$console_writer->write( "  \033[1;33mepub\033[0m        Import content from an EPUB ebook\n\n" );
+
+	$console_writer->write( "\033[1;32mGlobal Options:\033[0m\n" );
+	$console_writer->write( "  \033[1;34m--source-site-url=<url>\033[0m\n" );
+	$console_writer->write( "      Base URL of the source content (required)\n\n" );
+
+	$console_writer->write( "  \033[1;34m--additional-site-urls=<url>\033[0m\n" );
+	$console_writer->write( "      Additional URLs to rewrite links for (multiple allowed)\n\n" );
+
+	$console_writer->write( "  \033[1;34m--media-url=<url>\033[0m\n" );
+	$console_writer->write( "      URLs to download media files from (multiple allowed)\n\n" );
+
+	$console_writer->write( "  \033[1;34m--output-dir=<path>\033[0m\n" );
+	$console_writer->write( "      Create the new WordPress site in this directory\n" );
+	$console_writer->write( "      Must be empty and have write permissions\n\n" );
+
+	$console_writer->write( "\033[1;32mMode-specific Usage:\033[0m\n" );
+
+	$console_writer->write( "\033[1;33mgit\033[0m mode:\n" );
+	$console_writer->write( "  php import-markdown-directory.php git <repo_url>\n" );
+	$console_writer->write( "  Options:\n" );
+	$console_writer->write( "    \033[1;34m--branch=<branch>\033[0m\n" );
+	$console_writer->write( "        Git branch to import from (required)\n" );
+	$console_writer->write( "    \033[1;34m--path-in-repo=<path>\033[0m\n" );
+	$console_writer->write( "        Subdirectory in repository to import from\n\n" );
+
+	$console_writer->write( "\033[1;33mcrawler\033[0m mode:\n" );
+	$console_writer->write( "  php import-markdown-directory.php crawler <url>\n" );
+	$console_writer->write( "  Crawls the website at <url> and imports discovered content\n\n" );
+
+	$console_writer->write( "\033[1;33mlocal-directory\033[0m mode:\n" );
+	$console_writer->write( "  php import-markdown-directory.php local-directory <directory>\n" );
+	$console_writer->write( "  Imports content from local <directory>\n\n" );
+
+	$console_writer->write( "\033[1;33mwxr\033[0m mode:\n" );
+	$console_writer->write( "  php import-markdown-directory.php wxr <url or local path>\n" );
+	$console_writer->write( "  Imports content from a WordPress eXtended RSS file\n\n" );
+
+	$console_writer->write( "\033[1;33mepub\033[0m mode:\n" );
+	$console_writer->write( "  php import-markdown-directory.php epub <url or local path>\n" );
+	$console_writer->write( "  Imports content from an EPUB ebook\n\n" );
+
+	if ( $error ) {
+		$console_writer->write( "\033[1;31mError:\033[0m " );
+		$console_writer->write( $error );
+		$console_writer->write( "\n" );
+		PlaygroundProtocolClient::getInstance()->exit();
+	}
+	die();
+}
+
+define( 'NEW_SITE_CONTENT_ROOT', get_site_url() );
+$console_writer->write( 'Target site URL: ' . NEW_SITE_CONTENT_ROOT . "\n" );
+
+$parser = new Phalcon\Cop\Parser();
+$args   = $parser->parse( $argv );
+
+$args['mode']     = $args[0] ?? '';
+$args['data_url'] = $args[1] ?? '';
+
+$chrooted_fs     = null;
+$source_site_url = null;
+if ( in_array( $args['mode'], array( 'local-directory', 'git', 'crawler' ) ) ) {
+	// Validate required arguments
+	if ( ! isset( $args['source-site-url'] ) ) {
+		if ( $args['mode'] === 'crawler' ) {
+			$args['source-site-url'] = $args['data_url'];
+		} else {
+			help_message_and_die( 'The --source-site-url argument is required.' );
+		}
+	}
+	$index_file_pattern = '#(?:index|readme)\.(?:md|html|xhtml)$#i';
+	$import_path_prefix = '/imported-content';
+	$source_site_url    = $args['source-site-url'];
+
+	if ( $args['mode'] === 'local-directory' ) {
+		if ( ! isset( $args['data_url'] ) ) {
+			help_message_and_die( 'The "local-directory" argument is required.' );
+		}
+
+		PlaygroundProtocolClient::getInstance()->mountDirectory( $args['data_url'], '/files-to-import' );
+		$chrooted_fs = LocalFilesystem::create( '/files-to-import' );
+
+		$args['source-site-url'] = 'file:///';
+	} elseif ( $args['mode'] === 'git' ) {
+		if ( ! isset( $args['data_url'] ) ) {
+			help_message_and_die( 'The "repo" argument is required.' );
+		}
+
+		$args['repo'] = $args['data_url'];
+		if ( ! str_ends_with( $args['repo'], '.git' ) ) {
+			help_message_and_die( 'The "repo" argument must end with ".git" when mode is "git".' );
+		}
+
+		if ( ! isset( $args['branch'] ) ) {
+			help_message_and_die( 'The "branch" argument is required when mode is "git".' );
+		}
+
+		$console_writer->write( "Sparse checkout of the git repository\n" );
+		$temp_dir  = sys_get_temp_dir() . '/import-static-' . uniqid();
+		$cache_fs  = LocalFilesystem::create( $temp_dir );
+		$docs_repo = new GitRepository( $cache_fs );
+		$docs_repo->add_remote( 'origin', $args['repo'] );
+		$remote       = $docs_repo->get_remote_client( 'origin' );
+		$path_in_repo = $args['path-in-repo'] ?? '';
+		$branch       = $args['branch'] ?? 'trunk';
+		$remote->fetch(
+			$branch,
+			array(
+				'path' => $path_in_repo,
+				'shallow' => true,
+			)
+		);
+		$docs_repo->set_branch_tip( 'refs/heads/' . $branch, $docs_repo->get_branch_tip( 'refs/remotes/origin/' . $branch ) );
+		$docs_repo->checkout( 'refs/heads/' . $branch );
+		$git_fs      = GitFilesystem::create( $docs_repo );
+		$chrooted_fs = new ChrootLayer( $git_fs, $path_in_repo );
+	} elseif ( $args['mode'] === 'crawler' ) {
+		if ( ! isset( $args['data_url'] ) ) {
+			help_message_and_die( 'The "url" argument is required.' );
+		}
+		if ( ! WPURL::parse( $args['data_url'] ) ) {
+			help_message_and_die( 'The "url" argument must be a valid URL.' );
+		}
+		$args['source-site-url'] = $args['data_url'];
+		$tmp_dir                 = sys_get_temp_dir() . '/import-static-' . uniqid();
+		$chrooted_fs             = LocalFilesystem::create( $tmp_dir );
+		$crawler                 = new Crawler(
+			$args['data_url'],
+			array(
+				'preprocess_url' => function ( URL $url ) use ( $args ) {
+					if ( ! is_child_url_of( $url, $args['data_url'] ) ) {
+						return false;
+					}
+					$url->search = '';
+					if ( in_array( $url->pathname, array( '/feed/', '/wp-json/' ) ) ) {
+						return false;
+					}
+					if ( preg_match( '#^/\d{4}/\d{2}/\d{2}/[^/]+/$#', $url->pathname ) ) {
+						return $url;
+					}
+					if ( preg_match( '#^/[^/]+/$#', $url->pathname ) ) {
+						return $url;
+					}
+					return false;
+				},
+			)
+		);
+		$progress                = new ProgressBar( $console_writer, null );
+		$progress->start( 'Crawling website...' );
+		while ( $crawler->crawl_next() ) {
+			$parsed_url = WPURL::parse( $crawler->get_current_url() );
+			$file_path  = $parsed_url->pathname;
+			if ( $file_path === '/' ) {
+				$file_path = '/index.html';
+			} elseif ( str_ends_with( $file_path, '/' ) ) {
+				/**
+				 * Choose to treat /2021/10/03/dont-waste-time-on-boring-programming-lessons/ as
+				 * /2021/10/03/dont-waste-time-on-boring-programming-lessons.html
+				 *
+				 * Another possible choice would be to save it as
+				 * /2021/10/03/dont-waste-time-on-boring-programming-lessons/index.html
+				 */
+				$file_path = rtrim( $file_path, '/' );
+			}
+
+			if ( ! $file_path || strlen( $file_path ) < 1 ) {
+				$file_path = sha1( $crawler->get_current_url() );
+			}
+
+			$extension = pathinfo( $file_path, PATHINFO_EXTENSION );
+			if ( ! $extension ) {
+				$file_path .= '.html';
+			}
+
+			/**
+			 * Replace date-based paths with "posts" directory.
+			 *
+			 * Why? wp_insert_post() seems to mangle the post_name if it consists of a few numbers
+			 * and that messes up the URLs of the imported posts.
+			 *
+			 * @TODO: Investigate the reasons of this behavior.
+			 */
+			$file_path = preg_replace( '#/\d{4}/\d{2}/\d{2}/#', '/posts/', $file_path );
+			$content   = $crawler->get_current_content();
+			// @TODO: This is very naive – we should use the URL processor instead.
+			$content = preg_replace( '#/\d{4}/\d{2}/\d{2}/#', '/posts/', $content );
+
+			$chrooted_fs->mkdir( dirname( $file_path ), array( 'recursive' => true ) );
+			$chrooted_fs->put_contents(
+				$file_path,
+				$content
+			);
+			$progress->setMessage( 'Fetching ' . $parsed_url->pathname );
+			$progress->advance();
+		}
+		$progress->finish();
+	}
+	$entity_reader_factory = function () use ( $chrooted_fs, $source_site_url, $index_file_pattern ) {
+		return new FilesystemEntityReader(
+			$chrooted_fs,
+			array(
+				'index_file_pattern' => $index_file_pattern,
+				'filter_pattern' => '#\.(?:md|html|xhtml)$#',
+				/**
+				 * Use a number so large, there's no chance for wp_table INSERTs
+				 * to interfere with the post IDs generated by the FilesystemEntityReader.
+				 *
+				 * Some inserts are ran even by the importer, e.g. frontloading stubs.
+				 *
+				 * @TODO: Make sure this doesn't automatically bump the AUTOINCREMENT counter in MySQL.
+				 * @TODO: Bump the AUTOINCREMENT counter manually after a finished import.
+				 */
+				'first_post_id' => 10000000,
+				'base_url' => $source_site_url,
+			)
+		);
+	};
+
+	/**
+	 * Maps a filesystem path to a WordPress-friendly URL path we can assign
+	 * to the imported page.
+	 *
+	 * Example: "/docs/README.md" -> "/docs/readme"
+	 *
+	 * @param string $path The filesystem path to convert
+	 * @return string The WordPress-friendly URL path
+	 */
+	function map_file_path_to_wordpress_url( $path ) {
+		global $index_file_pattern, $import_path_prefix;
+
+		/**
+		 * Ensure a named top-level parent directory to base the entire
+		 * URL structure on. The goal is to have a consistent way to resolve
+		 * URLs for all the following files:
+		 *
+		 * - README.md
+		 * - chapter-5/README.md
+		 * - chapter-5/section-1.md
+		 * - chapter-5/section-3/readme.md
+		 *
+		 * Without the top-level directory, the best URL we can give the
+		 * /README.md file would be `/readme`. However, the `chapter-5/README.md`
+		 * would get a URL like `/chapter-5` which is inconsistent. However,
+		 * if we transform the path structure as follows, everything becomes
+		 * consistent:
+		 *
+		 * - /imported-content/README.md
+		 * - /imported-content/chapter-5/README.md
+		 * - /imported-content/chapter-5/section-1.md
+		 * - /imported-content/chapter-5/section-3/readme.md
+		 *
+		 * We want to keep all the links working after the import. A single,
+		 * consistent URL mapping strategy makes it much easier. The alternative
+		 * would be to maintain a mapping of parents to paths and use it whenever
+		 * creating pages and rewriting URLs.
+		 *
+		 * This isn't trivial. Having a top-level path prefix is not perfect,
+		 * but it's a sound compromise.
+		 */
+		$path = wp_join_unix_paths( $import_path_prefix, $path );
+
+		if ( 1 === preg_match( $index_file_pattern, $path ) ) {
+			$path = dirname( $path );
+		}
+
+		$extensions = array( '.md', '.html', '.xhtml' );
+		foreach ( $extensions as $ext ) {
+			if ( str_ends_with( $path, $ext ) ) {
+				$path = substr( $path, 0, -strlen( $ext ) );
+				break;
+			}
+		}
+
+		return strtolower( $path );
+	}
+
+	/**
+	 * Transforms links pointing to imported static files (e.g. ./getting-started.md)
+	 * to the format they will have after being imported into WordPress (e.g. /docs/getting-started).
+	 */
+	add_action(
+		'data_liberation.stream_importer.postprocess_url',
+		function (
+			$processor,
+			$context
+		) use (
+			$chrooted_fs,
+			/**
+			 * With &, $import_path_prefix reflects the latest value.
+			 * Without &, it's a local copy of the value from the outer scope.
+			 */
+			&$import_path_prefix
+		) {
+			/**
+			 * If we didn't rewrite the base URL, the URL points outside
+			 * of the imported root directory. Let's keep it as it is.
+			 */
+			if ( ! $context['applied_base_url_mapping'] ) {
+				return;
+			}
+
+			$path_original = $processor->get_parsed_url()->pathname;
+
+			/**
+			 * Remove the site path from the URL path and check:
+			 * Is this URL pointing to a file that exists in the imported
+			 * directory?
+			 */
+			$base_url_path_prefix  = $context['applied_base_url_mapping']['to']->pathname;
+			$path_relative_to_base = substr( $path_original, strlen( $base_url_path_prefix ) );
+			if ( $chrooted_fs->is_file( $path_relative_to_base ) ) {
+				/**
+				 * Yes! We are linking to an imported page. Let's transform the link
+				 * to a WordPress-friendly URL scheme.
+				 */
+				$path_rewritten = map_file_path_to_wordpress_url( $path_relative_to_base );
+				$path_rewritten = wp_join_unix_paths( $base_url_path_prefix, $path_rewritten );
+			} elseif ( $processor->is_url_absolute() ) {
+				/**
+				 * No. We are linking to a content page within our site but there is
+				 * no corresponding static file. This happens e.g. in the Gutenberg
+				 * handbook where the markdown files contain absolute URLs to the deployed
+				 * site, e.g.:
+				 *
+				 *     Start by ensuring you have Node.js and `npm` installed on your computer. Review
+				 *     the [Node.js development environment](https://developer.wordpress.org/block-editor/getting-started/devenv/nodejs-development-environment/) guide if not.
+				 *
+				 * Our best shot is to keep the URL as is, just with the imported
+				 * content root prepended to it.
+				 */
+				$path_rewritten = wp_join_unix_paths( $base_url_path_prefix, $import_path_prefix, $path_relative_to_base );
+			} else {
+				/**
+				 * It's a relative URL pointing somewhere within the URL space we're importing
+				 * to, but there is no corresponding static file. This is unexpected. There is
+				 * nothing we can do at this point – let's just keep the URL as it is.
+				 */
+				return;
+			}
+			$processor->set_url(
+				$path_rewritten,
+				WPURL::parse( $path_rewritten, $processor->get_parsed_url() )
+			);
+		},
+		10,
+		3
+	);
+
+	/**
+	 * Assigns post_name to every imported static page.
+	 */
+	add_filter(
+		'data_liberation.stream_importer.preprocess_entity',
+		function ( $entity ) use ( &$import_path_prefix, $index_file_pattern ) {
+			static $preprocessed_an_entity = false;
+			if ( $entity->get_type() !== 'post' ) {
+				return $entity;
+			}
+
+			$data = $entity->get_data();
+
+			if ( isset( $data['parsed_metadata']['slug'] ) ) {
+				$data['post_name'] = basename( $data['parsed_metadata']['slug'][0] );
+			} elseif ( isset( $data['local_file_path'] ) ) {
+				/**
+				 * The default import content path is "/imported-content". However,
+				 * maybe we can find a friendlier path prefix based on the post
+				 * title of the top-level index file.
+				 *
+				 * For example, a "Getting Started" guide found at "README.md"
+				 * could be imported to "/getting-started".
+				 */
+				if ( ! $preprocessed_an_entity ) {
+					$preprocessed_an_entity           = true;
+					$dirname                          = dirname( $data['local_file_path'] );
+					$dirname_makes_a_bad_slug         = $dirname !== '.' && $dirname === '/';
+					$is_index_file                    = 1 === preg_match( $index_file_pattern, $data['local_file_path'] );
+					$post_title_not_derived_from_path = $data['post_title'] !== ImportUtils::slug_to_title( basename( $data['local_file_path'] ) );
+
+					if (
+						$dirname_makes_a_bad_slug &&
+						$is_index_file &&
+						$post_title_not_derived_from_path &&
+						strlen( $data['post_title'] ) > 1
+					) {
+						$import_path_prefix = wp_import_slugify( $data['post_title'] );
+					}
+				}
+
+				$wordpress_url     = map_file_path_to_wordpress_url( $data['local_file_path'] );
+				$data['post_name'] = basename( $wordpress_url );
+			} else {
+				return $entity;
+			}
+
+			$entity->set_data( $data );
+			return $entity;
+		},
+		10,
+		2
+	);
+} elseif ( $args['mode'] === 'wxr' ) {
+	if ( ! isset( $args['data_url'] ) ) {
+		help_message_and_die( 'The "wxr file" argument is required.' );
+	}
+	$entity_reader_factory = function ( $cursor ) use ( $args ) {
+		return WXREntityReader::create(
+			uri_to_byte_stream( $args['data_url'] ),
+			$cursor
+		);
+	};
+} elseif ( $args['mode'] === 'epub' ) {
+	if ( ! isset( $args['data_url'] ) ) {
+		help_message_and_die( 'The "epub file" argument is required.' );
+	}
+	$zip_fs                = ZipFilesystem::create(
+		uri_to_byte_stream( $args['data_url'] )
+	);
+	$entity_reader_factory = function ( $cursor = null ) use ( $zip_fs ) {
+		return new EPubEntityReader(
+			$zip_fs,
+			1000000 // This is first post ID. We should really also accept a cursor
+		);
+	};
+	$reader                = $entity_reader_factory();
+	$source_site_url       = 'file://' . dirname( $reader->get_manifest_path() );
+
+	// To source the media files from the EPUB bundle:
+	$chrooted_fs = $zip_fs;
+
+	/**
+	 * Drop .xhtml extension from the links.
+	 */
+	add_action(
+		'data_liberation.stream_importer.postprocess_url',
+		function ( $processor ) {
+			$parsed_url = $processor->get_parsed_url();
+			if ( ! str_ends_with( $parsed_url->pathname, '.xhtml' ) ) {
+				return;
+			}
+			$parsed_url->pathname = substr( $parsed_url->pathname, 0, -6 );
+			$processor->set_url(
+				$parsed_url . '',
+				$parsed_url
+			);
+		}
+	);
+} else {
+	help_message_and_die( 'The "mode" argument is required and must be one of: "local-directory", "git", "crawler", "wxr", or "epub".' );
+	exit( 1 );
+}
+
+function uri_to_byte_stream( $uri ) {
+	if ( str_starts_with( $uri, 'http://' ) || str_starts_with( $uri, 'https://' ) ) {
+		$local_path = tempnam( sys_get_temp_dir(), 'wp-remote-file-' );
+		file_put_contents( $local_path, file_get_contents( $uri ) );
+		$uri = $local_path;
+
+		// @TODO: Use SeekableRequestReadStream here instead of
+		// pre-downloading the file to disk.
+		// $client = new Client();
+		// $response = $client->fetch($uri);
+	}
+	if ( file_exists( $uri ) ) {
+		return FileReadStream::from_path( $uri );
+	}
+	throw new \Exception( "Unknown resource type: $uri. If that's a local file, \033[1mplease provide an absolute path to the file\033[0m." );
+}
+
+
+/**
+ * Naive slugification function.
+ *
+ * @TODO: Use a more sophisticated one with utf-8 support etc.
+ */
+function wp_import_slugify( $title ) {
+	return preg_replace( '/[^a-z0-9]+/i', '-', trim( strtolower( $title ) ) );
+}
+
+$data_url = $args['data_url'];
+$console_writer->write( "Importing static files from $data_url\n" );
+
+
+try {
+	// Parse URL mapping arguments
+	$additional_url_mappings = array();
+	foreach ( $parser->getArray( 'additional-site-urls' ) as $url ) {
+		$additional_url_mappings[] = array(
+			'from' => $url,
+			'to' => NEW_SITE_CONTENT_ROOT,
+		);
+	}
+
+	$console_writer->write( "Starting the import\n" );
+	$importer = StreamImporter::create(
+		$entity_reader_factory,
+		array(
+			'source_site_url' => $source_site_url,
+			'new_site_content_root_url' => NEW_SITE_CONTENT_ROOT,
+			'source_media_root_urls' => $parser->getArray( 'media-url' ) ?: array( $source_site_url ),
+			'additional_url_mappings' => $additional_url_mappings,
+			'index_batch_size' => 1,
+			'attachment_downloader_options' => array(
+				'source_from_filesystem' => $chrooted_fs,
+			),
+		)
+	);
+
+	$import_session   = ImportSession::create(
+		array(
+			'data_source' => 'local_directory',
+			// @TODO: the phrase "file_name" doesn't make sense here. We're sourcing
+			// data from a directory, not a file. This string is used to tell
+			// the user in the UI what this they're importing in this import
+			// session. Let's rename it to something more descriptive.
+			'file_name' => $args['data_url'],
+		)
+	);
+	$retries_iterator = new RetryFrontloadingIterator( $import_session->get_id() );
+	$importer->set_frontloading_retries_iterator( $retries_iterator );
+
+	// @TODO: Prettier progress reporting
+	$ignored_message_printed = false;
+	do {
+		$result = data_liberation_import_step_customized( $import_session, $importer, $console_writer );
+		if ( $importer->get_stage() === StreamImporter::STAGE_FINISHED ) {
+			$console_writer->write( "\n" );
+			$console_writer->write( "\033[1;32mImport finished!\033[0m See your imported content at: \n" );
+
+			// Get the first page with non-empty content.
+			$posts = get_posts(
+				array(
+					'numberposts' => 10,
+					'orderby' => 'ID',
+					'order' => 'ASC',
+					'post_type' => 'page',
+					'post_status' => 'publish',
+				)
+			);
+
+			$url = NEW_SITE_CONTENT_ROOT;
+			foreach ( $posts as $post ) {
+				if ( ! empty( $post->post_content ) ) {
+					$url = get_permalink( $post );
+					break;
+				}
+			}
+			$console_writer->write( "\033[1;36m" . $url . "\033[0m\n" );
+			break;
+		} elseif ( false === $result ) {
+			if ( $importer->get_stage() === StreamImporter::STAGE_FRONTLOAD_ASSETS ) {
+				if ( ! $ignored_message_printed ) {
+					$console_writer->write( "\nSome assets could not be downloaded – they will be ignored so we can continue with the import.\n" );
+					$ignored_message_printed = true;
+				}
+				// $import_session->mark_frontloading_errors_as_ignored();
+			} else {
+				$console_writer->write( "Import failed, aborting\n" );
+				break;
+			}
+		} else {
+			// Twiddle our thumbs, importing in progress...
+		}
+	} while ( true );
+} finally {
+	if ( isset( $cache_fs ) ) {
+		$cache_fs->rmdir(
+			'/',
+			array(
+				'recursive' => true,
+			)
+		);
+	}
+}
+
+/**
+ * @TODO: Expose a primitive like the step function below from the
+ *        DataLiberation PHP component. Support all sorts of pause conditions
+ *        such as time limits, retry counts, memory limits, etc.
+ */
+function data_liberation_import_step_customized( ImportSession $session, StreamImporter $importer, ConsoleWriter $console_writer ) {
+	$soft_time_limit_seconds = 15;
+	$hard_time_limit_seconds = 25;
+	$start_time              = microtime( true );
+	$fetched_files           = 0;
+	$progress_bar            = null;
+
+	while ( true ) {
+		$time_taken = microtime( true ) - $start_time;
+		if ( $time_taken >= $soft_time_limit_seconds ) {
+			if ( $importer->get_stage() === StreamImporter::STAGE_FRONTLOAD_ASSETS ) {
+				if ( $fetched_files > 0 ) {
+					return true;
+				}
+			} else {
+				return true;
+			}
+		}
+		if ( $time_taken >= $hard_time_limit_seconds ) {
+			return true;
+		}
+
+		if ( true !== $importer->next_step() ) {
+			$session->set_reentrancy_cursor( $importer->get_reentrancy_cursor() );
+
+			$should_advance_to_next_stage = null !== $importer->get_next_stage();
+			if ( $should_advance_to_next_stage ) {
+				if ( StreamImporter::STAGE_FRONTLOAD_ASSETS === $importer->get_stage() ) {
+					$resolved_all_failures = $session->count_unfinished_frontloading_stubs() === 0;
+					if ( ! $resolved_all_failures ) {
+						// Uncomment once this script's intent becomes exiting on unresolved frontloading failures.
+						// if($progress_bar) {
+						// $progress_bar->finish();
+						// }
+						// return false;
+					}
+				}
+			}
+			if ( ! $importer->advance_to_next_stage() ) {
+				if ( $progress_bar ) {
+					$progress_bar->finish();
+				}
+				return false;
+			}
+			$session->set_stage( $importer->get_stage() );
+			$session->set_reentrancy_cursor( $importer->get_reentrancy_cursor() );
+			$console_writer->clearLine();
+			$progress_bar = null;
+
+			continue;
+		}
+
+		switch ( $importer->get_stage() ) {
+			case StreamImporter::STAGE_INDEX_ENTITIES:
+				$entities_counts = $importer->get_indexed_entities_counts();
+				$session->create_frontloading_stubs( $importer->get_indexed_assets_urls() );
+				$session->bump_total_number_of_entities( $entities_counts );
+				if ( ! $progress_bar ) {
+					$progress_bar = new ProgressBar( $console_writer, null );
+					$progress_bar->setMessage( 'Indexing entities' );
+					$progress_bar->start();
+				}
+				$progress_bar->setCurrent( array_sum( $session->get_total_number_of_entities() ) );
+				break;
+
+			case StreamImporter::STAGE_FRONTLOAD_ASSETS:
+				$progress = $importer->get_frontloading_progress();
+				$session->bump_frontloading_progress(
+					$progress,
+					$importer->get_frontloading_events()
+				);
+
+				if ( ! $progress_bar ) {
+					$progress_bar = new ProgressBar( $console_writer, null );
+					$progress_bar->setMessage( 'Fetching media files' );
+					$progress_bar->start();
+				}
+				$progress_bar->setCurrent( $session->count_unfinished_frontloading_stubs() );
+				break;
+
+			case StreamImporter::STAGE_IMPORT_ENTITIES:
+				$imported_counts = $importer->get_imported_entities_counts();
+
+				$session->bump_imported_entities_counts( $imported_counts );
+
+				if ( ! $progress_bar ) {
+					$progress_bar = new ProgressBar( $console_writer, $session->count_remaining_entities() );
+					$progress_bar->setMessage( 'Importing entities' );
+					$progress_bar->start();
+				}
+				$progress_bar->setCurrent( $session->count_all_imported_entities() );
+				break;
+		}
+
+		$session->set_reentrancy_cursor( $importer->get_reentrancy_cursor() );
+	}
+	return false;
+}

From b369a9626ea4517651a45779143d499928be438e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Thu, 29 May 2025 10:45:07 +0200
Subject: [PATCH 3/6] Load php-toolkit when importContent step is used

---
 bin/build-phar/smoke-test.php                 |   2 +-
 components/Blueprints/Runner.php              |  22 +-
 components/Blueprints/RunnerConfiguration.php |   4 +
 .../DataLiberation/bin/cli-importer.php       | 762 ------------------
 phar-libraries.json                           |   2 +-
 plugins/data-liberation/plugin.php            |   4 +-
 6 files changed, 27 insertions(+), 769 deletions(-)
 delete mode 100644 components/DataLiberation/bin/cli-importer.php

diff --git a/bin/build-phar/smoke-test.php b/bin/build-phar/smoke-test.php
index dbe8d8cd..81056533 100644
--- a/bin/build-phar/smoke-test.php
+++ b/bin/build-phar/smoke-test.php
@@ -1,6 +1,6 @@
 <?php
 
-require_once __DIR__ . '/../../dist/wordpress-libraries.phar';
+require_once __DIR__ . '/../../dist/php-toolkit.phar';
 
 /**
  * None of this will actually try to parse a file or import
diff --git a/components/Blueprints/Runner.php b/components/Blueprints/Runner.php
index 84be4745..1c943baf 100644
--- a/components/Blueprints/Runner.php
+++ b/components/Blueprints/Runner.php
@@ -618,10 +618,26 @@ private function createExecutionPlan(): array {
 		}
 
 		foreach ( $plan as $step ) {
-			// @TODO: Make sure this doesn't get included twice in the execution plan.
+			// @TODO: Make sure this doesn't get included twice in the execution plan,
+			//        e.g. if the Blueprint specified this step manually.
 			if ( $step instanceof ImportContentStep ) {
-				array_unshift( $plan, $this->createStepObject( 'installPlugin', [
-					'source' => $this->createDataReference( 'https://playground.wordpress.net/wordpress-importer.zip' ),
+				if($this->configuration->isRunningAsPhar()) {
+					throw new InvalidArgumentException( '@TODO: Importing content is not supported when running as phar.' );
+				} else {
+					$libraries_phar_path = __DIR__ . '/../../dist/php-toolkit.phar';
+					if(!file_exists($libraries_phar_path)) {
+						throw new InvalidArgumentException(
+							'In development, you must run `bash bin/build-libraries-phar.sh` to bundle importer libraries before importing content via a Blueprint. '.
+							'It generates a `dist/php-toolkit.phar` file bundling all the libraries required for importing content.'
+						);
+					}
+					$this->configuration->getLogger()->info( 'Loading importer libraries from ' . $libraries_phar_path );
+					$source = $this->createDataReference( new AbsoluteLocalPath( $libraries_phar_path ) );
+				}
+				array_unshift( $plan, $this->createStepObject( 'writeFiles', [
+					'files' => [
+						'php-toolkit.phar' => $source,
+					],
 				] ) );
 				break;
 			}
diff --git a/components/Blueprints/RunnerConfiguration.php b/components/Blueprints/RunnerConfiguration.php
index 0dbd7d97..5f532501 100644
--- a/components/Blueprints/RunnerConfiguration.php
+++ b/components/Blueprints/RunnerConfiguration.php
@@ -235,4 +235,8 @@ public function isAllowedLocalFilesystemAccess(): bool {
 	public static function getPermissionCliFlag( string $permission ): string {
 		return $permission;
 	}
+
+	public function isRunningAsPhar(): bool {
+		return \Phar::running(false) !== '';
+	}
 }
diff --git a/components/DataLiberation/bin/cli-importer.php b/components/DataLiberation/bin/cli-importer.php
deleted file mode 100644
index 348737a2..00000000
--- a/components/DataLiberation/bin/cli-importer.php
+++ /dev/null
@@ -1,762 +0,0 @@
-<?php
-
-use Rowbot\URL\URL;
-use WordPress\ByteStream\ReadStream\FileReadStream;
-use WordPress\DataLiberation\EntityReader\EPubEntityReader;
-use WordPress\DataLiberation\EntityReader\FilesystemEntityReader;
-use WordPress\DataLiberation\EntityReader\WXREntityReader;
-use WordPress\DataLiberation\Importer\ImportSession;
-use WordPress\DataLiberation\Importer\ImportUtils;
-use WordPress\DataLiberation\Importer\RetryFrontloadingIterator;
-use WordPress\DataLiberation\Importer\StreamImporter;
-use WordPress\DataLiberation\URL\WPURL;
-use WordPress\Filesystem\Layer\ChrootLayer;
-use WordPress\Filesystem\LocalFilesystem;
-use WordPress\Git\GitFilesystem;
-use WordPress\Git\GitRepository;
-use WordPress\HttpClient\Crawler;
-use WordPress\Zip\ZipFilesystem;
-
-use function WordPress\DataLiberation\URL\is_child_url_of;
-use function WordPress\Filesystem\wp_join_unix_paths;
-
-if ( file_exists( '/wordpress/wp-load.php' ) ) {
-	require_once '/wordpress/wp-load.php';
-}
-
-if ( file_exists( __DIR__ . '/../../vendor/autoload.php' ) ) {
-	require_once __DIR__ . '/../../vendor/autoload.php';
-} elseif ( file_exists( __DIR__ . '/wp-content/vendor/autoload.php' ) ) {
-	require_once __DIR__ . '/wp-content/vendor/autoload.php';
-}
-
-require_once __DIR__ . '/cli/Parser.php';
-require_once __DIR__ . '/playground-protocol/PlaygroundProtocolClient.php';
-require_once __DIR__ . '/cli/ConsoleWriter.php';
-require_once __DIR__ . '/cli/ProgressBar.php';
-
-$console_writer = new PlaygroundConsoleWriter();
-
-/**
- * Custom autoloader that should not be needed because we already have
- * the vendor autoloader in place.
- *
- * @TODO: Investigate why it's needed and get rid of it.
- */
-spl_autoload_register(
-	function ( $class ) use ( $console_writer ) {
-		// Base directory for components
-		$baseDir = WP_CONTENT_DIR . '/components/';
-
-		// Convert namespace to path
-		$path = str_replace( '\\', DIRECTORY_SEPARATOR, $class ) . '.php';
-		if ( str_starts_with( $path, 'WordPress/' ) ) {
-			$path = substr( $path, 10 );
-		}
-
-		// Full path to the file
-		$file = $baseDir . $path;
-
-		// Check if file exists and include it
-		if ( file_exists( $file ) ) {
-			require_once $file;
-			return true;
-		}
-
-		return false;
-	}
-);
-
-// Parse CLI arguments
-function help_message_and_die( $error = false ) {
-	global $console_writer;
-	$console_writer->write( "\033[1;32mDescription:\033[0m\n" );
-	$console_writer->write( "  Imports content into a new WordPress site\n\n" );
-
-	$console_writer->write( "\033[1;32mUsage:\033[0m\n" );
-	$console_writer->write( "  php import-markdown-directory.php <mode> [options]\n\n" );
-
-	$console_writer->write( "\033[1;32mModes:\033[0m\n" );
-	$console_writer->write( "  \033[1;33mcrawler\033[0m     Import content by crawling a website\n" );
-	$console_writer->write( "  \033[1;33mlocal-directory\033[0m        Import content from a local directory\n" );
-	$console_writer->write( "  \033[1;33mgit\033[0m         Import content from a git repository\n" );
-	$console_writer->write( "  \033[1;33mwxr\033[0m         Import content from a WordPress eXtended RSS file\n" );
-	$console_writer->write( "  \033[1;33mepub\033[0m        Import content from an EPUB ebook\n\n" );
-
-	$console_writer->write( "\033[1;32mGlobal Options:\033[0m\n" );
-	$console_writer->write( "  \033[1;34m--source-site-url=<url>\033[0m\n" );
-	$console_writer->write( "      Base URL of the source content (required)\n\n" );
-
-	$console_writer->write( "  \033[1;34m--additional-site-urls=<url>\033[0m\n" );
-	$console_writer->write( "      Additional URLs to rewrite links for (multiple allowed)\n\n" );
-
-	$console_writer->write( "  \033[1;34m--media-url=<url>\033[0m\n" );
-	$console_writer->write( "      URLs to download media files from (multiple allowed)\n\n" );
-
-	$console_writer->write( "  \033[1;34m--output-dir=<path>\033[0m\n" );
-	$console_writer->write( "      Create the new WordPress site in this directory\n" );
-	$console_writer->write( "      Must be empty and have write permissions\n\n" );
-
-	$console_writer->write( "\033[1;32mMode-specific Usage:\033[0m\n" );
-
-	$console_writer->write( "\033[1;33mgit\033[0m mode:\n" );
-	$console_writer->write( "  php import-markdown-directory.php git <repo_url>\n" );
-	$console_writer->write( "  Options:\n" );
-	$console_writer->write( "    \033[1;34m--branch=<branch>\033[0m\n" );
-	$console_writer->write( "        Git branch to import from (required)\n" );
-	$console_writer->write( "    \033[1;34m--path-in-repo=<path>\033[0m\n" );
-	$console_writer->write( "        Subdirectory in repository to import from\n\n" );
-
-	$console_writer->write( "\033[1;33mcrawler\033[0m mode:\n" );
-	$console_writer->write( "  php import-markdown-directory.php crawler <url>\n" );
-	$console_writer->write( "  Crawls the website at <url> and imports discovered content\n\n" );
-
-	$console_writer->write( "\033[1;33mlocal-directory\033[0m mode:\n" );
-	$console_writer->write( "  php import-markdown-directory.php local-directory <directory>\n" );
-	$console_writer->write( "  Imports content from local <directory>\n\n" );
-
-	$console_writer->write( "\033[1;33mwxr\033[0m mode:\n" );
-	$console_writer->write( "  php import-markdown-directory.php wxr <url or local path>\n" );
-	$console_writer->write( "  Imports content from a WordPress eXtended RSS file\n\n" );
-
-	$console_writer->write( "\033[1;33mepub\033[0m mode:\n" );
-	$console_writer->write( "  php import-markdown-directory.php epub <url or local path>\n" );
-	$console_writer->write( "  Imports content from an EPUB ebook\n\n" );
-
-	if ( $error ) {
-		$console_writer->write( "\033[1;31mError:\033[0m " );
-		$console_writer->write( $error );
-		$console_writer->write( "\n" );
-		PlaygroundProtocolClient::getInstance()->exit();
-	}
-	die();
-}
-
-define( 'NEW_SITE_CONTENT_ROOT', get_site_url() );
-$console_writer->write( 'Target site URL: ' . NEW_SITE_CONTENT_ROOT . "\n" );
-
-$parser = new Phalcon\Cop\Parser();
-$args   = $parser->parse( $argv );
-
-$args['mode']     = $args[0] ?? '';
-$args['data_url'] = $args[1] ?? '';
-
-$chrooted_fs     = null;
-$source_site_url = null;
-if ( in_array( $args['mode'], array( 'local-directory', 'git', 'crawler' ) ) ) {
-	// Validate required arguments
-	if ( ! isset( $args['source-site-url'] ) ) {
-		if ( $args['mode'] === 'crawler' ) {
-			$args['source-site-url'] = $args['data_url'];
-		} else {
-			help_message_and_die( 'The --source-site-url argument is required.' );
-		}
-	}
-	$index_file_pattern = '#(?:index|readme)\.(?:md|html|xhtml)$#i';
-	$import_path_prefix = '/imported-content';
-	$source_site_url    = $args['source-site-url'];
-
-	if ( $args['mode'] === 'local-directory' ) {
-		if ( ! isset( $args['data_url'] ) ) {
-			help_message_and_die( 'The "local-directory" argument is required.' );
-		}
-
-		PlaygroundProtocolClient::getInstance()->mountDirectory( $args['data_url'], '/files-to-import' );
-		$chrooted_fs = LocalFilesystem::create( '/files-to-import' );
-
-		$args['source-site-url'] = 'file:///';
-	} elseif ( $args['mode'] === 'git' ) {
-		if ( ! isset( $args['data_url'] ) ) {
-			help_message_and_die( 'The "repo" argument is required.' );
-		}
-
-		$args['repo'] = $args['data_url'];
-		if ( ! str_ends_with( $args['repo'], '.git' ) ) {
-			help_message_and_die( 'The "repo" argument must end with ".git" when mode is "git".' );
-		}
-
-		if ( ! isset( $args['branch'] ) ) {
-			help_message_and_die( 'The "branch" argument is required when mode is "git".' );
-		}
-
-		$console_writer->write( "Sparse checkout of the git repository\n" );
-		$temp_dir  = sys_get_temp_dir() . '/import-static-' . uniqid();
-		$cache_fs  = LocalFilesystem::create( $temp_dir );
-		$docs_repo = new GitRepository( $cache_fs );
-		$docs_repo->add_remote( 'origin', $args['repo'] );
-		$remote       = $docs_repo->get_remote_client( 'origin' );
-		$path_in_repo = $args['path-in-repo'] ?? '';
-		$branch       = $args['branch'] ?? 'trunk';
-		$remote->fetch(
-			$branch,
-			array(
-				'path' => $path_in_repo,
-				'shallow' => true,
-			)
-		);
-		$docs_repo->set_branch_tip( 'refs/heads/' . $branch, $docs_repo->get_branch_tip( 'refs/remotes/origin/' . $branch ) );
-		$docs_repo->checkout( 'refs/heads/' . $branch );
-		$git_fs      = GitFilesystem::create( $docs_repo );
-		$chrooted_fs = new ChrootLayer( $git_fs, $path_in_repo );
-	} elseif ( $args['mode'] === 'crawler' ) {
-		if ( ! isset( $args['data_url'] ) ) {
-			help_message_and_die( 'The "url" argument is required.' );
-		}
-		if ( ! WPURL::parse( $args['data_url'] ) ) {
-			help_message_and_die( 'The "url" argument must be a valid URL.' );
-		}
-		$args['source-site-url'] = $args['data_url'];
-		$tmp_dir                 = sys_get_temp_dir() . '/import-static-' . uniqid();
-		$chrooted_fs             = LocalFilesystem::create( $tmp_dir );
-		$crawler                 = new Crawler(
-			$args['data_url'],
-			array(
-				'preprocess_url' => function ( URL $url ) use ( $args ) {
-					if ( ! is_child_url_of( $url, $args['data_url'] ) ) {
-						return false;
-					}
-					$url->search = '';
-					if ( in_array( $url->pathname, array( '/feed/', '/wp-json/' ) ) ) {
-						return false;
-					}
-					if ( preg_match( '#^/\d{4}/\d{2}/\d{2}/[^/]+/$#', $url->pathname ) ) {
-						return $url;
-					}
-					if ( preg_match( '#^/[^/]+/$#', $url->pathname ) ) {
-						return $url;
-					}
-					return false;
-				},
-			)
-		);
-		$progress                = new ProgressBar( $console_writer, null );
-		$progress->start( 'Crawling website...' );
-		while ( $crawler->crawl_next() ) {
-			$parsed_url = WPURL::parse( $crawler->get_current_url() );
-			$file_path  = $parsed_url->pathname;
-			if ( $file_path === '/' ) {
-				$file_path = '/index.html';
-			} elseif ( str_ends_with( $file_path, '/' ) ) {
-				/**
-				 * Choose to treat /2021/10/03/dont-waste-time-on-boring-programming-lessons/ as
-				 * /2021/10/03/dont-waste-time-on-boring-programming-lessons.html
-				 *
-				 * Another possible choice would be to save it as
-				 * /2021/10/03/dont-waste-time-on-boring-programming-lessons/index.html
-				 */
-				$file_path = rtrim( $file_path, '/' );
-			}
-
-			if ( ! $file_path || strlen( $file_path ) < 1 ) {
-				$file_path = sha1( $crawler->get_current_url() );
-			}
-
-			$extension = pathinfo( $file_path, PATHINFO_EXTENSION );
-			if ( ! $extension ) {
-				$file_path .= '.html';
-			}
-
-			/**
-			 * Replace date-based paths with "posts" directory.
-			 *
-			 * Why? wp_insert_post() seems to mangle the post_name if it consists of a few numbers
-			 * and that messes up the URLs of the imported posts.
-			 *
-			 * @TODO: Investigate the reasons of this behavior.
-			 */
-			$file_path = preg_replace( '#/\d{4}/\d{2}/\d{2}/#', '/posts/', $file_path );
-			$content   = $crawler->get_current_content();
-			// @TODO: This is very naive – we should use the URL processor instead.
-			$content = preg_replace( '#/\d{4}/\d{2}/\d{2}/#', '/posts/', $content );
-
-			$chrooted_fs->mkdir( dirname( $file_path ), array( 'recursive' => true ) );
-			$chrooted_fs->put_contents(
-				$file_path,
-				$content
-			);
-			$progress->setMessage( 'Fetching ' . $parsed_url->pathname );
-			$progress->advance();
-		}
-		$progress->finish();
-	}
-	$entity_reader_factory = function () use ( $chrooted_fs, $source_site_url, $index_file_pattern ) {
-		return new FilesystemEntityReader(
-			$chrooted_fs,
-			array(
-				'index_file_pattern' => $index_file_pattern,
-				'filter_pattern' => '#\.(?:md|html|xhtml)$#',
-				/**
-				 * Use a number so large, there's no chance for wp_table INSERTs
-				 * to interfere with the post IDs generated by the FilesystemEntityReader.
-				 *
-				 * Some inserts are ran even by the importer, e.g. frontloading stubs.
-				 *
-				 * @TODO: Make sure this doesn't automatically bump the AUTOINCREMENT counter in MySQL.
-				 * @TODO: Bump the AUTOINCREMENT counter manually after a finished import.
-				 */
-				'first_post_id' => 10000000,
-				'base_url' => $source_site_url,
-			)
-		);
-	};
-
-	/**
-	 * Maps a filesystem path to a WordPress-friendly URL path we can assign
-	 * to the imported page.
-	 *
-	 * Example: "/docs/README.md" -> "/docs/readme"
-	 *
-	 * @param string $path The filesystem path to convert
-	 * @return string The WordPress-friendly URL path
-	 */
-	function map_file_path_to_wordpress_url( $path ) {
-		global $index_file_pattern, $import_path_prefix;
-
-		/**
-		 * Ensure a named top-level parent directory to base the entire
-		 * URL structure on. The goal is to have a consistent way to resolve
-		 * URLs for all the following files:
-		 *
-		 * - README.md
-		 * - chapter-5/README.md
-		 * - chapter-5/section-1.md
-		 * - chapter-5/section-3/readme.md
-		 *
-		 * Without the top-level directory, the best URL we can give the
-		 * /README.md file would be `/readme`. However, the `chapter-5/README.md`
-		 * would get a URL like `/chapter-5` which is inconsistent. However,
-		 * if we transform the path structure as follows, everything becomes
-		 * consistent:
-		 *
-		 * - /imported-content/README.md
-		 * - /imported-content/chapter-5/README.md
-		 * - /imported-content/chapter-5/section-1.md
-		 * - /imported-content/chapter-5/section-3/readme.md
-		 *
-		 * We want to keep all the links working after the import. A single,
-		 * consistent URL mapping strategy makes it much easier. The alternative
-		 * would be to maintain a mapping of parents to paths and use it whenever
-		 * creating pages and rewriting URLs.
-		 *
-		 * This isn't trivial. Having a top-level path prefix is not perfect,
-		 * but it's a sound compromise.
-		 */
-		$path = wp_join_unix_paths( $import_path_prefix, $path );
-
-		if ( 1 === preg_match( $index_file_pattern, $path ) ) {
-			$path = dirname( $path );
-		}
-
-		$extensions = array( '.md', '.html', '.xhtml' );
-		foreach ( $extensions as $ext ) {
-			if ( str_ends_with( $path, $ext ) ) {
-				$path = substr( $path, 0, -strlen( $ext ) );
-				break;
-			}
-		}
-
-		return strtolower( $path );
-	}
-
-	/**
-	 * Transforms links pointing to imported static files (e.g. ./getting-started.md)
-	 * to the format they will have after being imported into WordPress (e.g. /docs/getting-started).
-	 */
-	add_action(
-		'data_liberation.stream_importer.postprocess_url',
-		function (
-			$processor,
-			$context
-		) use (
-			$chrooted_fs,
-			/**
-			 * With &, $import_path_prefix reflects the latest value.
-			 * Without &, it's a local copy of the value from the outer scope.
-			 */
-			&$import_path_prefix
-		) {
-			/**
-			 * If we didn't rewrite the base URL, the URL points outside
-			 * of the imported root directory. Let's keep it as it is.
-			 */
-			if ( ! $context['applied_base_url_mapping'] ) {
-				return;
-			}
-
-			$path_original = $processor->get_parsed_url()->pathname;
-
-			/**
-			 * Remove the site path from the URL path and check:
-			 * Is this URL pointing to a file that exists in the imported
-			 * directory?
-			 */
-			$base_url_path_prefix  = $context['applied_base_url_mapping']['to']->pathname;
-			$path_relative_to_base = substr( $path_original, strlen( $base_url_path_prefix ) );
-			if ( $chrooted_fs->is_file( $path_relative_to_base ) ) {
-				/**
-				 * Yes! We are linking to an imported page. Let's transform the link
-				 * to a WordPress-friendly URL scheme.
-				 */
-				$path_rewritten = map_file_path_to_wordpress_url( $path_relative_to_base );
-				$path_rewritten = wp_join_unix_paths( $base_url_path_prefix, $path_rewritten );
-			} elseif ( $processor->is_url_absolute() ) {
-				/**
-				 * No. We are linking to a content page within our site but there is
-				 * no corresponding static file. This happens e.g. in the Gutenberg
-				 * handbook where the markdown files contain absolute URLs to the deployed
-				 * site, e.g.:
-				 *
-				 *     Start by ensuring you have Node.js and `npm` installed on your computer. Review
-				 *     the [Node.js development environment](https://developer.wordpress.org/block-editor/getting-started/devenv/nodejs-development-environment/) guide if not.
-				 *
-				 * Our best shot is to keep the URL as is, just with the imported
-				 * content root prepended to it.
-				 */
-				$path_rewritten = wp_join_unix_paths( $base_url_path_prefix, $import_path_prefix, $path_relative_to_base );
-			} else {
-				/**
-				 * It's a relative URL pointing somewhere within the URL space we're importing
-				 * to, but there is no corresponding static file. This is unexpected. There is
-				 * nothing we can do at this point – let's just keep the URL as it is.
-				 */
-				return;
-			}
-			$processor->set_url(
-				$path_rewritten,
-				WPURL::parse( $path_rewritten, $processor->get_parsed_url() )
-			);
-		},
-		10,
-		3
-	);
-
-	/**
-	 * Assigns post_name to every imported static page.
-	 */
-	add_filter(
-		'data_liberation.stream_importer.preprocess_entity',
-		function ( $entity ) use ( &$import_path_prefix, $index_file_pattern ) {
-			static $preprocessed_an_entity = false;
-			if ( $entity->get_type() !== 'post' ) {
-				return $entity;
-			}
-
-			$data = $entity->get_data();
-
-			if ( isset( $data['parsed_metadata']['slug'] ) ) {
-				$data['post_name'] = basename( $data['parsed_metadata']['slug'][0] );
-			} elseif ( isset( $data['local_file_path'] ) ) {
-				/**
-				 * The default import content path is "/imported-content". However,
-				 * maybe we can find a friendlier path prefix based on the post
-				 * title of the top-level index file.
-				 *
-				 * For example, a "Getting Started" guide found at "README.md"
-				 * could be imported to "/getting-started".
-				 */
-				if ( ! $preprocessed_an_entity ) {
-					$preprocessed_an_entity           = true;
-					$dirname                          = dirname( $data['local_file_path'] );
-					$dirname_makes_a_bad_slug         = $dirname !== '.' && $dirname === '/';
-					$is_index_file                    = 1 === preg_match( $index_file_pattern, $data['local_file_path'] );
-					$post_title_not_derived_from_path = $data['post_title'] !== ImportUtils::slug_to_title( basename( $data['local_file_path'] ) );
-
-					if (
-						$dirname_makes_a_bad_slug &&
-						$is_index_file &&
-						$post_title_not_derived_from_path &&
-						strlen( $data['post_title'] ) > 1
-					) {
-						$import_path_prefix = wp_import_slugify( $data['post_title'] );
-					}
-				}
-
-				$wordpress_url     = map_file_path_to_wordpress_url( $data['local_file_path'] );
-				$data['post_name'] = basename( $wordpress_url );
-			} else {
-				return $entity;
-			}
-
-			$entity->set_data( $data );
-			return $entity;
-		},
-		10,
-		2
-	);
-} elseif ( $args['mode'] === 'wxr' ) {
-	if ( ! isset( $args['data_url'] ) ) {
-		help_message_and_die( 'The "wxr file" argument is required.' );
-	}
-	$entity_reader_factory = function ( $cursor ) use ( $args ) {
-		return WXREntityReader::create(
-			uri_to_byte_stream( $args['data_url'] ),
-			$cursor
-		);
-	};
-} elseif ( $args['mode'] === 'epub' ) {
-	if ( ! isset( $args['data_url'] ) ) {
-		help_message_and_die( 'The "epub file" argument is required.' );
-	}
-	$zip_fs                = ZipFilesystem::create(
-		uri_to_byte_stream( $args['data_url'] )
-	);
-	$entity_reader_factory = function ( $cursor = null ) use ( $zip_fs ) {
-		return new EPubEntityReader(
-			$zip_fs,
-			1000000 // This is first post ID. We should really also accept a cursor
-		);
-	};
-	$reader                = $entity_reader_factory();
-	$source_site_url       = 'file://' . dirname( $reader->get_manifest_path() );
-
-	// To source the media files from the EPUB bundle:
-	$chrooted_fs = $zip_fs;
-
-	/**
-	 * Drop .xhtml extension from the links.
-	 */
-	add_action(
-		'data_liberation.stream_importer.postprocess_url',
-		function ( $processor ) {
-			$parsed_url = $processor->get_parsed_url();
-			if ( ! str_ends_with( $parsed_url->pathname, '.xhtml' ) ) {
-				return;
-			}
-			$parsed_url->pathname = substr( $parsed_url->pathname, 0, -6 );
-			$processor->set_url(
-				$parsed_url . '',
-				$parsed_url
-			);
-		}
-	);
-} else {
-	help_message_and_die( 'The "mode" argument is required and must be one of: "local-directory", "git", "crawler", "wxr", or "epub".' );
-	exit( 1 );
-}
-
-function uri_to_byte_stream( $uri ) {
-	if ( str_starts_with( $uri, 'http://' ) || str_starts_with( $uri, 'https://' ) ) {
-		$local_path = tempnam( sys_get_temp_dir(), 'wp-remote-file-' );
-		file_put_contents( $local_path, file_get_contents( $uri ) );
-		$uri = $local_path;
-
-		// @TODO: Use SeekableRequestReadStream here instead of
-		// pre-downloading the file to disk.
-		// $client = new Client();
-		// $response = $client->fetch($uri);
-	}
-	if ( file_exists( $uri ) ) {
-		return FileReadStream::from_path( $uri );
-	}
-	throw new \Exception( "Unknown resource type: $uri. If that's a local file, \033[1mplease provide an absolute path to the file\033[0m." );
-}
-
-
-/**
- * Naive slugification function.
- *
- * @TODO: Use a more sophisticated one with utf-8 support etc.
- */
-function wp_import_slugify( $title ) {
-	return preg_replace( '/[^a-z0-9]+/i', '-', trim( strtolower( $title ) ) );
-}
-
-$data_url = $args['data_url'];
-$console_writer->write( "Importing static files from $data_url\n" );
-
-
-try {
-	// Parse URL mapping arguments
-	$additional_url_mappings = array();
-	foreach ( $parser->getArray( 'additional-site-urls' ) as $url ) {
-		$additional_url_mappings[] = array(
-			'from' => $url,
-			'to' => NEW_SITE_CONTENT_ROOT,
-		);
-	}
-
-	$console_writer->write( "Starting the import\n" );
-	$importer = StreamImporter::create(
-		$entity_reader_factory,
-		array(
-			'source_site_url' => $source_site_url,
-			'new_site_content_root_url' => NEW_SITE_CONTENT_ROOT,
-			'source_media_root_urls' => $parser->getArray( 'media-url' ) ?: array( $source_site_url ),
-			'additional_url_mappings' => $additional_url_mappings,
-			'index_batch_size' => 1,
-			'attachment_downloader_options' => array(
-				'source_from_filesystem' => $chrooted_fs,
-			),
-		)
-	);
-
-	$import_session   = ImportSession::create(
-		array(
-			'data_source' => 'local_directory',
-			// @TODO: the phrase "file_name" doesn't make sense here. We're sourcing
-			// data from a directory, not a file. This string is used to tell
-			// the user in the UI what this they're importing in this import
-			// session. Let's rename it to something more descriptive.
-			'file_name' => $args['data_url'],
-		)
-	);
-	$retries_iterator = new RetryFrontloadingIterator( $import_session->get_id() );
-	$importer->set_frontloading_retries_iterator( $retries_iterator );
-
-	// @TODO: Prettier progress reporting
-	$ignored_message_printed = false;
-	do {
-		$result = data_liberation_import_step_customized( $import_session, $importer, $console_writer );
-		if ( $importer->get_stage() === StreamImporter::STAGE_FINISHED ) {
-			$console_writer->write( "\n" );
-			$console_writer->write( "\033[1;32mImport finished!\033[0m See your imported content at: \n" );
-
-			// Get the first page with non-empty content.
-			$posts = get_posts(
-				array(
-					'numberposts' => 10,
-					'orderby' => 'ID',
-					'order' => 'ASC',
-					'post_type' => 'page',
-					'post_status' => 'publish',
-				)
-			);
-
-			$url = NEW_SITE_CONTENT_ROOT;
-			foreach ( $posts as $post ) {
-				if ( ! empty( $post->post_content ) ) {
-					$url = get_permalink( $post );
-					break;
-				}
-			}
-			$console_writer->write( "\033[1;36m" . $url . "\033[0m\n" );
-			break;
-		} elseif ( false === $result ) {
-			if ( $importer->get_stage() === StreamImporter::STAGE_FRONTLOAD_ASSETS ) {
-				if ( ! $ignored_message_printed ) {
-					$console_writer->write( "\nSome assets could not be downloaded – they will be ignored so we can continue with the import.\n" );
-					$ignored_message_printed = true;
-				}
-				// $import_session->mark_frontloading_errors_as_ignored();
-			} else {
-				$console_writer->write( "Import failed, aborting\n" );
-				break;
-			}
-		} else {
-			// Twiddle our thumbs, importing in progress...
-		}
-	} while ( true );
-} finally {
-	if ( isset( $cache_fs ) ) {
-		$cache_fs->rmdir(
-			'/',
-			array(
-				'recursive' => true,
-			)
-		);
-	}
-}
-
-/**
- * @TODO: Expose a primitive like the step function below from the
- *        DataLiberation PHP component. Support all sorts of pause conditions
- *        such as time limits, retry counts, memory limits, etc.
- */
-function data_liberation_import_step_customized( ImportSession $session, StreamImporter $importer, ConsoleWriter $console_writer ) {
-	$soft_time_limit_seconds = 15;
-	$hard_time_limit_seconds = 25;
-	$start_time              = microtime( true );
-	$fetched_files           = 0;
-	$progress_bar            = null;
-
-	while ( true ) {
-		$time_taken = microtime( true ) - $start_time;
-		if ( $time_taken >= $soft_time_limit_seconds ) {
-			if ( $importer->get_stage() === StreamImporter::STAGE_FRONTLOAD_ASSETS ) {
-				if ( $fetched_files > 0 ) {
-					return true;
-				}
-			} else {
-				return true;
-			}
-		}
-		if ( $time_taken >= $hard_time_limit_seconds ) {
-			return true;
-		}
-
-		if ( true !== $importer->next_step() ) {
-			$session->set_reentrancy_cursor( $importer->get_reentrancy_cursor() );
-
-			$should_advance_to_next_stage = null !== $importer->get_next_stage();
-			if ( $should_advance_to_next_stage ) {
-				if ( StreamImporter::STAGE_FRONTLOAD_ASSETS === $importer->get_stage() ) {
-					$resolved_all_failures = $session->count_unfinished_frontloading_stubs() === 0;
-					if ( ! $resolved_all_failures ) {
-						// Uncomment once this script's intent becomes exiting on unresolved frontloading failures.
-						// if($progress_bar) {
-						// $progress_bar->finish();
-						// }
-						// return false;
-					}
-				}
-			}
-			if ( ! $importer->advance_to_next_stage() ) {
-				if ( $progress_bar ) {
-					$progress_bar->finish();
-				}
-				return false;
-			}
-			$session->set_stage( $importer->get_stage() );
-			$session->set_reentrancy_cursor( $importer->get_reentrancy_cursor() );
-			$console_writer->clearLine();
-			$progress_bar = null;
-
-			continue;
-		}
-
-		switch ( $importer->get_stage() ) {
-			case StreamImporter::STAGE_INDEX_ENTITIES:
-				$entities_counts = $importer->get_indexed_entities_counts();
-				$session->create_frontloading_stubs( $importer->get_indexed_assets_urls() );
-				$session->bump_total_number_of_entities( $entities_counts );
-				if ( ! $progress_bar ) {
-					$progress_bar = new ProgressBar( $console_writer, null );
-					$progress_bar->setMessage( 'Indexing entities' );
-					$progress_bar->start();
-				}
-				$progress_bar->setCurrent( array_sum( $session->get_total_number_of_entities() ) );
-				break;
-
-			case StreamImporter::STAGE_FRONTLOAD_ASSETS:
-				$progress = $importer->get_frontloading_progress();
-				$session->bump_frontloading_progress(
-					$progress,
-					$importer->get_frontloading_events()
-				);
-
-				if ( ! $progress_bar ) {
-					$progress_bar = new ProgressBar( $console_writer, null );
-					$progress_bar->setMessage( 'Fetching media files' );
-					$progress_bar->start();
-				}
-				$progress_bar->setCurrent( $session->count_unfinished_frontloading_stubs() );
-				break;
-
-			case StreamImporter::STAGE_IMPORT_ENTITIES:
-				$imported_counts = $importer->get_imported_entities_counts();
-
-				$session->bump_imported_entities_counts( $imported_counts );
-
-				if ( ! $progress_bar ) {
-					$progress_bar = new ProgressBar( $console_writer, $session->count_remaining_entities() );
-					$progress_bar->setMessage( 'Importing entities' );
-					$progress_bar->start();
-				}
-				$progress_bar->setCurrent( $session->count_all_imported_entities() );
-				break;
-		}
-
-		$session->set_reentrancy_cursor( $importer->get_reentrancy_cursor() );
-	}
-	return false;
-}
diff --git a/phar-libraries.json b/phar-libraries.json
index ef512789..3c64ec29 100644
--- a/phar-libraries.json
+++ b/phar-libraries.json
@@ -1,7 +1,7 @@
 {
 	"$schema": "https://raw.githubusercontent.com/box-project/box/refs/heads/main/res/schema.json",
 	"main": "vendor/autoload.php",
-	"output": "dist/wordpress-libraries.phar",
+	"output": "dist/php-toolkit.phar",
 	"force-autodiscovery": false,
 	"compactors": [
 		"KevinGH\\Box\\Compactor\\Php"
diff --git a/plugins/data-liberation/plugin.php b/plugins/data-liberation/plugin.php
index b719bcd7..038df052 100644
--- a/plugins/data-liberation/plugin.php
+++ b/plugins/data-liberation/plugin.php
@@ -19,9 +19,9 @@
 use WordPress\HttpClient\Request;
 use WordPress\Markdown\MarkdownImporter;
 
-if(file_exists(__DIR__ . '/wordpress-libraries.phar')) {
+if(file_exists(__DIR__ . '/php-toolkit.phar')) {
     // Production – built and installed plugin
-	require_once __DIR__ . '/wordpress-libraries.phar';
+	require_once __DIR__ . '/php-toolkit.phar';
 } else {
 	// Development – plugin mounted in WordPress via Playground CLI mounts
 	require_once __DIR__ . '/../../vendor/autoload.php';

From 0120ee30e0c2eea8e882e8ed78571c028e3e6a16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Thu, 29 May 2025 14:26:01 +0200
Subject: [PATCH 4/6] Use the Data Liberation importer for processing WXR files

---
 components/Blueprints/Runner.php              |   2 +-
 .../Blueprints/Steps/ImportContentStep.php    |  60 +-
 .../Steps/scripts/import-content.php          | 884 ++++++++++++++++++
 components/Blueprints/bin/blueprint.php       |  81 +-
 components/CLI/CLI.php                        | 119 +++
 components/CLI/composer.json                  |  26 +
 components/DataLiberation/URL/functions.php   | 245 ++---
 composer.json                                 |   1 +
 8 files changed, 1181 insertions(+), 237 deletions(-)
 create mode 100644 components/Blueprints/Steps/scripts/import-content.php
 create mode 100644 components/CLI/CLI.php
 create mode 100644 components/CLI/composer.json

diff --git a/components/Blueprints/Runner.php b/components/Blueprints/Runner.php
index 1c943baf..fcfc9c5e 100644
--- a/components/Blueprints/Runner.php
+++ b/components/Blueprints/Runner.php
@@ -632,7 +632,7 @@ private function createExecutionPlan(): array {
 						);
 					}
 					$this->configuration->getLogger()->info( 'Loading importer libraries from ' . $libraries_phar_path );
-					$source = $this->createDataReference( new AbsoluteLocalPath( $libraries_phar_path ) );
+					$source = $this->createDataReference( new InlineFile( 'php-toolkit.phar', file_get_contents( $libraries_phar_path ) ) );
 				}
 				array_unshift( $plan, $this->createStepObject( 'writeFiles', [
 					'files' => [
diff --git a/components/Blueprints/Steps/ImportContentStep.php b/components/Blueprints/Steps/ImportContentStep.php
index 99c9d368..0378d6bf 100644
--- a/components/Blueprints/Steps/ImportContentStep.php
+++ b/components/Blueprints/Steps/ImportContentStep.php
@@ -65,49 +65,31 @@ private function importWxr( Runtime $runtime, array $content_definition ): void
 		}
 
 		$wxrPath = $runtime->saveToTemporaryFile( $resolved );
+		// @TODO: Make it work when Blueprints are running as phar archive
+		$import_script_path = __DIR__ . '/scripts/import-content.php';
+		if ( ! file_exists( $import_script_path ) ) {
+			throw new BlueprintExecutionException( sprintf(
+				'Import script %s does not exist.',
+				$import_script_path
+			) );
+		}
+
+		$importer_script = file_get_contents( $import_script_path );
 		$runtime->evalPhpCodeInSubProcess(
 			<<<'PHP'
 <?php
-require_once getenv('DOCROOT') . '/wp-load.php';
-require_once getenv('DOCROOT') . '/wp-admin/includes/admin.php';
-
-kses_remove_filters();
-$admin_id = get_users(array('role' => 'Administrator') )[0]->ID;
-wp_set_current_user( $admin_id );
-
-wp_set_current_user( $admin_id );
-$importer = new WXR_Importer( array(
-'fetch_attachments' => true,
-// @TODO: Support custom author
-'default_author' => $admin_id
-) );
-$logger = new WP_Importer_Logger_CLI();
-$importer->set_logger( $logger );
-// Slashes from the imported content are lost if we don't call wp_slash here.
-add_action( 'wp_insert_post_data', function( $data ) {
-return wp_slash($data);
-});
-
-// Ensure that Site Editor templates are associated with the correct taxonomy.
-add_filter( 'wp_import_post_terms', function ( $terms, $post_id ) {
-foreach ( $terms as $post_term ) {
-if ( 'wp_theme' !== $term['taxonomy'] ) {continue;}
-$post_term = get_term_by('slug', $term['slug'], $term['taxonomy'] );
-if ( ! $post_term ) {
-$post_term = wp_insert_term(
-$term['slug'],
-$term['taxonomy']
-);
-$term_id = $post_term['term_id'];
-} else {
-$term_id = $post_term->term_id;
-}
-wp_set_object_terms( $post_id, $term_id, $term['taxonomy']) ;
-}
-return $terms;
-}, 10, 2 );
-$result = $importer->import( getenv('WXR_PATH') );
+// @TODO: Just call a function here, do not go through CLI arguments.
+// @TODO: Establish a communication channel between the main process and the subprocess
+//        to report progress and errors.
+// @TODO: Enforce chrooting of the imported static files.
+$_SERVER['argv'] = [
+	'import-wxr.php',
+	'wxr',
+	getenv('WXR_PATH')
+];
+?>
 PHP
+			. $importer_script
 			,
 			[
 				'WXR_PATH' => $wxrPath,
diff --git a/components/Blueprints/Steps/scripts/import-content.php b/components/Blueprints/Steps/scripts/import-content.php
new file mode 100644
index 00000000..cbb7c1b5
--- /dev/null
+++ b/components/Blueprints/Steps/scripts/import-content.php
@@ -0,0 +1,884 @@
+<?php
+
+use WordPress\ByteStream\ReadStream\FileReadStream;
+use WordPress\CLI\CLI;
+use WordPress\DataLiberation\EntityReader\EPubEntityReader;
+use WordPress\DataLiberation\EntityReader\FilesystemEntityReader;
+use WordPress\DataLiberation\EntityReader\WXREntityReader;
+use WordPress\DataLiberation\Importer\ImportSession;
+use WordPress\DataLiberation\Importer\ImportUtils;
+use WordPress\DataLiberation\Importer\RetryFrontloadingIterator;
+use WordPress\DataLiberation\Importer\StreamImporter;
+use WordPress\DataLiberation\URL\WPURL;
+use WordPress\Filesystem\Layer\ChrootLayer;
+use WordPress\Filesystem\LocalFilesystem;
+use WordPress\Git\GitFilesystem;
+use WordPress\Git\GitRepository;
+use WordPress\Zip\ZipFilesystem;
+
+use function WordPress\Filesystem\wp_join_unix_paths;
+
+require_once getenv('DOCROOT') . '/wp-load.php';
+require_once getenv('DOCROOT') . '/php-toolkit.phar';
+
+interface ConsoleWriter {
+    /**
+     * Write text at the current cursor position
+     * 
+     * @param string $text Text to write
+     */
+    public function write(string $text): void;
+
+    /**
+     * Move cursor to beginning of line and clear everything after
+     */
+    public function clearLine(): void;
+
+    /**
+     * Replace current line with new text
+     * 
+     * @param string $text New text for the line
+     */
+    public function replaceLine(string $text): void;
+
+    /**
+     * Write multiple lines, optionally replacing previous output
+     * 
+     * @param array $lines Array of text lines to write
+     * @param bool $replace Whether to replace previous output
+     */
+    public function writeLines(array $lines, bool $replace = false): void;
+}
+
+class PhpConsoleWriter implements ConsoleWriter {
+    private $stdout;
+
+    public function __construct() {
+        $this->stdout = fopen('php://stdout', 'w');
+    }
+
+    public function __destruct() {
+        fclose($this->stdout);
+    }
+
+    public function write(string $text): void {
+        fwrite($this->stdout, $text);
+    }
+
+    public function clearLine(): void {
+        if (!$this->isTty()) {
+            return;
+        }
+        fwrite($this->stdout, "\r\033[K"); // Return to start + clear to end
+    }
+
+    public function replaceLine(string $text): void {
+        $this->clearLine();
+        $this->write($text);
+    }
+
+    public function writeLines(array $lines, bool $replace = false): void {
+        if ($replace && $this->isTty()) {
+            // Move up by number of lines and clear them
+            foreach ($lines as $i => $line) {
+                if ($i > 0) {
+                    fwrite($this->stdout, "\033[1A"); // Move up one line
+                }
+                $this->clearLine();
+            }
+        }
+        
+        foreach ($lines as $line) {
+            $this->write($line . PHP_EOL);
+        }
+    }
+
+    private function isTty(): bool {
+        return stream_isatty($this->stdout);
+    }
+}
+
+class ProgressBar {
+    private ConsoleWriter $writer;
+    private ?int $total;
+    private int $current;
+    private int $width;
+    private string $message;
+    private float $startTime;
+    private bool $started = false;
+    private bool $indeterminate = false;
+
+    public function __construct(ConsoleWriter $writer, ?int $total = 100, int $width = 50) {
+        $this->writer = $writer;
+        $this->total = $total;
+        $this->indeterminate = ($total === null);
+        $this->current = 0;
+        $this->width = $width;
+        $this->message = '';
+    }
+
+    public function start(): void {
+        if ($this->started) {
+            return;
+        }
+        $this->started = true;
+        $this->startTime = microtime(true);
+        $this->update();
+    }
+
+    public function advance(int $step = 1): void {
+        $this->setCurrent($this->current + $step);
+    }
+
+    public function setCurrent(int $current): void {
+        $this->current = $this->indeterminate ? $current : min($this->total, max(0, $current));
+        $this->update();
+    }
+
+    public function setMessage(string $message): void {
+        $this->message = $message;
+        $this->update();
+    }
+
+    public function finish(): void {
+        if (!$this->started) {
+            return;
+        }
+        if (!$this->indeterminate) {
+            $this->current = $this->total;
+        }
+        $this->update();
+        $this->writer->write("\n");
+    }
+
+    private function update(): void {
+        if (!$this->started) {
+            return;
+        }
+
+        if ($this->indeterminate) {
+            $this->updateIndeterminate();
+        } else {
+            $this->updateDeterminate();
+        }
+    }
+
+    private function updateIndeterminate(): void {
+        $elapsed = microtime(true) - $this->startTime;
+        
+        // Create a "moving" animation for indeterminate progress
+        $position = (int)($elapsed * 5) % ($this->width * 2);
+        if ($position >= $this->width) {
+            $position = $this->width * 2 - $position;
+        }
+
+		$spaces_before = min(max(0, $position), $this->width - 3);
+		$spaces_after = max(0, $this->width - $position - 3);
+        
+        $bar = str_repeat(' ', $spaces_before) . '<=>' . str_repeat(' ', $spaces_after);
+        $status = sprintf(
+            "[%s] %d items - %s",
+            $bar,
+            $this->current,
+            $this->message
+        );
+        
+        $this->writer->replaceLine($status);
+    }
+
+    private function updateDeterminate(): void {
+        $percentage = $this->current / $this->total;
+        $filled = (int)round($this->width * $percentage);
+        $empty = $this->width - $filled;
+        
+        $bar = str_repeat('=', $filled);
+        if ($empty > 0) {
+            $bar .= '>';
+            $bar .= str_repeat(' ', $empty - 1);
+        }
+
+		$status = sprintf(
+			"[%s] %d/%d - %s",
+			$bar,
+			$this->current,
+			$this->total,
+			$this->message
+		);
+
+        $this->writer->replaceLine($status);
+    }
+}
+
+$console_writer = new PhpConsoleWriter();
+
+// Parse CLI arguments
+function show_error_message_and_die( $error = false ) {
+	global $console_writer;
+
+	$console_writer->write( "\033[1;31mError:\033[0m " );
+	$console_writer->write( $error );
+	$console_writer->write( "\n" );
+	exit( 1 );
+}
+
+function help_message_and_die() {
+	global $console_writer;
+
+	$console_writer->write( "\033[1;32mDescription:\033[0m\n" );
+	$console_writer->write( "  Imports content into a new WordPress site\n\n" );
+
+	$console_writer->write( "\033[1;32mUsage:\033[0m\n" );
+	$console_writer->write( "  php import-markdown-directory.php <mode> [options]\n\n" );
+
+	$console_writer->write( "\033[1;32mModes:\033[0m\n" );
+	$console_writer->write( "  \033[1;33mlocal-directory\033[0m        Import content from a local directory\n" );
+	$console_writer->write( "  \033[1;33mgit\033[0m         Import content from a git repository\n" );
+	$console_writer->write( "  \033[1;33mwxr\033[0m         Import content from a WordPress eXtended RSS file\n" );
+	$console_writer->write( "  \033[1;33mepub\033[0m        Import content from an EPUB ebook\n\n" );
+
+	$console_writer->write( "\033[1;32mGlobal Options:\033[0m\n" );
+	$console_writer->write( "  \033[1;34m--source-site-url=<url>\033[0m\n" );
+	$console_writer->write( "      Base URL of the source content (required)\n\n" );
+
+	$console_writer->write( "  \033[1;34m--additional-site-urls=<url>\033[0m\n" );
+	$console_writer->write( "      Additional URLs to rewrite links for (multiple allowed)\n\n" );
+
+	$console_writer->write( "  \033[1;34m--media-url=<url>\033[0m\n" );
+	$console_writer->write( "      URLs to download media files from (multiple allowed)\n\n" );
+
+	$console_writer->write( "  \033[1;34m--output-dir=<path>\033[0m\n" );
+	$console_writer->write( "      Create the new WordPress site in this directory\n" );
+	$console_writer->write( "      Must be empty and have write permissions\n\n" );
+
+	$console_writer->write( "\033[1;32mMode-specific Usage:\033[0m\n" );
+
+	$console_writer->write( "\033[1;33mgit\033[0m mode:\n" );
+	$console_writer->write( "  php import-markdown-directory.php git <repo_url>\n" );
+	$console_writer->write( "  Options:\n" );
+	$console_writer->write( "    \033[1;34m--branch=<branch>\033[0m\n" );
+	$console_writer->write( "        Git branch to import from (required)\n" );
+	$console_writer->write( "    \033[1;34m--path-in-repo=<path>\033[0m\n" );
+	$console_writer->write( "        Subdirectory in repository to import from\n\n" );
+
+	$console_writer->write( "\033[1;33mlocal-directory\033[0m mode:\n" );
+	$console_writer->write( "  php import-markdown-directory.php local-directory <directory>\n" );
+	$console_writer->write( "  Imports content from local <directory>\n\n" );
+
+	$console_writer->write( "\033[1;33mwxr\033[0m mode:\n" );
+	$console_writer->write( "  php import-markdown-directory.php wxr <url or local path>\n" );
+	$console_writer->write( "  Imports content from a WordPress eXtended RSS file\n\n" );
+
+	$console_writer->write( "\033[1;33mepub\033[0m mode:\n" );
+	$console_writer->write( "  php import-markdown-directory.php epub <url or local path>\n" );
+	$console_writer->write( "  Imports content from an EPUB ebook\n\n" );
+
+	die();
+}
+
+// Define the option definitions as documented above
+$optionDefs = [
+	// General options
+	'mode'                   => [ 'm', true, null, 'Import mode (git|local-directory|wxr|epub) (required)' ],
+	'output-dir'             => [ 'o', true, null, 'Directory to create the new WordPress site in (required)' ],
+	'source-site-url'        => [ 's', true, null, 'Base URL of the source content (required for most modes)' ],
+	'additional-site-urls'   => [ 'a', true, null, 'Additional URLs to rewrite links for (can be specified multiple times)' ],
+	'media-url'              => [ null, true, null, 'URLs to download media files from (can be specified multiple times)' ],
+
+	// git mode
+	'branch'                 => [ 'b', true, null, 'Git branch to import from (required for git mode)' ],
+	'path-in-repo'           => [ 'p', true, null, 'Subdirectory in repository to import from (optional for git mode)' ],
+
+	// Help
+	'help'                   => [ 'h', false, false, 'Show help' ],
+];
+
+// Parse CLI arguments and options
+try {
+	list( $positionals, $options ) = CLI::parseCommandArgsAndOptions( array_slice( $_SERVER['argv'], 1 ), $optionDefs );
+} catch ( InvalidArgumentException $e ) {
+	show_error_message_and_die( $e->getMessage() );
+}
+
+if ( $options['help'] ?? false ) {
+	help_message_and_die();
+}
+
+define( 'NEW_SITE_CONTENT_ROOT', get_site_url() );
+$console_writer->write( 'Target site URL: ' . NEW_SITE_CONTENT_ROOT . "\n" );
+
+
+// Map positional arguments to their meaning based on mode
+// (first positional is always the mode if not given as --mode)
+if ( empty( $options['mode'] ) && !empty( $positionals ) ) {
+	$options['mode'] = array_shift( $positionals );
+}
+
+// For each mode, map the next positional(s) to the correct option
+switch ( $options['mode'] ?? null ) {
+	case 'git':
+		// git <repo_url>
+		if ( !isset( $options['data_url'] ) && !empty( $positionals ) ) {
+			$options['data_url'] = array_shift( $positionals );
+		}
+		break;
+	case 'local-directory':
+		// local-directory <directory>
+		if ( !isset( $options['data_url'] ) && !empty( $positionals ) ) {
+			$options['data_url'] = array_shift( $positionals );
+		}
+		break;
+	case 'wxr':
+	case 'epub':
+		// wxr <url or local path>
+		// epub <url or local path>
+		if ( !isset( $options['data_url'] ) && !empty( $positionals ) ) {
+			$options['data_url'] = array_shift( $positionals );
+		}
+		break;
+}
+
+// Support multiple --additional-site-urls and --media-url
+foreach ( [ 'additional-site-urls', 'media-url' ] as $multiOpt ) {
+	if ( isset( $options[ $multiOpt ] ) && !is_array( $options[ $multiOpt ] ) ) {
+		$options[ $multiOpt ] = [ $options[ $multiOpt ] ];
+	}
+	// Scan $positionals for repeated --foo=bar style (if CLI parser doesn't already do this)
+}
+
+// For compatibility with legacy code below, map to $args
+$args = $options;
+
+$chrooted_fs     = null;
+$source_site_url = null;
+if ( in_array( $args['mode'], array( 'local-directory', 'git' ) ) ) {
+	// Validate required arguments
+	if ( ! isset( $args['source-site-url'] ) ) {
+		show_error_message_and_die( 'The --source-site-url argument is required.' );
+	}
+	$index_file_pattern = '#(?:index|readme)\.(?:md|html|xhtml)$#i';
+	$import_path_prefix = '/imported-content';
+	$source_site_url    = $args['source-site-url'];
+
+	if ( $args['mode'] === 'local-directory' ) {
+		if ( ! isset( $args['data_url'] ) ) {
+			show_error_message_and_die( 'The "local-directory" positional argument is required.' );
+		}
+
+		$chrooted_fs = LocalFilesystem::create( $args['data_url'] );
+
+		$args['source-site-url'] = 'file:///';
+	} elseif ( $args['mode'] === 'git' ) {
+		if ( ! isset( $args['data_url'] ) ) {
+			show_error_message_and_die( 'The "repo" argument is required.' );
+		}
+
+		$args['repo'] = $args['data_url'];
+		if ( ! str_ends_with( $args['repo'], '.git' ) ) {
+			show_error_message_and_die( 'The "repo" argument must end with ".git" when mode is "git".' );
+		}
+
+		if ( ! isset( $args['branch'] ) ) {
+			show_error_message_and_die( 'The "branch" argument is required when mode is "git".' );
+		}
+
+		$console_writer->write( "Sparse checkout of the git repository\n" );
+		$temp_dir  = sys_get_temp_dir() . '/import-static-' . uniqid();
+		$cache_fs  = LocalFilesystem::create( $temp_dir );
+		$docs_repo = new GitRepository( $cache_fs );
+		$docs_repo->add_remote( 'origin', $args['repo'] );
+		$remote       = $docs_repo->get_remote_client( 'origin' );
+		$path_in_repo = $args['path-in-repo'] ?? '';
+		$branch       = $args['branch'] ?? 'trunk';
+		$remote->fetch(
+			$branch,
+			array(
+				'path' => $path_in_repo,
+				'shallow' => true,
+			)
+		);
+		$docs_repo->set_branch_tip( 'refs/heads/' . $branch, $docs_repo->get_branch_tip( 'refs/remotes/origin/' . $branch ) );
+		$docs_repo->checkout( 'refs/heads/' . $branch );
+		$git_fs      = GitFilesystem::create( $docs_repo );
+		$chrooted_fs = new ChrootLayer( $git_fs, $path_in_repo );
+	}
+	$entity_reader_factory = function () use ( $chrooted_fs, $source_site_url, $index_file_pattern ) {
+		return new FilesystemEntityReader(
+			$chrooted_fs,
+			array(
+				'index_file_pattern' => $index_file_pattern,
+				'filter_pattern' => '#\.(?:md|html|xhtml)$#',
+				/**
+				 * Use a number so large, there's no chance for wp_table INSERTs
+				 * to interfere with the post IDs generated by the FilesystemEntityReader.
+				 *
+				 * Some inserts are ran even by the importer, e.g. frontloading stubs.
+				 *
+				 * @TODO: Make sure this doesn't automatically bump the AUTOINCREMENT counter in MySQL.
+				 * @TODO: Bump the AUTOINCREMENT counter manually after a finished import.
+				 */
+				'first_post_id' => 10000000,
+				'base_url' => $source_site_url,
+			)
+		);
+	};
+
+	/**
+	 * Maps a filesystem path to a WordPress-friendly URL path we can assign
+	 * to the imported page.
+	 *
+	 * Example: "/docs/README.md" -> "/docs/readme"
+	 *
+	 * @param string $path The filesystem path to convert
+	 * @return string The WordPress-friendly URL path
+	 */
+	function map_file_path_to_wordpress_url( $path ) {
+		global $index_file_pattern, $import_path_prefix;
+
+		/**
+		 * Ensure a named top-level parent directory to base the entire
+		 * URL structure on. The goal is to have a consistent way to resolve
+		 * URLs for all the following files:
+		 *
+		 * - README.md
+		 * - chapter-5/README.md
+		 * - chapter-5/section-1.md
+		 * - chapter-5/section-3/readme.md
+		 *
+		 * Without the top-level directory, the best URL we can give the
+		 * /README.md file would be `/readme`. However, the `chapter-5/README.md`
+		 * would get a URL like `/chapter-5` which is inconsistent. However,
+		 * if we transform the path structure as follows, everything becomes
+		 * consistent:
+		 *
+		 * - /imported-content/README.md
+		 * - /imported-content/chapter-5/README.md
+		 * - /imported-content/chapter-5/section-1.md
+		 * - /imported-content/chapter-5/section-3/readme.md
+		 *
+		 * We want to keep all the links working after the import. A single,
+		 * consistent URL mapping strategy makes it much easier. The alternative
+		 * would be to maintain a mapping of parents to paths and use it whenever
+		 * creating pages and rewriting URLs.
+		 *
+		 * This isn't trivial. Having a top-level path prefix is not perfect,
+		 * but it's a sound compromise.
+		 */
+		$path = wp_join_unix_paths( $import_path_prefix, $path );
+
+		if ( 1 === preg_match( $index_file_pattern, $path ) ) {
+			$path = dirname( $path );
+		}
+
+		$extensions = array( '.md', '.html', '.xhtml' );
+		foreach ( $extensions as $ext ) {
+			if ( str_ends_with( $path, $ext ) ) {
+				$path = substr( $path, 0, -strlen( $ext ) );
+				break;
+			}
+		}
+
+		return strtolower( $path );
+	}
+
+	/**
+	 * Transforms links pointing to imported static files (e.g. ./getting-started.md)
+	 * to the format they will have after being imported into WordPress (e.g. /docs/getting-started).
+	 */
+	add_action(
+		'data_liberation.stream_importer.postprocess_url',
+		function (
+			$processor,
+			$context
+		) use (
+			$chrooted_fs,
+			/**
+			 * With &, $import_path_prefix reflects the latest value.
+			 * Without &, it's a local copy of the value from the outer scope.
+			 */
+			&$import_path_prefix
+		) {
+			/**
+			 * If we didn't rewrite the base URL, the URL points outside
+			 * of the imported root directory. Let's keep it as it is.
+			 */
+			if ( ! $context['applied_base_url_mapping'] ) {
+				return;
+			}
+
+			$path_original = $processor->get_parsed_url()->pathname;
+
+			/**
+			 * Remove the site path from the URL path and check:
+			 * Is this URL pointing to a file that exists in the imported
+			 * directory?
+			 */
+			$base_url_path_prefix  = $context['applied_base_url_mapping']['to']->pathname;
+			$path_relative_to_base = substr( $path_original, strlen( $base_url_path_prefix ) );
+			if ( $chrooted_fs->is_file( $path_relative_to_base ) ) {
+				/**
+				 * Yes! We are linking to an imported page. Let's transform the link
+				 * to a WordPress-friendly URL scheme.
+				 */
+				$path_rewritten = map_file_path_to_wordpress_url( $path_relative_to_base );
+				$path_rewritten = wp_join_unix_paths( $base_url_path_prefix, $path_rewritten );
+			} elseif ( $processor->is_url_absolute() ) {
+				/**
+				 * No. We are linking to a content page within our site but there is
+				 * no corresponding static file. This happens e.g. in the Gutenberg
+				 * handbook where the markdown files contain absolute URLs to the deployed
+				 * site, e.g.:
+				 *
+				 *     Start by ensuring you have Node.js and `npm` installed on your computer. Review
+				 *     the [Node.js development environment](https://developer.wordpress.org/block-editor/getting-started/devenv/nodejs-development-environment/) guide if not.
+				 *
+				 * Our best shot is to keep the URL as is, just with the imported
+				 * content root prepended to it.
+				 */
+				$path_rewritten = wp_join_unix_paths( $base_url_path_prefix, $import_path_prefix, $path_relative_to_base );
+			} else {
+				/**
+				 * It's a relative URL pointing somewhere within the URL space we're importing
+				 * to, but there is no corresponding static file. This is unexpected. There is
+				 * nothing we can do at this point – let's just keep the URL as it is.
+				 */
+				return;
+			}
+			$processor->set_url(
+				$path_rewritten,
+				WPURL::parse( $path_rewritten, $processor->get_parsed_url() )
+			);
+		},
+		10,
+		3
+	);
+
+	/**
+	 * Assigns post_name to every imported static page.
+	 */
+	add_filter(
+		'data_liberation.stream_importer.preprocess_entity',
+		function ( $entity ) use ( &$import_path_prefix, $index_file_pattern ) {
+			static $preprocessed_an_entity = false;
+			if ( $entity->get_type() !== 'post' ) {
+				return $entity;
+			}
+
+			$data = $entity->get_data();
+
+			if ( isset( $data['parsed_metadata']['slug'] ) ) {
+				$data['post_name'] = basename( $data['parsed_metadata']['slug'][0] );
+			} elseif ( isset( $data['local_file_path'] ) ) {
+				/**
+				 * The default import content path is "/imported-content". However,
+				 * maybe we can find a friendlier path prefix based on the post
+				 * title of the top-level index file.
+				 *
+				 * For example, a "Getting Started" guide found at "README.md"
+				 * could be imported to "/getting-started".
+				 */
+				if ( ! $preprocessed_an_entity ) {
+					$preprocessed_an_entity           = true;
+					$dirname                          = dirname( $data['local_file_path'] );
+					$dirname_makes_a_bad_slug         = $dirname !== '.' && $dirname === '/';
+					$is_index_file                    = 1 === preg_match( $index_file_pattern, $data['local_file_path'] );
+					$post_title_not_derived_from_path = $data['post_title'] !== ImportUtils::slug_to_title( basename( $data['local_file_path'] ) );
+
+					if (
+						$dirname_makes_a_bad_slug &&
+						$is_index_file &&
+						$post_title_not_derived_from_path &&
+						strlen( $data['post_title'] ) > 1
+					) {
+						$import_path_prefix = wp_import_slugify( $data['post_title'] );
+					}
+				}
+
+				$wordpress_url     = map_file_path_to_wordpress_url( $data['local_file_path'] );
+				$data['post_name'] = basename( $wordpress_url );
+			} else {
+				return $entity;
+			}
+
+			$entity->set_data( $data );
+			return $entity;
+		},
+		10,
+		2
+	);
+} elseif ( $args['mode'] === 'wxr' ) {
+	if ( ! isset( $args['data_url'] ) ) {
+		help_message_and_die( 'The "wxr file" argument is required.' );
+	}
+	$entity_reader_factory = function ( $cursor ) use ( $args ) {
+		return WXREntityReader::create(
+			uri_to_byte_stream( $args['data_url'] ),
+			$cursor
+		);
+	};
+} elseif ( $args['mode'] === 'epub' ) {
+	if ( ! isset( $args['data_url'] ) ) {
+		help_message_and_die( 'The "epub file" argument is required.' );
+	}
+	$zip_fs                = ZipFilesystem::create(
+		uri_to_byte_stream( $args['data_url'] )
+	);
+	$entity_reader_factory = function ( $cursor = null ) use ( $zip_fs ) {
+		return new EPubEntityReader(
+			$zip_fs,
+			1000000 // This is first post ID. We should really also accept a cursor
+		);
+	};
+	$reader                = $entity_reader_factory();
+	$source_site_url       = 'file://' . dirname( $reader->get_manifest_path() );
+
+	// To source the media files from the EPUB bundle:
+	$chrooted_fs = $zip_fs;
+
+	/**
+	 * Drop .xhtml extension from the links.
+	 */
+	add_action(
+		'data_liberation.stream_importer.postprocess_url',
+		function ( $processor ) {
+			$parsed_url = $processor->get_parsed_url();
+			if ( ! str_ends_with( $parsed_url->pathname, '.xhtml' ) ) {
+				return;
+			}
+			$parsed_url->pathname = substr( $parsed_url->pathname, 0, -6 );
+			$processor->set_url(
+				$parsed_url . '',
+				$parsed_url
+			);
+		}
+	);
+} else {
+	help_message_and_die( 'The "mode" argument is required and must be one of: "local-directory", "git", "wxr", or "epub".' );
+	exit( 1 );
+}
+
+function uri_to_byte_stream( $uri ) {
+	if ( str_starts_with( $uri, 'http://' ) || str_starts_with( $uri, 'https://' ) ) {
+		$local_path = tempnam( sys_get_temp_dir(), 'wp-remote-file-' );
+		file_put_contents( $local_path, file_get_contents( $uri ) );
+		$uri = $local_path;
+
+		// @TODO: Use SeekableRequestReadStream here instead of
+		// pre-downloading the file to disk.
+		// $client = new Client();
+		// $response = $client->fetch($uri);
+	}
+	if ( file_exists( $uri ) ) {
+		return FileReadStream::from_path( $uri );
+	}
+	throw new \Exception( "Unknown resource type: $uri. If that's a local file, \033[1mplease provide an absolute path to the file\033[0m." );
+}
+
+
+/**
+ * Naive slugification function.
+ *
+ * @TODO: Use a more sophisticated one with utf-8 support etc.
+ */
+function wp_import_slugify( $title ) {
+	return preg_replace( '/[^a-z0-9]+/i', '-', trim( strtolower( $title ) ) );
+}
+
+$data_url = $args['data_url'];
+$console_writer->write( "Importing static files from $data_url\n" );
+
+
+try {
+	// Parse URL mapping arguments
+	$additional_url_mappings = array();
+	foreach ( $args['additional-site-urls'] ?? [] as $url ) {
+		$additional_url_mappings[] = array(
+			'from' => $url,
+			'to' => NEW_SITE_CONTENT_ROOT,
+		);
+	}
+
+	$console_writer->write( "Starting the import\n" );
+	$importer = StreamImporter::create(
+		$entity_reader_factory,
+		array(
+			'source_site_url' => $source_site_url,
+			'new_site_content_root_url' => NEW_SITE_CONTENT_ROOT,
+			'source_media_root_urls' => $args['media-url'] ?? array( $source_site_url ),
+			'additional_url_mappings' => $additional_url_mappings,
+			'index_batch_size' => 1,
+			'attachment_downloader_options' => array(
+				'source_from_filesystem' => $chrooted_fs,
+			),
+		)
+	);
+
+	$import_session   = ImportSession::create(
+		array(
+			'data_source' => 'local_directory',
+			// @TODO: the phrase "file_name" doesn't make sense here. We're sourcing
+			// data from a directory, not a file. This string is used to tell
+			// the user in the UI what this they're importing in this import
+			// session. Let's rename it to something more descriptive.
+			'file_name' => $args['data_url'],
+		)
+	);
+	$retries_iterator = new RetryFrontloadingIterator( $import_session->get_id() );
+	$importer->set_frontloading_retries_iterator( $retries_iterator );
+
+	// @TODO: Prettier progress reporting
+	$ignored_message_printed = false;
+	do {
+		$result = data_liberation_import_step_customized( $import_session, $importer, $console_writer );
+		if ( $importer->get_stage() === StreamImporter::STAGE_FINISHED ) {
+			$console_writer->write( "\n" );
+			$console_writer->write( "\033[1;32mImport finished!\033[0m See your imported content at: \n" );
+
+			// Get the first page with non-empty content.
+			$posts = get_posts(
+				array(
+					'numberposts' => 10,
+					'orderby' => 'ID',
+					'order' => 'ASC',
+					'post_type' => 'page',
+					'post_status' => 'publish',
+				)
+			);
+
+			$url = NEW_SITE_CONTENT_ROOT;
+			foreach ( $posts as $post ) {
+				if ( ! empty( $post->post_content ) ) {
+					$url = get_permalink( $post );
+					break;
+				}
+			}
+			$console_writer->write( "\033[1;36m" . $url . "\033[0m\n" );
+			break;
+		} elseif ( false === $result ) {
+			if ( $importer->get_stage() === StreamImporter::STAGE_FRONTLOAD_ASSETS ) {
+				if ( ! $ignored_message_printed ) {
+					$console_writer->write( "\nSome assets could not be downloaded – they will be ignored so we can continue with the import.\n" );
+					$ignored_message_printed = true;
+				}
+				// $import_session->mark_frontloading_errors_as_ignored();
+			} else {
+				$console_writer->write( "Import failed, aborting\n" );
+				break;
+			}
+		} else {
+			// Twiddle our thumbs, importing in progress...
+		}
+	} while ( true );
+} finally {
+	if ( isset( $cache_fs ) ) {
+		$cache_fs->rmdir(
+			'/',
+			array(
+				'recursive' => true,
+			)
+		);
+	}
+}
+
+/**
+ * @TODO: Expose a primitive like the step function below from the
+ *        DataLiberation PHP component. Support all sorts of pause conditions
+ *        such as time limits, retry counts, memory limits, etc.
+ */
+function data_liberation_import_step_customized( ImportSession $session, StreamImporter $importer, ConsoleWriter $console_writer ) {
+	$soft_time_limit_seconds = 15;
+	$hard_time_limit_seconds = 25;
+	$start_time              = microtime( true );
+	$fetched_files           = 0;
+	$progress_bar            = null;
+
+	while ( true ) {
+		$time_taken = microtime( true ) - $start_time;
+		if ( $time_taken >= $soft_time_limit_seconds ) {
+			if ( $importer->get_stage() === StreamImporter::STAGE_FRONTLOAD_ASSETS ) {
+				if ( $fetched_files > 0 ) {
+					return true;
+				}
+			} else {
+				return true;
+			}
+		}
+		if ( $time_taken >= $hard_time_limit_seconds ) {
+			return true;
+		}
+
+		if ( true !== $importer->next_step() ) {
+			$session->set_reentrancy_cursor( $importer->get_reentrancy_cursor() );
+
+			$should_advance_to_next_stage = null !== $importer->get_next_stage();
+			if ( $should_advance_to_next_stage ) {
+				if ( StreamImporter::STAGE_FRONTLOAD_ASSETS === $importer->get_stage() ) {
+					$resolved_all_failures = $session->count_unfinished_frontloading_stubs() === 0;
+					if ( ! $resolved_all_failures ) {
+						// Uncomment once this script's intent becomes exiting on unresolved frontloading failures.
+						// if($progress_bar) {
+						// $progress_bar->finish();
+						// }
+						// return false;
+					}
+				}
+			}
+			if ( ! $importer->advance_to_next_stage() ) {
+				if ( $progress_bar ) {
+					$progress_bar->finish();
+				}
+				return false;
+			}
+			$session->set_stage( $importer->get_stage() );
+			$session->set_reentrancy_cursor( $importer->get_reentrancy_cursor() );
+			$console_writer->clearLine();
+			$progress_bar = null;
+
+			continue;
+		}
+
+		switch ( $importer->get_stage() ) {
+			case StreamImporter::STAGE_INDEX_ENTITIES:
+				$entities_counts = $importer->get_indexed_entities_counts();
+				$session->create_frontloading_stubs( $importer->get_indexed_assets_urls() );
+				$session->bump_total_number_of_entities( $entities_counts );
+				if ( ! $progress_bar ) {
+					$progress_bar = new ProgressBar( $console_writer, null );
+					$progress_bar->setMessage( 'Indexing entities' );
+					$progress_bar->start();
+				}
+				$progress_bar->setCurrent( array_sum( $session->get_total_number_of_entities() ) );
+				break;
+
+			case StreamImporter::STAGE_FRONTLOAD_ASSETS:
+				$progress = $importer->get_frontloading_progress();
+				$session->bump_frontloading_progress(
+					$progress,
+					$importer->get_frontloading_events()
+				);
+
+				if ( ! $progress_bar ) {
+					$progress_bar = new ProgressBar( $console_writer, null );
+					$progress_bar->setMessage( 'Fetching media files' );
+					$progress_bar->start();
+				}
+				$progress_bar->setCurrent( $session->count_unfinished_frontloading_stubs() );
+				break;
+
+			case StreamImporter::STAGE_IMPORT_ENTITIES:
+				$imported_counts = $importer->get_imported_entities_counts();
+
+				$session->bump_imported_entities_counts( $imported_counts );
+
+				if ( ! $progress_bar ) {
+					$progress_bar = new ProgressBar( $console_writer, $session->count_remaining_entities() );
+					$progress_bar->setMessage( 'Importing entities' );
+					$progress_bar->start();
+				}
+				$progress_bar->setCurrent( $session->count_all_imported_entities() );
+				break;
+		}
+
+		$session->set_reentrancy_cursor( $importer->get_reentrancy_cursor() );
+	}
+	return false;
+}
diff --git a/components/Blueprints/bin/blueprint.php b/components/Blueprints/bin/blueprint.php
index dadde86d..d96fe88d 100644
--- a/components/Blueprints/bin/blueprint.php
+++ b/components/Blueprints/bin/blueprint.php
@@ -33,6 +33,7 @@
 
 require __DIR__ . '/../../../vendor/autoload.php';
 
+use WordPress\CLI\CLI;
 use WordPress\Blueprints\DataReference\AbsoluteLocalPath;
 use WordPress\Blueprints\DataReference\DataReference;
 use WordPress\Blueprints\Exception\BlueprintExecutionException;
@@ -102,84 +103,6 @@
 	],
 ];
 
-// -----------------------------------------------------------------------------
-//   Custom command‑line parser (POSIX‑ish but without getopt dependency)
-// -----------------------------------------------------------------------------
-function parseCommandArgsAndOptions( array $argv, array $optionDefs ): array {
-	$positionals = [];
-	$options     = [];
-	$short2long  = [];
-
-	// Initialise defaults & maps
-	foreach ( $optionDefs as $long => $def ) {
-		[ $short, , $default ] = $def;
-		$options[ $long ] = $default;
-		if ( $short ) {
-			$short2long[ $short ] = $long;
-		}
-	}
-
-	$i = 0; // Start from the first command argument
-	while ( $i < count( $argv ) ) {
-		$token = $argv[ $i ];
-
-		// Long option --foo or --foo=bar
-		if ( preg_match( '/^--([^=]+)(=(.*))?$/', $token, $m ) ) {
-			$long = $m[1];
-			if ( ! isset( $optionDefs[ $long ] ) ) {
-				throw new InvalidArgumentException( "Unknown option --$long" );
-			}
-			[ $short, $hasVal ] = $optionDefs[ $long ];
-			if ( $hasVal ) {
-				$val = $m[3] ?? ( $argv[ ++ $i ] ?? null );
-				if ( $val === null ) {
-					throw new InvalidArgumentException( "Option --$long requires a value" );
-				}
-				$options[ $long ] = $val;
-			} else {
-				$options[ $long ] = true;
-			}
-			$i ++;
-			continue;
-		}
-
-		// Short option(s): -abc or -e mysql or -e=mysql
-		if ( preg_match( '/^-([A-Za-z]{1,})(=(.*))?$/', $token, $m ) ) {
-			$bundle    = str_split( $m[1] );
-			$inlineVal = $m[3] ?? null;
-			foreach ( $bundle as $idx => $short ) {
-				if ( ! isset( $short2long[ $short ] ) ) {
-					throw new InvalidArgumentException( "Unknown option -$short" );
-				}
-				$long   = $short2long[ $short ];
-				$hasVal = $optionDefs[ $long ][1];
-				if ( $hasVal ) {
-					if ( $inlineVal !== null && $idx === 0 ) {
-						$options[ $long ] = $inlineVal;
-					} else {
-						$val = ( $idx === count( $bundle ) - 1 ) ? ( $argv[ ++ $i ] ?? null ) : null;
-						if ( $val === null ) {
-							throw new InvalidArgumentException( "Option -$short requires a value" );
-						}
-						$options[ $long ] = $val;
-					}
-					break; // value‑bearing short stops bundle processing
-				} else {
-					$options[ $long ] = true;
-				}
-			}
-			$i ++;
-			continue;
-		}
-
-		// Positional argument
-		$positionals[] = $token;
-		$i ++;
-	}
-
-	return [ $positionals, $options ];
-}
-
 // Get the command name from arguments, accounting for aliases
 function resolveCommand( $commandArg, array $commandConfigurations ): ?string {
 	// Direct command match
@@ -509,7 +432,7 @@ function reportProgress( $progress, $caption ) {
 	
 	// Parse command arguments and options
 	$commandArgv = array_slice( $_SERVER['argv'], 2 ); // Skip "php script.php command"
-	[ $positionalArgs, $options ] = parseCommandArgsAndOptions( $commandArgv, $commandConfigurations[ $command ]['options'] );
+	[ $positionalArgs, $options ] = CLI::parseCommandArgsAndOptions( $commandArgv, $commandConfigurations[ $command ]['options'] );
 	
 	// Dispatch to appropriate command handler
 	switch ( $command ) {
diff --git a/components/CLI/CLI.php b/components/CLI/CLI.php
new file mode 100644
index 00000000..2e9bee7d
--- /dev/null
+++ b/components/CLI/CLI.php
@@ -0,0 +1,119 @@
+<?php
+
+namespace WordPress\CLI;
+
+use InvalidArgumentException;
+
+class CLI {
+	/**
+	 * Parses command-line arguments and options in a POSIX-like style.
+	 *
+	 * This method processes an array of CLI arguments and an option definition array,
+	 * returning a tuple of positional arguments and an associative array of options.
+	 * It supports long options (e.g., --foo or --foo=bar), short options (e.g., -f or -f=bar),
+	 * and bundled short options (e.g., -abc).
+	 *
+	 * Option definitions should be in the form:
+	 *   [
+	 *     'longname' => [ 'short', hasValue, defaultValue, description ],
+	 *     // ...
+	 *   ]
+	 *
+	 * Example:
+	 *   $optionDefs = [
+	 *     'site-url'  => [ 'u', true, null, 'Public site URL' ],
+	 *     'site-path' => [ null, true, null, 'Target directory' ],
+	 *     'help'      => [ 'h', false, false, 'Show help' ],
+	 *   ];
+	 *   $argv = ['--site-url=https://mysite.test', '--site-path', '/var/www', '-h', 'blueprint.json'];
+	 *   [$positionals, $options] = CLI::parseCommandArgsAndOptions($argv, $optionDefs);
+	 *   // $positionals = ['blueprint.json']
+	 *   // $options = [
+	 *   //   'site-url'  => 'https://mysite.test',
+	 *   //   'site-path' => '/var/www',
+	 *   //   'help'      => true,
+	 *   // ]
+	 *
+	 * This is used in the Blueprint Runner CLI to parse command-line input, e.g.:
+	 *   php blueprint.php exec my-blueprint.json --site-url https://mysite.test --site-path ./mysite --help
+	 *
+	 * @param array $argv       The CLI arguments (excluding the script name and command).
+	 * @param array $optionDefs Option definitions as described above.
+	 * @return array            [ $positionals, $options ]
+	 * @throws InvalidArgumentException for unknown options or missing required values.
+	 */
+	public static function parseCommandArgsAndOptions( array $argv, array $optionDefs ): array {
+		$positionals = [];
+		$options     = [];
+		$short2long  = [];
+	
+		// Initialise defaults & maps
+		foreach ( $optionDefs as $long => $def ) {
+			[ $short, , $default ] = $def;
+			$options[ $long ] = $default;
+			if ( $short ) {
+				$short2long[ $short ] = $long;
+			}
+		}
+	
+		$i = 0; // Start from the first command argument
+		while ( $i < count( $argv ) ) {
+			$token = $argv[ $i ];
+	
+			// Long option --foo or --foo=bar
+			if ( preg_match( '/^--([^=]+)(=(.*))?$/', $token, $m ) ) {
+				$long = $m[1];
+				if ( ! isset( $optionDefs[ $long ] ) ) {
+					throw new InvalidArgumentException( "Unknown option --$long" );
+				}
+				[ $short, $hasVal ] = $optionDefs[ $long ];
+				if ( $hasVal ) {
+					$val = $m[3] ?? ( $argv[ ++ $i ] ?? null );
+					if ( $val === null ) {
+						throw new InvalidArgumentException( "Option --$long requires a value" );
+					}
+					$options[ $long ] = $val;
+				} else {
+					$options[ $long ] = true;
+				}
+				$i ++;
+				continue;
+			}
+	
+			// Short option(s): -abc or -e mysql or -e=mysql
+			if ( preg_match( '/^-([A-Za-z]{1,})(=(.*))?$/', $token, $m ) ) {
+				$bundle    = str_split( $m[1] );
+				$inlineVal = $m[3] ?? null;
+				foreach ( $bundle as $idx => $short ) {
+					if ( ! isset( $short2long[ $short ] ) ) {
+						throw new InvalidArgumentException( "Unknown option -$short" );
+					}
+					$long   = $short2long[ $short ];
+					$hasVal = $optionDefs[ $long ][1];
+					if ( $hasVal ) {
+						if ( $inlineVal !== null && $idx === 0 ) {
+							$options[ $long ] = $inlineVal;
+						} else {
+							$val = ( $idx === count( $bundle ) - 1 ) ? ( $argv[ ++ $i ] ?? null ) : null;
+							if ( $val === null ) {
+								throw new InvalidArgumentException( "Option -$short requires a value" );
+							}
+							$options[ $long ] = $val;
+						}
+						break; // value‑bearing short stops bundle processing
+					} else {
+						$options[ $long ] = true;
+					}
+				}
+				$i ++;
+				continue;
+			}
+	
+			// Positional argument
+			$positionals[] = $token;
+			$i ++;
+		}
+	
+		return [ $positionals, $options ];
+	}
+}
\ No newline at end of file
diff --git a/components/CLI/composer.json b/components/CLI/composer.json
new file mode 100644
index 00000000..73dfda50
--- /dev/null
+++ b/components/CLI/composer.json
@@ -0,0 +1,26 @@
+{
+	"name": "wordpress/cli",
+	"description": "CLI component for WordPress.",
+	"type": "library",
+	"authors": [
+		{
+			"name": "Adam Zielinski",
+			"email": "adam@adamziel.com"
+		},
+		{
+			"name": "WordPress Team",
+			"email": "wordpress@wordpress.org"
+		}
+	],
+	"require": {
+		"php": ">=7.2"
+	},
+	"autoload": {
+		"psr-4": {
+			"WordPress\\CLI\\": ""
+		},
+		"exclude-from-classmap": [
+			"/Tests/"
+		]
+	}
+}
diff --git a/components/DataLiberation/URL/functions.php b/components/DataLiberation/URL/functions.php
index 4180ab97..99cbf8cb 100644
--- a/components/DataLiberation/URL/functions.php
+++ b/components/DataLiberation/URL/functions.php
@@ -6,143 +6,152 @@
 use WordPress\DataLiberation\BlockMarkup\BlockMarkupUrlProcessor;
 
 /**
- * Migrate URLs in post content. See WPRewriteUrlsTests for
- * specific examples. TODO: A better description.
+ * We have a weird composer autoloading issue. Sometimes it requires
+ * this file twice. And only this file! The function_exists check is
+ * a quick workaround until we figure out what's going on.
  *
- * Example:
- *
- * ```php
- * php > wp_rewrite_urls([
- *   'block_markup' => '<!-- wp:image {"src": "http://legacy-blog.com/image.jpg"} -->',
- *   'url-mapping' => [
- *     'http://legacy-blog.com' => 'https://modern-webstore.org'
- *   ]
- * ])
- * <!-- wp:image {"src":"https:\/\/modern-webstore.org\/image.jpg"} -->
- * ```
- *
- * @TODO Use a proper JSON parser and encoder to:
- * * Support UTF-16 characters
- * * Gracefully handle recoverable encoding issues
- * * Avoid changing the whitespace in the same manner as
- *   we do in WP_HTML_Tag_Processor
+ * @TODO: Fix this.
  */
-function wp_rewrite_urls( $options ) {
-	if ( empty( $options['base_url'] ) ) {
-		// Use first from-url as base_url if not specified
-		$from_urls           = array_keys( $options['url-mapping'] );
-		$options['base_url'] = $from_urls[0];
-	}
+if ( ! function_exists('\WordPress\DataLiberation\URL\wp_rewrite_urls') ) {
+	/**
+	 * Migrate URLs in post content. See WPRewriteUrlsTests for
+	 * specific examples. TODO: A better description.
+	 *
+	 * Example:
+	 *
+	 * ```php
+	 * php > wp_rewrite_urls([
+	 *   'block_markup' => '<!-- wp:image {"src": "http://legacy-blog.com/image.jpg"} -->',
+	 *   'url-mapping' => [
+	 *     'http://legacy-blog.com' => 'https://modern-webstore.org'
+	 *   ]
+	 * ])
+	 * <!-- wp:image {"src":"https:\/\/modern-webstore.org\/image.jpg"} -->
+	 * ```
+	 *
+	 * @TODO Use a proper JSON parser and encoder to:
+	 * * Support UTF-16 characters
+	 * * Gracefully handle recoverable encoding issues
+	 * * Avoid changing the whitespace in the same manner as
+	 *   we do in WP_HTML_Tag_Processor
+	 */
+	function wp_rewrite_urls( $options ) {
+		if ( empty( $options['base_url'] ) ) {
+			// Use first from-url as base_url if not specified
+			$from_urls           = array_keys( $options['url-mapping'] );
+			$options['base_url'] = $from_urls[0];
+		}
 
-	$url_mapping = array();
-	foreach ( $options['url-mapping'] as $from_url_string => $to_url_string ) {
-		$url_mapping[] = array(
-			'from_url' => WPURL::parse( $from_url_string ),
-			'to_url'   => WPURL::parse( $to_url_string ),
-		);
-	}
+		$url_mapping = array();
+		foreach ( $options['url-mapping'] as $from_url_string => $to_url_string ) {
+			$url_mapping[] = array(
+				'from_url' => WPURL::parse( $from_url_string ),
+				'to_url'   => WPURL::parse( $to_url_string ),
+			);
+		}
 
-	$p = new BlockMarkupUrlProcessor( $options['block_markup'], $options['base_url'] );
-	while ( $p->next_url() ) {
-		$parsed_url = $p->get_parsed_url();
-		foreach ( $url_mapping as $mapping ) {
-			if ( is_child_url_of( $parsed_url, $mapping['from_url'] ) ) {
-				$p->replace_base_url( $mapping['to_url'] );
-				break;
+		$p = new BlockMarkupUrlProcessor( $options['block_markup'], $options['base_url'] );
+		while ( $p->next_url() ) {
+			$parsed_url = $p->get_parsed_url();
+			foreach ( $url_mapping as $mapping ) {
+				if ( is_child_url_of( $parsed_url, $mapping['from_url'] ) ) {
+					$p->replace_base_url( $mapping['to_url'] );
+					break;
+				}
 			}
 		}
+
+		return $p->get_updated_html();
 	}
 
-	return $p->get_updated_html();
-}
+	/**
+	 * Check if a given URL matches the current site URL.
+	 *
+	 * @param  URL  $parent  The URL to check.
+	 * @param  string  $child  The current site URL to compare against.
+	 *
+	 * @return bool Whether the URL matches the current site URL.
+	 */
+	function is_child_url_of( $child, $parent_url ) {
+		$parent_url                       = is_string( $parent_url ) ? WPURL::parse( $parent_url ) : $parent_url;
+		$child                            = is_string( $child ) ? WPURL::parse( $child ) : $child;
+		$child_pathname_no_trailing_slash = rtrim( urldecode( $child->pathname ), '/' );
+
+		if ( false === $child || false === $parent_url ) {
+			return false;
+		}
 
-/**
- * Check if a given URL matches the current site URL.
- *
- * @param  URL  $parent  The URL to check.
- * @param  string  $child  The current site URL to compare against.
- *
- * @return bool Whether the URL matches the current site URL.
- */
-function is_child_url_of( $child, $parent_url ) {
-	$parent_url                       = is_string( $parent_url ) ? WPURL::parse( $parent_url ) : $parent_url;
-	$child                            = is_string( $child ) ? WPURL::parse( $child ) : $child;
-	$child_pathname_no_trailing_slash = rtrim( urldecode( $child->pathname ), '/' );
+		if ( $parent_url->hostname !== $child->hostname ) {
+			return false;
+		}
 
-	if ( false === $child || false === $parent_url ) {
-		return false;
-	}
+		if ( $parent_url->protocol !== $child->protocol ) {
+			return false;
+		}
 
-	if ( $parent_url->hostname !== $child->hostname ) {
-		return false;
-	}
+		$parent_pathname = urldecode( $parent_url->pathname );
 
-	if ( $parent_url->protocol !== $child->protocol ) {
-		return false;
+		return (
+			// Direct match
+			$parent_pathname === $child_pathname_no_trailing_slash ||
+			$parent_pathname === $child_pathname_no_trailing_slash . '/' ||
+			// Path prefix
+			strncmp( $child_pathname_no_trailing_slash . '/', $parent_pathname, strlen( $parent_pathname ) ) === 0
+		);
 	}
 
-	$parent_pathname = urldecode( $parent_url->pathname );
-
-	return (
-		// Direct match
-		$parent_pathname === $child_pathname_no_trailing_slash ||
-		$parent_pathname === $child_pathname_no_trailing_slash . '/' ||
-		// Path prefix
-		strncmp( $child_pathname_no_trailing_slash . '/', $parent_pathname, strlen( $parent_pathname ) ) === 0
-	);
-}
+	/**
+	 * Decodes the first n **encoded bytes** a URL-encoded string.
+	 *
+	 * For example, `urldecode_n( '%22is 6 %3C 6?%22 – asked Achilles', 1 )` returns
+	 * '"is 6 %3C 6?%22 – asked Achilles' because only the first encoded byte is decoded.
+	 *
+	 * @param  string  $string  The string to decode.
+	 * @param  int  $decode_n  The number of bytes to decode in $input
+	 *
+	 * @return string The decoded string.
+	 */
+	function urldecode_n( $input, $decode_n ) {
+		$result = '';
+		$at     = 0;
+		while ( true ) {
+			if ( $at + 3 > strlen( $input ) ) {
+				break;
+			}
 
-/**
- * Decodes the first n **encoded bytes** a URL-encoded string.
- *
- * For example, `urldecode_n( '%22is 6 %3C 6?%22 – asked Achilles', 1 )` returns
- * '"is 6 %3C 6?%22 – asked Achilles' because only the first encoded byte is decoded.
- *
- * @param  string  $string  The string to decode.
- * @param  int  $decode_n  The number of bytes to decode in $input
- *
- * @return string The decoded string.
- */
-function urldecode_n( $input, $decode_n ) {
-	$result = '';
-	$at     = 0;
-	while ( true ) {
-		if ( $at + 3 > strlen( $input ) ) {
-			break;
-		}
+			$last_at = $at;
+			$at      += strcspn( $input, '%', $at );
+			// Consume bytes except for the percent sign.
+			$result .= substr( $input, $last_at, $at - $last_at );
 
-		$last_at = $at;
-		$at      += strcspn( $input, '%', $at );
-		// Consume bytes except for the percent sign.
-		$result .= substr( $input, $last_at, $at - $last_at );
+			// If we've already decoded the requested number of bytes, stop.
+			if ( strlen( $result ) >= $decode_n ) {
+				break;
+			}
 
-		// If we've already decoded the requested number of bytes, stop.
-		if ( strlen( $result ) >= $decode_n ) {
-			break;
-		}
+			++ $at;
+			if ( $at > strlen( $input ) ) {
+				break;
+			}
 
-		++ $at;
-		if ( $at > strlen( $input ) ) {
-			break;
+			$decodable_length = strspn(
+				$input,
+				'0123456789ABCDEFabcdef',
+				$at,
+				2
+			);
+
+			if ( $decodable_length === 2 ) {
+				// Decode the hex sequence.
+				$result .= chr( hexdec( $input[ $at ] . $input[ $at + 1 ] ) );
+				$at     += 2;
+			} else {
+				// Consume the next byte and move on.
+				$result .= '%';
+			}
 		}
+		$result .= substr( $input, $at );
 
-		$decodable_length = strspn(
-			$input,
-			'0123456789ABCDEFabcdef',
-			$at,
-			2
-		);
-
-		if ( $decodable_length === 2 ) {
-			// Decode the hex sequence.
-			$result .= chr( hexdec( $input[ $at ] . $input[ $at + 1 ] ) );
-			$at     += 2;
-		} else {
-			// Consume the next byte and move on.
-			$result .= '%';
-		}
+		return $result;
 	}
-	$result .= substr( $input, $at );
-
-	return $result;
 }
diff --git a/composer.json b/composer.json
index 4ef1653f..d6a37423 100644
--- a/composer.json
+++ b/composer.json
@@ -56,6 +56,7 @@
         ],
         "psr-4": {
             "WordPress\\Blueprints\\": "components/Blueprints/",
+            "WordPress\\CLI\\": "components/CLI/",
             "WordPress\\DataLiberation\\": "components/DataLiberation/",
             "Rowbot\\": "components/DataLiberation/vendor-patched/",
             "Brick\\": "components/DataLiberation/vendor-patched/",

From c134840ae84081dcca1a44e1584661b14ef71e3d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Thu, 29 May 2025 14:50:52 +0200
Subject: [PATCH 5/6] Accept URLs found outside of img src as assets

---
 .../Blueprints/Steps/ImportContentStep.php    |   4 +-
 .../Importer/StreamImporter.php               |  32 ++-
 components/DataLiberation/URL/functions.php   | 246 +++++++++---------
 3 files changed, 148 insertions(+), 134 deletions(-)

diff --git a/components/Blueprints/Steps/ImportContentStep.php b/components/Blueprints/Steps/ImportContentStep.php
index 0378d6bf..9b92fcf1 100644
--- a/components/Blueprints/Steps/ImportContentStep.php
+++ b/components/Blueprints/Steps/ImportContentStep.php
@@ -85,7 +85,9 @@ private function importWxr( Runtime $runtime, array $content_definition ): void
 $_SERVER['argv'] = [
 	'import-wxr.php',
 	'wxr',
-	getenv('WXR_PATH')
+	getenv('WXR_PATH'),
+	'--media-url',
+	'https://pd.w.org/'
 ];
 ?>
 PHP
diff --git a/components/DataLiberation/Importer/StreamImporter.php b/components/DataLiberation/Importer/StreamImporter.php
index a96be2e8..cbf86b62 100644
--- a/components/DataLiberation/Importer/StreamImporter.php
+++ b/components/DataLiberation/Importer/StreamImporter.php
@@ -452,7 +452,6 @@ protected function index_next_entities() {
 			$entity = $this->get_current_entity();
 
 			$type = $entity->get_type();
-			var_dump( $type );
 
 			// Count entities by type.
 			if ( ! isset( $this->indexed_entities_counts[ $type ] ) ) {
@@ -1001,12 +1000,33 @@ protected function rewrite_attachment_url( string $raw_url, $base_url = null ) {
 	 * @TODO: What other asset types are there?
 	 */
 	protected function url_processor_matched_asset_url( BlockMarkupUrlProcessor $p ) {
-		if ( $p->get_tag() !== 'IMG' ) {
-			return false;
-		}
-		if ( $p->get_inspected_attribute_name() !== 'src' ) {
-			return false;
+		/**
+		 * Decide whether the URL is an asset URL worth downloading.
+		 * 
+		 * All URLs with an image-like extension are treated as images,
+		 * 
+		 * For example, the background image in the following block would be accepted:
+		 *
+		 *     <div style="background-image: url(https://example.com/image.jpg)">
+		 */
+		$path = $p->get_parsed_url()->pathname;
+		$extension = pathinfo( $path, PATHINFO_EXTENSION );
+		if ( ! in_array($extension, array('jpg', 'jpeg', 'png', 'gif', 'webp', 'svg') ) ) {
+			/**
+			 * Absent an extension, try to guess whether it's a static asset based
+			 * on its location in the document. For now, we only accept images.
+			 */
+			if ( $p->get_tag() !== 'IMG' ) {
+				return false;
+			}
+			if ( $p->get_inspected_attribute_name() !== 'src' ) {
+				return false;
+			}
 		}
+
+		/**
+		 * Finally, confirm it comes from one of the allowed media root URLs.
+		 */
 		foreach ( $this->source_media_root_urls as $source_media_root_url ) {
 			if ( is_child_url_of( $p->get_parsed_url(), $source_media_root_url ) ) {
 				return true;
diff --git a/components/DataLiberation/URL/functions.php b/components/DataLiberation/URL/functions.php
index 99cbf8cb..4cb1c166 100644
--- a/components/DataLiberation/URL/functions.php
+++ b/components/DataLiberation/URL/functions.php
@@ -5,153 +5,145 @@
 use Rowbot\URL\URL;
 use WordPress\DataLiberation\BlockMarkup\BlockMarkupUrlProcessor;
 
+
 /**
- * We have a weird composer autoloading issue. Sometimes it requires
- * this file twice. And only this file! The function_exists check is
- * a quick workaround until we figure out what's going on.
+ * Migrate URLs in post content. See WPRewriteUrlsTests for
+ * specific examples. TODO: A better description.
+ *
+ * Example:
+ *
+ * ```php
+ * php > wp_rewrite_urls([
+ *   'block_markup' => '<!-- wp:image {"src": "http://legacy-blog.com/image.jpg"} -->',
+ *   'url-mapping' => [
+ *     'http://legacy-blog.com' => 'https://modern-webstore.org'
+ *   ]
+ * ])
+ * <!-- wp:image {"src":"https:\/\/modern-webstore.org\/image.jpg"} -->
+ * ```
  *
- * @TODO: Fix this.
+ * @TODO Use a proper JSON parser and encoder to:
+ * * Support UTF-16 characters
+ * * Gracefully handle recoverable encoding issues
+ * * Avoid changing the whitespace in the same manner as
+ *   we do in WP_HTML_Tag_Processor
  */
-if ( ! function_exists('\WordPress\DataLiberation\URL\wp_rewrite_urls') ) {
-	/**
-	 * Migrate URLs in post content. See WPRewriteUrlsTests for
-	 * specific examples. TODO: A better description.
-	 *
-	 * Example:
-	 *
-	 * ```php
-	 * php > wp_rewrite_urls([
-	 *   'block_markup' => '<!-- wp:image {"src": "http://legacy-blog.com/image.jpg"} -->',
-	 *   'url-mapping' => [
-	 *     'http://legacy-blog.com' => 'https://modern-webstore.org'
-	 *   ]
-	 * ])
-	 * <!-- wp:image {"src":"https:\/\/modern-webstore.org\/image.jpg"} -->
-	 * ```
-	 *
-	 * @TODO Use a proper JSON parser and encoder to:
-	 * * Support UTF-16 characters
-	 * * Gracefully handle recoverable encoding issues
-	 * * Avoid changing the whitespace in the same manner as
-	 *   we do in WP_HTML_Tag_Processor
-	 */
-	function wp_rewrite_urls( $options ) {
-		if ( empty( $options['base_url'] ) ) {
-			// Use first from-url as base_url if not specified
-			$from_urls           = array_keys( $options['url-mapping'] );
-			$options['base_url'] = $from_urls[0];
-		}
+function wp_rewrite_urls( $options ) {
+	if ( empty( $options['base_url'] ) ) {
+		// Use first from-url as base_url if not specified
+		$from_urls           = array_keys( $options['url-mapping'] );
+		$options['base_url'] = $from_urls[0];
+	}
 
-		$url_mapping = array();
-		foreach ( $options['url-mapping'] as $from_url_string => $to_url_string ) {
-			$url_mapping[] = array(
-				'from_url' => WPURL::parse( $from_url_string ),
-				'to_url'   => WPURL::parse( $to_url_string ),
-			);
-		}
+	$url_mapping = array();
+	foreach ( $options['url-mapping'] as $from_url_string => $to_url_string ) {
+		$url_mapping[] = array(
+			'from_url' => WPURL::parse( $from_url_string ),
+			'to_url'   => WPURL::parse( $to_url_string ),
+		);
+	}
 
-		$p = new BlockMarkupUrlProcessor( $options['block_markup'], $options['base_url'] );
-		while ( $p->next_url() ) {
-			$parsed_url = $p->get_parsed_url();
-			foreach ( $url_mapping as $mapping ) {
-				if ( is_child_url_of( $parsed_url, $mapping['from_url'] ) ) {
-					$p->replace_base_url( $mapping['to_url'] );
-					break;
-				}
+	$p = new BlockMarkupUrlProcessor( $options['block_markup'], $options['base_url'] );
+	while ( $p->next_url() ) {
+		$parsed_url = $p->get_parsed_url();
+		foreach ( $url_mapping as $mapping ) {
+			if ( is_child_url_of( $parsed_url, $mapping['from_url'] ) ) {
+				$p->replace_base_url( $mapping['to_url'] );
+				break;
 			}
 		}
-
-		return $p->get_updated_html();
 	}
 
-	/**
-	 * Check if a given URL matches the current site URL.
-	 *
-	 * @param  URL  $parent  The URL to check.
-	 * @param  string  $child  The current site URL to compare against.
-	 *
-	 * @return bool Whether the URL matches the current site URL.
-	 */
-	function is_child_url_of( $child, $parent_url ) {
-		$parent_url                       = is_string( $parent_url ) ? WPURL::parse( $parent_url ) : $parent_url;
-		$child                            = is_string( $child ) ? WPURL::parse( $child ) : $child;
-		$child_pathname_no_trailing_slash = rtrim( urldecode( $child->pathname ), '/' );
-
-		if ( false === $child || false === $parent_url ) {
-			return false;
-		}
+	return $p->get_updated_html();
+}
 
-		if ( $parent_url->hostname !== $child->hostname ) {
-			return false;
-		}
+/**
+ * Check if a given URL matches the current site URL.
+ *
+ * @param  URL  $parent  The URL to check.
+ * @param  string  $child  The current site URL to compare against.
+ *
+ * @return bool Whether the URL matches the current site URL.
+ */
+function is_child_url_of( $child, $parent_url ) {
+	$parent_url                       = is_string( $parent_url ) ? WPURL::parse( $parent_url ) : $parent_url;
+	$child                            = is_string( $child ) ? WPURL::parse( $child ) : $child;
+	$child_pathname_no_trailing_slash = rtrim( urldecode( $child->pathname ), '/' );
 
-		if ( $parent_url->protocol !== $child->protocol ) {
-			return false;
-		}
+	if ( false === $child || false === $parent_url ) {
+		return false;
+	}
 
-		$parent_pathname = urldecode( $parent_url->pathname );
+	if ( $parent_url->hostname !== $child->hostname ) {
+		return false;
+	}
 
-		return (
-			// Direct match
-			$parent_pathname === $child_pathname_no_trailing_slash ||
-			$parent_pathname === $child_pathname_no_trailing_slash . '/' ||
-			// Path prefix
-			strncmp( $child_pathname_no_trailing_slash . '/', $parent_pathname, strlen( $parent_pathname ) ) === 0
-		);
+	if ( $parent_url->protocol !== $child->protocol ) {
+		return false;
 	}
 
-	/**
-	 * Decodes the first n **encoded bytes** a URL-encoded string.
-	 *
-	 * For example, `urldecode_n( '%22is 6 %3C 6?%22 – asked Achilles', 1 )` returns
-	 * '"is 6 %3C 6?%22 – asked Achilles' because only the first encoded byte is decoded.
-	 *
-	 * @param  string  $string  The string to decode.
-	 * @param  int  $decode_n  The number of bytes to decode in $input
-	 *
-	 * @return string The decoded string.
-	 */
-	function urldecode_n( $input, $decode_n ) {
-		$result = '';
-		$at     = 0;
-		while ( true ) {
-			if ( $at + 3 > strlen( $input ) ) {
-				break;
-			}
+	$parent_pathname = urldecode( $parent_url->pathname );
 
-			$last_at = $at;
-			$at      += strcspn( $input, '%', $at );
-			// Consume bytes except for the percent sign.
-			$result .= substr( $input, $last_at, $at - $last_at );
+	return (
+		// Direct match
+		$parent_pathname === $child_pathname_no_trailing_slash ||
+		$parent_pathname === $child_pathname_no_trailing_slash . '/' ||
+		// Path prefix
+		strncmp( $child_pathname_no_trailing_slash . '/', $parent_pathname, strlen( $parent_pathname ) ) === 0
+	);
+}
 
-			// If we've already decoded the requested number of bytes, stop.
-			if ( strlen( $result ) >= $decode_n ) {
-				break;
-			}
+/**
+ * Decodes the first n **encoded bytes** a URL-encoded string.
+ *
+ * For example, `urldecode_n( '%22is 6 %3C 6?%22 – asked Achilles', 1 )` returns
+ * '"is 6 %3C 6?%22 – asked Achilles' because only the first encoded byte is decoded.
+ *
+ * @param  string  $string  The string to decode.
+ * @param  int  $decode_n  The number of bytes to decode in $input
+ *
+ * @return string The decoded string.
+ */
+function urldecode_n( $input, $decode_n ) {
+	$result = '';
+	$at     = 0;
+	while ( true ) {
+		if ( $at + 3 > strlen( $input ) ) {
+			break;
+		}
 
-			++ $at;
-			if ( $at > strlen( $input ) ) {
-				break;
-			}
+		$last_at = $at;
+		$at      += strcspn( $input, '%', $at );
+		// Consume bytes except for the percent sign.
+		$result .= substr( $input, $last_at, $at - $last_at );
 
-			$decodable_length = strspn(
-				$input,
-				'0123456789ABCDEFabcdef',
-				$at,
-				2
-			);
-
-			if ( $decodable_length === 2 ) {
-				// Decode the hex sequence.
-				$result .= chr( hexdec( $input[ $at ] . $input[ $at + 1 ] ) );
-				$at     += 2;
-			} else {
-				// Consume the next byte and move on.
-				$result .= '%';
-			}
+		// If we've already decoded the requested number of bytes, stop.
+		if ( strlen( $result ) >= $decode_n ) {
+			break;
+		}
+
+		++ $at;
+		if ( $at > strlen( $input ) ) {
+			break;
 		}
-		$result .= substr( $input, $at );
 
-		return $result;
+		$decodable_length = strspn(
+			$input,
+			'0123456789ABCDEFabcdef',
+			$at,
+			2
+		);
+
+		if ( $decodable_length === 2 ) {
+			// Decode the hex sequence.
+			$result .= chr( hexdec( $input[ $at ] . $input[ $at + 1 ] ) );
+			$at     += 2;
+		} else {
+			// Consume the next byte and move on.
+			$result .= '%';
+		}
 	}
+	$result .= substr( $input, $at );
+
+	return $result;
 }

From 1371578fa2d62bf0e3f7c1400a26f4340dbc4a1f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Thu, 29 May 2025 15:08:39 +0200
Subject: [PATCH 6/6] Start migrating posts importing step to data liberation
 importer

---
 .../Blueprints/Steps/ImportContentStep.php    | 35 ++++++++++++++-----
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/components/Blueprints/Steps/ImportContentStep.php b/components/Blueprints/Steps/ImportContentStep.php
index 9b92fcf1..e724c629 100644
--- a/components/Blueprints/Steps/ImportContentStep.php
+++ b/components/Blueprints/Steps/ImportContentStep.php
@@ -44,7 +44,7 @@ public function run( Runtime $runtime, Tracker $progress ) {
 				$this->importWxr( $runtime, $content_definition );
 			} elseif ( $content_definition['type'] === 'posts' ) {
 				$progress[ $i ]->setCaption( 'Importing a post ' );
-				$this->importPosts( $runtime, $content_definition );
+				$this->importPosts( $runtime, $content_definition['source'] );
 			} else {
 				throw new RuntimeException( 'Unsupported content type: ' . $content_definition['type'] );
 			}
@@ -64,7 +64,9 @@ private function importWxr( Runtime $runtime, array $content_definition ): void
 			) );
 		}
 
+		// @TODO: Pass the data reference to the import script to enable streaming.
 		$wxrPath = $runtime->saveToTemporaryFile( $resolved );
+
 		// @TODO: Make it work when Blueprints are running as phar archive
 		$import_script_path = __DIR__ . '/scripts/import-content.php';
 		if ( ! file_exists( $import_script_path ) ) {
@@ -86,8 +88,9 @@ private function importWxr( Runtime $runtime, array $content_definition ): void
 	'import-wxr.php',
 	'wxr',
 	getenv('WXR_PATH'),
-	'--media-url',
-	'https://pd.w.org/'
+	// @TODO: Support arbitrary media URLs to enable fetching assets during import.
+	// '--media-url',
+	// 'https://pd.w.org/'
 ];
 ?>
 PHP
@@ -99,10 +102,14 @@ private function importWxr( Runtime $runtime, array $content_definition ): void
 		);
 	}
 
-	private function importPosts( Runtime $runtime, array $content_definition ): void {
-		$posts = $content_definition['source'];
-		if ( ! is_array( $posts ) ) {
-			throw new RuntimeException( 'Invalid posts data.' );
+	private function importPosts( Runtime $runtime, $post ): void {
+		// @TODO: Use the Data Liberation importer here.
+		$resolved = $runtime->resolve( $post );
+		if ( ! $resolved instanceof File ) {
+			throw new BlueprintExecutionException( sprintf(
+				'Imported content reference must be a file, but %s was a Directory.',
+				$post->get_human_readable_name()
+			) );
 		}
 
 		$runtime->evalPhpCodeInSubProcess(
@@ -110,12 +117,22 @@ private function importPosts( Runtime $runtime, array $content_definition ): voi
 <?php
 require_once getenv('DOCROOT') . '/wp-load.php';
 foreach (json_decode(getenv('POSTS'), true) as $post) {
-wp_insert_post(wp_slash($post));
+	$result = wp_insert_post(wp_slash($post));
+	if (is_wp_error($result)) {
+		throw new Exception( $result->get_error_message() );
+	}
 }
 PHP
 			,
 			[
-				'POSTS' => json_encode( $posts ),
+				'POSTS' => json_encode( [
+					[
+						'post_title'   => 'Test Post',
+						'post_content' => $resolved->getStream()->consume_all(),
+						'post_status'  => 'publish',
+						'post_type'    => 'post',
+					],
+				] ),
 			]
 		);
 	}