From 704ca55010148e3cd64b05e92c36135be59e44cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Thu, 29 May 2025 10:43:32 +0200 Subject: [PATCH 1/6] Separate blueprints.phar build from components library build --- bin/build-libraries-phar.sh | 2 +- bin/build-phar/smoke-test.php | 4 ++- .../Importer/StreamImporter.php | 7 +++++ composer.json | 2 +- phar-box.json => phar-blueprints.json | 0 phar-libraries.json | 31 +++++++++++++++++++ 6 files changed, 43 insertions(+), 3 deletions(-) rename phar-box.json => phar-blueprints.json (100%) create mode 100644 phar-libraries.json diff --git a/bin/build-libraries-phar.sh b/bin/build-libraries-phar.sh index 839fa0b6..92da6363 100644 --- a/bin/build-libraries-phar.sh +++ b/bin/build-libraries-phar.sh @@ -20,7 +20,7 @@ cd $PROJECT_DIR mkdir -p $BUILD_DIR rm $DIST_DIR/wordpress-libraries.* > /dev/null 2>&1 || true export BOX_BASE_PATH=$(type -a box | grep -v 'alias' | awk '{print $3}') -php $BUILD_DIR/box.php compile -d $PROJECT_DIR -c $PROJECT_DIR/phar-box.json +php $BUILD_DIR/box.php compile -d $PROJECT_DIR -c $PROJECT_DIR/phar-libraries.json php -d 'phar.readonly=0' $BUILD_DIR/truncate-composer-checks.php $DIST_DIR/wordpress-libraries.phar cd $DIST_DIR php $BUILD_DIR/smoke-test.php diff --git a/bin/build-phar/smoke-test.php b/bin/build-phar/smoke-test.php index 6fcbe520..dbe8d8cd 100644 --- a/bin/build-phar/smoke-test.php +++ b/bin/build-phar/smoke-test.php @@ -9,7 +9,9 @@ */ $c = WordPress\DataLiberation\Importer\StreamImporter::create_for_wxr_file(__DIR__ . '/nosuchfile.xml', [ 'uploads_path' => __DIR__ . '/uploads', - 'new_site_url' => 'https://smoke-test.org' + 'new_site_url' => 'https://smoke-test.org', + 'new_site_content_root_url' => 'https://smoke-test.org', + 'new_media_root_url' => 'https://smoke-test.org', ]); WordPress\DataLiberation\URL\WPURL::parse('https://example.com'); diff --git a/components/DataLiberation/Importer/StreamImporter.php b/components/DataLiberation/Importer/StreamImporter.php index 409eeea0..a96be2e8 100644 --- a/components/DataLiberation/Importer/StreamImporter.php +++ b/components/DataLiberation/Importer/StreamImporter.php @@ -2,6 +2,7 @@ namespace WordPress\DataLiberation\Importer; +use InvalidArgumentException; use WordPress\ByteStream\ReadStream\FileReadStream; use WordPress\DataLiberation\BlockMarkup\BlockMarkupUrlProcessor; use WordPress\DataLiberation\EntityReader\EntityReaderIterator; @@ -286,6 +287,9 @@ protected static function parse_options( $options ) { // throw new DataLiberationException( 'The "source_site_url" option is required' ); } if ( ! isset( $options['new_site_content_root_url'] ) ) { + if(!function_exists('get_site_url')) { + throw new InvalidArgumentException('Option "new_site_content_root_url" is required'); + } $options['new_site_content_root_url'] = get_site_url(); } @@ -296,6 +300,9 @@ protected static function parse_options( $options ) { $options['uploads_path'] = rtrim( $options['uploads_path'], '/' ); if ( ! isset( $options['new_media_root_url'] ) ) { + if(!function_exists('get_site_url')) { + throw new InvalidArgumentException('Option "new_media_root_url" is required'); + } $options['new_media_root_url'] = rtrim( get_site_url(), '/' ) . '/wp-content/uploads'; } // Remove the trailing slash to make concatenation easier later. diff --git a/composer.json b/composer.json index 992dd0ca..4ef1653f 100644 --- a/composer.json +++ b/composer.json @@ -72,7 +72,7 @@ } }, "scripts": { - "build-blueprints-phar": "box compile -c phar-box.json", + "build-blueprints-phar": "box compile -c phar-blueprints.json", "regenerate-json-schema": "node components/Blueprints/Versions/Version2/json-schema/regenerate-schema.ts", "test": "phpunit -c phpunit.xml", "lint": "phpcs --standard=WordPress .", diff --git a/phar-box.json b/phar-blueprints.json similarity index 100% rename from phar-box.json rename to phar-blueprints.json diff --git a/phar-libraries.json b/phar-libraries.json new file mode 100644 index 00000000..ef512789 --- /dev/null +++ b/phar-libraries.json @@ -0,0 +1,31 @@ +{ + "$schema": "https://raw.githubusercontent.com/box-project/box/refs/heads/main/res/schema.json", + "main": "vendor/autoload.php", + "output": "dist/wordpress-libraries.phar", + "force-autodiscovery": false, + "compactors": [ + "KevinGH\\Box\\Compactor\\Php" + ], + "check-requirements": false, + "annotations": false, + "shebang": "#!/usr/bin/env php", + "compression": "GZ", + "finder": [ + { + "notName": "/.*\\.md|.*\\.dist|Makefile|composer\\.json|composer\\.lock/", + "exclude": [ + "untracked", + "test", + "test_old", + "tests", + "Tests", + "Test", + "vendor-bin" + ], + "in": "components" + } + ], + "directories": [ + "vendor/composer" + ] +} From a1053ae83ef17e95c398bee760be500283f6c9b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Thu, 29 May 2025 10:43:51 +0200 Subject: [PATCH 2/6] add a cli importer script --- .../DataLiberation/bin/cli-importer.php | 762 ++++++++++++++++++ 1 file changed, 762 insertions(+) create mode 100644 components/DataLiberation/bin/cli-importer.php diff --git a/components/DataLiberation/bin/cli-importer.php b/components/DataLiberation/bin/cli-importer.php new file mode 100644 index 00000000..348737a2 --- /dev/null +++ b/components/DataLiberation/bin/cli-importer.php @@ -0,0 +1,762 @@ +write( "\033[1;32mDescription:\033[0m\n" ); + $console_writer->write( " Imports content into a new WordPress site\n\n" ); + + $console_writer->write( "\033[1;32mUsage:\033[0m\n" ); + $console_writer->write( " php import-markdown-directory.php [options]\n\n" ); + + $console_writer->write( "\033[1;32mModes:\033[0m\n" ); + $console_writer->write( " \033[1;33mcrawler\033[0m Import content by crawling a website\n" ); + $console_writer->write( " \033[1;33mlocal-directory\033[0m Import content from a local directory\n" ); + $console_writer->write( " \033[1;33mgit\033[0m Import content from a git repository\n" ); + $console_writer->write( " \033[1;33mwxr\033[0m Import content from a WordPress eXtended RSS file\n" ); + $console_writer->write( " \033[1;33mepub\033[0m Import content from an EPUB ebook\n\n" ); + + $console_writer->write( "\033[1;32mGlobal Options:\033[0m\n" ); + $console_writer->write( " \033[1;34m--source-site-url=\033[0m\n" ); + $console_writer->write( " Base URL of the source content (required)\n\n" ); + + $console_writer->write( " \033[1;34m--additional-site-urls=\033[0m\n" ); + $console_writer->write( " Additional URLs to rewrite links for (multiple allowed)\n\n" ); + + $console_writer->write( " \033[1;34m--media-url=\033[0m\n" ); + $console_writer->write( " URLs to download media files from (multiple allowed)\n\n" ); + + $console_writer->write( " \033[1;34m--output-dir=\033[0m\n" ); + $console_writer->write( " Create the new WordPress site in this directory\n" ); + $console_writer->write( " Must be empty and have write permissions\n\n" ); + + $console_writer->write( "\033[1;32mMode-specific Usage:\033[0m\n" ); + + $console_writer->write( "\033[1;33mgit\033[0m mode:\n" ); + $console_writer->write( " php import-markdown-directory.php git \n" ); + $console_writer->write( " Options:\n" ); + $console_writer->write( " \033[1;34m--branch=\033[0m\n" ); + $console_writer->write( " Git branch to import from (required)\n" ); + $console_writer->write( " \033[1;34m--path-in-repo=\033[0m\n" ); + $console_writer->write( " Subdirectory in repository to import from\n\n" ); + + $console_writer->write( "\033[1;33mcrawler\033[0m mode:\n" ); + $console_writer->write( " php import-markdown-directory.php crawler \n" ); + $console_writer->write( " Crawls the website at and imports discovered content\n\n" ); + + $console_writer->write( "\033[1;33mlocal-directory\033[0m mode:\n" ); + $console_writer->write( " php import-markdown-directory.php local-directory \n" ); + $console_writer->write( " Imports content from local \n\n" ); + + $console_writer->write( "\033[1;33mwxr\033[0m mode:\n" ); + $console_writer->write( " php import-markdown-directory.php wxr \n" ); + $console_writer->write( " Imports content from a WordPress eXtended RSS file\n\n" ); + + $console_writer->write( "\033[1;33mepub\033[0m mode:\n" ); + $console_writer->write( " php import-markdown-directory.php epub \n" ); + $console_writer->write( " Imports content from an EPUB ebook\n\n" ); + + if ( $error ) { + $console_writer->write( "\033[1;31mError:\033[0m " ); + $console_writer->write( $error ); + $console_writer->write( "\n" ); + PlaygroundProtocolClient::getInstance()->exit(); + } + die(); +} + +define( 'NEW_SITE_CONTENT_ROOT', get_site_url() ); +$console_writer->write( 'Target site URL: ' . NEW_SITE_CONTENT_ROOT . "\n" ); + +$parser = new Phalcon\Cop\Parser(); +$args = $parser->parse( $argv ); + +$args['mode'] = $args[0] ?? ''; +$args['data_url'] = $args[1] ?? ''; + +$chrooted_fs = null; +$source_site_url = null; +if ( in_array( $args['mode'], array( 'local-directory', 'git', 'crawler' ) ) ) { + // Validate required arguments + if ( ! isset( $args['source-site-url'] ) ) { + if ( $args['mode'] === 'crawler' ) { + $args['source-site-url'] = $args['data_url']; + } else { + help_message_and_die( 'The --source-site-url argument is required.' ); + } + } + $index_file_pattern = '#(?:index|readme)\.(?:md|html|xhtml)$#i'; + $import_path_prefix = '/imported-content'; + $source_site_url = $args['source-site-url']; + + if ( $args['mode'] === 'local-directory' ) { + if ( ! isset( $args['data_url'] ) ) { + help_message_and_die( 'The "local-directory" argument is required.' ); + } + + PlaygroundProtocolClient::getInstance()->mountDirectory( $args['data_url'], '/files-to-import' ); + $chrooted_fs = LocalFilesystem::create( '/files-to-import' ); + + $args['source-site-url'] = 'file:///'; + } elseif ( $args['mode'] === 'git' ) { + if ( ! isset( $args['data_url'] ) ) { + help_message_and_die( 'The "repo" argument is required.' ); + } + + $args['repo'] = $args['data_url']; + if ( ! str_ends_with( $args['repo'], '.git' ) ) { + help_message_and_die( 'The "repo" argument must end with ".git" when mode is "git".' ); + } + + if ( ! isset( $args['branch'] ) ) { + help_message_and_die( 'The "branch" argument is required when mode is "git".' ); + } + + $console_writer->write( "Sparse checkout of the git repository\n" ); + $temp_dir = sys_get_temp_dir() . '/import-static-' . uniqid(); + $cache_fs = LocalFilesystem::create( $temp_dir ); + $docs_repo = new GitRepository( $cache_fs ); + $docs_repo->add_remote( 'origin', $args['repo'] ); + $remote = $docs_repo->get_remote_client( 'origin' ); + $path_in_repo = $args['path-in-repo'] ?? ''; + $branch = $args['branch'] ?? 'trunk'; + $remote->fetch( + $branch, + array( + 'path' => $path_in_repo, + 'shallow' => true, + ) + ); + $docs_repo->set_branch_tip( 'refs/heads/' . $branch, $docs_repo->get_branch_tip( 'refs/remotes/origin/' . $branch ) ); + $docs_repo->checkout( 'refs/heads/' . $branch ); + $git_fs = GitFilesystem::create( $docs_repo ); + $chrooted_fs = new ChrootLayer( $git_fs, $path_in_repo ); + } elseif ( $args['mode'] === 'crawler' ) { + if ( ! isset( $args['data_url'] ) ) { + help_message_and_die( 'The "url" argument is required.' ); + } + if ( ! WPURL::parse( $args['data_url'] ) ) { + help_message_and_die( 'The "url" argument must be a valid URL.' ); + } + $args['source-site-url'] = $args['data_url']; + $tmp_dir = sys_get_temp_dir() . '/import-static-' . uniqid(); + $chrooted_fs = LocalFilesystem::create( $tmp_dir ); + $crawler = new Crawler( + $args['data_url'], + array( + 'preprocess_url' => function ( URL $url ) use ( $args ) { + if ( ! is_child_url_of( $url, $args['data_url'] ) ) { + return false; + } + $url->search = ''; + if ( in_array( $url->pathname, array( '/feed/', '/wp-json/' ) ) ) { + return false; + } + if ( preg_match( '#^/\d{4}/\d{2}/\d{2}/[^/]+/$#', $url->pathname ) ) { + return $url; + } + if ( preg_match( '#^/[^/]+/$#', $url->pathname ) ) { + return $url; + } + return false; + }, + ) + ); + $progress = new ProgressBar( $console_writer, null ); + $progress->start( 'Crawling website...' ); + while ( $crawler->crawl_next() ) { + $parsed_url = WPURL::parse( $crawler->get_current_url() ); + $file_path = $parsed_url->pathname; + if ( $file_path === '/' ) { + $file_path = '/index.html'; + } elseif ( str_ends_with( $file_path, '/' ) ) { + /** + * Choose to treat /2021/10/03/dont-waste-time-on-boring-programming-lessons/ as + * /2021/10/03/dont-waste-time-on-boring-programming-lessons.html + * + * Another possible choice would be to save it as + * /2021/10/03/dont-waste-time-on-boring-programming-lessons/index.html + */ + $file_path = rtrim( $file_path, '/' ); + } + + if ( ! $file_path || strlen( $file_path ) < 1 ) { + $file_path = sha1( $crawler->get_current_url() ); + } + + $extension = pathinfo( $file_path, PATHINFO_EXTENSION ); + if ( ! $extension ) { + $file_path .= '.html'; + } + + /** + * Replace date-based paths with "posts" directory. + * + * Why? wp_insert_post() seems to mangle the post_name if it consists of a few numbers + * and that messes up the URLs of the imported posts. + * + * @TODO: Investigate the reasons of this behavior. + */ + $file_path = preg_replace( '#/\d{4}/\d{2}/\d{2}/#', '/posts/', $file_path ); + $content = $crawler->get_current_content(); + // @TODO: This is very naive – we should use the URL processor instead. + $content = preg_replace( '#/\d{4}/\d{2}/\d{2}/#', '/posts/', $content ); + + $chrooted_fs->mkdir( dirname( $file_path ), array( 'recursive' => true ) ); + $chrooted_fs->put_contents( + $file_path, + $content + ); + $progress->setMessage( 'Fetching ' . $parsed_url->pathname ); + $progress->advance(); + } + $progress->finish(); + } + $entity_reader_factory = function () use ( $chrooted_fs, $source_site_url, $index_file_pattern ) { + return new FilesystemEntityReader( + $chrooted_fs, + array( + 'index_file_pattern' => $index_file_pattern, + 'filter_pattern' => '#\.(?:md|html|xhtml)$#', + /** + * Use a number so large, there's no chance for wp_table INSERTs + * to interfere with the post IDs generated by the FilesystemEntityReader. + * + * Some inserts are ran even by the importer, e.g. frontloading stubs. + * + * @TODO: Make sure this doesn't automatically bump the AUTOINCREMENT counter in MySQL. + * @TODO: Bump the AUTOINCREMENT counter manually after a finished import. + */ + 'first_post_id' => 10000000, + 'base_url' => $source_site_url, + ) + ); + }; + + /** + * Maps a filesystem path to a WordPress-friendly URL path we can assign + * to the imported page. + * + * Example: "/docs/README.md" -> "/docs/readme" + * + * @param string $path The filesystem path to convert + * @return string The WordPress-friendly URL path + */ + function map_file_path_to_wordpress_url( $path ) { + global $index_file_pattern, $import_path_prefix; + + /** + * Ensure a named top-level parent directory to base the entire + * URL structure on. The goal is to have a consistent way to resolve + * URLs for all the following files: + * + * - README.md + * - chapter-5/README.md + * - chapter-5/section-1.md + * - chapter-5/section-3/readme.md + * + * Without the top-level directory, the best URL we can give the + * /README.md file would be `/readme`. However, the `chapter-5/README.md` + * would get a URL like `/chapter-5` which is inconsistent. However, + * if we transform the path structure as follows, everything becomes + * consistent: + * + * - /imported-content/README.md + * - /imported-content/chapter-5/README.md + * - /imported-content/chapter-5/section-1.md + * - /imported-content/chapter-5/section-3/readme.md + * + * We want to keep all the links working after the import. A single, + * consistent URL mapping strategy makes it much easier. The alternative + * would be to maintain a mapping of parents to paths and use it whenever + * creating pages and rewriting URLs. + * + * This isn't trivial. Having a top-level path prefix is not perfect, + * but it's a sound compromise. + */ + $path = wp_join_unix_paths( $import_path_prefix, $path ); + + if ( 1 === preg_match( $index_file_pattern, $path ) ) { + $path = dirname( $path ); + } + + $extensions = array( '.md', '.html', '.xhtml' ); + foreach ( $extensions as $ext ) { + if ( str_ends_with( $path, $ext ) ) { + $path = substr( $path, 0, -strlen( $ext ) ); + break; + } + } + + return strtolower( $path ); + } + + /** + * Transforms links pointing to imported static files (e.g. ./getting-started.md) + * to the format they will have after being imported into WordPress (e.g. /docs/getting-started). + */ + add_action( + 'data_liberation.stream_importer.postprocess_url', + function ( + $processor, + $context + ) use ( + $chrooted_fs, + /** + * With &, $import_path_prefix reflects the latest value. + * Without &, it's a local copy of the value from the outer scope. + */ + &$import_path_prefix + ) { + /** + * If we didn't rewrite the base URL, the URL points outside + * of the imported root directory. Let's keep it as it is. + */ + if ( ! $context['applied_base_url_mapping'] ) { + return; + } + + $path_original = $processor->get_parsed_url()->pathname; + + /** + * Remove the site path from the URL path and check: + * Is this URL pointing to a file that exists in the imported + * directory? + */ + $base_url_path_prefix = $context['applied_base_url_mapping']['to']->pathname; + $path_relative_to_base = substr( $path_original, strlen( $base_url_path_prefix ) ); + if ( $chrooted_fs->is_file( $path_relative_to_base ) ) { + /** + * Yes! We are linking to an imported page. Let's transform the link + * to a WordPress-friendly URL scheme. + */ + $path_rewritten = map_file_path_to_wordpress_url( $path_relative_to_base ); + $path_rewritten = wp_join_unix_paths( $base_url_path_prefix, $path_rewritten ); + } elseif ( $processor->is_url_absolute() ) { + /** + * No. We are linking to a content page within our site but there is + * no corresponding static file. This happens e.g. in the Gutenberg + * handbook where the markdown files contain absolute URLs to the deployed + * site, e.g.: + * + * Start by ensuring you have Node.js and `npm` installed on your computer. Review + * the [Node.js development environment](https://developer.wordpress.org/block-editor/getting-started/devenv/nodejs-development-environment/) guide if not. + * + * Our best shot is to keep the URL as is, just with the imported + * content root prepended to it. + */ + $path_rewritten = wp_join_unix_paths( $base_url_path_prefix, $import_path_prefix, $path_relative_to_base ); + } else { + /** + * It's a relative URL pointing somewhere within the URL space we're importing + * to, but there is no corresponding static file. This is unexpected. There is + * nothing we can do at this point – let's just keep the URL as it is. + */ + return; + } + $processor->set_url( + $path_rewritten, + WPURL::parse( $path_rewritten, $processor->get_parsed_url() ) + ); + }, + 10, + 3 + ); + + /** + * Assigns post_name to every imported static page. + */ + add_filter( + 'data_liberation.stream_importer.preprocess_entity', + function ( $entity ) use ( &$import_path_prefix, $index_file_pattern ) { + static $preprocessed_an_entity = false; + if ( $entity->get_type() !== 'post' ) { + return $entity; + } + + $data = $entity->get_data(); + + if ( isset( $data['parsed_metadata']['slug'] ) ) { + $data['post_name'] = basename( $data['parsed_metadata']['slug'][0] ); + } elseif ( isset( $data['local_file_path'] ) ) { + /** + * The default import content path is "/imported-content". However, + * maybe we can find a friendlier path prefix based on the post + * title of the top-level index file. + * + * For example, a "Getting Started" guide found at "README.md" + * could be imported to "/getting-started". + */ + if ( ! $preprocessed_an_entity ) { + $preprocessed_an_entity = true; + $dirname = dirname( $data['local_file_path'] ); + $dirname_makes_a_bad_slug = $dirname !== '.' && $dirname === '/'; + $is_index_file = 1 === preg_match( $index_file_pattern, $data['local_file_path'] ); + $post_title_not_derived_from_path = $data['post_title'] !== ImportUtils::slug_to_title( basename( $data['local_file_path'] ) ); + + if ( + $dirname_makes_a_bad_slug && + $is_index_file && + $post_title_not_derived_from_path && + strlen( $data['post_title'] ) > 1 + ) { + $import_path_prefix = wp_import_slugify( $data['post_title'] ); + } + } + + $wordpress_url = map_file_path_to_wordpress_url( $data['local_file_path'] ); + $data['post_name'] = basename( $wordpress_url ); + } else { + return $entity; + } + + $entity->set_data( $data ); + return $entity; + }, + 10, + 2 + ); +} elseif ( $args['mode'] === 'wxr' ) { + if ( ! isset( $args['data_url'] ) ) { + help_message_and_die( 'The "wxr file" argument is required.' ); + } + $entity_reader_factory = function ( $cursor ) use ( $args ) { + return WXREntityReader::create( + uri_to_byte_stream( $args['data_url'] ), + $cursor + ); + }; +} elseif ( $args['mode'] === 'epub' ) { + if ( ! isset( $args['data_url'] ) ) { + help_message_and_die( 'The "epub file" argument is required.' ); + } + $zip_fs = ZipFilesystem::create( + uri_to_byte_stream( $args['data_url'] ) + ); + $entity_reader_factory = function ( $cursor = null ) use ( $zip_fs ) { + return new EPubEntityReader( + $zip_fs, + 1000000 // This is first post ID. We should really also accept a cursor + ); + }; + $reader = $entity_reader_factory(); + $source_site_url = 'file://' . dirname( $reader->get_manifest_path() ); + + // To source the media files from the EPUB bundle: + $chrooted_fs = $zip_fs; + + /** + * Drop .xhtml extension from the links. + */ + add_action( + 'data_liberation.stream_importer.postprocess_url', + function ( $processor ) { + $parsed_url = $processor->get_parsed_url(); + if ( ! str_ends_with( $parsed_url->pathname, '.xhtml' ) ) { + return; + } + $parsed_url->pathname = substr( $parsed_url->pathname, 0, -6 ); + $processor->set_url( + $parsed_url . '', + $parsed_url + ); + } + ); +} else { + help_message_and_die( 'The "mode" argument is required and must be one of: "local-directory", "git", "crawler", "wxr", or "epub".' ); + exit( 1 ); +} + +function uri_to_byte_stream( $uri ) { + if ( str_starts_with( $uri, 'http://' ) || str_starts_with( $uri, 'https://' ) ) { + $local_path = tempnam( sys_get_temp_dir(), 'wp-remote-file-' ); + file_put_contents( $local_path, file_get_contents( $uri ) ); + $uri = $local_path; + + // @TODO: Use SeekableRequestReadStream here instead of + // pre-downloading the file to disk. + // $client = new Client(); + // $response = $client->fetch($uri); + } + if ( file_exists( $uri ) ) { + return FileReadStream::from_path( $uri ); + } + throw new \Exception( "Unknown resource type: $uri. If that's a local file, \033[1mplease provide an absolute path to the file\033[0m." ); +} + + +/** + * Naive slugification function. + * + * @TODO: Use a more sophisticated one with utf-8 support etc. + */ +function wp_import_slugify( $title ) { + return preg_replace( '/[^a-z0-9]+/i', '-', trim( strtolower( $title ) ) ); +} + +$data_url = $args['data_url']; +$console_writer->write( "Importing static files from $data_url\n" ); + + +try { + // Parse URL mapping arguments + $additional_url_mappings = array(); + foreach ( $parser->getArray( 'additional-site-urls' ) as $url ) { + $additional_url_mappings[] = array( + 'from' => $url, + 'to' => NEW_SITE_CONTENT_ROOT, + ); + } + + $console_writer->write( "Starting the import\n" ); + $importer = StreamImporter::create( + $entity_reader_factory, + array( + 'source_site_url' => $source_site_url, + 'new_site_content_root_url' => NEW_SITE_CONTENT_ROOT, + 'source_media_root_urls' => $parser->getArray( 'media-url' ) ?: array( $source_site_url ), + 'additional_url_mappings' => $additional_url_mappings, + 'index_batch_size' => 1, + 'attachment_downloader_options' => array( + 'source_from_filesystem' => $chrooted_fs, + ), + ) + ); + + $import_session = ImportSession::create( + array( + 'data_source' => 'local_directory', + // @TODO: the phrase "file_name" doesn't make sense here. We're sourcing + // data from a directory, not a file. This string is used to tell + // the user in the UI what this they're importing in this import + // session. Let's rename it to something more descriptive. + 'file_name' => $args['data_url'], + ) + ); + $retries_iterator = new RetryFrontloadingIterator( $import_session->get_id() ); + $importer->set_frontloading_retries_iterator( $retries_iterator ); + + // @TODO: Prettier progress reporting + $ignored_message_printed = false; + do { + $result = data_liberation_import_step_customized( $import_session, $importer, $console_writer ); + if ( $importer->get_stage() === StreamImporter::STAGE_FINISHED ) { + $console_writer->write( "\n" ); + $console_writer->write( "\033[1;32mImport finished!\033[0m See your imported content at: \n" ); + + // Get the first page with non-empty content. + $posts = get_posts( + array( + 'numberposts' => 10, + 'orderby' => 'ID', + 'order' => 'ASC', + 'post_type' => 'page', + 'post_status' => 'publish', + ) + ); + + $url = NEW_SITE_CONTENT_ROOT; + foreach ( $posts as $post ) { + if ( ! empty( $post->post_content ) ) { + $url = get_permalink( $post ); + break; + } + } + $console_writer->write( "\033[1;36m" . $url . "\033[0m\n" ); + break; + } elseif ( false === $result ) { + if ( $importer->get_stage() === StreamImporter::STAGE_FRONTLOAD_ASSETS ) { + if ( ! $ignored_message_printed ) { + $console_writer->write( "\nSome assets could not be downloaded – they will be ignored so we can continue with the import.\n" ); + $ignored_message_printed = true; + } + // $import_session->mark_frontloading_errors_as_ignored(); + } else { + $console_writer->write( "Import failed, aborting\n" ); + break; + } + } else { + // Twiddle our thumbs, importing in progress... + } + } while ( true ); +} finally { + if ( isset( $cache_fs ) ) { + $cache_fs->rmdir( + '/', + array( + 'recursive' => true, + ) + ); + } +} + +/** + * @TODO: Expose a primitive like the step function below from the + * DataLiberation PHP component. Support all sorts of pause conditions + * such as time limits, retry counts, memory limits, etc. + */ +function data_liberation_import_step_customized( ImportSession $session, StreamImporter $importer, ConsoleWriter $console_writer ) { + $soft_time_limit_seconds = 15; + $hard_time_limit_seconds = 25; + $start_time = microtime( true ); + $fetched_files = 0; + $progress_bar = null; + + while ( true ) { + $time_taken = microtime( true ) - $start_time; + if ( $time_taken >= $soft_time_limit_seconds ) { + if ( $importer->get_stage() === StreamImporter::STAGE_FRONTLOAD_ASSETS ) { + if ( $fetched_files > 0 ) { + return true; + } + } else { + return true; + } + } + if ( $time_taken >= $hard_time_limit_seconds ) { + return true; + } + + if ( true !== $importer->next_step() ) { + $session->set_reentrancy_cursor( $importer->get_reentrancy_cursor() ); + + $should_advance_to_next_stage = null !== $importer->get_next_stage(); + if ( $should_advance_to_next_stage ) { + if ( StreamImporter::STAGE_FRONTLOAD_ASSETS === $importer->get_stage() ) { + $resolved_all_failures = $session->count_unfinished_frontloading_stubs() === 0; + if ( ! $resolved_all_failures ) { + // Uncomment once this script's intent becomes exiting on unresolved frontloading failures. + // if($progress_bar) { + // $progress_bar->finish(); + // } + // return false; + } + } + } + if ( ! $importer->advance_to_next_stage() ) { + if ( $progress_bar ) { + $progress_bar->finish(); + } + return false; + } + $session->set_stage( $importer->get_stage() ); + $session->set_reentrancy_cursor( $importer->get_reentrancy_cursor() ); + $console_writer->clearLine(); + $progress_bar = null; + + continue; + } + + switch ( $importer->get_stage() ) { + case StreamImporter::STAGE_INDEX_ENTITIES: + $entities_counts = $importer->get_indexed_entities_counts(); + $session->create_frontloading_stubs( $importer->get_indexed_assets_urls() ); + $session->bump_total_number_of_entities( $entities_counts ); + if ( ! $progress_bar ) { + $progress_bar = new ProgressBar( $console_writer, null ); + $progress_bar->setMessage( 'Indexing entities' ); + $progress_bar->start(); + } + $progress_bar->setCurrent( array_sum( $session->get_total_number_of_entities() ) ); + break; + + case StreamImporter::STAGE_FRONTLOAD_ASSETS: + $progress = $importer->get_frontloading_progress(); + $session->bump_frontloading_progress( + $progress, + $importer->get_frontloading_events() + ); + + if ( ! $progress_bar ) { + $progress_bar = new ProgressBar( $console_writer, null ); + $progress_bar->setMessage( 'Fetching media files' ); + $progress_bar->start(); + } + $progress_bar->setCurrent( $session->count_unfinished_frontloading_stubs() ); + break; + + case StreamImporter::STAGE_IMPORT_ENTITIES: + $imported_counts = $importer->get_imported_entities_counts(); + + $session->bump_imported_entities_counts( $imported_counts ); + + if ( ! $progress_bar ) { + $progress_bar = new ProgressBar( $console_writer, $session->count_remaining_entities() ); + $progress_bar->setMessage( 'Importing entities' ); + $progress_bar->start(); + } + $progress_bar->setCurrent( $session->count_all_imported_entities() ); + break; + } + + $session->set_reentrancy_cursor( $importer->get_reentrancy_cursor() ); + } + return false; +} From b369a9626ea4517651a45779143d499928be438e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Thu, 29 May 2025 10:45:07 +0200 Subject: [PATCH 3/6] Load php-toolkit when importContent step is used --- bin/build-phar/smoke-test.php | 2 +- components/Blueprints/Runner.php | 22 +- components/Blueprints/RunnerConfiguration.php | 4 + .../DataLiberation/bin/cli-importer.php | 762 ------------------ phar-libraries.json | 2 +- plugins/data-liberation/plugin.php | 4 +- 6 files changed, 27 insertions(+), 769 deletions(-) delete mode 100644 components/DataLiberation/bin/cli-importer.php diff --git a/bin/build-phar/smoke-test.php b/bin/build-phar/smoke-test.php index dbe8d8cd..81056533 100644 --- a/bin/build-phar/smoke-test.php +++ b/bin/build-phar/smoke-test.php @@ -1,6 +1,6 @@ createStepObject( 'installPlugin', [ - 'source' => $this->createDataReference( 'https://playground.wordpress.net/wordpress-importer.zip' ), + if($this->configuration->isRunningAsPhar()) { + throw new InvalidArgumentException( '@TODO: Importing content is not supported when running as phar.' ); + } else { + $libraries_phar_path = __DIR__ . '/../../dist/php-toolkit.phar'; + if(!file_exists($libraries_phar_path)) { + throw new InvalidArgumentException( + 'In development, you must run `bash bin/build-libraries-phar.sh` to bundle importer libraries before importing content via a Blueprint. '. + 'It generates a `dist/php-toolkit.phar` file bundling all the libraries required for importing content.' + ); + } + $this->configuration->getLogger()->info( 'Loading importer libraries from ' . $libraries_phar_path ); + $source = $this->createDataReference( new AbsoluteLocalPath( $libraries_phar_path ) ); + } + array_unshift( $plan, $this->createStepObject( 'writeFiles', [ + 'files' => [ + 'php-toolkit.phar' => $source, + ], ] ) ); break; } diff --git a/components/Blueprints/RunnerConfiguration.php b/components/Blueprints/RunnerConfiguration.php index 0dbd7d97..5f532501 100644 --- a/components/Blueprints/RunnerConfiguration.php +++ b/components/Blueprints/RunnerConfiguration.php @@ -235,4 +235,8 @@ public function isAllowedLocalFilesystemAccess(): bool { public static function getPermissionCliFlag( string $permission ): string { return $permission; } + + public function isRunningAsPhar(): bool { + return \Phar::running(false) !== ''; + } } diff --git a/components/DataLiberation/bin/cli-importer.php b/components/DataLiberation/bin/cli-importer.php deleted file mode 100644 index 348737a2..00000000 --- a/components/DataLiberation/bin/cli-importer.php +++ /dev/null @@ -1,762 +0,0 @@ -write( "\033[1;32mDescription:\033[0m\n" ); - $console_writer->write( " Imports content into a new WordPress site\n\n" ); - - $console_writer->write( "\033[1;32mUsage:\033[0m\n" ); - $console_writer->write( " php import-markdown-directory.php [options]\n\n" ); - - $console_writer->write( "\033[1;32mModes:\033[0m\n" ); - $console_writer->write( " \033[1;33mcrawler\033[0m Import content by crawling a website\n" ); - $console_writer->write( " \033[1;33mlocal-directory\033[0m Import content from a local directory\n" ); - $console_writer->write( " \033[1;33mgit\033[0m Import content from a git repository\n" ); - $console_writer->write( " \033[1;33mwxr\033[0m Import content from a WordPress eXtended RSS file\n" ); - $console_writer->write( " \033[1;33mepub\033[0m Import content from an EPUB ebook\n\n" ); - - $console_writer->write( "\033[1;32mGlobal Options:\033[0m\n" ); - $console_writer->write( " \033[1;34m--source-site-url=\033[0m\n" ); - $console_writer->write( " Base URL of the source content (required)\n\n" ); - - $console_writer->write( " \033[1;34m--additional-site-urls=\033[0m\n" ); - $console_writer->write( " Additional URLs to rewrite links for (multiple allowed)\n\n" ); - - $console_writer->write( " \033[1;34m--media-url=\033[0m\n" ); - $console_writer->write( " URLs to download media files from (multiple allowed)\n\n" ); - - $console_writer->write( " \033[1;34m--output-dir=\033[0m\n" ); - $console_writer->write( " Create the new WordPress site in this directory\n" ); - $console_writer->write( " Must be empty and have write permissions\n\n" ); - - $console_writer->write( "\033[1;32mMode-specific Usage:\033[0m\n" ); - - $console_writer->write( "\033[1;33mgit\033[0m mode:\n" ); - $console_writer->write( " php import-markdown-directory.php git \n" ); - $console_writer->write( " Options:\n" ); - $console_writer->write( " \033[1;34m--branch=\033[0m\n" ); - $console_writer->write( " Git branch to import from (required)\n" ); - $console_writer->write( " \033[1;34m--path-in-repo=\033[0m\n" ); - $console_writer->write( " Subdirectory in repository to import from\n\n" ); - - $console_writer->write( "\033[1;33mcrawler\033[0m mode:\n" ); - $console_writer->write( " php import-markdown-directory.php crawler \n" ); - $console_writer->write( " Crawls the website at and imports discovered content\n\n" ); - - $console_writer->write( "\033[1;33mlocal-directory\033[0m mode:\n" ); - $console_writer->write( " php import-markdown-directory.php local-directory \n" ); - $console_writer->write( " Imports content from local \n\n" ); - - $console_writer->write( "\033[1;33mwxr\033[0m mode:\n" ); - $console_writer->write( " php import-markdown-directory.php wxr \n" ); - $console_writer->write( " Imports content from a WordPress eXtended RSS file\n\n" ); - - $console_writer->write( "\033[1;33mepub\033[0m mode:\n" ); - $console_writer->write( " php import-markdown-directory.php epub \n" ); - $console_writer->write( " Imports content from an EPUB ebook\n\n" ); - - if ( $error ) { - $console_writer->write( "\033[1;31mError:\033[0m " ); - $console_writer->write( $error ); - $console_writer->write( "\n" ); - PlaygroundProtocolClient::getInstance()->exit(); - } - die(); -} - -define( 'NEW_SITE_CONTENT_ROOT', get_site_url() ); -$console_writer->write( 'Target site URL: ' . NEW_SITE_CONTENT_ROOT . "\n" ); - -$parser = new Phalcon\Cop\Parser(); -$args = $parser->parse( $argv ); - -$args['mode'] = $args[0] ?? ''; -$args['data_url'] = $args[1] ?? ''; - -$chrooted_fs = null; -$source_site_url = null; -if ( in_array( $args['mode'], array( 'local-directory', 'git', 'crawler' ) ) ) { - // Validate required arguments - if ( ! isset( $args['source-site-url'] ) ) { - if ( $args['mode'] === 'crawler' ) { - $args['source-site-url'] = $args['data_url']; - } else { - help_message_and_die( 'The --source-site-url argument is required.' ); - } - } - $index_file_pattern = '#(?:index|readme)\.(?:md|html|xhtml)$#i'; - $import_path_prefix = '/imported-content'; - $source_site_url = $args['source-site-url']; - - if ( $args['mode'] === 'local-directory' ) { - if ( ! isset( $args['data_url'] ) ) { - help_message_and_die( 'The "local-directory" argument is required.' ); - } - - PlaygroundProtocolClient::getInstance()->mountDirectory( $args['data_url'], '/files-to-import' ); - $chrooted_fs = LocalFilesystem::create( '/files-to-import' ); - - $args['source-site-url'] = 'file:///'; - } elseif ( $args['mode'] === 'git' ) { - if ( ! isset( $args['data_url'] ) ) { - help_message_and_die( 'The "repo" argument is required.' ); - } - - $args['repo'] = $args['data_url']; - if ( ! str_ends_with( $args['repo'], '.git' ) ) { - help_message_and_die( 'The "repo" argument must end with ".git" when mode is "git".' ); - } - - if ( ! isset( $args['branch'] ) ) { - help_message_and_die( 'The "branch" argument is required when mode is "git".' ); - } - - $console_writer->write( "Sparse checkout of the git repository\n" ); - $temp_dir = sys_get_temp_dir() . '/import-static-' . uniqid(); - $cache_fs = LocalFilesystem::create( $temp_dir ); - $docs_repo = new GitRepository( $cache_fs ); - $docs_repo->add_remote( 'origin', $args['repo'] ); - $remote = $docs_repo->get_remote_client( 'origin' ); - $path_in_repo = $args['path-in-repo'] ?? ''; - $branch = $args['branch'] ?? 'trunk'; - $remote->fetch( - $branch, - array( - 'path' => $path_in_repo, - 'shallow' => true, - ) - ); - $docs_repo->set_branch_tip( 'refs/heads/' . $branch, $docs_repo->get_branch_tip( 'refs/remotes/origin/' . $branch ) ); - $docs_repo->checkout( 'refs/heads/' . $branch ); - $git_fs = GitFilesystem::create( $docs_repo ); - $chrooted_fs = new ChrootLayer( $git_fs, $path_in_repo ); - } elseif ( $args['mode'] === 'crawler' ) { - if ( ! isset( $args['data_url'] ) ) { - help_message_and_die( 'The "url" argument is required.' ); - } - if ( ! WPURL::parse( $args['data_url'] ) ) { - help_message_and_die( 'The "url" argument must be a valid URL.' ); - } - $args['source-site-url'] = $args['data_url']; - $tmp_dir = sys_get_temp_dir() . '/import-static-' . uniqid(); - $chrooted_fs = LocalFilesystem::create( $tmp_dir ); - $crawler = new Crawler( - $args['data_url'], - array( - 'preprocess_url' => function ( URL $url ) use ( $args ) { - if ( ! is_child_url_of( $url, $args['data_url'] ) ) { - return false; - } - $url->search = ''; - if ( in_array( $url->pathname, array( '/feed/', '/wp-json/' ) ) ) { - return false; - } - if ( preg_match( '#^/\d{4}/\d{2}/\d{2}/[^/]+/$#', $url->pathname ) ) { - return $url; - } - if ( preg_match( '#^/[^/]+/$#', $url->pathname ) ) { - return $url; - } - return false; - }, - ) - ); - $progress = new ProgressBar( $console_writer, null ); - $progress->start( 'Crawling website...' ); - while ( $crawler->crawl_next() ) { - $parsed_url = WPURL::parse( $crawler->get_current_url() ); - $file_path = $parsed_url->pathname; - if ( $file_path === '/' ) { - $file_path = '/index.html'; - } elseif ( str_ends_with( $file_path, '/' ) ) { - /** - * Choose to treat /2021/10/03/dont-waste-time-on-boring-programming-lessons/ as - * /2021/10/03/dont-waste-time-on-boring-programming-lessons.html - * - * Another possible choice would be to save it as - * /2021/10/03/dont-waste-time-on-boring-programming-lessons/index.html - */ - $file_path = rtrim( $file_path, '/' ); - } - - if ( ! $file_path || strlen( $file_path ) < 1 ) { - $file_path = sha1( $crawler->get_current_url() ); - } - - $extension = pathinfo( $file_path, PATHINFO_EXTENSION ); - if ( ! $extension ) { - $file_path .= '.html'; - } - - /** - * Replace date-based paths with "posts" directory. - * - * Why? wp_insert_post() seems to mangle the post_name if it consists of a few numbers - * and that messes up the URLs of the imported posts. - * - * @TODO: Investigate the reasons of this behavior. - */ - $file_path = preg_replace( '#/\d{4}/\d{2}/\d{2}/#', '/posts/', $file_path ); - $content = $crawler->get_current_content(); - // @TODO: This is very naive – we should use the URL processor instead. - $content = preg_replace( '#/\d{4}/\d{2}/\d{2}/#', '/posts/', $content ); - - $chrooted_fs->mkdir( dirname( $file_path ), array( 'recursive' => true ) ); - $chrooted_fs->put_contents( - $file_path, - $content - ); - $progress->setMessage( 'Fetching ' . $parsed_url->pathname ); - $progress->advance(); - } - $progress->finish(); - } - $entity_reader_factory = function () use ( $chrooted_fs, $source_site_url, $index_file_pattern ) { - return new FilesystemEntityReader( - $chrooted_fs, - array( - 'index_file_pattern' => $index_file_pattern, - 'filter_pattern' => '#\.(?:md|html|xhtml)$#', - /** - * Use a number so large, there's no chance for wp_table INSERTs - * to interfere with the post IDs generated by the FilesystemEntityReader. - * - * Some inserts are ran even by the importer, e.g. frontloading stubs. - * - * @TODO: Make sure this doesn't automatically bump the AUTOINCREMENT counter in MySQL. - * @TODO: Bump the AUTOINCREMENT counter manually after a finished import. - */ - 'first_post_id' => 10000000, - 'base_url' => $source_site_url, - ) - ); - }; - - /** - * Maps a filesystem path to a WordPress-friendly URL path we can assign - * to the imported page. - * - * Example: "/docs/README.md" -> "/docs/readme" - * - * @param string $path The filesystem path to convert - * @return string The WordPress-friendly URL path - */ - function map_file_path_to_wordpress_url( $path ) { - global $index_file_pattern, $import_path_prefix; - - /** - * Ensure a named top-level parent directory to base the entire - * URL structure on. The goal is to have a consistent way to resolve - * URLs for all the following files: - * - * - README.md - * - chapter-5/README.md - * - chapter-5/section-1.md - * - chapter-5/section-3/readme.md - * - * Without the top-level directory, the best URL we can give the - * /README.md file would be `/readme`. However, the `chapter-5/README.md` - * would get a URL like `/chapter-5` which is inconsistent. However, - * if we transform the path structure as follows, everything becomes - * consistent: - * - * - /imported-content/README.md - * - /imported-content/chapter-5/README.md - * - /imported-content/chapter-5/section-1.md - * - /imported-content/chapter-5/section-3/readme.md - * - * We want to keep all the links working after the import. A single, - * consistent URL mapping strategy makes it much easier. The alternative - * would be to maintain a mapping of parents to paths and use it whenever - * creating pages and rewriting URLs. - * - * This isn't trivial. Having a top-level path prefix is not perfect, - * but it's a sound compromise. - */ - $path = wp_join_unix_paths( $import_path_prefix, $path ); - - if ( 1 === preg_match( $index_file_pattern, $path ) ) { - $path = dirname( $path ); - } - - $extensions = array( '.md', '.html', '.xhtml' ); - foreach ( $extensions as $ext ) { - if ( str_ends_with( $path, $ext ) ) { - $path = substr( $path, 0, -strlen( $ext ) ); - break; - } - } - - return strtolower( $path ); - } - - /** - * Transforms links pointing to imported static files (e.g. ./getting-started.md) - * to the format they will have after being imported into WordPress (e.g. /docs/getting-started). - */ - add_action( - 'data_liberation.stream_importer.postprocess_url', - function ( - $processor, - $context - ) use ( - $chrooted_fs, - /** - * With &, $import_path_prefix reflects the latest value. - * Without &, it's a local copy of the value from the outer scope. - */ - &$import_path_prefix - ) { - /** - * If we didn't rewrite the base URL, the URL points outside - * of the imported root directory. Let's keep it as it is. - */ - if ( ! $context['applied_base_url_mapping'] ) { - return; - } - - $path_original = $processor->get_parsed_url()->pathname; - - /** - * Remove the site path from the URL path and check: - * Is this URL pointing to a file that exists in the imported - * directory? - */ - $base_url_path_prefix = $context['applied_base_url_mapping']['to']->pathname; - $path_relative_to_base = substr( $path_original, strlen( $base_url_path_prefix ) ); - if ( $chrooted_fs->is_file( $path_relative_to_base ) ) { - /** - * Yes! We are linking to an imported page. Let's transform the link - * to a WordPress-friendly URL scheme. - */ - $path_rewritten = map_file_path_to_wordpress_url( $path_relative_to_base ); - $path_rewritten = wp_join_unix_paths( $base_url_path_prefix, $path_rewritten ); - } elseif ( $processor->is_url_absolute() ) { - /** - * No. We are linking to a content page within our site but there is - * no corresponding static file. This happens e.g. in the Gutenberg - * handbook where the markdown files contain absolute URLs to the deployed - * site, e.g.: - * - * Start by ensuring you have Node.js and `npm` installed on your computer. Review - * the [Node.js development environment](https://developer.wordpress.org/block-editor/getting-started/devenv/nodejs-development-environment/) guide if not. - * - * Our best shot is to keep the URL as is, just with the imported - * content root prepended to it. - */ - $path_rewritten = wp_join_unix_paths( $base_url_path_prefix, $import_path_prefix, $path_relative_to_base ); - } else { - /** - * It's a relative URL pointing somewhere within the URL space we're importing - * to, but there is no corresponding static file. This is unexpected. There is - * nothing we can do at this point – let's just keep the URL as it is. - */ - return; - } - $processor->set_url( - $path_rewritten, - WPURL::parse( $path_rewritten, $processor->get_parsed_url() ) - ); - }, - 10, - 3 - ); - - /** - * Assigns post_name to every imported static page. - */ - add_filter( - 'data_liberation.stream_importer.preprocess_entity', - function ( $entity ) use ( &$import_path_prefix, $index_file_pattern ) { - static $preprocessed_an_entity = false; - if ( $entity->get_type() !== 'post' ) { - return $entity; - } - - $data = $entity->get_data(); - - if ( isset( $data['parsed_metadata']['slug'] ) ) { - $data['post_name'] = basename( $data['parsed_metadata']['slug'][0] ); - } elseif ( isset( $data['local_file_path'] ) ) { - /** - * The default import content path is "/imported-content". However, - * maybe we can find a friendlier path prefix based on the post - * title of the top-level index file. - * - * For example, a "Getting Started" guide found at "README.md" - * could be imported to "/getting-started". - */ - if ( ! $preprocessed_an_entity ) { - $preprocessed_an_entity = true; - $dirname = dirname( $data['local_file_path'] ); - $dirname_makes_a_bad_slug = $dirname !== '.' && $dirname === '/'; - $is_index_file = 1 === preg_match( $index_file_pattern, $data['local_file_path'] ); - $post_title_not_derived_from_path = $data['post_title'] !== ImportUtils::slug_to_title( basename( $data['local_file_path'] ) ); - - if ( - $dirname_makes_a_bad_slug && - $is_index_file && - $post_title_not_derived_from_path && - strlen( $data['post_title'] ) > 1 - ) { - $import_path_prefix = wp_import_slugify( $data['post_title'] ); - } - } - - $wordpress_url = map_file_path_to_wordpress_url( $data['local_file_path'] ); - $data['post_name'] = basename( $wordpress_url ); - } else { - return $entity; - } - - $entity->set_data( $data ); - return $entity; - }, - 10, - 2 - ); -} elseif ( $args['mode'] === 'wxr' ) { - if ( ! isset( $args['data_url'] ) ) { - help_message_and_die( 'The "wxr file" argument is required.' ); - } - $entity_reader_factory = function ( $cursor ) use ( $args ) { - return WXREntityReader::create( - uri_to_byte_stream( $args['data_url'] ), - $cursor - ); - }; -} elseif ( $args['mode'] === 'epub' ) { - if ( ! isset( $args['data_url'] ) ) { - help_message_and_die( 'The "epub file" argument is required.' ); - } - $zip_fs = ZipFilesystem::create( - uri_to_byte_stream( $args['data_url'] ) - ); - $entity_reader_factory = function ( $cursor = null ) use ( $zip_fs ) { - return new EPubEntityReader( - $zip_fs, - 1000000 // This is first post ID. We should really also accept a cursor - ); - }; - $reader = $entity_reader_factory(); - $source_site_url = 'file://' . dirname( $reader->get_manifest_path() ); - - // To source the media files from the EPUB bundle: - $chrooted_fs = $zip_fs; - - /** - * Drop .xhtml extension from the links. - */ - add_action( - 'data_liberation.stream_importer.postprocess_url', - function ( $processor ) { - $parsed_url = $processor->get_parsed_url(); - if ( ! str_ends_with( $parsed_url->pathname, '.xhtml' ) ) { - return; - } - $parsed_url->pathname = substr( $parsed_url->pathname, 0, -6 ); - $processor->set_url( - $parsed_url . '', - $parsed_url - ); - } - ); -} else { - help_message_and_die( 'The "mode" argument is required and must be one of: "local-directory", "git", "crawler", "wxr", or "epub".' ); - exit( 1 ); -} - -function uri_to_byte_stream( $uri ) { - if ( str_starts_with( $uri, 'http://' ) || str_starts_with( $uri, 'https://' ) ) { - $local_path = tempnam( sys_get_temp_dir(), 'wp-remote-file-' ); - file_put_contents( $local_path, file_get_contents( $uri ) ); - $uri = $local_path; - - // @TODO: Use SeekableRequestReadStream here instead of - // pre-downloading the file to disk. - // $client = new Client(); - // $response = $client->fetch($uri); - } - if ( file_exists( $uri ) ) { - return FileReadStream::from_path( $uri ); - } - throw new \Exception( "Unknown resource type: $uri. If that's a local file, \033[1mplease provide an absolute path to the file\033[0m." ); -} - - -/** - * Naive slugification function. - * - * @TODO: Use a more sophisticated one with utf-8 support etc. - */ -function wp_import_slugify( $title ) { - return preg_replace( '/[^a-z0-9]+/i', '-', trim( strtolower( $title ) ) ); -} - -$data_url = $args['data_url']; -$console_writer->write( "Importing static files from $data_url\n" ); - - -try { - // Parse URL mapping arguments - $additional_url_mappings = array(); - foreach ( $parser->getArray( 'additional-site-urls' ) as $url ) { - $additional_url_mappings[] = array( - 'from' => $url, - 'to' => NEW_SITE_CONTENT_ROOT, - ); - } - - $console_writer->write( "Starting the import\n" ); - $importer = StreamImporter::create( - $entity_reader_factory, - array( - 'source_site_url' => $source_site_url, - 'new_site_content_root_url' => NEW_SITE_CONTENT_ROOT, - 'source_media_root_urls' => $parser->getArray( 'media-url' ) ?: array( $source_site_url ), - 'additional_url_mappings' => $additional_url_mappings, - 'index_batch_size' => 1, - 'attachment_downloader_options' => array( - 'source_from_filesystem' => $chrooted_fs, - ), - ) - ); - - $import_session = ImportSession::create( - array( - 'data_source' => 'local_directory', - // @TODO: the phrase "file_name" doesn't make sense here. We're sourcing - // data from a directory, not a file. This string is used to tell - // the user in the UI what this they're importing in this import - // session. Let's rename it to something more descriptive. - 'file_name' => $args['data_url'], - ) - ); - $retries_iterator = new RetryFrontloadingIterator( $import_session->get_id() ); - $importer->set_frontloading_retries_iterator( $retries_iterator ); - - // @TODO: Prettier progress reporting - $ignored_message_printed = false; - do { - $result = data_liberation_import_step_customized( $import_session, $importer, $console_writer ); - if ( $importer->get_stage() === StreamImporter::STAGE_FINISHED ) { - $console_writer->write( "\n" ); - $console_writer->write( "\033[1;32mImport finished!\033[0m See your imported content at: \n" ); - - // Get the first page with non-empty content. - $posts = get_posts( - array( - 'numberposts' => 10, - 'orderby' => 'ID', - 'order' => 'ASC', - 'post_type' => 'page', - 'post_status' => 'publish', - ) - ); - - $url = NEW_SITE_CONTENT_ROOT; - foreach ( $posts as $post ) { - if ( ! empty( $post->post_content ) ) { - $url = get_permalink( $post ); - break; - } - } - $console_writer->write( "\033[1;36m" . $url . "\033[0m\n" ); - break; - } elseif ( false === $result ) { - if ( $importer->get_stage() === StreamImporter::STAGE_FRONTLOAD_ASSETS ) { - if ( ! $ignored_message_printed ) { - $console_writer->write( "\nSome assets could not be downloaded – they will be ignored so we can continue with the import.\n" ); - $ignored_message_printed = true; - } - // $import_session->mark_frontloading_errors_as_ignored(); - } else { - $console_writer->write( "Import failed, aborting\n" ); - break; - } - } else { - // Twiddle our thumbs, importing in progress... - } - } while ( true ); -} finally { - if ( isset( $cache_fs ) ) { - $cache_fs->rmdir( - '/', - array( - 'recursive' => true, - ) - ); - } -} - -/** - * @TODO: Expose a primitive like the step function below from the - * DataLiberation PHP component. Support all sorts of pause conditions - * such as time limits, retry counts, memory limits, etc. - */ -function data_liberation_import_step_customized( ImportSession $session, StreamImporter $importer, ConsoleWriter $console_writer ) { - $soft_time_limit_seconds = 15; - $hard_time_limit_seconds = 25; - $start_time = microtime( true ); - $fetched_files = 0; - $progress_bar = null; - - while ( true ) { - $time_taken = microtime( true ) - $start_time; - if ( $time_taken >= $soft_time_limit_seconds ) { - if ( $importer->get_stage() === StreamImporter::STAGE_FRONTLOAD_ASSETS ) { - if ( $fetched_files > 0 ) { - return true; - } - } else { - return true; - } - } - if ( $time_taken >= $hard_time_limit_seconds ) { - return true; - } - - if ( true !== $importer->next_step() ) { - $session->set_reentrancy_cursor( $importer->get_reentrancy_cursor() ); - - $should_advance_to_next_stage = null !== $importer->get_next_stage(); - if ( $should_advance_to_next_stage ) { - if ( StreamImporter::STAGE_FRONTLOAD_ASSETS === $importer->get_stage() ) { - $resolved_all_failures = $session->count_unfinished_frontloading_stubs() === 0; - if ( ! $resolved_all_failures ) { - // Uncomment once this script's intent becomes exiting on unresolved frontloading failures. - // if($progress_bar) { - // $progress_bar->finish(); - // } - // return false; - } - } - } - if ( ! $importer->advance_to_next_stage() ) { - if ( $progress_bar ) { - $progress_bar->finish(); - } - return false; - } - $session->set_stage( $importer->get_stage() ); - $session->set_reentrancy_cursor( $importer->get_reentrancy_cursor() ); - $console_writer->clearLine(); - $progress_bar = null; - - continue; - } - - switch ( $importer->get_stage() ) { - case StreamImporter::STAGE_INDEX_ENTITIES: - $entities_counts = $importer->get_indexed_entities_counts(); - $session->create_frontloading_stubs( $importer->get_indexed_assets_urls() ); - $session->bump_total_number_of_entities( $entities_counts ); - if ( ! $progress_bar ) { - $progress_bar = new ProgressBar( $console_writer, null ); - $progress_bar->setMessage( 'Indexing entities' ); - $progress_bar->start(); - } - $progress_bar->setCurrent( array_sum( $session->get_total_number_of_entities() ) ); - break; - - case StreamImporter::STAGE_FRONTLOAD_ASSETS: - $progress = $importer->get_frontloading_progress(); - $session->bump_frontloading_progress( - $progress, - $importer->get_frontloading_events() - ); - - if ( ! $progress_bar ) { - $progress_bar = new ProgressBar( $console_writer, null ); - $progress_bar->setMessage( 'Fetching media files' ); - $progress_bar->start(); - } - $progress_bar->setCurrent( $session->count_unfinished_frontloading_stubs() ); - break; - - case StreamImporter::STAGE_IMPORT_ENTITIES: - $imported_counts = $importer->get_imported_entities_counts(); - - $session->bump_imported_entities_counts( $imported_counts ); - - if ( ! $progress_bar ) { - $progress_bar = new ProgressBar( $console_writer, $session->count_remaining_entities() ); - $progress_bar->setMessage( 'Importing entities' ); - $progress_bar->start(); - } - $progress_bar->setCurrent( $session->count_all_imported_entities() ); - break; - } - - $session->set_reentrancy_cursor( $importer->get_reentrancy_cursor() ); - } - return false; -} diff --git a/phar-libraries.json b/phar-libraries.json index ef512789..3c64ec29 100644 --- a/phar-libraries.json +++ b/phar-libraries.json @@ -1,7 +1,7 @@ { "$schema": "https://raw.githubusercontent.com/box-project/box/refs/heads/main/res/schema.json", "main": "vendor/autoload.php", - "output": "dist/wordpress-libraries.phar", + "output": "dist/php-toolkit.phar", "force-autodiscovery": false, "compactors": [ "KevinGH\\Box\\Compactor\\Php" diff --git a/plugins/data-liberation/plugin.php b/plugins/data-liberation/plugin.php index b719bcd7..038df052 100644 --- a/plugins/data-liberation/plugin.php +++ b/plugins/data-liberation/plugin.php @@ -19,9 +19,9 @@ use WordPress\HttpClient\Request; use WordPress\Markdown\MarkdownImporter; -if(file_exists(__DIR__ . '/wordpress-libraries.phar')) { +if(file_exists(__DIR__ . '/php-toolkit.phar')) { // Production – built and installed plugin - require_once __DIR__ . '/wordpress-libraries.phar'; + require_once __DIR__ . '/php-toolkit.phar'; } else { // Development – plugin mounted in WordPress via Playground CLI mounts require_once __DIR__ . '/../../vendor/autoload.php'; From 0120ee30e0c2eea8e882e8ed78571c028e3e6a16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Thu, 29 May 2025 14:26:01 +0200 Subject: [PATCH 4/6] Use the Data Liberation importer for processing WXR files --- components/Blueprints/Runner.php | 2 +- .../Blueprints/Steps/ImportContentStep.php | 60 +- .../Steps/scripts/import-content.php | 884 ++++++++++++++++++ components/Blueprints/bin/blueprint.php | 81 +- components/CLI/CLI.php | 119 +++ components/CLI/composer.json | 26 + components/DataLiberation/URL/functions.php | 245 ++--- composer.json | 1 + 8 files changed, 1181 insertions(+), 237 deletions(-) create mode 100644 components/Blueprints/Steps/scripts/import-content.php create mode 100644 components/CLI/CLI.php create mode 100644 components/CLI/composer.json diff --git a/components/Blueprints/Runner.php b/components/Blueprints/Runner.php index 1c943baf..fcfc9c5e 100644 --- a/components/Blueprints/Runner.php +++ b/components/Blueprints/Runner.php @@ -632,7 +632,7 @@ private function createExecutionPlan(): array { ); } $this->configuration->getLogger()->info( 'Loading importer libraries from ' . $libraries_phar_path ); - $source = $this->createDataReference( new AbsoluteLocalPath( $libraries_phar_path ) ); + $source = $this->createDataReference( new InlineFile( 'php-toolkit.phar', file_get_contents( $libraries_phar_path ) ) ); } array_unshift( $plan, $this->createStepObject( 'writeFiles', [ 'files' => [ diff --git a/components/Blueprints/Steps/ImportContentStep.php b/components/Blueprints/Steps/ImportContentStep.php index 99c9d368..0378d6bf 100644 --- a/components/Blueprints/Steps/ImportContentStep.php +++ b/components/Blueprints/Steps/ImportContentStep.php @@ -65,49 +65,31 @@ private function importWxr( Runtime $runtime, array $content_definition ): void } $wxrPath = $runtime->saveToTemporaryFile( $resolved ); + // @TODO: Make it work when Blueprints are running as phar archive + $import_script_path = __DIR__ . '/scripts/import-content.php'; + if ( ! file_exists( $import_script_path ) ) { + throw new BlueprintExecutionException( sprintf( + 'Import script %s does not exist.', + $import_script_path + ) ); + } + + $importer_script = file_get_contents( $import_script_path ); $runtime->evalPhpCodeInSubProcess( <<<'PHP' 'Administrator') )[0]->ID; -wp_set_current_user( $admin_id ); - -wp_set_current_user( $admin_id ); -$importer = new WXR_Importer( array( -'fetch_attachments' => true, -// @TODO: Support custom author -'default_author' => $admin_id -) ); -$logger = new WP_Importer_Logger_CLI(); -$importer->set_logger( $logger ); -// Slashes from the imported content are lost if we don't call wp_slash here. -add_action( 'wp_insert_post_data', function( $data ) { -return wp_slash($data); -}); - -// Ensure that Site Editor templates are associated with the correct taxonomy. -add_filter( 'wp_import_post_terms', function ( $terms, $post_id ) { -foreach ( $terms as $post_term ) { -if ( 'wp_theme' !== $term['taxonomy'] ) {continue;} -$post_term = get_term_by('slug', $term['slug'], $term['taxonomy'] ); -if ( ! $post_term ) { -$post_term = wp_insert_term( -$term['slug'], -$term['taxonomy'] -); -$term_id = $post_term['term_id']; -} else { -$term_id = $post_term->term_id; -} -wp_set_object_terms( $post_id, $term_id, $term['taxonomy']) ; -} -return $terms; -}, 10, 2 ); -$result = $importer->import( getenv('WXR_PATH') ); +// @TODO: Just call a function here, do not go through CLI arguments. +// @TODO: Establish a communication channel between the main process and the subprocess +// to report progress and errors. +// @TODO: Enforce chrooting of the imported static files. +$_SERVER['argv'] = [ + 'import-wxr.php', + 'wxr', + getenv('WXR_PATH') +]; +?> PHP + . $importer_script , [ 'WXR_PATH' => $wxrPath, diff --git a/components/Blueprints/Steps/scripts/import-content.php b/components/Blueprints/Steps/scripts/import-content.php new file mode 100644 index 00000000..cbb7c1b5 --- /dev/null +++ b/components/Blueprints/Steps/scripts/import-content.php @@ -0,0 +1,884 @@ +stdout = fopen('php://stdout', 'w'); + } + + public function __destruct() { + fclose($this->stdout); + } + + public function write(string $text): void { + fwrite($this->stdout, $text); + } + + public function clearLine(): void { + if (!$this->isTty()) { + return; + } + fwrite($this->stdout, "\r\033[K"); // Return to start + clear to end + } + + public function replaceLine(string $text): void { + $this->clearLine(); + $this->write($text); + } + + public function writeLines(array $lines, bool $replace = false): void { + if ($replace && $this->isTty()) { + // Move up by number of lines and clear them + foreach ($lines as $i => $line) { + if ($i > 0) { + fwrite($this->stdout, "\033[1A"); // Move up one line + } + $this->clearLine(); + } + } + + foreach ($lines as $line) { + $this->write($line . PHP_EOL); + } + } + + private function isTty(): bool { + return stream_isatty($this->stdout); + } +} + +class ProgressBar { + private ConsoleWriter $writer; + private ?int $total; + private int $current; + private int $width; + private string $message; + private float $startTime; + private bool $started = false; + private bool $indeterminate = false; + + public function __construct(ConsoleWriter $writer, ?int $total = 100, int $width = 50) { + $this->writer = $writer; + $this->total = $total; + $this->indeterminate = ($total === null); + $this->current = 0; + $this->width = $width; + $this->message = ''; + } + + public function start(): void { + if ($this->started) { + return; + } + $this->started = true; + $this->startTime = microtime(true); + $this->update(); + } + + public function advance(int $step = 1): void { + $this->setCurrent($this->current + $step); + } + + public function setCurrent(int $current): void { + $this->current = $this->indeterminate ? $current : min($this->total, max(0, $current)); + $this->update(); + } + + public function setMessage(string $message): void { + $this->message = $message; + $this->update(); + } + + public function finish(): void { + if (!$this->started) { + return; + } + if (!$this->indeterminate) { + $this->current = $this->total; + } + $this->update(); + $this->writer->write("\n"); + } + + private function update(): void { + if (!$this->started) { + return; + } + + if ($this->indeterminate) { + $this->updateIndeterminate(); + } else { + $this->updateDeterminate(); + } + } + + private function updateIndeterminate(): void { + $elapsed = microtime(true) - $this->startTime; + + // Create a "moving" animation for indeterminate progress + $position = (int)($elapsed * 5) % ($this->width * 2); + if ($position >= $this->width) { + $position = $this->width * 2 - $position; + } + + $spaces_before = min(max(0, $position), $this->width - 3); + $spaces_after = max(0, $this->width - $position - 3); + + $bar = str_repeat(' ', $spaces_before) . '<=>' . str_repeat(' ', $spaces_after); + $status = sprintf( + "[%s] %d items - %s", + $bar, + $this->current, + $this->message + ); + + $this->writer->replaceLine($status); + } + + private function updateDeterminate(): void { + $percentage = $this->current / $this->total; + $filled = (int)round($this->width * $percentage); + $empty = $this->width - $filled; + + $bar = str_repeat('=', $filled); + if ($empty > 0) { + $bar .= '>'; + $bar .= str_repeat(' ', $empty - 1); + } + + $status = sprintf( + "[%s] %d/%d - %s", + $bar, + $this->current, + $this->total, + $this->message + ); + + $this->writer->replaceLine($status); + } +} + +$console_writer = new PhpConsoleWriter(); + +// Parse CLI arguments +function show_error_message_and_die( $error = false ) { + global $console_writer; + + $console_writer->write( "\033[1;31mError:\033[0m " ); + $console_writer->write( $error ); + $console_writer->write( "\n" ); + exit( 1 ); +} + +function help_message_and_die() { + global $console_writer; + + $console_writer->write( "\033[1;32mDescription:\033[0m\n" ); + $console_writer->write( " Imports content into a new WordPress site\n\n" ); + + $console_writer->write( "\033[1;32mUsage:\033[0m\n" ); + $console_writer->write( " php import-markdown-directory.php [options]\n\n" ); + + $console_writer->write( "\033[1;32mModes:\033[0m\n" ); + $console_writer->write( " \033[1;33mlocal-directory\033[0m Import content from a local directory\n" ); + $console_writer->write( " \033[1;33mgit\033[0m Import content from a git repository\n" ); + $console_writer->write( " \033[1;33mwxr\033[0m Import content from a WordPress eXtended RSS file\n" ); + $console_writer->write( " \033[1;33mepub\033[0m Import content from an EPUB ebook\n\n" ); + + $console_writer->write( "\033[1;32mGlobal Options:\033[0m\n" ); + $console_writer->write( " \033[1;34m--source-site-url=\033[0m\n" ); + $console_writer->write( " Base URL of the source content (required)\n\n" ); + + $console_writer->write( " \033[1;34m--additional-site-urls=\033[0m\n" ); + $console_writer->write( " Additional URLs to rewrite links for (multiple allowed)\n\n" ); + + $console_writer->write( " \033[1;34m--media-url=\033[0m\n" ); + $console_writer->write( " URLs to download media files from (multiple allowed)\n\n" ); + + $console_writer->write( " \033[1;34m--output-dir=\033[0m\n" ); + $console_writer->write( " Create the new WordPress site in this directory\n" ); + $console_writer->write( " Must be empty and have write permissions\n\n" ); + + $console_writer->write( "\033[1;32mMode-specific Usage:\033[0m\n" ); + + $console_writer->write( "\033[1;33mgit\033[0m mode:\n" ); + $console_writer->write( " php import-markdown-directory.php git \n" ); + $console_writer->write( " Options:\n" ); + $console_writer->write( " \033[1;34m--branch=\033[0m\n" ); + $console_writer->write( " Git branch to import from (required)\n" ); + $console_writer->write( " \033[1;34m--path-in-repo=\033[0m\n" ); + $console_writer->write( " Subdirectory in repository to import from\n\n" ); + + $console_writer->write( "\033[1;33mlocal-directory\033[0m mode:\n" ); + $console_writer->write( " php import-markdown-directory.php local-directory \n" ); + $console_writer->write( " Imports content from local \n\n" ); + + $console_writer->write( "\033[1;33mwxr\033[0m mode:\n" ); + $console_writer->write( " php import-markdown-directory.php wxr \n" ); + $console_writer->write( " Imports content from a WordPress eXtended RSS file\n\n" ); + + $console_writer->write( "\033[1;33mepub\033[0m mode:\n" ); + $console_writer->write( " php import-markdown-directory.php epub \n" ); + $console_writer->write( " Imports content from an EPUB ebook\n\n" ); + + die(); +} + +// Define the option definitions as documented above +$optionDefs = [ + // General options + 'mode' => [ 'm', true, null, 'Import mode (git|local-directory|wxr|epub) (required)' ], + 'output-dir' => [ 'o', true, null, 'Directory to create the new WordPress site in (required)' ], + 'source-site-url' => [ 's', true, null, 'Base URL of the source content (required for most modes)' ], + 'additional-site-urls' => [ 'a', true, null, 'Additional URLs to rewrite links for (can be specified multiple times)' ], + 'media-url' => [ null, true, null, 'URLs to download media files from (can be specified multiple times)' ], + + // git mode + 'branch' => [ 'b', true, null, 'Git branch to import from (required for git mode)' ], + 'path-in-repo' => [ 'p', true, null, 'Subdirectory in repository to import from (optional for git mode)' ], + + // Help + 'help' => [ 'h', false, false, 'Show help' ], +]; + +// Parse CLI arguments and options +try { + list( $positionals, $options ) = CLI::parseCommandArgsAndOptions( array_slice( $_SERVER['argv'], 1 ), $optionDefs ); +} catch ( InvalidArgumentException $e ) { + show_error_message_and_die( $e->getMessage() ); +} + +if ( $options['help'] ?? false ) { + help_message_and_die(); +} + +define( 'NEW_SITE_CONTENT_ROOT', get_site_url() ); +$console_writer->write( 'Target site URL: ' . NEW_SITE_CONTENT_ROOT . "\n" ); + + +// Map positional arguments to their meaning based on mode +// (first positional is always the mode if not given as --mode) +if ( empty( $options['mode'] ) && !empty( $positionals ) ) { + $options['mode'] = array_shift( $positionals ); +} + +// For each mode, map the next positional(s) to the correct option +switch ( $options['mode'] ?? null ) { + case 'git': + // git + if ( !isset( $options['data_url'] ) && !empty( $positionals ) ) { + $options['data_url'] = array_shift( $positionals ); + } + break; + case 'local-directory': + // local-directory + if ( !isset( $options['data_url'] ) && !empty( $positionals ) ) { + $options['data_url'] = array_shift( $positionals ); + } + break; + case 'wxr': + case 'epub': + // wxr + // epub + if ( !isset( $options['data_url'] ) && !empty( $positionals ) ) { + $options['data_url'] = array_shift( $positionals ); + } + break; +} + +// Support multiple --additional-site-urls and --media-url +foreach ( [ 'additional-site-urls', 'media-url' ] as $multiOpt ) { + if ( isset( $options[ $multiOpt ] ) && !is_array( $options[ $multiOpt ] ) ) { + $options[ $multiOpt ] = [ $options[ $multiOpt ] ]; + } + // Scan $positionals for repeated --foo=bar style (if CLI parser doesn't already do this) +} + +// For compatibility with legacy code below, map to $args +$args = $options; + +$chrooted_fs = null; +$source_site_url = null; +if ( in_array( $args['mode'], array( 'local-directory', 'git' ) ) ) { + // Validate required arguments + if ( ! isset( $args['source-site-url'] ) ) { + show_error_message_and_die( 'The --source-site-url argument is required.' ); + } + $index_file_pattern = '#(?:index|readme)\.(?:md|html|xhtml)$#i'; + $import_path_prefix = '/imported-content'; + $source_site_url = $args['source-site-url']; + + if ( $args['mode'] === 'local-directory' ) { + if ( ! isset( $args['data_url'] ) ) { + show_error_message_and_die( 'The "local-directory" positional argument is required.' ); + } + + $chrooted_fs = LocalFilesystem::create( $args['data_url'] ); + + $args['source-site-url'] = 'file:///'; + } elseif ( $args['mode'] === 'git' ) { + if ( ! isset( $args['data_url'] ) ) { + show_error_message_and_die( 'The "repo" argument is required.' ); + } + + $args['repo'] = $args['data_url']; + if ( ! str_ends_with( $args['repo'], '.git' ) ) { + show_error_message_and_die( 'The "repo" argument must end with ".git" when mode is "git".' ); + } + + if ( ! isset( $args['branch'] ) ) { + show_error_message_and_die( 'The "branch" argument is required when mode is "git".' ); + } + + $console_writer->write( "Sparse checkout of the git repository\n" ); + $temp_dir = sys_get_temp_dir() . '/import-static-' . uniqid(); + $cache_fs = LocalFilesystem::create( $temp_dir ); + $docs_repo = new GitRepository( $cache_fs ); + $docs_repo->add_remote( 'origin', $args['repo'] ); + $remote = $docs_repo->get_remote_client( 'origin' ); + $path_in_repo = $args['path-in-repo'] ?? ''; + $branch = $args['branch'] ?? 'trunk'; + $remote->fetch( + $branch, + array( + 'path' => $path_in_repo, + 'shallow' => true, + ) + ); + $docs_repo->set_branch_tip( 'refs/heads/' . $branch, $docs_repo->get_branch_tip( 'refs/remotes/origin/' . $branch ) ); + $docs_repo->checkout( 'refs/heads/' . $branch ); + $git_fs = GitFilesystem::create( $docs_repo ); + $chrooted_fs = new ChrootLayer( $git_fs, $path_in_repo ); + } + $entity_reader_factory = function () use ( $chrooted_fs, $source_site_url, $index_file_pattern ) { + return new FilesystemEntityReader( + $chrooted_fs, + array( + 'index_file_pattern' => $index_file_pattern, + 'filter_pattern' => '#\.(?:md|html|xhtml)$#', + /** + * Use a number so large, there's no chance for wp_table INSERTs + * to interfere with the post IDs generated by the FilesystemEntityReader. + * + * Some inserts are ran even by the importer, e.g. frontloading stubs. + * + * @TODO: Make sure this doesn't automatically bump the AUTOINCREMENT counter in MySQL. + * @TODO: Bump the AUTOINCREMENT counter manually after a finished import. + */ + 'first_post_id' => 10000000, + 'base_url' => $source_site_url, + ) + ); + }; + + /** + * Maps a filesystem path to a WordPress-friendly URL path we can assign + * to the imported page. + * + * Example: "/docs/README.md" -> "/docs/readme" + * + * @param string $path The filesystem path to convert + * @return string The WordPress-friendly URL path + */ + function map_file_path_to_wordpress_url( $path ) { + global $index_file_pattern, $import_path_prefix; + + /** + * Ensure a named top-level parent directory to base the entire + * URL structure on. The goal is to have a consistent way to resolve + * URLs for all the following files: + * + * - README.md + * - chapter-5/README.md + * - chapter-5/section-1.md + * - chapter-5/section-3/readme.md + * + * Without the top-level directory, the best URL we can give the + * /README.md file would be `/readme`. However, the `chapter-5/README.md` + * would get a URL like `/chapter-5` which is inconsistent. However, + * if we transform the path structure as follows, everything becomes + * consistent: + * + * - /imported-content/README.md + * - /imported-content/chapter-5/README.md + * - /imported-content/chapter-5/section-1.md + * - /imported-content/chapter-5/section-3/readme.md + * + * We want to keep all the links working after the import. A single, + * consistent URL mapping strategy makes it much easier. The alternative + * would be to maintain a mapping of parents to paths and use it whenever + * creating pages and rewriting URLs. + * + * This isn't trivial. Having a top-level path prefix is not perfect, + * but it's a sound compromise. + */ + $path = wp_join_unix_paths( $import_path_prefix, $path ); + + if ( 1 === preg_match( $index_file_pattern, $path ) ) { + $path = dirname( $path ); + } + + $extensions = array( '.md', '.html', '.xhtml' ); + foreach ( $extensions as $ext ) { + if ( str_ends_with( $path, $ext ) ) { + $path = substr( $path, 0, -strlen( $ext ) ); + break; + } + } + + return strtolower( $path ); + } + + /** + * Transforms links pointing to imported static files (e.g. ./getting-started.md) + * to the format they will have after being imported into WordPress (e.g. /docs/getting-started). + */ + add_action( + 'data_liberation.stream_importer.postprocess_url', + function ( + $processor, + $context + ) use ( + $chrooted_fs, + /** + * With &, $import_path_prefix reflects the latest value. + * Without &, it's a local copy of the value from the outer scope. + */ + &$import_path_prefix + ) { + /** + * If we didn't rewrite the base URL, the URL points outside + * of the imported root directory. Let's keep it as it is. + */ + if ( ! $context['applied_base_url_mapping'] ) { + return; + } + + $path_original = $processor->get_parsed_url()->pathname; + + /** + * Remove the site path from the URL path and check: + * Is this URL pointing to a file that exists in the imported + * directory? + */ + $base_url_path_prefix = $context['applied_base_url_mapping']['to']->pathname; + $path_relative_to_base = substr( $path_original, strlen( $base_url_path_prefix ) ); + if ( $chrooted_fs->is_file( $path_relative_to_base ) ) { + /** + * Yes! We are linking to an imported page. Let's transform the link + * to a WordPress-friendly URL scheme. + */ + $path_rewritten = map_file_path_to_wordpress_url( $path_relative_to_base ); + $path_rewritten = wp_join_unix_paths( $base_url_path_prefix, $path_rewritten ); + } elseif ( $processor->is_url_absolute() ) { + /** + * No. We are linking to a content page within our site but there is + * no corresponding static file. This happens e.g. in the Gutenberg + * handbook where the markdown files contain absolute URLs to the deployed + * site, e.g.: + * + * Start by ensuring you have Node.js and `npm` installed on your computer. Review + * the [Node.js development environment](https://developer.wordpress.org/block-editor/getting-started/devenv/nodejs-development-environment/) guide if not. + * + * Our best shot is to keep the URL as is, just with the imported + * content root prepended to it. + */ + $path_rewritten = wp_join_unix_paths( $base_url_path_prefix, $import_path_prefix, $path_relative_to_base ); + } else { + /** + * It's a relative URL pointing somewhere within the URL space we're importing + * to, but there is no corresponding static file. This is unexpected. There is + * nothing we can do at this point – let's just keep the URL as it is. + */ + return; + } + $processor->set_url( + $path_rewritten, + WPURL::parse( $path_rewritten, $processor->get_parsed_url() ) + ); + }, + 10, + 3 + ); + + /** + * Assigns post_name to every imported static page. + */ + add_filter( + 'data_liberation.stream_importer.preprocess_entity', + function ( $entity ) use ( &$import_path_prefix, $index_file_pattern ) { + static $preprocessed_an_entity = false; + if ( $entity->get_type() !== 'post' ) { + return $entity; + } + + $data = $entity->get_data(); + + if ( isset( $data['parsed_metadata']['slug'] ) ) { + $data['post_name'] = basename( $data['parsed_metadata']['slug'][0] ); + } elseif ( isset( $data['local_file_path'] ) ) { + /** + * The default import content path is "/imported-content". However, + * maybe we can find a friendlier path prefix based on the post + * title of the top-level index file. + * + * For example, a "Getting Started" guide found at "README.md" + * could be imported to "/getting-started". + */ + if ( ! $preprocessed_an_entity ) { + $preprocessed_an_entity = true; + $dirname = dirname( $data['local_file_path'] ); + $dirname_makes_a_bad_slug = $dirname !== '.' && $dirname === '/'; + $is_index_file = 1 === preg_match( $index_file_pattern, $data['local_file_path'] ); + $post_title_not_derived_from_path = $data['post_title'] !== ImportUtils::slug_to_title( basename( $data['local_file_path'] ) ); + + if ( + $dirname_makes_a_bad_slug && + $is_index_file && + $post_title_not_derived_from_path && + strlen( $data['post_title'] ) > 1 + ) { + $import_path_prefix = wp_import_slugify( $data['post_title'] ); + } + } + + $wordpress_url = map_file_path_to_wordpress_url( $data['local_file_path'] ); + $data['post_name'] = basename( $wordpress_url ); + } else { + return $entity; + } + + $entity->set_data( $data ); + return $entity; + }, + 10, + 2 + ); +} elseif ( $args['mode'] === 'wxr' ) { + if ( ! isset( $args['data_url'] ) ) { + help_message_and_die( 'The "wxr file" argument is required.' ); + } + $entity_reader_factory = function ( $cursor ) use ( $args ) { + return WXREntityReader::create( + uri_to_byte_stream( $args['data_url'] ), + $cursor + ); + }; +} elseif ( $args['mode'] === 'epub' ) { + if ( ! isset( $args['data_url'] ) ) { + help_message_and_die( 'The "epub file" argument is required.' ); + } + $zip_fs = ZipFilesystem::create( + uri_to_byte_stream( $args['data_url'] ) + ); + $entity_reader_factory = function ( $cursor = null ) use ( $zip_fs ) { + return new EPubEntityReader( + $zip_fs, + 1000000 // This is first post ID. We should really also accept a cursor + ); + }; + $reader = $entity_reader_factory(); + $source_site_url = 'file://' . dirname( $reader->get_manifest_path() ); + + // To source the media files from the EPUB bundle: + $chrooted_fs = $zip_fs; + + /** + * Drop .xhtml extension from the links. + */ + add_action( + 'data_liberation.stream_importer.postprocess_url', + function ( $processor ) { + $parsed_url = $processor->get_parsed_url(); + if ( ! str_ends_with( $parsed_url->pathname, '.xhtml' ) ) { + return; + } + $parsed_url->pathname = substr( $parsed_url->pathname, 0, -6 ); + $processor->set_url( + $parsed_url . '', + $parsed_url + ); + } + ); +} else { + help_message_and_die( 'The "mode" argument is required and must be one of: "local-directory", "git", "wxr", or "epub".' ); + exit( 1 ); +} + +function uri_to_byte_stream( $uri ) { + if ( str_starts_with( $uri, 'http://' ) || str_starts_with( $uri, 'https://' ) ) { + $local_path = tempnam( sys_get_temp_dir(), 'wp-remote-file-' ); + file_put_contents( $local_path, file_get_contents( $uri ) ); + $uri = $local_path; + + // @TODO: Use SeekableRequestReadStream here instead of + // pre-downloading the file to disk. + // $client = new Client(); + // $response = $client->fetch($uri); + } + if ( file_exists( $uri ) ) { + return FileReadStream::from_path( $uri ); + } + throw new \Exception( "Unknown resource type: $uri. If that's a local file, \033[1mplease provide an absolute path to the file\033[0m." ); +} + + +/** + * Naive slugification function. + * + * @TODO: Use a more sophisticated one with utf-8 support etc. + */ +function wp_import_slugify( $title ) { + return preg_replace( '/[^a-z0-9]+/i', '-', trim( strtolower( $title ) ) ); +} + +$data_url = $args['data_url']; +$console_writer->write( "Importing static files from $data_url\n" ); + + +try { + // Parse URL mapping arguments + $additional_url_mappings = array(); + foreach ( $args['additional-site-urls'] ?? [] as $url ) { + $additional_url_mappings[] = array( + 'from' => $url, + 'to' => NEW_SITE_CONTENT_ROOT, + ); + } + + $console_writer->write( "Starting the import\n" ); + $importer = StreamImporter::create( + $entity_reader_factory, + array( + 'source_site_url' => $source_site_url, + 'new_site_content_root_url' => NEW_SITE_CONTENT_ROOT, + 'source_media_root_urls' => $args['media-url'] ?? array( $source_site_url ), + 'additional_url_mappings' => $additional_url_mappings, + 'index_batch_size' => 1, + 'attachment_downloader_options' => array( + 'source_from_filesystem' => $chrooted_fs, + ), + ) + ); + + $import_session = ImportSession::create( + array( + 'data_source' => 'local_directory', + // @TODO: the phrase "file_name" doesn't make sense here. We're sourcing + // data from a directory, not a file. This string is used to tell + // the user in the UI what this they're importing in this import + // session. Let's rename it to something more descriptive. + 'file_name' => $args['data_url'], + ) + ); + $retries_iterator = new RetryFrontloadingIterator( $import_session->get_id() ); + $importer->set_frontloading_retries_iterator( $retries_iterator ); + + // @TODO: Prettier progress reporting + $ignored_message_printed = false; + do { + $result = data_liberation_import_step_customized( $import_session, $importer, $console_writer ); + if ( $importer->get_stage() === StreamImporter::STAGE_FINISHED ) { + $console_writer->write( "\n" ); + $console_writer->write( "\033[1;32mImport finished!\033[0m See your imported content at: \n" ); + + // Get the first page with non-empty content. + $posts = get_posts( + array( + 'numberposts' => 10, + 'orderby' => 'ID', + 'order' => 'ASC', + 'post_type' => 'page', + 'post_status' => 'publish', + ) + ); + + $url = NEW_SITE_CONTENT_ROOT; + foreach ( $posts as $post ) { + if ( ! empty( $post->post_content ) ) { + $url = get_permalink( $post ); + break; + } + } + $console_writer->write( "\033[1;36m" . $url . "\033[0m\n" ); + break; + } elseif ( false === $result ) { + if ( $importer->get_stage() === StreamImporter::STAGE_FRONTLOAD_ASSETS ) { + if ( ! $ignored_message_printed ) { + $console_writer->write( "\nSome assets could not be downloaded – they will be ignored so we can continue with the import.\n" ); + $ignored_message_printed = true; + } + // $import_session->mark_frontloading_errors_as_ignored(); + } else { + $console_writer->write( "Import failed, aborting\n" ); + break; + } + } else { + // Twiddle our thumbs, importing in progress... + } + } while ( true ); +} finally { + if ( isset( $cache_fs ) ) { + $cache_fs->rmdir( + '/', + array( + 'recursive' => true, + ) + ); + } +} + +/** + * @TODO: Expose a primitive like the step function below from the + * DataLiberation PHP component. Support all sorts of pause conditions + * such as time limits, retry counts, memory limits, etc. + */ +function data_liberation_import_step_customized( ImportSession $session, StreamImporter $importer, ConsoleWriter $console_writer ) { + $soft_time_limit_seconds = 15; + $hard_time_limit_seconds = 25; + $start_time = microtime( true ); + $fetched_files = 0; + $progress_bar = null; + + while ( true ) { + $time_taken = microtime( true ) - $start_time; + if ( $time_taken >= $soft_time_limit_seconds ) { + if ( $importer->get_stage() === StreamImporter::STAGE_FRONTLOAD_ASSETS ) { + if ( $fetched_files > 0 ) { + return true; + } + } else { + return true; + } + } + if ( $time_taken >= $hard_time_limit_seconds ) { + return true; + } + + if ( true !== $importer->next_step() ) { + $session->set_reentrancy_cursor( $importer->get_reentrancy_cursor() ); + + $should_advance_to_next_stage = null !== $importer->get_next_stage(); + if ( $should_advance_to_next_stage ) { + if ( StreamImporter::STAGE_FRONTLOAD_ASSETS === $importer->get_stage() ) { + $resolved_all_failures = $session->count_unfinished_frontloading_stubs() === 0; + if ( ! $resolved_all_failures ) { + // Uncomment once this script's intent becomes exiting on unresolved frontloading failures. + // if($progress_bar) { + // $progress_bar->finish(); + // } + // return false; + } + } + } + if ( ! $importer->advance_to_next_stage() ) { + if ( $progress_bar ) { + $progress_bar->finish(); + } + return false; + } + $session->set_stage( $importer->get_stage() ); + $session->set_reentrancy_cursor( $importer->get_reentrancy_cursor() ); + $console_writer->clearLine(); + $progress_bar = null; + + continue; + } + + switch ( $importer->get_stage() ) { + case StreamImporter::STAGE_INDEX_ENTITIES: + $entities_counts = $importer->get_indexed_entities_counts(); + $session->create_frontloading_stubs( $importer->get_indexed_assets_urls() ); + $session->bump_total_number_of_entities( $entities_counts ); + if ( ! $progress_bar ) { + $progress_bar = new ProgressBar( $console_writer, null ); + $progress_bar->setMessage( 'Indexing entities' ); + $progress_bar->start(); + } + $progress_bar->setCurrent( array_sum( $session->get_total_number_of_entities() ) ); + break; + + case StreamImporter::STAGE_FRONTLOAD_ASSETS: + $progress = $importer->get_frontloading_progress(); + $session->bump_frontloading_progress( + $progress, + $importer->get_frontloading_events() + ); + + if ( ! $progress_bar ) { + $progress_bar = new ProgressBar( $console_writer, null ); + $progress_bar->setMessage( 'Fetching media files' ); + $progress_bar->start(); + } + $progress_bar->setCurrent( $session->count_unfinished_frontloading_stubs() ); + break; + + case StreamImporter::STAGE_IMPORT_ENTITIES: + $imported_counts = $importer->get_imported_entities_counts(); + + $session->bump_imported_entities_counts( $imported_counts ); + + if ( ! $progress_bar ) { + $progress_bar = new ProgressBar( $console_writer, $session->count_remaining_entities() ); + $progress_bar->setMessage( 'Importing entities' ); + $progress_bar->start(); + } + $progress_bar->setCurrent( $session->count_all_imported_entities() ); + break; + } + + $session->set_reentrancy_cursor( $importer->get_reentrancy_cursor() ); + } + return false; +} diff --git a/components/Blueprints/bin/blueprint.php b/components/Blueprints/bin/blueprint.php index dadde86d..d96fe88d 100644 --- a/components/Blueprints/bin/blueprint.php +++ b/components/Blueprints/bin/blueprint.php @@ -33,6 +33,7 @@ require __DIR__ . '/../../../vendor/autoload.php'; +use WordPress\CLI\CLI; use WordPress\Blueprints\DataReference\AbsoluteLocalPath; use WordPress\Blueprints\DataReference\DataReference; use WordPress\Blueprints\Exception\BlueprintExecutionException; @@ -102,84 +103,6 @@ ], ]; -// ----------------------------------------------------------------------------- -// Custom command‑line parser (POSIX‑ish but without getopt dependency) -// ----------------------------------------------------------------------------- -function parseCommandArgsAndOptions( array $argv, array $optionDefs ): array { - $positionals = []; - $options = []; - $short2long = []; - - // Initialise defaults & maps - foreach ( $optionDefs as $long => $def ) { - [ $short, , $default ] = $def; - $options[ $long ] = $default; - if ( $short ) { - $short2long[ $short ] = $long; - } - } - - $i = 0; // Start from the first command argument - while ( $i < count( $argv ) ) { - $token = $argv[ $i ]; - - // Long option --foo or --foo=bar - if ( preg_match( '/^--([^=]+)(=(.*))?$/', $token, $m ) ) { - $long = $m[1]; - if ( ! isset( $optionDefs[ $long ] ) ) { - throw new InvalidArgumentException( "Unknown option --$long" ); - } - [ $short, $hasVal ] = $optionDefs[ $long ]; - if ( $hasVal ) { - $val = $m[3] ?? ( $argv[ ++ $i ] ?? null ); - if ( $val === null ) { - throw new InvalidArgumentException( "Option --$long requires a value" ); - } - $options[ $long ] = $val; - } else { - $options[ $long ] = true; - } - $i ++; - continue; - } - - // Short option(s): -abc or -e mysql or -e=mysql - if ( preg_match( '/^-([A-Za-z]{1,})(=(.*))?$/', $token, $m ) ) { - $bundle = str_split( $m[1] ); - $inlineVal = $m[3] ?? null; - foreach ( $bundle as $idx => $short ) { - if ( ! isset( $short2long[ $short ] ) ) { - throw new InvalidArgumentException( "Unknown option -$short" ); - } - $long = $short2long[ $short ]; - $hasVal = $optionDefs[ $long ][1]; - if ( $hasVal ) { - if ( $inlineVal !== null && $idx === 0 ) { - $options[ $long ] = $inlineVal; - } else { - $val = ( $idx === count( $bundle ) - 1 ) ? ( $argv[ ++ $i ] ?? null ) : null; - if ( $val === null ) { - throw new InvalidArgumentException( "Option -$short requires a value" ); - } - $options[ $long ] = $val; - } - break; // value‑bearing short stops bundle processing - } else { - $options[ $long ] = true; - } - } - $i ++; - continue; - } - - // Positional argument - $positionals[] = $token; - $i ++; - } - - return [ $positionals, $options ]; -} - // Get the command name from arguments, accounting for aliases function resolveCommand( $commandArg, array $commandConfigurations ): ?string { // Direct command match @@ -509,7 +432,7 @@ function reportProgress( $progress, $caption ) { // Parse command arguments and options $commandArgv = array_slice( $_SERVER['argv'], 2 ); // Skip "php script.php command" - [ $positionalArgs, $options ] = parseCommandArgsAndOptions( $commandArgv, $commandConfigurations[ $command ]['options'] ); + [ $positionalArgs, $options ] = CLI::parseCommandArgsAndOptions( $commandArgv, $commandConfigurations[ $command ]['options'] ); // Dispatch to appropriate command handler switch ( $command ) { diff --git a/components/CLI/CLI.php b/components/CLI/CLI.php new file mode 100644 index 00000000..2e9bee7d --- /dev/null +++ b/components/CLI/CLI.php @@ -0,0 +1,119 @@ + [ 'short', hasValue, defaultValue, description ], + * // ... + * ] + * + * Example: + * $optionDefs = [ + * 'site-url' => [ 'u', true, null, 'Public site URL' ], + * 'site-path' => [ null, true, null, 'Target directory' ], + * 'help' => [ 'h', false, false, 'Show help' ], + * ]; + * $argv = ['--site-url=https://mysite.test', '--site-path', '/var/www', '-h', 'blueprint.json']; + * [$positionals, $options] = CLI::parseCommandArgsAndOptions($argv, $optionDefs); + * // $positionals = ['blueprint.json'] + * // $options = [ + * // 'site-url' => 'https://mysite.test', + * // 'site-path' => '/var/www', + * // 'help' => true, + * // ] + * + * This is used in the Blueprint Runner CLI to parse command-line input, e.g.: + * php blueprint.php exec my-blueprint.json --site-url https://mysite.test --site-path ./mysite --help + * + * @param array $argv The CLI arguments (excluding the script name and command). + * @param array $optionDefs Option definitions as described above. + * @return array [ $positionals, $options ] + * @throws InvalidArgumentException for unknown options or missing required values. + */ + public static function parseCommandArgsAndOptions( array $argv, array $optionDefs ): array { + $positionals = []; + $options = []; + $short2long = []; + + // Initialise defaults & maps + foreach ( $optionDefs as $long => $def ) { + [ $short, , $default ] = $def; + $options[ $long ] = $default; + if ( $short ) { + $short2long[ $short ] = $long; + } + } + + $i = 0; // Start from the first command argument + while ( $i < count( $argv ) ) { + $token = $argv[ $i ]; + + // Long option --foo or --foo=bar + if ( preg_match( '/^--([^=]+)(=(.*))?$/', $token, $m ) ) { + $long = $m[1]; + if ( ! isset( $optionDefs[ $long ] ) ) { + throw new InvalidArgumentException( "Unknown option --$long" ); + } + [ $short, $hasVal ] = $optionDefs[ $long ]; + if ( $hasVal ) { + $val = $m[3] ?? ( $argv[ ++ $i ] ?? null ); + if ( $val === null ) { + throw new InvalidArgumentException( "Option --$long requires a value" ); + } + $options[ $long ] = $val; + } else { + $options[ $long ] = true; + } + $i ++; + continue; + } + + // Short option(s): -abc or -e mysql or -e=mysql + if ( preg_match( '/^-([A-Za-z]{1,})(=(.*))?$/', $token, $m ) ) { + $bundle = str_split( $m[1] ); + $inlineVal = $m[3] ?? null; + foreach ( $bundle as $idx => $short ) { + if ( ! isset( $short2long[ $short ] ) ) { + throw new InvalidArgumentException( "Unknown option -$short" ); + } + $long = $short2long[ $short ]; + $hasVal = $optionDefs[ $long ][1]; + if ( $hasVal ) { + if ( $inlineVal !== null && $idx === 0 ) { + $options[ $long ] = $inlineVal; + } else { + $val = ( $idx === count( $bundle ) - 1 ) ? ( $argv[ ++ $i ] ?? null ) : null; + if ( $val === null ) { + throw new InvalidArgumentException( "Option -$short requires a value" ); + } + $options[ $long ] = $val; + } + break; // value‑bearing short stops bundle processing + } else { + $options[ $long ] = true; + } + } + $i ++; + continue; + } + + // Positional argument + $positionals[] = $token; + $i ++; + } + + return [ $positionals, $options ]; + } +} \ No newline at end of file diff --git a/components/CLI/composer.json b/components/CLI/composer.json new file mode 100644 index 00000000..73dfda50 --- /dev/null +++ b/components/CLI/composer.json @@ -0,0 +1,26 @@ +{ + "name": "wordpress/cli", + "description": "CLI component for WordPress.", + "type": "library", + "authors": [ + { + "name": "Adam Zielinski", + "email": "adam@adamziel.com" + }, + { + "name": "WordPress Team", + "email": "wordpress@wordpress.org" + } + ], + "require": { + "php": ">=7.2" + }, + "autoload": { + "psr-4": { + "WordPress\\CLI\\": "" + }, + "exclude-from-classmap": [ + "/Tests/" + ] + } +} diff --git a/components/DataLiberation/URL/functions.php b/components/DataLiberation/URL/functions.php index 4180ab97..99cbf8cb 100644 --- a/components/DataLiberation/URL/functions.php +++ b/components/DataLiberation/URL/functions.php @@ -6,143 +6,152 @@ use WordPress\DataLiberation\BlockMarkup\BlockMarkupUrlProcessor; /** - * Migrate URLs in post content. See WPRewriteUrlsTests for - * specific examples. TODO: A better description. + * We have a weird composer autoloading issue. Sometimes it requires + * this file twice. And only this file! The function_exists check is + * a quick workaround until we figure out what's going on. * - * Example: - * - * ```php - * php > wp_rewrite_urls([ - * 'block_markup' => '', - * 'url-mapping' => [ - * 'http://legacy-blog.com' => 'https://modern-webstore.org' - * ] - * ]) - * - * ``` - * - * @TODO Use a proper JSON parser and encoder to: - * * Support UTF-16 characters - * * Gracefully handle recoverable encoding issues - * * Avoid changing the whitespace in the same manner as - * we do in WP_HTML_Tag_Processor + * @TODO: Fix this. */ -function wp_rewrite_urls( $options ) { - if ( empty( $options['base_url'] ) ) { - // Use first from-url as base_url if not specified - $from_urls = array_keys( $options['url-mapping'] ); - $options['base_url'] = $from_urls[0]; - } +if ( ! function_exists('\WordPress\DataLiberation\URL\wp_rewrite_urls') ) { + /** + * Migrate URLs in post content. See WPRewriteUrlsTests for + * specific examples. TODO: A better description. + * + * Example: + * + * ```php + * php > wp_rewrite_urls([ + * 'block_markup' => '', + * 'url-mapping' => [ + * 'http://legacy-blog.com' => 'https://modern-webstore.org' + * ] + * ]) + * + * ``` + * + * @TODO Use a proper JSON parser and encoder to: + * * Support UTF-16 characters + * * Gracefully handle recoverable encoding issues + * * Avoid changing the whitespace in the same manner as + * we do in WP_HTML_Tag_Processor + */ + function wp_rewrite_urls( $options ) { + if ( empty( $options['base_url'] ) ) { + // Use first from-url as base_url if not specified + $from_urls = array_keys( $options['url-mapping'] ); + $options['base_url'] = $from_urls[0]; + } - $url_mapping = array(); - foreach ( $options['url-mapping'] as $from_url_string => $to_url_string ) { - $url_mapping[] = array( - 'from_url' => WPURL::parse( $from_url_string ), - 'to_url' => WPURL::parse( $to_url_string ), - ); - } + $url_mapping = array(); + foreach ( $options['url-mapping'] as $from_url_string => $to_url_string ) { + $url_mapping[] = array( + 'from_url' => WPURL::parse( $from_url_string ), + 'to_url' => WPURL::parse( $to_url_string ), + ); + } - $p = new BlockMarkupUrlProcessor( $options['block_markup'], $options['base_url'] ); - while ( $p->next_url() ) { - $parsed_url = $p->get_parsed_url(); - foreach ( $url_mapping as $mapping ) { - if ( is_child_url_of( $parsed_url, $mapping['from_url'] ) ) { - $p->replace_base_url( $mapping['to_url'] ); - break; + $p = new BlockMarkupUrlProcessor( $options['block_markup'], $options['base_url'] ); + while ( $p->next_url() ) { + $parsed_url = $p->get_parsed_url(); + foreach ( $url_mapping as $mapping ) { + if ( is_child_url_of( $parsed_url, $mapping['from_url'] ) ) { + $p->replace_base_url( $mapping['to_url'] ); + break; + } } } + + return $p->get_updated_html(); } - return $p->get_updated_html(); -} + /** + * Check if a given URL matches the current site URL. + * + * @param URL $parent The URL to check. + * @param string $child The current site URL to compare against. + * + * @return bool Whether the URL matches the current site URL. + */ + function is_child_url_of( $child, $parent_url ) { + $parent_url = is_string( $parent_url ) ? WPURL::parse( $parent_url ) : $parent_url; + $child = is_string( $child ) ? WPURL::parse( $child ) : $child; + $child_pathname_no_trailing_slash = rtrim( urldecode( $child->pathname ), '/' ); + + if ( false === $child || false === $parent_url ) { + return false; + } -/** - * Check if a given URL matches the current site URL. - * - * @param URL $parent The URL to check. - * @param string $child The current site URL to compare against. - * - * @return bool Whether the URL matches the current site URL. - */ -function is_child_url_of( $child, $parent_url ) { - $parent_url = is_string( $parent_url ) ? WPURL::parse( $parent_url ) : $parent_url; - $child = is_string( $child ) ? WPURL::parse( $child ) : $child; - $child_pathname_no_trailing_slash = rtrim( urldecode( $child->pathname ), '/' ); + if ( $parent_url->hostname !== $child->hostname ) { + return false; + } - if ( false === $child || false === $parent_url ) { - return false; - } + if ( $parent_url->protocol !== $child->protocol ) { + return false; + } - if ( $parent_url->hostname !== $child->hostname ) { - return false; - } + $parent_pathname = urldecode( $parent_url->pathname ); - if ( $parent_url->protocol !== $child->protocol ) { - return false; + return ( + // Direct match + $parent_pathname === $child_pathname_no_trailing_slash || + $parent_pathname === $child_pathname_no_trailing_slash . '/' || + // Path prefix + strncmp( $child_pathname_no_trailing_slash . '/', $parent_pathname, strlen( $parent_pathname ) ) === 0 + ); } - $parent_pathname = urldecode( $parent_url->pathname ); - - return ( - // Direct match - $parent_pathname === $child_pathname_no_trailing_slash || - $parent_pathname === $child_pathname_no_trailing_slash . '/' || - // Path prefix - strncmp( $child_pathname_no_trailing_slash . '/', $parent_pathname, strlen( $parent_pathname ) ) === 0 - ); -} + /** + * Decodes the first n **encoded bytes** a URL-encoded string. + * + * For example, `urldecode_n( '%22is 6 %3C 6?%22 – asked Achilles', 1 )` returns + * '"is 6 %3C 6?%22 – asked Achilles' because only the first encoded byte is decoded. + * + * @param string $string The string to decode. + * @param int $decode_n The number of bytes to decode in $input + * + * @return string The decoded string. + */ + function urldecode_n( $input, $decode_n ) { + $result = ''; + $at = 0; + while ( true ) { + if ( $at + 3 > strlen( $input ) ) { + break; + } -/** - * Decodes the first n **encoded bytes** a URL-encoded string. - * - * For example, `urldecode_n( '%22is 6 %3C 6?%22 – asked Achilles', 1 )` returns - * '"is 6 %3C 6?%22 – asked Achilles' because only the first encoded byte is decoded. - * - * @param string $string The string to decode. - * @param int $decode_n The number of bytes to decode in $input - * - * @return string The decoded string. - */ -function urldecode_n( $input, $decode_n ) { - $result = ''; - $at = 0; - while ( true ) { - if ( $at + 3 > strlen( $input ) ) { - break; - } + $last_at = $at; + $at += strcspn( $input, '%', $at ); + // Consume bytes except for the percent sign. + $result .= substr( $input, $last_at, $at - $last_at ); - $last_at = $at; - $at += strcspn( $input, '%', $at ); - // Consume bytes except for the percent sign. - $result .= substr( $input, $last_at, $at - $last_at ); + // If we've already decoded the requested number of bytes, stop. + if ( strlen( $result ) >= $decode_n ) { + break; + } - // If we've already decoded the requested number of bytes, stop. - if ( strlen( $result ) >= $decode_n ) { - break; - } + ++ $at; + if ( $at > strlen( $input ) ) { + break; + } - ++ $at; - if ( $at > strlen( $input ) ) { - break; + $decodable_length = strspn( + $input, + '0123456789ABCDEFabcdef', + $at, + 2 + ); + + if ( $decodable_length === 2 ) { + // Decode the hex sequence. + $result .= chr( hexdec( $input[ $at ] . $input[ $at + 1 ] ) ); + $at += 2; + } else { + // Consume the next byte and move on. + $result .= '%'; + } } + $result .= substr( $input, $at ); - $decodable_length = strspn( - $input, - '0123456789ABCDEFabcdef', - $at, - 2 - ); - - if ( $decodable_length === 2 ) { - // Decode the hex sequence. - $result .= chr( hexdec( $input[ $at ] . $input[ $at + 1 ] ) ); - $at += 2; - } else { - // Consume the next byte and move on. - $result .= '%'; - } + return $result; } - $result .= substr( $input, $at ); - - return $result; } diff --git a/composer.json b/composer.json index 4ef1653f..d6a37423 100644 --- a/composer.json +++ b/composer.json @@ -56,6 +56,7 @@ ], "psr-4": { "WordPress\\Blueprints\\": "components/Blueprints/", + "WordPress\\CLI\\": "components/CLI/", "WordPress\\DataLiberation\\": "components/DataLiberation/", "Rowbot\\": "components/DataLiberation/vendor-patched/", "Brick\\": "components/DataLiberation/vendor-patched/", From c134840ae84081dcca1a44e1584661b14ef71e3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Thu, 29 May 2025 14:50:52 +0200 Subject: [PATCH 5/6] Accept URLs found outside of img src as assets --- .../Blueprints/Steps/ImportContentStep.php | 4 +- .../Importer/StreamImporter.php | 32 ++- components/DataLiberation/URL/functions.php | 246 +++++++++--------- 3 files changed, 148 insertions(+), 134 deletions(-) diff --git a/components/Blueprints/Steps/ImportContentStep.php b/components/Blueprints/Steps/ImportContentStep.php index 0378d6bf..9b92fcf1 100644 --- a/components/Blueprints/Steps/ImportContentStep.php +++ b/components/Blueprints/Steps/ImportContentStep.php @@ -85,7 +85,9 @@ private function importWxr( Runtime $runtime, array $content_definition ): void $_SERVER['argv'] = [ 'import-wxr.php', 'wxr', - getenv('WXR_PATH') + getenv('WXR_PATH'), + '--media-url', + 'https://pd.w.org/' ]; ?> PHP diff --git a/components/DataLiberation/Importer/StreamImporter.php b/components/DataLiberation/Importer/StreamImporter.php index a96be2e8..cbf86b62 100644 --- a/components/DataLiberation/Importer/StreamImporter.php +++ b/components/DataLiberation/Importer/StreamImporter.php @@ -452,7 +452,6 @@ protected function index_next_entities() { $entity = $this->get_current_entity(); $type = $entity->get_type(); - var_dump( $type ); // Count entities by type. if ( ! isset( $this->indexed_entities_counts[ $type ] ) ) { @@ -1001,12 +1000,33 @@ protected function rewrite_attachment_url( string $raw_url, $base_url = null ) { * @TODO: What other asset types are there? */ protected function url_processor_matched_asset_url( BlockMarkupUrlProcessor $p ) { - if ( $p->get_tag() !== 'IMG' ) { - return false; - } - if ( $p->get_inspected_attribute_name() !== 'src' ) { - return false; + /** + * Decide whether the URL is an asset URL worth downloading. + * + * All URLs with an image-like extension are treated as images, + * + * For example, the background image in the following block would be accepted: + * + *
+ */ + $path = $p->get_parsed_url()->pathname; + $extension = pathinfo( $path, PATHINFO_EXTENSION ); + if ( ! in_array($extension, array('jpg', 'jpeg', 'png', 'gif', 'webp', 'svg') ) ) { + /** + * Absent an extension, try to guess whether it's a static asset based + * on its location in the document. For now, we only accept images. + */ + if ( $p->get_tag() !== 'IMG' ) { + return false; + } + if ( $p->get_inspected_attribute_name() !== 'src' ) { + return false; + } } + + /** + * Finally, confirm it comes from one of the allowed media root URLs. + */ foreach ( $this->source_media_root_urls as $source_media_root_url ) { if ( is_child_url_of( $p->get_parsed_url(), $source_media_root_url ) ) { return true; diff --git a/components/DataLiberation/URL/functions.php b/components/DataLiberation/URL/functions.php index 99cbf8cb..4cb1c166 100644 --- a/components/DataLiberation/URL/functions.php +++ b/components/DataLiberation/URL/functions.php @@ -5,153 +5,145 @@ use Rowbot\URL\URL; use WordPress\DataLiberation\BlockMarkup\BlockMarkupUrlProcessor; + /** - * We have a weird composer autoloading issue. Sometimes it requires - * this file twice. And only this file! The function_exists check is - * a quick workaround until we figure out what's going on. + * Migrate URLs in post content. See WPRewriteUrlsTests for + * specific examples. TODO: A better description. + * + * Example: + * + * ```php + * php > wp_rewrite_urls([ + * 'block_markup' => '', + * 'url-mapping' => [ + * 'http://legacy-blog.com' => 'https://modern-webstore.org' + * ] + * ]) + * + * ``` * - * @TODO: Fix this. + * @TODO Use a proper JSON parser and encoder to: + * * Support UTF-16 characters + * * Gracefully handle recoverable encoding issues + * * Avoid changing the whitespace in the same manner as + * we do in WP_HTML_Tag_Processor */ -if ( ! function_exists('\WordPress\DataLiberation\URL\wp_rewrite_urls') ) { - /** - * Migrate URLs in post content. See WPRewriteUrlsTests for - * specific examples. TODO: A better description. - * - * Example: - * - * ```php - * php > wp_rewrite_urls([ - * 'block_markup' => '', - * 'url-mapping' => [ - * 'http://legacy-blog.com' => 'https://modern-webstore.org' - * ] - * ]) - * - * ``` - * - * @TODO Use a proper JSON parser and encoder to: - * * Support UTF-16 characters - * * Gracefully handle recoverable encoding issues - * * Avoid changing the whitespace in the same manner as - * we do in WP_HTML_Tag_Processor - */ - function wp_rewrite_urls( $options ) { - if ( empty( $options['base_url'] ) ) { - // Use first from-url as base_url if not specified - $from_urls = array_keys( $options['url-mapping'] ); - $options['base_url'] = $from_urls[0]; - } +function wp_rewrite_urls( $options ) { + if ( empty( $options['base_url'] ) ) { + // Use first from-url as base_url if not specified + $from_urls = array_keys( $options['url-mapping'] ); + $options['base_url'] = $from_urls[0]; + } - $url_mapping = array(); - foreach ( $options['url-mapping'] as $from_url_string => $to_url_string ) { - $url_mapping[] = array( - 'from_url' => WPURL::parse( $from_url_string ), - 'to_url' => WPURL::parse( $to_url_string ), - ); - } + $url_mapping = array(); + foreach ( $options['url-mapping'] as $from_url_string => $to_url_string ) { + $url_mapping[] = array( + 'from_url' => WPURL::parse( $from_url_string ), + 'to_url' => WPURL::parse( $to_url_string ), + ); + } - $p = new BlockMarkupUrlProcessor( $options['block_markup'], $options['base_url'] ); - while ( $p->next_url() ) { - $parsed_url = $p->get_parsed_url(); - foreach ( $url_mapping as $mapping ) { - if ( is_child_url_of( $parsed_url, $mapping['from_url'] ) ) { - $p->replace_base_url( $mapping['to_url'] ); - break; - } + $p = new BlockMarkupUrlProcessor( $options['block_markup'], $options['base_url'] ); + while ( $p->next_url() ) { + $parsed_url = $p->get_parsed_url(); + foreach ( $url_mapping as $mapping ) { + if ( is_child_url_of( $parsed_url, $mapping['from_url'] ) ) { + $p->replace_base_url( $mapping['to_url'] ); + break; } } - - return $p->get_updated_html(); } - /** - * Check if a given URL matches the current site URL. - * - * @param URL $parent The URL to check. - * @param string $child The current site URL to compare against. - * - * @return bool Whether the URL matches the current site URL. - */ - function is_child_url_of( $child, $parent_url ) { - $parent_url = is_string( $parent_url ) ? WPURL::parse( $parent_url ) : $parent_url; - $child = is_string( $child ) ? WPURL::parse( $child ) : $child; - $child_pathname_no_trailing_slash = rtrim( urldecode( $child->pathname ), '/' ); - - if ( false === $child || false === $parent_url ) { - return false; - } + return $p->get_updated_html(); +} - if ( $parent_url->hostname !== $child->hostname ) { - return false; - } +/** + * Check if a given URL matches the current site URL. + * + * @param URL $parent The URL to check. + * @param string $child The current site URL to compare against. + * + * @return bool Whether the URL matches the current site URL. + */ +function is_child_url_of( $child, $parent_url ) { + $parent_url = is_string( $parent_url ) ? WPURL::parse( $parent_url ) : $parent_url; + $child = is_string( $child ) ? WPURL::parse( $child ) : $child; + $child_pathname_no_trailing_slash = rtrim( urldecode( $child->pathname ), '/' ); - if ( $parent_url->protocol !== $child->protocol ) { - return false; - } + if ( false === $child || false === $parent_url ) { + return false; + } - $parent_pathname = urldecode( $parent_url->pathname ); + if ( $parent_url->hostname !== $child->hostname ) { + return false; + } - return ( - // Direct match - $parent_pathname === $child_pathname_no_trailing_slash || - $parent_pathname === $child_pathname_no_trailing_slash . '/' || - // Path prefix - strncmp( $child_pathname_no_trailing_slash . '/', $parent_pathname, strlen( $parent_pathname ) ) === 0 - ); + if ( $parent_url->protocol !== $child->protocol ) { + return false; } - /** - * Decodes the first n **encoded bytes** a URL-encoded string. - * - * For example, `urldecode_n( '%22is 6 %3C 6?%22 – asked Achilles', 1 )` returns - * '"is 6 %3C 6?%22 – asked Achilles' because only the first encoded byte is decoded. - * - * @param string $string The string to decode. - * @param int $decode_n The number of bytes to decode in $input - * - * @return string The decoded string. - */ - function urldecode_n( $input, $decode_n ) { - $result = ''; - $at = 0; - while ( true ) { - if ( $at + 3 > strlen( $input ) ) { - break; - } + $parent_pathname = urldecode( $parent_url->pathname ); - $last_at = $at; - $at += strcspn( $input, '%', $at ); - // Consume bytes except for the percent sign. - $result .= substr( $input, $last_at, $at - $last_at ); + return ( + // Direct match + $parent_pathname === $child_pathname_no_trailing_slash || + $parent_pathname === $child_pathname_no_trailing_slash . '/' || + // Path prefix + strncmp( $child_pathname_no_trailing_slash . '/', $parent_pathname, strlen( $parent_pathname ) ) === 0 + ); +} - // If we've already decoded the requested number of bytes, stop. - if ( strlen( $result ) >= $decode_n ) { - break; - } +/** + * Decodes the first n **encoded bytes** a URL-encoded string. + * + * For example, `urldecode_n( '%22is 6 %3C 6?%22 – asked Achilles', 1 )` returns + * '"is 6 %3C 6?%22 – asked Achilles' because only the first encoded byte is decoded. + * + * @param string $string The string to decode. + * @param int $decode_n The number of bytes to decode in $input + * + * @return string The decoded string. + */ +function urldecode_n( $input, $decode_n ) { + $result = ''; + $at = 0; + while ( true ) { + if ( $at + 3 > strlen( $input ) ) { + break; + } - ++ $at; - if ( $at > strlen( $input ) ) { - break; - } + $last_at = $at; + $at += strcspn( $input, '%', $at ); + // Consume bytes except for the percent sign. + $result .= substr( $input, $last_at, $at - $last_at ); - $decodable_length = strspn( - $input, - '0123456789ABCDEFabcdef', - $at, - 2 - ); - - if ( $decodable_length === 2 ) { - // Decode the hex sequence. - $result .= chr( hexdec( $input[ $at ] . $input[ $at + 1 ] ) ); - $at += 2; - } else { - // Consume the next byte and move on. - $result .= '%'; - } + // If we've already decoded the requested number of bytes, stop. + if ( strlen( $result ) >= $decode_n ) { + break; + } + + ++ $at; + if ( $at > strlen( $input ) ) { + break; } - $result .= substr( $input, $at ); - return $result; + $decodable_length = strspn( + $input, + '0123456789ABCDEFabcdef', + $at, + 2 + ); + + if ( $decodable_length === 2 ) { + // Decode the hex sequence. + $result .= chr( hexdec( $input[ $at ] . $input[ $at + 1 ] ) ); + $at += 2; + } else { + // Consume the next byte and move on. + $result .= '%'; + } } + $result .= substr( $input, $at ); + + return $result; } From 1371578fa2d62bf0e3f7c1400a26f4340dbc4a1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Thu, 29 May 2025 15:08:39 +0200 Subject: [PATCH 6/6] Start migrating posts importing step to data liberation importer --- .../Blueprints/Steps/ImportContentStep.php | 35 ++++++++++++++----- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/components/Blueprints/Steps/ImportContentStep.php b/components/Blueprints/Steps/ImportContentStep.php index 9b92fcf1..e724c629 100644 --- a/components/Blueprints/Steps/ImportContentStep.php +++ b/components/Blueprints/Steps/ImportContentStep.php @@ -44,7 +44,7 @@ public function run( Runtime $runtime, Tracker $progress ) { $this->importWxr( $runtime, $content_definition ); } elseif ( $content_definition['type'] === 'posts' ) { $progress[ $i ]->setCaption( 'Importing a post ' ); - $this->importPosts( $runtime, $content_definition ); + $this->importPosts( $runtime, $content_definition['source'] ); } else { throw new RuntimeException( 'Unsupported content type: ' . $content_definition['type'] ); } @@ -64,7 +64,9 @@ private function importWxr( Runtime $runtime, array $content_definition ): void ) ); } + // @TODO: Pass the data reference to the import script to enable streaming. $wxrPath = $runtime->saveToTemporaryFile( $resolved ); + // @TODO: Make it work when Blueprints are running as phar archive $import_script_path = __DIR__ . '/scripts/import-content.php'; if ( ! file_exists( $import_script_path ) ) { @@ -86,8 +88,9 @@ private function importWxr( Runtime $runtime, array $content_definition ): void 'import-wxr.php', 'wxr', getenv('WXR_PATH'), - '--media-url', - 'https://pd.w.org/' + // @TODO: Support arbitrary media URLs to enable fetching assets during import. + // '--media-url', + // 'https://pd.w.org/' ]; ?> PHP @@ -99,10 +102,14 @@ private function importWxr( Runtime $runtime, array $content_definition ): void ); } - private function importPosts( Runtime $runtime, array $content_definition ): void { - $posts = $content_definition['source']; - if ( ! is_array( $posts ) ) { - throw new RuntimeException( 'Invalid posts data.' ); + private function importPosts( Runtime $runtime, $post ): void { + // @TODO: Use the Data Liberation importer here. + $resolved = $runtime->resolve( $post ); + if ( ! $resolved instanceof File ) { + throw new BlueprintExecutionException( sprintf( + 'Imported content reference must be a file, but %s was a Directory.', + $post->get_human_readable_name() + ) ); } $runtime->evalPhpCodeInSubProcess( @@ -110,12 +117,22 @@ private function importPosts( Runtime $runtime, array $content_definition ): voi get_error_message() ); + } } PHP , [ - 'POSTS' => json_encode( $posts ), + 'POSTS' => json_encode( [ + [ + 'post_title' => 'Test Post', + 'post_content' => $resolved->getStream()->consume_all(), + 'post_status' => 'publish', + 'post_type' => 'post', + ], + ] ), ] ); }