From 46e01645c32482d45f19842e9a7b0c632b009831 Mon Sep 17 00:00:00 2001 From: MK Date: Sat, 30 May 2026 20:52:41 +0800 Subject: [PATCH 1/2] fix(install): retry download body stream, verification, and extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI intermittently failed with three different errors that share one root cause: the HTTP retry logic wrapped only the request setup (`send().await?.error_for_status()`), which returns as soon as the response headers arrive. The body streaming to disk, hash verification, and tar/archive extraction all happened *after* the retry closure returned, so a truncated or corrupt download surfaced immediately as `UnexpectedEof` (tar extraction), `HashMismatch` (verification), or a failed node download — none of which triggered a retry. Affected flaky tests: - vite_install: test_detect_package_manager_with_yarn_config_cjs (UnexpectedEof) - vite_install: test_download_success_package_manager_with_sha1_and_sha224 (HashMismatch) - vite_js_runtime: test_execute_node_version (download failed) Fix: - request.rs: `download_file` now retries the request + body stream as a single unit and detects truncation (bytes written != advertised Content-Length). `download_and_extract_tgz_with_hash` retries the full download → verify → extract pipeline on transient content-integrity errors (Io / HashMismatch), resetting the target dir each attempt. Network errors stay inside `download_file`'s retry and the 404 -> PackageManagerVersionNotFound mapping is preserved. - download.rs (node): `download_file` retries request + body stream with the same Content-Length truncation check. - runtime.rs (node): wraps download → verify → extract in a content-integrity retry mirroring the install path, so a complete-but-corrupt node archive is re-downloaded instead of aborting. Adds deterministic httpmock regression tests proving a corrupt archive and a hash mismatch are retried by the full pipeline. --- crates/vite_install/src/request.rs | 162 +++++++++++++++++++++++-- crates/vite_js_runtime/src/download.rs | 128 +++++++++++-------- crates/vite_js_runtime/src/runtime.rs | 65 +++++++--- 3 files changed, 277 insertions(+), 78 deletions(-) diff --git a/crates/vite_install/src/request.rs b/crates/vite_install/src/request.rs index 43549fee52..c7b20c3bb1 100644 --- a/crates/vite_install/src/request.rs +++ b/crates/vite_install/src/request.rs @@ -110,31 +110,65 @@ impl HttpClient { let target_path = target_path.as_ref(); tracing::debug!("Downloading {} to {:?}", url, target_path); - let response = self.get(url).await?; + let client = vite_shared::shared_http_client(); - self.write_response_to_file(response, target_path).await?; + // Make the request *and* the body stream a single retried unit. Doing + // the request inline (instead of calling `self.get`) avoids a double + // retry layer. A truncated download (bytes written != advertised + // Content-Length) returns an error so the retry re-downloads. + (|| async { + let response = client.get(url).send().await?.error_for_status()?; + Self::write_response_to_file(response, target_path).await + }) + .retry( + ExponentialBuilder::default() + .with_jitter() + .with_min_delay(Duration::from_millis(self.min_delay)) + .with_max_times(self.max_times), + ) + .await?; tracing::debug!("Download completed: {:?}", target_path); Ok(()) } - /// Internal helper to write response body to file - async fn write_response_to_file( - &self, - response: Response, - target_path: &Path, - ) -> Result<(), Error> { + /// Internal helper to write response body to file. + /// + /// Captures the advertised `Content-Length` before streaming and verifies + /// the number of bytes written matches it, so a truncated/short read + /// surfaces as an error that the caller's retry can re-download. + async fn write_response_to_file(response: Response, target_path: &Path) -> Result<(), Error> { + let content_length = response.content_length(); + // Create the target file let mut file = fs::File::create(target_path).await?; // Stream the response body to the file + let mut bytes_written: u64 = 0; let mut stream = response.bytes_stream(); while let Some(chunk_result) = stream.next().await { let chunk = chunk_result?; + bytes_written += chunk.len() as u64; file.write_all(&chunk).await?; } file.flush().await?; + + // Detect truncation: if a Content-Length was advertised and the bytes + // written don't match, the download is incomplete — error out so the + // retry re-downloads from scratch. + if let Some(expected_len) = content_length + && bytes_written != expected_len + { + return Err(Error::Io(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + vite_str::format!( + "incomplete download: expected {expected_len} bytes, got {bytes_written}" + ) + .to_string(), + ))); + } + Ok(()) } } @@ -177,10 +211,39 @@ pub async fn download_and_extract_tgz_with_hash( expected_hash ); - // Create target directory - fs::create_dir_all(&target_dir).await?; + // Wrap the download → verify → extract sequence in a content-integrity + // retry. Network errors are already retried inside `download_file`, so the + // predicate below only retries IO / hash-mismatch / extraction failures + // (the symptoms of a transient corrupt or truncated download). A 404 maps + // to a `Reqwest` error that must propagate unchanged so the caller in + // `package_manager.rs` can turn it into `PackageManagerVersionNotFound`. + (|| async { download_and_extract_tgz_with_hash_once(url, &target_dir, expected_hash).await }) + .retry( + ExponentialBuilder::default() + .with_jitter() + .with_min_delay(Duration::from_millis(200)) + .with_max_times(3), + ) + .when(is_retryable_extract_error) + .await +} + +/// A single download → verify → extract attempt. +/// +/// Starts from clean state by removing and recreating `target_dir`, so a +/// partially-extracted or corrupt prior attempt cannot interfere with a retry. +async fn download_and_extract_tgz_with_hash_once( + url: &str, + target_dir: &Path, + expected_hash: Option<&str>, +) -> Result<(), Error> { + // Reset target directory so a partial prior attempt can't interfere. + if fs::try_exists(target_dir).await.unwrap_or(false) { + fs::remove_dir_all(target_dir).await?; + } + fs::create_dir_all(target_dir).await?; - // Download the tgz file with retry logic using HttpClient + // Download the tgz file. `download_file` retries the network layer itself. let tgz_file = target_dir.join("package.tgz"); let client = HttpClient::new(); client.download_file(url, &tgz_file).await?; @@ -192,7 +255,7 @@ pub async fn download_and_extract_tgz_with_hash( // Extract the tgz file to the target directory let tgz_file_for_extract = tgz_file.clone(); - let target_dir_for_extract = target_dir.clone(); + let target_dir_for_extract = target_dir.to_path_buf(); tokio::task::spawn_blocking(move || { extract_tgz(&tgz_file_for_extract, &target_dir_for_extract) }) @@ -204,6 +267,22 @@ pub async fn download_and_extract_tgz_with_hash( Ok(()) } +/// Predicate for the content-integrity retry in +/// [`download_and_extract_tgz_with_hash`]. +/// +/// Retries only transient content-integrity / IO failures that a fresh +/// re-download can fix. Everything else falls through to `false`: +/// - `Reqwest` network errors are deliberately excluded — `download_file` +/// already retries the request + body stream, and not re-retrying them here +/// keeps the caller's 404 → `PackageManagerVersionNotFound` mapping intact. +/// - Permanent config errors (bad hash format, unsupported algorithm, +/// version-not-found) can never succeed on retry. +/// - A `JoinError` (a `spawn_blocking` panic during extraction) is a bug, not a +/// transient fault, so it fails fast rather than re-downloading. +fn is_retryable_extract_error(err: &Error) -> bool { + matches!(err, Error::Io(_) | Error::IoWithPath { .. } | Error::HashMismatch { .. }) +} + /// Computes the hash of the given content using the specified digest algorithm. /// /// # Type Parameters @@ -540,6 +619,65 @@ mod tests { // TempDir automatically cleans up when it goes out of scope } + /// Regression test for flaky package-manager / node downloads. + /// + /// The retry logic used to wrap only the HTTP request setup + /// (`send().await?.error_for_status()`), which returns as soon as the + /// response *headers* arrive. The body stream and tar extraction happened + /// *after* the retry closure returned, so a corrupt or truncated download + /// surfaced immediately as an `UnexpectedEof` extraction error without ever + /// being retried. A corrupt archive served with a 200 status must instead + /// be retried by the full download+extract pipeline. + #[tokio::test] + async fn test_download_and_extract_retries_on_corrupt_archive() { + let server = MockServer::start(); + let mock = server.mock(|when, then| { + when.method(GET).path("/corrupt.tgz"); + then.status(200) + .header("content-type", "application/octet-stream") + .body("this is not a valid gzip archive"); + }); + + let temp_dir = TempDir::new().unwrap(); + let target_dir = temp_dir.path().join("extracted"); + let url = format!("{}/corrupt.tgz", server.base_url()); + + let result = download_and_extract_tgz_with_hash(&url, &target_dir, None).await; + assert!(result.is_err(), "corrupt archive should fail to extract: {result:?}"); + assert!( + mock.hits() > 1, + "a corrupt download must be retried by the full pipeline, but it was only attempted {} time(s)", + mock.hits() + ); + } + + /// A complete-but-wrong download (hash mismatch) must also be retried, + /// since a transient corrupt/truncated download is the most likely cause of + /// an integrity mismatch for an immutable npm tarball. + #[tokio::test] + async fn test_download_and_extract_retries_on_hash_mismatch() { + let server = MockServer::start(); + let mock_tgz = create_mock_package_tgz(); + let mock = server.mock(|when, then| { + when.method(GET).path("/mismatch.tgz"); + then.status(200).header("content-type", "application/octet-stream").body(mock_tgz); + }); + + let temp_dir = TempDir::new().unwrap(); + let target_dir = temp_dir.path().join("extracted"); + let url = format!("{}/mismatch.tgz", server.base_url()); + let wrong_hash = "sha512.0000000000000000000000000000000000000000000000000000000000000000\ + 0000000000000000000000000000000000000000000000000000000000000000"; + + let result = download_and_extract_tgz_with_hash(&url, &target_dir, Some(wrong_hash)).await; + assert!(result.is_err(), "hash mismatch should fail: {result:?}"); + assert!( + mock.hits() > 1, + "a hash mismatch must be retried, but it was only attempted {} time(s)", + mock.hits() + ); + } + #[tokio::test] #[ignore] // Flaky on musl/Alpine — temp file race condition async fn test_verify_file_hash_sha1() { diff --git a/crates/vite_js_runtime/src/download.rs b/crates/vite_js_runtime/src/download.rs index 72c9d9fcdb..22eaafc84d 100644 --- a/crates/vite_js_runtime/src/download.rs +++ b/crates/vite_js_runtime/src/download.rs @@ -41,73 +41,101 @@ pub async fn download_file( tracing::debug!("Downloading {url} to {target_path:?}"); - let response = (|| async { client.get(url).send().await?.error_for_status() }) - .retry( - ExponentialBuilder::default() - .with_jitter() - .with_min_delay(Duration::from_millis(500)) - .with_max_times(3), - ) - .await - .map_err(|e| Error::DownloadFailed { - url: url.into(), - reason: vite_shared::format_error_chain(&e).into(), - })?; - - // Get Content-Length for progress bar - let total_size = response.content_length(); - - // Create progress bar (only in TTY and not in CI) + // Create progress bar (only in TTY and not in CI). Built once and reused + // across retry attempts; its position is reset at the start of every + // attempt so a retried download doesn't double-count bytes. let is_ci = vite_shared::EnvConfig::get().is_ci; let progress = if std::io::stderr().is_terminal() && !is_ci { - let pb = if let Some(size) = total_size { - let pb = ProgressBar::new(size); - pb.set_style( - ProgressStyle::default_bar() - .template( - "{msg}\n{spinner:.green} [{elapsed_precise}] [{bar:40.blue/white}] \ - {bytes}/{total_bytes} ({bytes_per_sec}, {eta})", - ) - .expect("valid progress bar template") - .progress_chars("#>-"), - ); - pb - } else { - let pb = ProgressBar::new_spinner(); - pb.set_style( - ProgressStyle::default_spinner() - .template( - "{msg}\n{spinner:.green} [{elapsed_precise}] {bytes} ({bytes_per_sec})", - ) - .expect("valid spinner template"), - ); - pb.enable_steady_tick(Duration::from_millis(100)); - pb - }; + let pb = ProgressBar::new_spinner(); + pb.set_style( + ProgressStyle::default_spinner() + .template("{msg}\n{spinner:.green} [{elapsed_precise}] {bytes} ({bytes_per_sec})") + .expect("valid spinner template"), + ); + pb.enable_steady_tick(Duration::from_millis(100)); pb.set_message(message.to_string()); Some(pb) } else { None }; - // Stream to file with progress updates - let mut file = fs::File::create(target_path).await?; - let mut stream = response.bytes_stream(); + // Make the request *and* the body stream a single retried unit, so a + // truncated download (bytes written != advertised Content-Length) triggers + // a re-download instead of surfacing as a corrupt archive later. + let result = (|| async { + let response = client.get(url).send().await?.error_for_status()?; + + // Advertised length, used both for the progress bar and the + // truncation check below. + let total_size = response.content_length(); - while let Some(chunk_result) = stream.next().await { - let chunk = chunk_result?; if let Some(ref pb) = progress { - pb.inc(chunk.len() as u64); + // Reset for this attempt. + pb.set_position(0); + if let Some(size) = total_size { + pb.set_length(size); + pb.set_style( + ProgressStyle::default_bar() + .template( + "{msg}\n{spinner:.green} [{elapsed_precise}] [{bar:40.blue/white}] \ + {bytes}/{total_bytes} ({bytes_per_sec}, {eta})", + ) + .expect("valid progress bar template") + .progress_chars("#>-"), + ); + } } - file.write_all(&chunk).await?; - } - file.flush().await?; + // Stream to file with progress updates. + let mut file = fs::File::create(target_path).await?; + let mut stream = response.bytes_stream(); + let mut bytes_written: u64 = 0; + + while let Some(chunk_result) = stream.next().await { + let chunk = chunk_result?; + bytes_written += chunk.len() as u64; + if let Some(ref pb) = progress { + pb.inc(chunk.len() as u64); + } + file.write_all(&chunk).await?; + } + + file.flush().await?; + + // Detect truncation: a short read against an advertised Content-Length + // is an incomplete download — error out so the retry re-downloads. + if let Some(expected_len) = total_size + && bytes_written != expected_len + { + return Err(Error::Io(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + vite_str::format!( + "incomplete download: expected {expected_len} bytes, got {bytes_written}" + ) + .to_string(), + ))); + } + + Ok(()) + }) + .retry( + ExponentialBuilder::default() + .with_jitter() + .with_min_delay(Duration::from_millis(500)) + .with_max_times(3), + ) + .await + .map_err(|e| Error::DownloadFailed { + url: url.into(), + reason: vite_shared::format_error_chain(&e).into(), + }); if let Some(pb) = progress { pb.finish_and_clear(); } + result?; + tracing::debug!("Download completed: {target_path:?}"); Ok(()) diff --git a/crates/vite_js_runtime/src/runtime.rs b/crates/vite_js_runtime/src/runtime.rs index 1c0dbf03b9..53043c3b01 100644 --- a/crates/vite_js_runtime/src/runtime.rs +++ b/crates/vite_js_runtime/src/runtime.rs @@ -1,3 +1,6 @@ +use std::time::Duration; + +use backon::{ExponentialBuilder, Retryable}; use node_semver::{Range, Version}; use tempfile::TempDir; use vite_path::{AbsolutePath, AbsolutePathBuf}; @@ -172,31 +175,49 @@ pub async fn download_runtime_with_provider( let temp_path = AbsolutePathBuf::new(temp_dir.path().to_path_buf()).unwrap(); let archive_path = temp_path.join(&download_info.archive_filename); - // Verify hash if verification method is provided - match &download_info.hash_verification { + // Resolve the expected hash once. The SHASUMS fetch/parse is deterministic + // (parse failures are permanent and the fetch already retries internally), + // so it stays outside the content-integrity retry below. + let expected_hash: Option = match &download_info.hash_verification { HashVerification::ShasumsFile { url } => { let shasums_content = download_text(url).await?; - let expected_hash = - provider.parse_shasums(&shasums_content, &download_info.archive_filename)?; + Some(provider.parse_shasums(&shasums_content, &download_info.archive_filename)?) + } + HashVerification::None => None, + }; - // Download archive - download_file(&download_info.archive_url, &archive_path, &download_message).await?; + let extracted_path = temp_path.join(&download_info.extracted_dir_name); - // Verify hash - verify_file_hash(&archive_path, &expected_hash, &download_info.archive_filename) - .await?; + // Retry the download → verify → extract pipeline on transient corruption. + // `download_file` already retries the network layer (and surfaces an + // exhausted download as `DownloadFailed`, which `is_retryable_runtime_error` + // excludes), so this layer only re-downloads when a complete-but-corrupt + // archive fails hash verification or extraction. Each attempt starts from a + // clean extraction target so a partial prior attempt can't interfere. + (|| async { + download_file(&download_info.archive_url, &archive_path, &download_message).await?; + + if let Some(expected_hash) = &expected_hash { + verify_file_hash(&archive_path, expected_hash, &download_info.archive_filename).await?; } - HashVerification::None => { - // Download archive without verification - download_file(&download_info.archive_url, &archive_path, &download_message).await?; + + if tokio::fs::try_exists(&extracted_path).await.unwrap_or(false) { + tokio::fs::remove_dir_all(&extracted_path).await?; } - } + extract_archive(&archive_path, &temp_path, download_info.archive_format).await?; - // Extract archive - extract_archive(&archive_path, &temp_path, download_info.archive_format).await?; + Ok::<(), Error>(()) + }) + .retry( + ExponentialBuilder::default() + .with_jitter() + .with_min_delay(Duration::from_millis(200)) + .with_max_times(3), + ) + .when(is_retryable_runtime_error) + .await?; // Move extracted directory to cache location - let extracted_path = temp_path.join(&download_info.extracted_dir_name); move_to_cache(&extracted_path, &install_dir, version).await?; tracing::info!("{} {version} installed at {install_dir:?}", provider.name()); @@ -210,6 +231,18 @@ pub async fn download_runtime_with_provider( }) } +/// Predicate for the content-integrity retry in +/// [`download_runtime_with_provider`]. +/// +/// Retries only a complete-but-corrupt download: a hash mismatch, or an +/// extraction failure (`ExtractionFailed` from zip, `Io`/`UnexpectedEof` from +/// tar.gz). Everything else fails fast — `DownloadFailed`/`Reqwest` are already +/// retried inside `download_file`, and permanent errors (version-not-found, +/// SHASUMS parse failures, `JoinError` panics) can never succeed on retry. +fn is_retryable_runtime_error(err: &Error) -> bool { + matches!(err, Error::HashMismatch { .. } | Error::ExtractionFailed { .. } | Error::Io(_)) +} + /// Represents the source from which a Node.js version was read. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum VersionSource { From 0183c8f286732819b51d3bc0678078276724319e Mon Sep 17 00:00:00 2001 From: MK Date: Sat, 30 May 2026 21:41:37 +0800 Subject: [PATCH 2/2] fix(install): collapse download retry to a single layer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cursor Bugbot flagged retry multiplication: a truncated download surfaced as `Error::Io` from `download_file`'s own retry, and the outer `download_and_extract_tgz_with_hash` retry also matched `Error::Io`, so a persistent failure could trigger up to 4×4 = 16 download attempts. Collapse to a single retry layer: `_once` now downloads with a no-retry client (`HttpClient::with_config(0, 0)`), and the pipeline retry owns all resilience. The predicate (`is_retryable_download_error`) retries transient network/HTTP errors (except 404), truncated downloads, corrupt-archive extraction, and hash mismatches; a 404 and permanent config errors fail fast and propagate unchanged so the caller still maps 404 → PackageManagerVersionNotFound (now without wasted retries). --- crates/vite_install/src/request.rs | 55 +++++++++++++++++------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/crates/vite_install/src/request.rs b/crates/vite_install/src/request.rs index c7b20c3bb1..e60e55961f 100644 --- a/crates/vite_install/src/request.rs +++ b/crates/vite_install/src/request.rs @@ -3,7 +3,7 @@ use std::{path::Path, time::Duration}; use backon::{ExponentialBuilder, Retryable}; use flate2::read::GzDecoder; use futures_util::stream::StreamExt; -use reqwest::Response; +use reqwest::{Response, StatusCode}; use serde::de::DeserializeOwned; use sha1::Sha1; use sha2::{Digest, Sha224, Sha256, Sha512}; @@ -211,20 +211,21 @@ pub async fn download_and_extract_tgz_with_hash( expected_hash ); - // Wrap the download → verify → extract sequence in a content-integrity - // retry. Network errors are already retried inside `download_file`, so the - // predicate below only retries IO / hash-mismatch / extraction failures - // (the symptoms of a transient corrupt or truncated download). A 404 maps - // to a `Reqwest` error that must propagate unchanged so the caller in - // `package_manager.rs` can turn it into `PackageManagerVersionNotFound`. + // This is the single retry layer for the whole download → verify → extract + // pipeline: each attempt does one download (no nested retry — see + // `_once`), so a transient network error, a truncated download, a corrupt + // archive, or a hash mismatch all re-download from scratch exactly once per + // attempt. A 404 (version not found) and permanent config errors fail fast + // and propagate unchanged so the caller in `package_manager.rs` can map a + // 404 to `PackageManagerVersionNotFound`. (|| async { download_and_extract_tgz_with_hash_once(url, &target_dir, expected_hash).await }) .retry( ExponentialBuilder::default() .with_jitter() - .with_min_delay(Duration::from_millis(200)) + .with_min_delay(Duration::from_millis(500)) .with_max_times(3), ) - .when(is_retryable_extract_error) + .when(is_retryable_download_error) .await } @@ -243,9 +244,12 @@ async fn download_and_extract_tgz_with_hash_once( } fs::create_dir_all(target_dir).await?; - // Download the tgz file. `download_file` retries the network layer itself. + // Download the tgz file with a single attempt (no internal retry). The + // pipeline retry in `download_and_extract_tgz_with_hash` owns all retries; + // letting `download_file` retry here too would nest two retry layers and + // multiply attempts (up to N×M downloads) for a persistent failure. let tgz_file = target_dir.join("package.tgz"); - let client = HttpClient::new(); + let client = HttpClient::with_config(0, 0); client.download_file(url, &tgz_file).await?; // Verify hash if provided @@ -267,20 +271,25 @@ async fn download_and_extract_tgz_with_hash_once( Ok(()) } -/// Predicate for the content-integrity retry in +/// Predicate for the single download → verify → extract retry in /// [`download_and_extract_tgz_with_hash`]. /// -/// Retries only transient content-integrity / IO failures that a fresh -/// re-download can fix. Everything else falls through to `false`: -/// - `Reqwest` network errors are deliberately excluded — `download_file` -/// already retries the request + body stream, and not re-retrying them here -/// keeps the caller's 404 → `PackageManagerVersionNotFound` mapping intact. -/// - Permanent config errors (bad hash format, unsupported algorithm, -/// version-not-found) can never succeed on retry. -/// - A `JoinError` (a `spawn_blocking` panic during extraction) is a bug, not a -/// transient fault, so it fails fast rather than re-downloading. -fn is_retryable_extract_error(err: &Error) -> bool { - matches!(err, Error::Io(_) | Error::IoWithPath { .. } | Error::HashMismatch { .. }) +/// Retries transient failures that a fresh re-download can fix; everything else +/// fails fast: +/// - `Reqwest`: retry transient network/HTTP errors, but NOT a 404 — that means +/// the version doesn't exist, so it must propagate unchanged for the caller's +/// `PackageManagerVersionNotFound` mapping (and there's no point retrying it). +/// - `Io` / `IoWithPath`: truncated download or corrupt-archive extraction +/// (e.g. tar `UnexpectedEof`). +/// - `HashMismatch`: integrity failure, most likely a corrupt/truncated download. +/// - Everything else (bad hash format, unsupported algorithm, a `JoinError` +/// from a `spawn_blocking` panic, …) is permanent and is not retried. +fn is_retryable_download_error(err: &Error) -> bool { + match err { + Error::Reqwest(e) => e.status() != Some(StatusCode::NOT_FOUND), + Error::Io(_) | Error::IoWithPath { .. } | Error::HashMismatch { .. } => true, + _ => false, + } } /// Computes the hash of the given content using the specified digest algorithm.