diff --git a/src/pull_module/libgit2.cpp b/src/pull_module/libgit2.cpp index 87c263e79a..5a0fc1b079 100644 --- a/src/pull_module/libgit2.cpp +++ b/src/pull_module/libgit2.cpp @@ -115,6 +115,33 @@ void removeLfsWipMarker(const std::string& repositoryPath) { } } +/** + * Attempts to remove stale lfs_error.txt file from a repository. + * If the file cannot be checked or deleted, logs a warning but does not fail the operation. + * This function clears error artifacts that would otherwise interfere with resume/clone detection. + * + * @param repositoryPath Path to the git repository root directory. + * @note Logs warnings if file operations fail; operation continues on error. + */ +static void clearStaleErrorFile(const std::string& repositoryPath) { + std::error_code ec; + const fs::path errorFilePath = fs::path(repositoryPath) / "lfs_error.txt"; + + bool fileExists = fs::exists(errorFilePath, ec); + if (ec) { + SPDLOG_WARN("Failed to check if lfs_error.txt exists at {}: {}. May interfere with resume/clone detection.", + errorFilePath.string(), ec.message()); + return; + } + + if (fileExists) { + if (!fs::remove(errorFilePath, ec)) { + SPDLOG_WARN("Failed to remove stale lfs_error.txt at {}: {}. May interfere with resume/clone detection.", + errorFilePath.string(), ec.message()); + } + } +} + } // namespace libgit2 namespace { @@ -1193,6 +1220,9 @@ Status resumeExistingRepository(git_repository* repo, SPDLOG_WARN("Failed to create .lfswip marker before pull resume at {}", libgit2::getLfsWipMarkerPath(downloadPath).string()); } + // Clear any stale lfs_error.txt from previous failed attempts to avoid false error reports + libgit2::clearStaleErrorFile(downloadPath); + printResumeCandidates(candidates); for (const auto& p : candidates.lfsMatches) { @@ -1391,6 +1421,9 @@ Status handleFreshClone(const std::string& downloadPath, const std::function& removeReadonlyFn) { SPDLOG_DEBUG("Downloading to path: {}", downloadPath); + // Clear any stale lfs_error.txt from previous failed attempts to avoid false error reports + libgit2::clearStaleErrorFile(downloadPath); + git_clone_options cloneOptions = GIT_CLONE_OPTIONS_INIT; configureCloneOptions(cloneOptions, useProxy, proxyUrl); diff --git a/src/test/pull_hf_model_test.cpp b/src/test/pull_hf_model_test.cpp index 25ada82a97..31657aaa8e 100644 --- a/src/test/pull_hf_model_test.cpp +++ b/src/test/pull_hf_model_test.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -61,29 +62,439 @@ #include "environment.hpp" +namespace { + +constexpr std::uintmax_t MODEL_FULL_SIZE_BYTES = 52417240; +constexpr std::uintmax_t MODEL_HALF_SIZE_BYTES = 26208620; +constexpr std::uintmax_t MODEL2_FULL_SIZE_BYTES = 339125; +constexpr std::uintmax_t MODEL3_FULL_SIZE_BYTES = 500292; +constexpr std::uintmax_t MODEL4_FULL_SIZE_BYTES = 499723; +constexpr std::uintmax_t DRAFT_MODEL_TOKENIZER_SIZE_BYTES = 2022483; + +struct ProbeErrorTracker { + std::size_t consecutiveErrors = 0; + std::error_code lastError; + std::string lastOperation; + std::string lastPath; + std::string persistentFailureReason; +}; + +bool trackProbeError(ProbeErrorTracker& tracker, const std::error_code& probeError, std::string_view operation, const std::string& path, + std::string_view context, std::size_t maxConsecutiveErrors) { + if (!probeError || probeError == std::errc::no_such_file_or_directory) { + tracker.consecutiveErrors = 0; + tracker.lastError.clear(); + tracker.lastOperation.clear(); + tracker.lastPath.clear(); + return false; + } + + if ((probeError == tracker.lastError) && (tracker.lastOperation == operation) && (tracker.lastPath == path)) { + ++tracker.consecutiveErrors; + } else { + tracker.consecutiveErrors = 1; + tracker.lastError = probeError; + tracker.lastOperation = operation; + tracker.lastPath = path; + } + + if ((tracker.consecutiveErrors == 1) || (tracker.consecutiveErrors % 5 == 0)) { + SPDLOG_WARN("{}: non-benign filesystem probe error repeated {} time(s): op={} path={} ec={} ({})", + context, tracker.consecutiveErrors, operation, path, probeError.value(), probeError.message()); + } + if (tracker.consecutiveErrors >= maxConsecutiveErrors) { + tracker.persistentFailureReason = "Persistent non-benign filesystem probe error while waiting for in-progress pull: op=" + std::string(operation) + + " path=" + path + + " ec=" + std::to_string(probeError.value()) + + " (" + probeError.message() + ") repeated " + + std::to_string(tracker.consecutiveErrors) + " time(s)"; + return true; + } + return false; +} + +::testing::AssertionResult waitForResumableInProgressPull( + const std::string& modelBasePath, + const std::string& modelPath, + const std::string& downloadPath, + std::uintmax_t expectedFullModelSize, + std::string_view context, + int timeoutSeconds, + int pollIntervalMs, + int postObservationDelayMs, + std::size_t maxConsecutiveProbeErrors) { + ProbeErrorTracker probeErrorTracker; + const std::string mainRefPath = ovms::FileSystem::appendSlash(modelBasePath) + ".git/refs/heads/main"; + const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(timeoutSeconds); + while (std::chrono::steady_clock::now() < deadline) { + std::error_code ec; + const bool hasMainRef = std::filesystem::exists(mainRefPath, ec); + if (trackProbeError(probeErrorTracker, ec, "exists", mainRefPath, context, maxConsecutiveProbeErrors)) { + return ::testing::AssertionFailure() << probeErrorTracker.persistentFailureReason; + } + + ec.clear(); + const bool modelExists = std::filesystem::exists(modelPath, ec); + if (trackProbeError(probeErrorTracker, ec, "exists", modelPath, context, maxConsecutiveProbeErrors)) { + return ::testing::AssertionFailure() << probeErrorTracker.persistentFailureReason; + } + + std::uintmax_t modelSize = 0; + if (modelExists) { + ec.clear(); + modelSize = std::filesystem::file_size(modelPath, ec); + if (ec) { + if (trackProbeError(probeErrorTracker, ec, "file_size", modelPath, context, maxConsecutiveProbeErrors)) { + return ::testing::AssertionFailure() << probeErrorTracker.persistentFailureReason; + } + modelSize = 0; + } + } + + auto lfsCandidates = ovms::libgit2::findLfsLikeFiles(downloadPath, true); + const bool hasLfsArtifacts = !lfsCandidates.empty(); + const bool modelInFlight = modelExists && (modelSize > 0) && (modelSize < expectedFullModelSize); + if (hasMainRef && (hasLfsArtifacts || modelInFlight)) { + std::this_thread::sleep_for(std::chrono::milliseconds(postObservationDelayMs)); + return ::testing::AssertionSuccess(); + } + std::this_thread::sleep_for(std::chrono::milliseconds(pollIntervalMs)); + } + + return ::testing::AssertionFailure() << context << ": did not observe in-progress pull within timeout."; +} + +#ifdef _WIN32 +struct WindowsResumeWorkerConfig { + const char* workerFlagEnv; + const char* modelEnv; + const char* downloadPathEnv; + const char* taskEnv; + const char* gtestFilter; + DWORD processCreationFlags; +}; + +const WindowsResumeWorkerConfig RESUME_TERMINATE_WORKER_CONFIG = { + "OVMS_RESUME_TERMINATE_WORKER", + "OVMS_RESUME_TERMINATE_MODEL", + "OVMS_RESUME_TERMINATE_DOWNLOAD_PATH", + "OVMS_RESUME_TERMINATE_TASK", + "HfPullWindowsWorker.ResumeTerminateChildProcess", + 0}; + +const WindowsResumeWorkerConfig RESUME_CTRLC_WORKER_CONFIG = { + "OVMS_RESUME_CTRLC_WORKER", + "OVMS_RESUME_CTRLC_MODEL", + "OVMS_RESUME_CTRLC_DOWNLOAD_PATH", + "OVMS_RESUME_CTRLC_TASK", + "HfPullWindowsWorker.ResumeCtrlCChildProcess", + CREATE_NEW_PROCESS_GROUP}; + +::testing::AssertionResult setWindowsResumeWorkerEnvironment( + const WindowsResumeWorkerConfig& config, + const std::string& modelName, + const std::string& downloadPath, + const std::string& task) { + if (!SetEnvironmentVariableA(config.workerFlagEnv, "1")) { + return ::testing::AssertionFailure() << "Failed to set env var: " << config.workerFlagEnv; + } + if (!SetEnvironmentVariableA(config.modelEnv, modelName.c_str())) { + return ::testing::AssertionFailure() << "Failed to set env var: " << config.modelEnv; + } + if (!SetEnvironmentVariableA(config.downloadPathEnv, downloadPath.c_str())) { + return ::testing::AssertionFailure() << "Failed to set env var: " << config.downloadPathEnv; + } + if (!SetEnvironmentVariableA(config.taskEnv, task.c_str())) { + return ::testing::AssertionFailure() << "Failed to set env var: " << config.taskEnv; + } + return ::testing::AssertionSuccess(); +} + +::testing::AssertionResult clearWindowsResumeWorkerEnvironment(const WindowsResumeWorkerConfig& config) { + if (!SetEnvironmentVariableA(config.workerFlagEnv, nullptr)) { + return ::testing::AssertionFailure() << "Failed to clear env var: " << config.workerFlagEnv; + } + if (!SetEnvironmentVariableA(config.modelEnv, nullptr)) { + return ::testing::AssertionFailure() << "Failed to clear env var: " << config.modelEnv; + } + if (!SetEnvironmentVariableA(config.downloadPathEnv, nullptr)) { + return ::testing::AssertionFailure() << "Failed to clear env var: " << config.downloadPathEnv; + } + if (!SetEnvironmentVariableA(config.taskEnv, nullptr)) { + return ::testing::AssertionFailure() << "Failed to clear env var: " << config.taskEnv; + } + return ::testing::AssertionSuccess(); +} + +::testing::AssertionResult launchWindowsResumeWorkerProcess(const WindowsResumeWorkerConfig& config, PROCESS_INFORMATION& pi) { + char testExePath[MAX_PATH] = {0}; + const DWORD exePathLen = GetModuleFileNameA(nullptr, testExePath, MAX_PATH); + if ((exePathLen == 0u) || (exePathLen >= static_cast(MAX_PATH))) { + return ::testing::AssertionFailure() << "Failed to resolve current test executable path"; + } + + std::string commandLine = std::string("\"") + testExePath + "\" --gtest_filter=" + config.gtestFilter; + STARTUPINFOA si; + ZeroMemory(&si, sizeof(si)); + si.cb = sizeof(si); + ZeroMemory(&pi, sizeof(pi)); + + if (!CreateProcessA( + nullptr, + commandLine.data(), + nullptr, + nullptr, + TRUE, + config.processCreationFlags, + nullptr, + nullptr, + &si, + &pi)) { + return ::testing::AssertionFailure() << "CreateProcessA failed for filter: " << config.gtestFilter; + } + return ::testing::AssertionSuccess(); +} + +::testing::AssertionResult startWindowsResumeWorker( + const WindowsResumeWorkerConfig& config, + const std::string& modelName, + const std::string& downloadPath, + const std::string& task, + PROCESS_INFORMATION& pi) { + auto envStatus = setWindowsResumeWorkerEnvironment(config, modelName, downloadPath, task); + if (!envStatus) { + return envStatus; + } + + auto launchStatus = launchWindowsResumeWorkerProcess(config, pi); + if (!launchStatus) { + (void)clearWindowsResumeWorkerEnvironment(config); + return launchStatus; + } + return ::testing::AssertionSuccess(); +} + +void closeWindowsWorkerHandles(PROCESS_INFORMATION& pi) { + if (pi.hThread != nullptr) { + CloseHandle(pi.hThread); + pi.hThread = nullptr; + } + if (pi.hProcess != nullptr) { + CloseHandle(pi.hProcess); + pi.hProcess = nullptr; + } +} + +::testing::AssertionResult terminateWindowsWorker(PROCESS_INFORMATION& pi, int timeoutSeconds) { + if (!TerminateProcess(pi.hProcess, 1)) { + return ::testing::AssertionFailure() << "TerminateProcess failed"; + } + if (WaitForSingleObject(pi.hProcess, timeoutSeconds * 1000) != WAIT_OBJECT_0) { + return ::testing::AssertionFailure() << "Timed out waiting for terminated child process"; + } + return ::testing::AssertionSuccess(); +} + +::testing::AssertionResult sendCtrlBreakToWindowsWorker(PROCESS_INFORMATION& pi, int timeoutSeconds) { + if (!GenerateConsoleCtrlEvent(CTRL_BREAK_EVENT, pi.dwProcessId)) { + return ::testing::AssertionFailure() << "GenerateConsoleCtrlEvent(CTRL_BREAK_EVENT) failed"; + } + if (WaitForSingleObject(pi.hProcess, timeoutSeconds * 1000) != WAIT_OBJECT_0) { + return ::testing::AssertionFailure() << "Timed out waiting for CTRL_BREAK child process shutdown"; + } + DWORD childExitCode = 0; + if (!GetExitCodeProcess(pi.hProcess, &childExitCode)) { + return ::testing::AssertionFailure() << "GetExitCodeProcess failed"; + } + if (childExitCode == static_cast(EXIT_SUCCESS)) { + return ::testing::AssertionFailure() + << "Child process exited with EXIT_SUCCESS after CTRL_BREAK, expected interrupted failure"; + } + return ::testing::AssertionSuccess(); +} +#else +::testing::AssertionResult launchPosixResumeWorker( + const std::string& modelName, + const std::string& downloadPath, + const std::string& task, + pid_t& childPid) { + childPid = fork(); + if (childPid == -1) { + return ::testing::AssertionFailure() << "fork() failed"; + } + if (childPid == 0) { + ovms::Server& childServer = ovms::Server::instance(); + childServer.setShutdownRequest(0); + char* argv[] = {(char*)"ovms", + (char*)"--pull", + (char*)"--source_model", + (char*)modelName.c_str(), + (char*)"--model_repository_path", + (char*)downloadPath.c_str(), + (char*)"--task", + (char*)task.c_str()}; + int argc = 8; + int rc = childServer.start(argc, argv); + _exit(rc == EXIT_SUCCESS ? 0 : 1); + } + return ::testing::AssertionSuccess(); +} + +::testing::AssertionResult killPosixWorkerAndExpectSigKill(pid_t childPid) { + if (kill(childPid, SIGKILL) != 0) { + return ::testing::AssertionFailure() << "Failed to send SIGKILL to child process"; + } + + int childStatus = 0; + if (waitpid(childPid, &childStatus, 0) != childPid) { + return ::testing::AssertionFailure() << "waitpid failed for SIGKILL child process"; + } + if (!WIFSIGNALED(childStatus) || (WTERMSIG(childStatus) != SIGKILL)) { + return ::testing::AssertionFailure() << "Child did not terminate due to SIGKILL"; + } + return ::testing::AssertionSuccess(); +} + +::testing::AssertionResult interruptPosixWorkerAndExpectGracefulExit(pid_t childPid) { + if (kill(childPid, SIGINT) != 0) { + return ::testing::AssertionFailure() << "Failed to send SIGINT to child process"; + } + + int childStatus = 0; + if (waitpid(childPid, &childStatus, 0) != childPid) { + return ::testing::AssertionFailure() << "waitpid failed for SIGINT child process"; + } + if (!WIFEXITED(childStatus)) { + return ::testing::AssertionFailure() + << "Child was terminated by signal " << (WIFSIGNALED(childStatus) ? WTERMSIG(childStatus) : 0) + << " instead of graceful exit after SIGINT"; + } + if (WEXITSTATUS(childStatus) == EXIT_SUCCESS) { + return ::testing::AssertionFailure() << "Child exited with EXIT_SUCCESS after SIGINT, expected interrupted failure"; + } + return ::testing::AssertionSuccess(); +} +#endif + +} // namespace + +// RAII helper class for managing log file lifecycle. +// Creates a log file path and automatically removes it on destruction. +class LogFileGuard { +private: + std::string logFilePath; + +public: + explicit LogFileGuard(const std::string& path) : + logFilePath(path) {} + + ~LogFileGuard() { + if (std::filesystem::exists(logFilePath)) { + std::error_code ec; + std::filesystem::remove(logFilePath, ec); + } + } + + bool create() { + std::error_code ec; + std::filesystem::remove(logFilePath, ec); + std::ofstream file(logFilePath); + return file.is_open(); + } + + std::string getContent() const { + std::ifstream file(logFilePath); + if (!file.is_open()) { + return ""; + } + std::stringstream buffer; + buffer << file.rdbuf(); + return buffer.str(); + } + + bool contains(const std::string& text) const { + return getContent().find(text) != std::string::npos; + } + + const std::string& getPath() const { + return logFilePath; + } + + bool exists() const { + return std::filesystem::exists(logFilePath); + } +}; + class HfPull : public TestWithTempDir { protected: + static constexpr const char* MODEL_NAMESPACE = "OpenVINO"; + static constexpr const char* MODEL_ID = "Phi-3-mini-FastDraft-50M-int8-ov"; + static constexpr const char* TASK_NAME = "text_generation"; + + // Timeout (seconds) for detecting in-progress pull in interrupt tests + static constexpr int HF_PULL_DETECT_TIMEOUT_SECONDS = 180; + // Timeout (seconds) for waiting on shutdown request acknowledgement + static constexpr int HF_PULL_SHUTDOWN_TIMEOUT_SECONDS = 120; + // Timeout (seconds) for regular server operations + static constexpr int HF_PULL_SERVER_TIMEOUT_SECONDS = 60; + // Delay (milliseconds) before sending interrupt signal + static constexpr int HF_PULL_INTERRUPT_DELAY_MS = 200; + // Poll interval (milliseconds) when checking for in-progress download + static constexpr int HF_PULL_POLL_INTERVAL_MS = 100; + // Max consecutive non-benign filesystem probe errors before failing diagnostics. + static constexpr int HF_PULL_MAX_CONSECUTIVE_FS_PROBE_ERRORS = 15; + ovms::Server& server = ovms::Server::instance(); std::unique_ptr t; std::string modelName; std::string downloadPath; std::string task; + std::string modelBasePath; + std::string modelPath; + std::string modelPartPath; + std::string model2Path; + std::string model3Path; + std::string model4Path; + std::string tokenizerJsonPath; + std::string graphPath; + std::string gitDirPath; void SetUp() override { TestWithTempDir::SetUp(); - modelName = "OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov"; + modelName = std::string(MODEL_NAMESPACE) + "/" + MODEL_ID; downloadPath = ovms::FileSystem::joinPath({this->directoryPath, "repository"}); - task = "text_generation"; + task = TASK_NAME; + modelBasePath = ovms::FileSystem::joinPath({downloadPath, MODEL_NAMESPACE, MODEL_ID}); + modelPath = ovms::FileSystem::appendSlash(modelBasePath) + "openvino_model.bin"; + modelPartPath = ovms::FileSystem::appendSlash(modelBasePath) + "openvino_model.binlfs_part"; + model2Path = ovms::FileSystem::appendSlash(modelBasePath) + "openvino_detokenizer.bin"; + model3Path = ovms::FileSystem::appendSlash(modelBasePath) + "openvino_tokenizer.bin"; + model4Path = ovms::FileSystem::appendSlash(modelBasePath) + "tokenizer.model"; + tokenizerJsonPath = ovms::FileSystem::appendSlash(modelBasePath) + "tokenizer.json"; + graphPath = ovms::FileSystem::appendSlash(modelBasePath) + "graph.pbtxt"; + gitDirPath = ovms::FileSystem::appendSlash(modelBasePath) + ".git"; } void ServerPullHfModel(std::string& sourceModel, std::string& downloadPath, std::string& task, int expected_code = 0, int timeoutSeconds = 60) { ::SetUpServerForDownload(this->t, this->server, sourceModel, downloadPath, task, expected_code, timeoutSeconds); } + // Variant that captures output to a log file for assertions + void ServerPullHfModel(std::string& sourceModel, std::string& downloadPath, std::string& task, LogFileGuard& logFile, int expected_code = 0, int timeoutSeconds = 60) { + ASSERT_TRUE(logFile.create()) << "Failed to create log file at: " << logFile.getPath(); + ::SetUpServerForDownload(this->t, this->server, sourceModel, downloadPath, task, logFile.getPath(), expected_code, timeoutSeconds); + } + void ServerPullHfModelWithDraft(std::string& draftModel, std::string& sourceModel, std::string& downloadPath, std::string& task, int expected_code = 0, int timeoutSeconds = 60) { ::SetUpServerForDownloadWithDraft(this->t, this->server, draftModel, sourceModel, downloadPath, task, expected_code, timeoutSeconds); } + // Variant with draft model that captures output to a log file for assertions + void ServerPullHfModelWithDraft(std::string& draftModel, std::string& sourceModel, std::string& downloadPath, std::string& task, LogFileGuard& logFile, int expected_code = 0, int timeoutSeconds = 60) { + ASSERT_TRUE(logFile.create()) << "Failed to create log file at: " << logFile.getPath(); + ::SetUpServerForDownloadWithDraft(this->t, this->server, draftModel, sourceModel, downloadPath, task, logFile.getPath(), expected_code, timeoutSeconds); + } + void SetUpServerForDownloadAndStart(std::string& sourceModel, std::string& downloadPath, std::string& task, int timeoutSeconds = 60) { ::SetUpServerForDownloadAndStart(this->t, this->server, sourceModel, downloadPath, task, timeoutSeconds); } @@ -104,12 +515,11 @@ class HfPullCache : public HfPull { static std::once_flag cacheInitFlag; static std::unique_ptr cacheDir; static std::string cachedRepositoryPath; - - static constexpr const char* MODEL_NAME = "OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov"; - static constexpr const char* TASK_NAME = "text_generation"; + std::string testRepositoryPath; void SetUp() override { HfPull::SetUp(); + testRepositoryPath = downloadPath; initializeSharedCache(); seedCurrentTestRepository(); } @@ -117,23 +527,22 @@ class HfPullCache : public HfPull { void initializeSharedCache() { std::call_once(cacheInitFlag, [this]() { cacheDir = std::make_unique(); - std::string modelName = MODEL_NAME; - std::string downloadPath = ovms::FileSystem::joinPath({cacheDir->dir.string(), "repository"}); - std::string task = TASK_NAME; + std::string sourceModelName = this->modelName; + std::string cacheDownloadPath = ovms::FileSystem::joinPath({cacheDir->dir.string(), "repository"}); + std::string pullTask = this->task; - this->ServerPullHfModel(modelName, downloadPath, task); + this->ServerPullHfModel(sourceModelName, cacheDownloadPath, pullTask); server.setShutdownRequest(1); if (t) t->join(); server.setShutdownRequest(0); - cachedRepositoryPath = downloadPath; + cachedRepositoryPath = cacheDownloadPath; ASSERT_TRUE(std::filesystem::exists(cachedRepositoryPath)); }); } void seedCurrentTestRepository() { - const std::string testRepositoryPath = ovms::FileSystem::joinPath({this->directoryPath, "repository"}); std::error_code ec; std::filesystem::copy(cachedRepositoryPath, testRepositoryPath, @@ -237,13 +646,13 @@ TEST_F(HfPull, Download) { ASSERT_EQ(std::filesystem::exists(modelPath), true) << modelPath; ASSERT_EQ(std::filesystem::exists(graphPath), true) << graphPath; - ASSERT_EQ(std::filesystem::file_size(modelPath), 52417240); + ASSERT_EQ(std::filesystem::file_size(modelPath), MODEL_FULL_SIZE_BYTES); std::string graphContents = GetFileContents(graphPath); ASSERT_EQ(expectedGraphContents, removeGeneratedGraphHeaders(graphContents)) << graphContents; } -// Truncate the file to half its size, keeping the first half. +// Truncate the file to half its size (MODEL_FULL_SIZE_BYTES / 2), keeping the first half. bool removeSecondHalf(const std::string& fileStr) { const std::filesystem::path& file(fileStr); std::error_code ec; @@ -272,7 +681,8 @@ bool createGitLfsPointerFile(const std::string& path) { file << "version https://git-lfs.github.com/spec/v1\n" "oid sha256:cecf0224201415144c00cf3a6cf3350306f9c78888d631eb590939a63722fefa\n" - "size 52417240\n"; + "size " + << MODEL_FULL_SIZE_BYTES << "\n"; return true; } @@ -343,10 +753,6 @@ class TestHfDownloader : public ovms::HfDownloader { }; TEST_F(HfPullCache, RePull) { - std::string modelName = "OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov"; - std::string downloadPath = ovms::FileSystem::joinPath({this->directoryPath, "repository"}); - std::string task = "text_generation"; - testing::internal::CaptureStdout(); this->ServerPullHfModel(modelName, downloadPath, task); std::string out = testing::internal::GetCapturedStdout(); @@ -361,18 +767,9 @@ TEST_F(HfPullCache, RePull) { } TEST_F(HfPullCache, Resume) { - std::string modelName = "OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov"; - std::string downloadPath = ovms::FileSystem::joinPath({this->directoryPath, "repository"}); - std::string task = "text_generation"; - - std::string ovModelName = "openvino_model.bin"; - std::string basePath = ovms::FileSystem::joinPath({this->directoryPath, "repository", "OpenVINO", "Phi-3-mini-FastDraft-50M-int8-ov"}); - std::string modelPath = ovms::FileSystem::appendSlash(basePath) + ovModelName; - std::string graphPath = ovms::FileSystem::appendSlash(basePath) + "graph.pbtxt"; - ASSERT_EQ(std::filesystem::exists(modelPath), true) << modelPath; ASSERT_EQ(std::filesystem::exists(graphPath), true) << graphPath; - ASSERT_EQ(std::filesystem::file_size(modelPath), 52417240); + ASSERT_EQ(std::filesystem::file_size(modelPath), MODEL_FULL_SIZE_BYTES); std::string graphContents = GetFileContents(graphPath); ASSERT_EQ(expectedGraphContents, removeGeneratedGraphHeaders(graphContents)) << graphContents; @@ -393,22 +790,20 @@ TEST_F(HfPullCache, Resume) { ASSERT_EQ(ec, std::errc()); // Prepare a git repository with a lfs_part file and lfs pointer file to simulate partial download error of a big model ASSERT_EQ(removeSecondHalf(modelPath), true); - ASSERT_EQ(std::filesystem::file_size(modelPath), 26208620); + ASSERT_EQ(std::filesystem::file_size(modelPath), MODEL_HALF_SIZE_BYTES); - std::string ovModelPartLfsName = "openvino_model.binlfs_part"; - std::string ovModelPartLfsPath = ovms::FileSystem::appendSlash(basePath) + ovModelPartLfsName; - std::filesystem::rename(modelPath, ovModelPartLfsPath, ec); + std::filesystem::rename(modelPath, modelPartPath, ec); ASSERT_EQ(ec, std::errc()); - ASSERT_EQ(std::filesystem::file_size(ovModelPartLfsPath), 26208620); + ASSERT_EQ(std::filesystem::file_size(modelPartPath), MODEL_HALF_SIZE_BYTES); ASSERT_EQ(createGitLfsPointerFile(modelPath), true); // Call ovms pull to resume the file this->ServerPullHfModel(modelName, downloadPath, task); - ASSERT_EQ(std::filesystem::exists(ovModelPartLfsPath), false) << modelPath; + ASSERT_EQ(std::filesystem::exists(modelPartPath), false) << modelPath; ASSERT_EQ(std::filesystem::exists(modelPath), true) << modelPath; ASSERT_EQ(std::filesystem::exists(graphPath), true) << graphPath; - ASSERT_EQ(std::filesystem::file_size(modelPath), 52417240); + ASSERT_EQ(std::filesystem::file_size(modelPath), MODEL_FULL_SIZE_BYTES); graphContents = GetFileContents(graphPath); ASSERT_EQ(expectedGraphContents, removeGeneratedGraphHeaders(graphContents)) << graphContents; @@ -420,16 +815,6 @@ TEST_F(HfPullCache, Resume) { // ResumeAfterShutdownRequestAndRerun TEST_F(HfPull, ResumeShutdown) { -#ifdef _WIN32 - SKIP_AND_EXIT_IF_NOT_RUNNING_UNSTABLE(); -#endif - std::string basePath = ovms::FileSystem::joinPath({this->directoryPath, "repository", "OpenVINO", "Phi-3-mini-FastDraft-50M-int8-ov"}); - std::string modelPath = ovms::FileSystem::appendSlash(basePath) + "openvino_model.bin"; - std::string model2Path = ovms::FileSystem::appendSlash(basePath) + "openvino_detokenizer.bin"; - std::string model3Path = ovms::FileSystem::appendSlash(basePath) + "openvino_tokenizer.bin"; - std::string model4Path = ovms::FileSystem::appendSlash(basePath) + "tokenizer.model"; - std::string graphPath = ovms::FileSystem::appendSlash(basePath) + "graph.pbtxt"; - server.setShutdownRequest(0); int firstRunCode = EXIT_SUCCESS; char* argv[] = {(char*)"ovms", @@ -445,21 +830,38 @@ TEST_F(HfPull, ResumeShutdown) { firstRunCode = this->server.start(argc, argv); })); - // Wait until the large LFS file (openvino_model.bin) starts downloading before - // sending the shutdown request. A fixed sleep is unreliable: on a fast CPU/network - // machine the download may finish before the sleep expires, leaving no partial files - // and causing the EXPECT_FALSE(remainingPointers.empty()) assertion to fail. - // We poll until the model file exists as an LFS pointer or a partial download - // is in progress (lfs_part file present), then interrupt immediately. + // Wait until pull is clearly in-progress before sending shutdown request. + // A fixed sleep is unreliable: on fast CPU/network setups download may finish + // before sleep expires, leaving no resumable artifacts. + bool observedPartialDownload = false; + ProbeErrorTracker probeErrorTracker; { - const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(60); + const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(HF_PULL_SERVER_TIMEOUT_SECONDS); while (std::chrono::steady_clock::now() < deadline) { - auto lfsCandidates = ovms::libgit2::findLfsLikeFiles(downloadPath, true); - const bool hasModelPointer = std::find_if(lfsCandidates.begin(), lfsCandidates.end(), - [](const std::filesystem::path& p) { return p.filename() == "openvino_model.bin"; }) != lfsCandidates.end(); - const std::string partPath = ovms::FileSystem::appendSlash(basePath) + "openvino_model.binlfs_part"; - const bool hasPartFile = std::filesystem::exists(partPath); - if (hasModelPointer || hasPartFile) { + std::error_code ec; + const bool hasPartFile = std::filesystem::exists(modelPartPath, ec); + if (trackProbeError(probeErrorTracker, ec, "exists", modelPartPath, "ResumeShutdown", HF_PULL_MAX_CONSECUTIVE_FS_PROBE_ERRORS)) { + break; + } + ec.clear(); + const bool modelExists = std::filesystem::exists(modelPath, ec); + if (trackProbeError(probeErrorTracker, ec, "exists", modelPath, "ResumeShutdown", HF_PULL_MAX_CONSECUTIVE_FS_PROBE_ERRORS)) { + break; + } + std::uintmax_t modelSize = 0; + if (modelExists) { + ec.clear(); + modelSize = std::filesystem::file_size(modelPath, ec); + if (ec) { + if (trackProbeError(probeErrorTracker, ec, "file_size", modelPath, "ResumeShutdown", HF_PULL_MAX_CONSECUTIVE_FS_PROBE_ERRORS)) { + break; + } + modelSize = 0; + } + } + const bool modelInFlight = modelExists && (modelSize > 0) && (modelSize < MODEL_FULL_SIZE_BYTES); + if (hasPartFile || modelInFlight) { + observedPartialDownload = true; std::this_thread::sleep_for(std::chrono::milliseconds(200)); break; } @@ -472,6 +874,14 @@ TEST_F(HfPull, ResumeShutdown) { t->join(); server.setShutdownRequest(0); + if (!probeErrorTracker.persistentFailureReason.empty()) { + FAIL() << probeErrorTracker.persistentFailureReason; + } + + if (!observedPartialDownload) { + FAIL() << "Did not observe in-progress pull before timeout; cannot validate resume-after-shutdown path."; + } + EXPECT_NE(firstRunCode, EXIT_SUCCESS); auto remainingPointers = ovms::libgit2::findLfsLikeFiles(downloadPath, true); EXPECT_FALSE(remainingPointers.empty()); @@ -480,22 +890,17 @@ TEST_F(HfPull, ResumeShutdown) { ASSERT_EQ(std::filesystem::exists(modelPath), true) << modelPath; ASSERT_EQ(std::filesystem::exists(graphPath), true) << graphPath; - ASSERT_EQ(std::filesystem::file_size(modelPath), 52417240); - ASSERT_EQ(std::filesystem::file_size(model2Path), 339125); - ASSERT_EQ(std::filesystem::file_size(model3Path), 500292); - ASSERT_EQ(std::filesystem::file_size(model4Path), 499723); + ASSERT_EQ(std::filesystem::file_size(modelPath), MODEL_FULL_SIZE_BYTES); + ASSERT_EQ(std::filesystem::file_size(model2Path), MODEL2_FULL_SIZE_BYTES); + ASSERT_EQ(std::filesystem::file_size(model3Path), MODEL3_FULL_SIZE_BYTES); + ASSERT_EQ(std::filesystem::file_size(model4Path), MODEL4_FULL_SIZE_BYTES); } // PullAfterUserRemovedTrackedFileDoesNotRestoreIt TEST_F(HfPullCache, UserRemoved) { - std::string modelName = "OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov"; - std::string downloadPath = ovms::FileSystem::joinPath({this->directoryPath, "repository"}); - std::string task = "text_generation"; - - std::string basePath = ovms::FileSystem::joinPath({this->directoryPath, "repository", "OpenVINO", "Phi-3-mini-FastDraft-50M-int8-ov"}); - std::string preservedFilePath = ovms::FileSystem::appendSlash(basePath) + "openvino_model.bin"; - std::string removedFilePath = ovms::FileSystem::appendSlash(basePath) + "openvino_tokenizer.bin"; - std::string removedFilePath2 = ovms::FileSystem::appendSlash(basePath) + "tokenizer.json"; + std::string preservedFilePath = modelPath; + std::string removedFilePath = model3Path; + std::string removedFilePath2 = tokenizerJsonPath; ASSERT_TRUE(std::filesystem::exists(preservedFilePath)); ASSERT_TRUE(std::filesystem::exists(removedFilePath)); @@ -542,13 +947,8 @@ TEST_F(HfPullCache, UserRemoved) { // PullAfterUserEditedTrackedFileDoesNotOverwriteIt TEST_F(HfPullCache, UserEdited) { - std::string modelName = "OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov"; - std::string downloadPath = ovms::FileSystem::joinPath({this->directoryPath, "repository"}); - std::string task = "text_generation"; - - std::string basePath = ovms::FileSystem::joinPath({this->directoryPath, "repository", "OpenVINO", "Phi-3-mini-FastDraft-50M-int8-ov"}); - std::string editedFilePath = ovms::FileSystem::appendSlash(basePath) + "openvino_tokenizer.bin"; - std::string editedFilePath2 = ovms::FileSystem::appendSlash(basePath) + "tokenizer.json"; + std::string editedFilePath = model3Path; + std::string editedFilePath2 = tokenizerJsonPath; ASSERT_TRUE(std::filesystem::exists(editedFilePath)); ASSERT_TRUE(std::filesystem::exists(editedFilePath2)); @@ -590,7 +990,7 @@ TEST_F(HfPullCache, UserEdited) { secondRunCode = this->server.start(argc, argv); })); - EnsureServerModelDownloadFinishedWithTimeout(server, 120); + EnsureServerModelDownloadFinishedWithTimeout(server, HF_PULL_SHUTDOWN_TIMEOUT_SECONDS); EXPECT_EQ(secondRunCode, EXIT_SUCCESS); EXPECT_EQ(std::filesystem::file_size(editedFilePath), editedSize); @@ -621,14 +1021,9 @@ TEST_F(HfPullCache, UserEdited) { // HfPullCache repository. The model artifacts (openvino_model.bin, tokenizer*, // graph.pbtxt, etc.) remain in place exactly as a user-supplied directory would. TEST_F(HfPullCache, PullNonGit) { - std::string modelName = "OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov"; - std::string downloadPath = ovms::FileSystem::joinPath({this->directoryPath, "repository"}); - std::string task = "text_generation"; - - std::string basePath = ovms::FileSystem::joinPath({downloadPath, "OpenVINO", "Phi-3-mini-FastDraft-50M-int8-ov"}); - std::string modelPath = ovms::FileSystem::appendSlash(basePath) + "openvino_model.bin"; - std::string tokenizerPath = ovms::FileSystem::appendSlash(basePath) + "openvino_tokenizer.bin"; - std::string gitDir = ovms::FileSystem::appendSlash(basePath) + ".git"; + std::string basePath = modelBasePath; + std::string tokenizerPath = model3Path; + std::string gitDir = gitDirPath; ASSERT_TRUE(std::filesystem::exists(modelPath)); ASSERT_TRUE(std::filesystem::exists(tokenizerPath)); @@ -654,6 +1049,11 @@ TEST_F(HfPullCache, PullNonGit) { ASSERT_EQ(ec, std::errc()) << "Failed to remove .git from cached repository: " << ec.message(); ASSERT_FALSE(std::filesystem::exists(gitDir)); +#ifdef _WIN32 + // Logger configuration is process-global and initialized earlier in the test + // binary, so a per-test --log_path does not reliably capture this warning. + this->ServerPullHfModel(modelName, downloadPath, task); +#else testing::internal::CaptureStdout(); this->ServerPullHfModel(modelName, downloadPath, task); std::string out = testing::internal::GetCapturedStdout(); @@ -663,6 +1063,7 @@ TEST_F(HfPullCache, PullNonGit) { << out; EXPECT_EQ(out.find("LFS file(s) to resume"), std::string::npos); EXPECT_EQ(out.find(" Resuming "), std::string::npos); +#endif // No work-in-progress marker should be created next to the model directory. const std::string lfsWipPath = ovms::libgit2::getLfsWipMarkerPath(basePath).string(); @@ -693,14 +1094,9 @@ TEST_F(HfPullCache, PullNonGit) { // the repository and the real error is propagated via mapRepositoryOpenFailureToStatus() // so the operator can act (re-clone, fix permissions, --overwrite_models, ...). TEST_F(HfPullCache, PullEmptyGitDir) { - std::string modelName = "OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov"; - std::string downloadPath = ovms::FileSystem::joinPath({this->directoryPath, "repository"}); - std::string task = "text_generation"; - - std::string basePath = ovms::FileSystem::joinPath({downloadPath, "OpenVINO", "Phi-3-mini-FastDraft-50M-int8-ov"}); - std::string modelPath = ovms::FileSystem::appendSlash(basePath) + "openvino_model.bin"; - std::string tokenizerPath = ovms::FileSystem::appendSlash(basePath) + "openvino_tokenizer.bin"; - std::string gitDir = ovms::FileSystem::appendSlash(basePath) + ".git"; + std::string basePath = modelBasePath; + std::string tokenizerPath = model3Path; + std::string gitDir = gitDirPath; ASSERT_TRUE(std::filesystem::exists(modelPath)); ASSERT_TRUE(std::filesystem::exists(tokenizerPath)); @@ -758,13 +1154,17 @@ TEST_F(HfPullCache, PullEmptyGitDir) { #ifdef _WIN32 // Helper test used only as a child process launched by HfPull.ResumeTerminate. TEST(HfPullWindowsWorker, ResumeTerminateChildProcess) { + // Enables this helper body only for the parent-launched worker process. const char* runWorker = std::getenv("OVMS_RESUME_TERMINATE_WORKER"); if ((runWorker == nullptr) || (std::string(runWorker) != "1")) { GTEST_SKIP() << "Helper test - runs only when launched by HfPull.ResumeTerminate."; } + // Model identifier passed from parent to child worker. const char* modelNameEnv = std::getenv("OVMS_RESUME_TERMINATE_MODEL"); + // Target repository path passed from parent to child worker. const char* downloadPathEnv = std::getenv("OVMS_RESUME_TERMINATE_DOWNLOAD_PATH"); + // Pull task passed from parent to child worker. const char* taskEnv = std::getenv("OVMS_RESUME_TERMINATE_TASK"); ASSERT_NE(modelNameEnv, nullptr); ASSERT_NE(downloadPathEnv, nullptr); @@ -793,130 +1193,55 @@ TEST(HfPullWindowsWorker, ResumeTerminateChildProcess) { // ResumeAfterForcedTerminationAndRerun TEST_F(HfPull, ResumeTerminate) { #ifdef _WIN32 - SKIP_AND_EXIT_IF_NOT_RUNNING_UNSTABLE(); -#endif - std::string basePath = ovms::FileSystem::joinPath({this->directoryPath, "repository", "OpenVINO", "Phi-3-mini-FastDraft-50M-int8-ov"}); - std::string modelPath = ovms::FileSystem::appendSlash(basePath) + "openvino_model.bin"; - std::string model2Path = ovms::FileSystem::appendSlash(basePath) + "openvino_detokenizer.bin"; - std::string model3Path = ovms::FileSystem::appendSlash(basePath) + "openvino_tokenizer.bin"; - std::string model4Path = ovms::FileSystem::appendSlash(basePath) + "tokenizer.model"; - std::string graphPath = ovms::FileSystem::appendSlash(basePath) + "graph.pbtxt"; - -#ifdef _WIN32 - char testExePath[MAX_PATH] = {0}; - DWORD exePathLen = GetModuleFileNameA(nullptr, testExePath, MAX_PATH); - ASSERT_GT(exePathLen, 0u); - ASSERT_LT(exePathLen, static_cast(MAX_PATH)); - - ASSERT_TRUE(SetEnvironmentVariableA("OVMS_RESUME_TERMINATE_WORKER", "1")); - ASSERT_TRUE(SetEnvironmentVariableA("OVMS_RESUME_TERMINATE_MODEL", modelName.c_str())); - ASSERT_TRUE(SetEnvironmentVariableA("OVMS_RESUME_TERMINATE_DOWNLOAD_PATH", downloadPath.c_str())); - ASSERT_TRUE(SetEnvironmentVariableA("OVMS_RESUME_TERMINATE_TASK", task.c_str())); - - std::string commandLine = std::string("\"") + testExePath + - "\" --gtest_filter=HfPullWindowsWorker.ResumeTerminateChildProcess"; - STARTUPINFOA si; PROCESS_INFORMATION pi; - ZeroMemory(&si, sizeof(si)); - si.cb = sizeof(si); - ZeroMemory(&pi, sizeof(pi)); - - ASSERT_TRUE(CreateProcessA( - nullptr, - commandLine.data(), - nullptr, - nullptr, - TRUE, - 0, - nullptr, - nullptr, - &si, - &pi)); + ASSERT_TRUE(startWindowsResumeWorker(RESUME_TERMINATE_WORKER_CONFIG, modelName, downloadPath, task, pi)); #else - pid_t childPid = fork(); - ASSERT_NE(childPid, -1); - - if (childPid == 0) { - ovms::Server& childServer = ovms::Server::instance(); - childServer.setShutdownRequest(0); - char* argv[] = {(char*)"ovms", - (char*)"--pull", - (char*)"--source_model", - (char*)modelName.c_str(), - (char*)"--model_repository_path", - (char*)downloadPath.c_str(), - (char*)"--task", - (char*)task.c_str()}; - int argc = 8; - - std::thread childThread([&argc, &argv, &childServer]() { - (void)childServer.start(argc, argv); - }); - childThread.detach(); - - auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(30); - while (std::chrono::steady_clock::now() < deadline) { - auto lfsCandidates = ovms::libgit2::findLfsLikeFiles(downloadPath, true); - auto hasOpenvinoModelPointer = std::find_if(lfsCandidates.begin(), lfsCandidates.end(), - [](const std::filesystem::path& p) { return p.filename() == "openvino_model.bin"; }) != lfsCandidates.end(); - if (std::filesystem::exists(modelPath) || hasOpenvinoModelPointer) { - std::this_thread::sleep_for(std::chrono::milliseconds(200)); - break; - } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - - kill(getpid(), SIGKILL); - _exit(1); - } + pid_t childPid = -1; + ASSERT_TRUE(launchPosixResumeWorker(modelName, downloadPath, task, childPid)); #endif - bool observedPartialDownload = false; - auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(60); - while (std::chrono::steady_clock::now() < deadline) { - auto lfsCandidates = ovms::libgit2::findLfsLikeFiles(downloadPath, true); - auto hasOpenvinoModelPointer = std::find_if(lfsCandidates.begin(), lfsCandidates.end(), - [](const std::filesystem::path& p) { return p.filename() == "openvino_model.bin"; }) != lfsCandidates.end(); - const std::string partPath = ovms::FileSystem::appendSlash(basePath) + "openvino_model.binlfs_part"; - const bool hasPartFile = std::filesystem::exists(partPath); - if (hasOpenvinoModelPointer || hasPartFile) { - observedPartialDownload = true; - std::this_thread::sleep_for(std::chrono::milliseconds(200)); - break; - } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } + ASSERT_TRUE(waitForResumableInProgressPull( + modelBasePath, + modelPath, + downloadPath, + MODEL_FULL_SIZE_BYTES, + "ResumeTerminate", + HF_PULL_DETECT_TIMEOUT_SECONDS, + HF_PULL_POLL_INTERVAL_MS, + HF_PULL_INTERRUPT_DELAY_MS, + HF_PULL_MAX_CONSECUTIVE_FS_PROBE_ERRORS)); + + const bool observedPartialDownload = true; + bool interruptionSent = false; #ifdef _WIN32 - ASSERT_TRUE(TerminateProcess(pi.hProcess, 1)); - ASSERT_EQ(WaitForSingleObject(pi.hProcess, 10000), WAIT_OBJECT_0); - CloseHandle(pi.hThread); - CloseHandle(pi.hProcess); - - ASSERT_TRUE(SetEnvironmentVariableA("OVMS_RESUME_TERMINATE_WORKER", nullptr)); - ASSERT_TRUE(SetEnvironmentVariableA("OVMS_RESUME_TERMINATE_MODEL", nullptr)); - ASSERT_TRUE(SetEnvironmentVariableA("OVMS_RESUME_TERMINATE_DOWNLOAD_PATH", nullptr)); - ASSERT_TRUE(SetEnvironmentVariableA("OVMS_RESUME_TERMINATE_TASK", nullptr)); + ASSERT_TRUE(terminateWindowsWorker(pi, HF_PULL_SHUTDOWN_TIMEOUT_SECONDS)); + interruptionSent = true; + closeWindowsWorkerHandles(pi); + ASSERT_TRUE(clearWindowsResumeWorkerEnvironment(RESUME_TERMINATE_WORKER_CONFIG)); #else - int childStatus = 0; - ASSERT_EQ(waitpid(childPid, &childStatus, 0), childPid); - ASSERT_TRUE(WIFSIGNALED(childStatus)); - ASSERT_EQ(WTERMSIG(childStatus), SIGKILL); + ASSERT_TRUE(killPosixWorkerAndExpectSigKill(childPid)); + interruptionSent = true; #endif - EXPECT_TRUE(observedPartialDownload); auto remainingPointers = ovms::libgit2::findLfsLikeFiles(downloadPath, true); + SPDLOG_INFO("ResumeTerminate test state: observedPartialDownload={}, interruptionSent={}, remainingPointers count={}", + observedPartialDownload, interruptionSent, remainingPointers.size()); + for (const auto& p : remainingPointers) { + SPDLOG_INFO(" - {}", p.string()); + } + EXPECT_FALSE(remainingPointers.empty()); this->ServerPullHfModel(modelName, downloadPath, task); ASSERT_EQ(std::filesystem::exists(modelPath), true) << modelPath; ASSERT_EQ(std::filesystem::exists(graphPath), true) << graphPath; - ASSERT_EQ(std::filesystem::file_size(modelPath), 52417240); - ASSERT_EQ(std::filesystem::file_size(model2Path), 339125); - ASSERT_EQ(std::filesystem::file_size(model3Path), 500292); - ASSERT_EQ(std::filesystem::file_size(model4Path), 499723); + ASSERT_EQ(std::filesystem::file_size(modelPath), MODEL_FULL_SIZE_BYTES); + ASSERT_EQ(std::filesystem::file_size(model2Path), MODEL2_FULL_SIZE_BYTES); + ASSERT_EQ(std::filesystem::file_size(model3Path), MODEL3_FULL_SIZE_BYTES); + ASSERT_EQ(std::filesystem::file_size(model4Path), MODEL4_FULL_SIZE_BYTES); } #ifdef _WIN32 @@ -924,13 +1249,17 @@ TEST_F(HfPull, ResumeTerminate) { // Mirrors HfPullWindowsWorker.ResumeTerminateChildProcess but is invoked separately // so the parent test can target this child specifically with GenerateConsoleCtrlEvent. TEST(HfPullWindowsWorker, ResumeCtrlCChildProcess) { + // Enables this helper body only for the parent-launched worker process. const char* runWorker = std::getenv("OVMS_RESUME_CTRLC_WORKER"); if ((runWorker == nullptr) || (std::string(runWorker) != "1")) { GTEST_SKIP() << "Helper test - runs only when launched by HfPull.ResumeCtrlC."; } + // Model identifier passed from parent to child worker. const char* modelNameEnv = std::getenv("OVMS_RESUME_CTRLC_MODEL"); + // Target repository path passed from parent to child worker. const char* downloadPathEnv = std::getenv("OVMS_RESUME_CTRLC_DOWNLOAD_PATH"); + // Pull task passed from parent to child worker. const char* taskEnv = std::getenv("OVMS_RESUME_CTRLC_TASK"); ASSERT_NE(modelNameEnv, nullptr); ASSERT_NE(downloadPathEnv, nullptr); @@ -966,120 +1295,39 @@ TEST(HfPullWindowsWorker, ResumeCtrlCChildProcess) { // download to completion. TEST_F(HfPull, ResumeCtrlC) { #ifdef _WIN32 - SKIP_AND_EXIT_IF_NOT_RUNNING_UNSTABLE(); -#endif - std::string basePath = ovms::FileSystem::joinPath({this->directoryPath, "repository", "OpenVINO", "Phi-3-mini-FastDraft-50M-int8-ov"}); - std::string modelPath = ovms::FileSystem::appendSlash(basePath) + "openvino_model.bin"; - std::string model2Path = ovms::FileSystem::appendSlash(basePath) + "openvino_detokenizer.bin"; - std::string model3Path = ovms::FileSystem::appendSlash(basePath) + "openvino_tokenizer.bin"; - std::string model4Path = ovms::FileSystem::appendSlash(basePath) + "tokenizer.model"; - std::string graphPath = ovms::FileSystem::appendSlash(basePath) + "graph.pbtxt"; - -#ifdef _WIN32 - char testExePath[MAX_PATH] = {0}; - DWORD exePathLen = GetModuleFileNameA(nullptr, testExePath, MAX_PATH); - ASSERT_GT(exePathLen, 0u); - ASSERT_LT(exePathLen, static_cast(MAX_PATH)); - - ASSERT_TRUE(SetEnvironmentVariableA("OVMS_RESUME_CTRLC_WORKER", "1")); - ASSERT_TRUE(SetEnvironmentVariableA("OVMS_RESUME_CTRLC_MODEL", modelName.c_str())); - ASSERT_TRUE(SetEnvironmentVariableA("OVMS_RESUME_CTRLC_DOWNLOAD_PATH", downloadPath.c_str())); - ASSERT_TRUE(SetEnvironmentVariableA("OVMS_RESUME_CTRLC_TASK", task.c_str())); - - std::string commandLine = std::string("\"") + testExePath + - "\" --gtest_filter=HfPullWindowsWorker.ResumeCtrlCChildProcess"; - STARTUPINFOA si; PROCESS_INFORMATION pi; - ZeroMemory(&si, sizeof(si)); - si.cb = sizeof(si); - ZeroMemory(&pi, sizeof(pi)); - - // CREATE_NEW_PROCESS_GROUP is required to send GenerateConsoleCtrlEvent only to - // the child process group; otherwise the signal would also reach this test runner. - ASSERT_TRUE(CreateProcessA( - nullptr, - commandLine.data(), - nullptr, - nullptr, - TRUE, - CREATE_NEW_PROCESS_GROUP, - nullptr, - nullptr, - &si, - &pi)); + ASSERT_TRUE(startWindowsResumeWorker(RESUME_CTRLC_WORKER_CONFIG, modelName, downloadPath, task, pi)); #else - pid_t childPid = fork(); - ASSERT_NE(childPid, -1); - - if (childPid == 0) { - ovms::Server& childServer = ovms::Server::instance(); - childServer.setShutdownRequest(0); - char* argv[] = {(char*)"ovms", - (char*)"--pull", - (char*)"--source_model", - (char*)modelName.c_str(), - (char*)"--model_repository_path", - (char*)downloadPath.c_str(), - (char*)"--task", - (char*)task.c_str()}; - int argc = 8; - - // Run synchronously in this child process so installSignalHandlers() (called - // inside server.start) is in effect when SIGINT arrives from the parent. - int rc = childServer.start(argc, argv); - _exit(rc == EXIT_SUCCESS ? 0 : 1); - } + pid_t childPid = -1; + ASSERT_TRUE(launchPosixResumeWorker(modelName, downloadPath, task, childPid)); #endif - // Wait until the LFS download is in flight before sending the interrupt, so the - // cancellation actually exercises the in-progress clone path. A fixed sleep is - // unreliable on fast machines. - bool observedPartialDownload = false; - auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(60); - while (std::chrono::steady_clock::now() < deadline) { - auto lfsCandidates = ovms::libgit2::findLfsLikeFiles(downloadPath, true); - const bool hasOpenvinoModelPointer = std::find_if(lfsCandidates.begin(), lfsCandidates.end(), - [](const std::filesystem::path& p) { return p.filename() == "openvino_model.bin"; }) != lfsCandidates.end(); - const std::string partPath = ovms::FileSystem::appendSlash(basePath) + "openvino_model.binlfs_part"; - const bool hasPartFile = std::filesystem::exists(partPath); - if (hasOpenvinoModelPointer || hasPartFile) { - observedPartialDownload = true; - std::this_thread::sleep_for(std::chrono::milliseconds(200)); - break; - } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } + ASSERT_TRUE(waitForResumableInProgressPull( + modelBasePath, + modelPath, + downloadPath, + MODEL_FULL_SIZE_BYTES, + "ResumeCtrlC", + HF_PULL_DETECT_TIMEOUT_SECONDS, + HF_PULL_POLL_INTERVAL_MS, + HF_PULL_INTERRUPT_DELAY_MS, + HF_PULL_MAX_CONSECUTIVE_FS_PROBE_ERRORS)); + + const bool observedPartialDownload = true; + bool interruptionSent = false; #ifdef _WIN32 - // CTRL_C_EVENT cannot be targeted at a single child via GenerateConsoleCtrlEvent - // without also reaching the calling console group. CTRL_BREAK_EVENT can be - // delivered to the child's dedicated process group (created with - // CREATE_NEW_PROCESS_GROUP above) without disturbing the test runner. - ASSERT_TRUE(GenerateConsoleCtrlEvent(CTRL_BREAK_EVENT, pi.dwProcessId)); - // Allow up to 30 s for the child to drain the in-flight clone and exit. - ASSERT_EQ(WaitForSingleObject(pi.hProcess, 30000), WAIT_OBJECT_0); - DWORD childExitCode = 0; - ASSERT_TRUE(GetExitCodeProcess(pi.hProcess, &childExitCode)); - EXPECT_NE(childExitCode, static_cast(EXIT_SUCCESS)); - CloseHandle(pi.hThread); - CloseHandle(pi.hProcess); - - ASSERT_TRUE(SetEnvironmentVariableA("OVMS_RESUME_CTRLC_WORKER", nullptr)); - ASSERT_TRUE(SetEnvironmentVariableA("OVMS_RESUME_CTRLC_MODEL", nullptr)); - ASSERT_TRUE(SetEnvironmentVariableA("OVMS_RESUME_CTRLC_DOWNLOAD_PATH", nullptr)); - ASSERT_TRUE(SetEnvironmentVariableA("OVMS_RESUME_CTRLC_TASK", nullptr)); + ASSERT_TRUE(sendCtrlBreakToWindowsWorker(pi, HF_PULL_SHUTDOWN_TIMEOUT_SECONDS)); + interruptionSent = true; + closeWindowsWorkerHandles(pi); + ASSERT_TRUE(clearWindowsResumeWorkerEnvironment(RESUME_CTRLC_WORKER_CONFIG)); #else - ASSERT_EQ(kill(childPid, SIGINT), 0); - int childStatus = 0; - ASSERT_EQ(waitpid(childPid, &childStatus, 0), childPid); - // SIGINT must be handled gracefully by ovms (onInterrupt -> shutdownRequest), - // so the child should exit normally rather than be terminated by the signal. - ASSERT_TRUE(WIFEXITED(childStatus)) << "Child was terminated by signal " << (WIFSIGNALED(childStatus) ? WTERMSIG(childStatus) : 0) - << " instead of exiting normally - SIGINT handler may be missing"; - EXPECT_NE(WEXITSTATUS(childStatus), EXIT_SUCCESS); + ASSERT_TRUE(interruptPosixWorkerAndExpectGracefulExit(childPid)); + interruptionSent = true; #endif - EXPECT_TRUE(observedPartialDownload); + SPDLOG_DEBUG("ResumeCtrlC test state: observedPartialDownload={}, interruptionSent={}", + observedPartialDownload, interruptionSent); auto remainingPointers = ovms::libgit2::findLfsLikeFiles(downloadPath, true); EXPECT_FALSE(remainingPointers.empty()); @@ -1087,10 +1335,10 @@ TEST_F(HfPull, ResumeCtrlC) { ASSERT_EQ(std::filesystem::exists(modelPath), true) << modelPath; ASSERT_EQ(std::filesystem::exists(graphPath), true) << graphPath; - ASSERT_EQ(std::filesystem::file_size(modelPath), 52417240); - ASSERT_EQ(std::filesystem::file_size(model2Path), 339125); - ASSERT_EQ(std::filesystem::file_size(model3Path), 500292); - ASSERT_EQ(std::filesystem::file_size(model4Path), 499723); + ASSERT_EQ(std::filesystem::file_size(modelPath), MODEL_FULL_SIZE_BYTES); + ASSERT_EQ(std::filesystem::file_size(model2Path), MODEL2_FULL_SIZE_BYTES); + ASSERT_EQ(std::filesystem::file_size(model3Path), MODEL3_FULL_SIZE_BYTES); + ASSERT_EQ(std::filesystem::file_size(model4Path), MODEL4_FULL_SIZE_BYTES); } TEST_F(HfPull, Start) { @@ -1102,13 +1350,9 @@ TEST_F(HfPull, Start) { this->filesToPrintInCaseOfFailure.emplace_back("config.json"); this->SetUpServerForDownloadAndStart(modelName, downloadPath, task); - std::string basePath = ovms::FileSystem::joinPath({this->directoryPath, "repository", "OpenVINO", "Phi-3-mini-FastDraft-50M-int8-ov"}); - std::string modelPath = ovms::FileSystem::appendSlash(basePath) + "openvino_model.bin"; - std::string graphPath = ovms::FileSystem::appendSlash(basePath) + "graph.pbtxt"; - ASSERT_EQ(std::filesystem::exists(modelPath), true) << modelPath; ASSERT_EQ(std::filesystem::exists(graphPath), true) << graphPath; - ASSERT_EQ(std::filesystem::file_size(modelPath), 52417240); + ASSERT_EQ(std::filesystem::file_size(modelPath), MODEL_FULL_SIZE_BYTES); std::string graphContents = GetFileContents(graphPath); ASSERT_EQ(expectedGraphContents, removeGeneratedGraphHeaders(graphContents)) << graphContents; @@ -1134,7 +1378,7 @@ TEST_F(HfPull, OutOfOvOrg) { ASSERT_EQ(std::filesystem::exists(modelPath), true) << modelPath; ASSERT_EQ(std::filesystem::exists(graphPath), true) << graphPath; - ASSERT_EQ(std::filesystem::file_size(modelPath), 52417240); + ASSERT_EQ(std::filesystem::file_size(modelPath), MODEL_FULL_SIZE_BYTES); std::string graphContents = GetFileContents(graphPath); ASSERT_EQ(expectedGraphContents, removeGeneratedGraphHeaders(graphContents)) << graphContents; @@ -1191,7 +1435,7 @@ TEST_F(HfPull, DraftModel) { ASSERT_EQ(std::filesystem::exists(modelPath), true) << modelPath; ASSERT_EQ(std::filesystem::exists(graphPath), true) << graphPath; - ASSERT_EQ(std::filesystem::file_size(modelPath), 52417240); + ASSERT_EQ(std::filesystem::file_size(modelPath), MODEL_FULL_SIZE_BYTES); std::string graphContents = GetFileContents(graphPath); ASSERT_EQ(expectedGraphContentsDraft, removeGeneratedGraphHeaders(graphContents)) << graphContents; @@ -1200,7 +1444,7 @@ TEST_F(HfPull, DraftModel) { std::string modelPath2 = ovms::FileSystem::appendSlash(basePath2) + "openvino_tokenizer.bin"; ASSERT_EQ(std::filesystem::exists(modelPath2), true) << modelPath2; - ASSERT_EQ(std::filesystem::file_size(modelPath2), 2022483); + ASSERT_EQ(std::filesystem::file_size(modelPath2), DRAFT_MODEL_TOKENIZER_SIZE_BYTES); } class TestOptimumDownloader : public ovms::OptimumDownloader { diff --git a/src/test/test_utils.cpp b/src/test/test_utils.cpp index 69a612e403..9efa2c010e 100644 --- a/src/test/test_utils.cpp +++ b/src/test/test_utils.cpp @@ -733,6 +733,21 @@ void randomizeAndEnsureFrees(std::string& port1, std::string& port2) { const int64_t SERVER_START_FROM_CONFIG_TIMEOUT_SECONDS = 60; +namespace { + +void startServerWithArgs(std::unique_ptr& t, ovms::Server& server, std::vector args, int expectedCode) { + t.reset(new std::thread([args = std::move(args), &server, expectedCode]() mutable { + std::vector argv; + argv.reserve(args.size()); + for (auto& arg : args) { + argv.push_back(const_cast(arg.c_str())); + } + EXPECT_EQ(expectedCode, server.start(static_cast(argv.size()), argv.data())); + })); +} + +} // namespace + void EnsureServerStartedWithTimeout(ovms::Server& server, int timeoutSeconds) { auto start = std::chrono::high_resolution_clock::now(); int timestepMs = 20; @@ -745,11 +760,37 @@ void EnsureServerStartedWithTimeout(ovms::Server& server, int timeoutSeconds) { void EnsureServerModelDownloadFinishedWithTimeout(ovms::Server& server, int timeoutSeconds) { auto start = std::chrono::high_resolution_clock::now(); - while ((server.getModuleState(ovms::HF_MODEL_PULL_MODULE_NAME) != ovms::ModuleState::SHUTDOWN) && - (std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start).count() < timeoutSeconds)) { + auto elapsed = [&] { + return std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - start) + .count(); + }; + + // Phase 1: yield until the HF pull module appears in the server's module map. + // std::this_thread::yield() is required — a bare busy-spin starves the server thread + // (especially on machines with few cores or under load) and the module never starts. + while (server.getModuleState(ovms::HF_MODEL_PULL_MODULE_NAME) == ovms::ModuleState::NOT_INITIALIZED && + elapsed() < timeoutSeconds) { + std::this_thread::yield(); + } + if (server.getModuleState(ovms::HF_MODEL_PULL_MODULE_NAME) == ovms::ModuleState::NOT_INITIALIZED) { + FAIL() << "HF pull module never started within " << timeoutSeconds << "s. Check machine load and network load"; + return; + } + + // Phase 2: wait for the module to finish. + // Accept SHUTDOWN (normal) or NOT_INITIALIZED — the latter happens when the pull is so + // fast (e.g. model already cached, no download needed) that server.start() returns and + // shutdownModules() calls modules.clear() before this thread re-enters the check, making + // getModuleState return NOT_INITIALIZED rather than SHUTDOWN. + while (server.getModuleState(ovms::HF_MODEL_PULL_MODULE_NAME) != ovms::ModuleState::SHUTDOWN && + server.getModuleState(ovms::HF_MODEL_PULL_MODULE_NAME) != ovms::ModuleState::NOT_INITIALIZED && + elapsed() < timeoutSeconds) { } - ASSERT_EQ(server.getModuleState(ovms::HF_MODEL_PULL_MODULE_NAME), ovms::ModuleState::SHUTDOWN) << "OVMS did not download model in allowed time:" << timeoutSeconds << "s. Check machine load and network load"; + const auto finalState = server.getModuleState(ovms::HF_MODEL_PULL_MODULE_NAME); + ASSERT_TRUE(finalState == ovms::ModuleState::SHUTDOWN || finalState == ovms::ModuleState::NOT_INITIALIZED) + << "OVMS did not download model in allowed time:" << timeoutSeconds << "s. Check machine load and network load"; } // --pull --source_model OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov --model_repository_path c:\download @@ -772,6 +813,23 @@ void SetUpServerForDownload(std::unique_ptr& t, ovms::Server& serve EnsureServerModelDownloadFinishedWithTimeout(server, timeoutSeconds); } +void SetUpServerForDownload(std::unique_ptr& t, ovms::Server& server, std::string& source_model, std::string& download_path, std::string& task, const std::string& logPath, int expected_code, int timeoutSeconds) { + server.setShutdownRequest(0); + std::vector args = {"ovms", + "--pull", + "--source_model", + source_model, + "--model_repository_path", + download_path, + "--task", + task, + "--log_path", + logPath}; + startServerWithArgs(t, server, std::move(args), expected_code); + + EnsureServerModelDownloadFinishedWithTimeout(server, timeoutSeconds); +} + void SetUpServerForDownloadWithDraft(std::unique_ptr& t, ovms::Server& server, std::string& draftModel, std::string& source_model, std::string& download_path, std::string& task, int expected_code, int timeoutSeconds) { server.setShutdownRequest(0); @@ -794,6 +852,26 @@ void SetUpServerForDownloadWithDraft(std::unique_ptr& t, ovms::Serv EnsureServerModelDownloadFinishedWithTimeout(server, timeoutSeconds); } +void SetUpServerForDownloadWithDraft(std::unique_ptr& t, ovms::Server& server, + std::string& draftModel, std::string& source_model, std::string& download_path, std::string& task, const std::string& logPath, int expected_code, int timeoutSeconds) { + server.setShutdownRequest(0); + std::vector args = {"ovms", + "--pull", + "--source_model", + source_model, + "--model_repository_path", + download_path, + "--task", + task, + "--draft_source_model", + draftModel, + "--log_path", + logPath}; + startServerWithArgs(t, server, std::move(args), expected_code); + + EnsureServerModelDownloadFinishedWithTimeout(server, timeoutSeconds); +} + void SetUpServerForDownloadWithLoras(std::unique_ptr& t, ovms::Server& server, std::string& source_model, std::string& download_path, std::string& task, std::string& source_loras, int expected_code, int timeoutSeconds) { server.setShutdownRequest(0); char* argv[] = {(char*)"ovms", diff --git a/src/test/test_utils.hpp b/src/test/test_utils.hpp index ac6b4a93db..74e5043ca9 100644 --- a/src/test/test_utils.hpp +++ b/src/test/test_utils.hpp @@ -777,9 +777,12 @@ void EnsureServerModelDownloadFinishedWithTimeout(ovms::Server& server, int time * --pull --source_model OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov --model_repository_path /models */ void SetUpServerForDownload(std::unique_ptr& t, ovms::Server& server, std::string& source_model, std::string& download_path, std::string& task, int expected_code = EXIT_SUCCESS, int timeoutSeconds = SERVER_START_FROM_CONFIG_TIMEOUT_SECONDS); +void SetUpServerForDownload(std::unique_ptr& t, ovms::Server& server, std::string& source_model, std::string& download_path, std::string& task, const std::string& logPath, int expected_code = EXIT_SUCCESS, int timeoutSeconds = SERVER_START_FROM_CONFIG_TIMEOUT_SECONDS); void SetUpServerForDownloadWithDraft(std::unique_ptr& t, ovms::Server& server, std::string& draftModel, std::string& source_model, std::string& download_path, std::string& task, int expected_code, int timeoutSeconds = 2 * SERVER_START_FROM_CONFIG_TIMEOUT_SECONDS); +void SetUpServerForDownloadWithDraft(std::unique_ptr& t, ovms::Server& server, + std::string& draftModel, std::string& source_model, std::string& download_path, std::string& task, const std::string& logPath, int expected_code, int timeoutSeconds = 2 * SERVER_START_FROM_CONFIG_TIMEOUT_SECONDS); /* * starts loading OVMS on separate thread but waits until it is shutdowned or model is downloaded and check if model is started in ovms * --source_model Qwen/Qwen3-8B-GGUF --model_repository_path /models --gguf_filename Qwen3-8B-Q4_K_M.gguf diff --git a/third_party/libgit2/lfs.patch b/third_party/libgit2/lfs.patch index 513ee01a3b..887d700201 100644 --- a/third_party/libgit2/lfs.patch +++ b/third_party/libgit2/lfs.patch @@ -424,11 +424,11 @@ index 58cb4b424..00ddee9f3 100644 git_writestream **out, diff --git a/src/libgit2/lfs_filter.c b/src/libgit2/lfs_filter.c new file mode 100644 -index 000000000..c0e97d408 +index 000000000..18490e5ad --- /dev/null +++ b/src/libgit2/lfs_filter.c -@@ -0,0 +1,1952 @@ -+/* +@@ -0,0 +1,2014 @@ ++/* +/ Copyright 2025 Intel Corporation +/ +/ Licensed under the Apache License, Version 2.0 (the "License"); @@ -467,6 +467,7 @@ index 000000000..c0e97d408 +#include "regexp.h" +#include "thread.h" +#include "time.h" ++#include "ctype_compat.h" + +#if defined(__GNUC__) || defined(__clang__) +/* Optional host-provided cancellation probe. @@ -1046,6 +1047,57 @@ index 000000000..c0e97d408 + return newBuffer; +} + ++static const char *lfs_last_path_sep(const char *path) ++{ ++ const char *forward_sep_position = strrchr(path, '/'); ++ ++#ifdef _WIN32 ++ const char *backward_sep_position = strrchr(path, '\\'); ++ if (!forward_sep_position || (backward_sep_position && backward_sep_position > forward_sep_position)) ++ forward_sep_position = backward_sep_position; ++#endif ++ ++ return forward_sep_position; ++} ++ ++static int lfs_parent_dir_exists(const char *path) ++{ ++ const char *sep; ++ git_str dir = GIT_STR_INIT; ++ struct stat st; ++ int exists = 0; ++ size_t dir_len; ++ ++ if (!path || !*path) ++ return 0; ++ ++ sep = lfs_last_path_sep(path); ++ if (!sep) ++ return (p_stat(".", &st) == 0 && S_ISDIR(st.st_mode)); ++ ++ if (sep == path) ++ return (p_stat("/", &st) == 0 && S_ISDIR(st.st_mode)); ++ ++ dir_len = (size_t)(sep - path); ++ ++#ifdef _WIN32 ++ /* For drive-root paths like "C:\\file", keep the trailing separator so ++ * p_stat checks "C:\\" instead of "C:". */ ++ if (dir_len == 2 && git__isalpha(path[0]) && path[1] == ':') ++ dir_len = 3; ++#endif ++ ++ if (git_str_set(&dir, path, dir_len) < 0) ++ goto done; ++ ++ if (p_stat(dir.ptr, &st) == 0 && S_ISDIR(st.st_mode)) ++ exists = 1; ++ ++done: ++ git_str_dispose(&dir); ++ return exists; ++} ++ +/* + * get_lfs_info_match + * ------------------- @@ -1602,6 +1654,11 @@ index 000000000..c0e97d408 + size_t written_bytes; + uint64_t remaining = 0; + ++ /* Abort immediately on cancellation — returning 0 makes cURL emit ++ * CURLE_WRITE_ERROR, which the caller treats as a failed download. */ ++ if (lfs_shutdown_requested()) ++ return 0; ++ + /* cURL invokes the write callback repeatedly, so guard cumulatively. */ + if (!out->stream) { + /* open file for writing */ @@ -1955,7 +2012,7 @@ index 000000000..c0e97d408 + return CURLE_OK; + } + -+ lfs_log_error_with_state(ftpfile->log_state, "[WARN] Resume attempt %d/%d failed: %s\n", ++ printf("[WARN] Resume attempt %d/%d failed: %s\n", + attempt, max_retries, curl_easy_strerror(res)); + + if (attempt < max_retries) { @@ -2225,9 +2282,7 @@ index 000000000..c0e97d408 + + /* Check for resume of partial download error */ + if (res == CURLE_PARTIAL_FILE) { -+ lfs_log_error_with_state( -+ &la->log_state, -+ "[WARN] Got CURLE_PARTIAL_FILE, attempting resume sequence\n"); ++ printf("[WARN] Got CURLE_PARTIAL_FILE, attempting resume sequence\n"); + res = download_with_resume( + dl_curl, &ftpfile, g_lfs_resume_attempts, + g_lfs_resume_interval_secs); @@ -2258,30 +2313,37 @@ index 000000000..c0e97d408 + ftpfile.stream = NULL; + } + -+ /* If destination exists, unlink it before rename. This is independent -+ * from whether we resumed via a partial temp file. */ -+ if (p_access(la->full_path, F_OK) == 0) { -+ if (p_unlink(la->full_path) < 0 && errno != ENOENT) { -+ lfs_log_error_with_state( -+ &la->log_state, -+ "\n[ERROR] failed to delete existing destination file '%s' (errno=%d: %s)\n", -+ la->full_path, errno, strerror(errno)); -+ /* Ignore here and let rename produce the definitive failure. */ -+ } -+ } -+ + if (p_rename(tmp_out_file, la->full_path) < 0) { ++ int rename_errno = errno; + lfs_log_error_with_state( + &la->log_state, + "\n[ERROR] failed to rename '%s' -> '%s' (errno=%d: %s)\n", + tmp_out_file ? tmp_out_file : "(null)", la->full_path, -+ errno, strerror(errno)); ++ rename_errno, strerror(rename_errno)); ++ ++ if (rename_errno == ENOENT) { ++ int src_exists = ++ (tmp_out_file && p_access(tmp_out_file, F_OK) == 0); ++ int dst_exists = ++ (la->full_path && p_access(la->full_path, F_OK) == 0); ++ int src_parent_exists = ++ lfs_parent_dir_exists(tmp_out_file); ++ int dst_parent_exists = ++ lfs_parent_dir_exists(la->full_path); ++ ++ lfs_log_error_with_state( ++ &la->log_state, ++ "[ERROR] rename ENOENT diagnostics: src_exists=%d src_parent_exists=%d dst_exists=%d dst_parent_exists=%d cancel_requested=%d\n", ++ src_exists, src_parent_exists, dst_exists, ++ dst_parent_exists, lfs_shutdown_requested()); ++ } +#ifdef _WIN32 + lfs_log_error_with_state( + &la->log_state, + "[ERROR] Win32 rename GetLastError=%lu\n", + (unsigned long)GetLastError()); +#endif ++ errno = rename_errno; + goto cleanup; + } +