From 231d8922e66a05d0c740c8f0ebd5ccda63723ef6 Mon Sep 17 00:00:00 2001 From: Carlos Date: Thu, 25 Dec 2025 09:03:54 +0100 Subject: [PATCH] fix: Retry tests when build is still in progress instead of failing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a regression introduced in Phase 2 reliability improvements (PRs #943/#946) where tests would be permanently marked as failed when the GitHub Actions build hadn't completed yet. Previous behavior (before Dec 2025): - If artifact not found, just return silently - Test stays in "pending" state - Next cron cycle retries the test - Eventually build completes and test runs Regression behavior (after Phase 2): - If artifact not found, mark test as failed immediately - Test is permanently marked as "canceled" with ERROR status - No retry - test never runs even after build completes Fixed behavior (this PR): - Distinguish between retryable and permanent failures - Retryable: build in progress, workflow queued, diagnostic error - Permanent: build failed, artifact expired - Only mark as failed for permanent failures - Return silently for retryable failures (next cron cycle retries) Changes: - Modified _diagnose_missing_artifact() to return tuple (message, is_retryable) - Modified start_test() to check is_retryable before marking as failed - Added 8 comprehensive tests for retry behavior Fixes issue where Windows tests fail with "Build still in progress" when the cron job runs before the 40-minute Windows build completes. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- mod_ci/controllers.py | 56 +++++--- tests/test_ci/test_controllers.py | 217 ++++++++++++++++++++++++++++++ 2 files changed, 257 insertions(+), 16 deletions(-) diff --git a/mod_ci/controllers.py b/mod_ci/controllers.py index 58e142f4..73f1f9d6 100755 --- a/mod_ci/controllers.py +++ b/mod_ci/controllers.py @@ -616,17 +616,21 @@ def mark_test_failed(db, test, repository, message: str) -> bool: return db_success and github_success -def _diagnose_missing_artifact(repository, commit_sha: str, platform, log) -> str: +def _diagnose_missing_artifact(repository, commit_sha: str, platform, log) -> tuple: """ Diagnose why an artifact was not found for a commit. - Checks the workflow run status to provide a more helpful error message. + Checks the workflow run status to provide a more helpful error message and + indicate whether this is a retryable situation (build still in progress) + or a permanent failure (build failed, artifact expired). :param repository: GitHub repository object :param commit_sha: The commit SHA to check :param platform: The platform (TestPlatform.linux or TestPlatform.windows) :param log: Logger instance - :return: A descriptive error message + :return: Tuple of (error_message: str, is_retryable: bool) + is_retryable=True means the test should NOT be marked as failed + and should be left for the next cron cycle to retry """ if platform == TestPlatform.linux: expected_workflow = Workflow_builds.LINUX @@ -648,26 +652,38 @@ def _diagnose_missing_artifact(repository, commit_sha: str, platform, log) -> st workflow_found = True if workflow_run.status != "completed": - return (f"Build still in progress: '{expected_workflow}' is {workflow_run.status}. " - f"Please wait for the build to complete and retry.") + # Build is still running - this is RETRYABLE + # Don't mark as failed, let the next cron cycle retry + message = (f"Build still in progress: '{expected_workflow}' is {workflow_run.status}. " + f"Will retry when build completes.") + return (message, True) # Retryable elif workflow_run.conclusion != "success": - return (f"Build failed: '{expected_workflow}' finished with conclusion '{workflow_run.conclusion}'. " - f"Check the GitHub Actions logs for details.") + # Build failed - this is a PERMANENT failure + message = (f"Build failed: '{expected_workflow}' finished with conclusion " + f"'{workflow_run.conclusion}'. Check the GitHub Actions logs for details.") + return (message, False) # Not retryable else: # Build succeeded but artifact not found - may have expired - return (f"Artifact not found: '{expected_workflow}' completed successfully, " - f"but no artifact was found. The artifact may have expired (GitHub deletes " - f"artifacts after a retention period) or was not uploaded properly.") + # This is a PERMANENT failure + message = (f"Artifact not found: '{expected_workflow}' completed successfully, " + f"but no artifact was found. The artifact may have expired (GitHub deletes " + f"artifacts after a retention period) or was not uploaded properly.") + return (message, False) # Not retryable if not workflow_found: - return (f"No workflow run found: '{expected_workflow}' has not run for commit {commit_sha[:7]}. " - f"This may indicate the workflow was not triggered or is queued.") + # No workflow run found - could be queued or not triggered + # This is RETRYABLE (workflow might be queued or path filters excluded it) + message = (f"No workflow run found: '{expected_workflow}' has not run for commit " + f"{commit_sha[:7]}. The workflow may be queued, or was not triggered " + f"due to path filters. Will retry.") + return (message, True) # Retryable except Exception as e: log.warning(f"Failed to diagnose missing artifact: {e}") - return f"No build artifact found for this commit (diagnostic check failed: {e})" + # On diagnostic failure, assume retryable to be safe + return (f"No build artifact found for this commit (diagnostic check failed: {e})", True) - return "No build artifact found for this commit" + return ("No build artifact found for this commit", True) # Default to retryable def start_test(compute, app, db, repository: Repository.Repository, test, bot_token) -> None: @@ -806,8 +822,16 @@ def start_test(compute, app, db, repository: Repository.Repository, test, bot_to artifact = find_artifact_for_commit(repository, test.commit, test.platform, log) if artifact is None: - # Use diagnostic function to provide detailed error message - error_detail = _diagnose_missing_artifact(repository, test.commit, test.platform, log) + # Use diagnostic function to determine if this is retryable + error_detail, is_retryable = _diagnose_missing_artifact(repository, test.commit, test.platform, log) + + if is_retryable: + # Build is still in progress or workflow is queued - don't mark as failed + # Just return and let the next cron cycle retry this test + log.info(f"Test {test.id}: {error_detail}") + return + + # Permanent failure - mark the test as failed log.critical(f"Test {test.id}: Could not find artifact for commit {test.commit[:8]}: {error_detail}") mark_test_failed(db, test, repository, error_detail) return diff --git a/tests/test_ci/test_controllers.py b/tests/test_ci/test_controllers.py index 59db5bc2..0470e219 100644 --- a/tests/test_ci/test_controllers.py +++ b/tests/test_ci/test_controllers.py @@ -3248,3 +3248,220 @@ def test_parse_gcp_error_error_not_a_dict(self): error_msg = parse_gcp_error(result, log=mock_log) self.assertIn("VM creation failed", error_msg) mock_log.error.assert_called_once() + + +class TestDiagnoseMissingArtifact(BaseTestCase): + """Test the _diagnose_missing_artifact function and retry behavior.""" + + def test_build_in_progress_is_retryable(self): + """Test that build in progress returns retryable=True.""" + from mod_ci.controllers import _diagnose_missing_artifact + + repository = MagicMock() + log = MagicMock() + + # Mock workflow + workflow = MagicMock() + workflow.id = 1 + workflow.name = "Build CCExtractor on Linux" + repository.get_workflows.return_value = [workflow] + + # Mock workflow run - in progress + workflow_run = MagicMock() + workflow_run.workflow_id = 1 + workflow_run.status = "in_progress" + repository.get_workflow_runs.return_value = [workflow_run] + + message, is_retryable = _diagnose_missing_artifact( + repository, "abc123", TestPlatform.linux, log + ) + + self.assertTrue(is_retryable) + self.assertIn("Build still in progress", message) + self.assertIn("Will retry", message) + + def test_build_queued_is_retryable(self): + """Test that queued build returns retryable=True.""" + from mod_ci.controllers import _diagnose_missing_artifact + + repository = MagicMock() + log = MagicMock() + + # Mock workflow + workflow = MagicMock() + workflow.id = 1 + workflow.name = "Build CCExtractor on Windows" + repository.get_workflows.return_value = [workflow] + + # Mock workflow run - queued + workflow_run = MagicMock() + workflow_run.workflow_id = 1 + workflow_run.status = "queued" + repository.get_workflow_runs.return_value = [workflow_run] + + message, is_retryable = _diagnose_missing_artifact( + repository, "abc123", TestPlatform.windows, log + ) + + self.assertTrue(is_retryable) + self.assertIn("Build still in progress", message) + + def test_build_failed_is_not_retryable(self): + """Test that failed build returns retryable=False.""" + from mod_ci.controllers import _diagnose_missing_artifact + + repository = MagicMock() + log = MagicMock() + + # Mock workflow + workflow = MagicMock() + workflow.id = 1 + workflow.name = "Build CCExtractor on Linux" + repository.get_workflows.return_value = [workflow] + + # Mock workflow run - completed but failed + workflow_run = MagicMock() + workflow_run.workflow_id = 1 + workflow_run.status = "completed" + workflow_run.conclusion = "failure" + repository.get_workflow_runs.return_value = [workflow_run] + + message, is_retryable = _diagnose_missing_artifact( + repository, "abc123", TestPlatform.linux, log + ) + + self.assertFalse(is_retryable) + self.assertIn("Build failed", message) + + def test_artifact_expired_is_not_retryable(self): + """Test that expired artifact returns retryable=False.""" + from mod_ci.controllers import _diagnose_missing_artifact + + repository = MagicMock() + log = MagicMock() + + # Mock workflow + workflow = MagicMock() + workflow.id = 1 + workflow.name = "Build CCExtractor on Linux" + repository.get_workflows.return_value = [workflow] + + # Mock workflow run - completed successfully but artifact gone + workflow_run = MagicMock() + workflow_run.workflow_id = 1 + workflow_run.status = "completed" + workflow_run.conclusion = "success" + repository.get_workflow_runs.return_value = [workflow_run] + + message, is_retryable = _diagnose_missing_artifact( + repository, "abc123", TestPlatform.linux, log + ) + + self.assertFalse(is_retryable) + self.assertIn("Artifact not found", message) + self.assertIn("expired", message) + + def test_no_workflow_run_is_retryable(self): + """Test that missing workflow run returns retryable=True.""" + from mod_ci.controllers import _diagnose_missing_artifact + + repository = MagicMock() + log = MagicMock() + + # Mock workflow + workflow = MagicMock() + workflow.id = 1 + workflow.name = "Build CCExtractor on Linux" + repository.get_workflows.return_value = [workflow] + + # No workflow runs for this commit + repository.get_workflow_runs.return_value = [] + + message, is_retryable = _diagnose_missing_artifact( + repository, "abc123", TestPlatform.linux, log + ) + + self.assertTrue(is_retryable) + self.assertIn("No workflow run found", message) + self.assertIn("Will retry", message) + + def test_diagnostic_exception_is_retryable(self): + """Test that diagnostic failures default to retryable=True.""" + from mod_ci.controllers import _diagnose_missing_artifact + + repository = MagicMock() + log = MagicMock() + + # Make get_workflows throw an exception + repository.get_workflows.side_effect = Exception("API error") + + message, is_retryable = _diagnose_missing_artifact( + repository, "abc123", TestPlatform.linux, log + ) + + self.assertTrue(is_retryable) + self.assertIn("diagnostic check failed", message) + + +class TestStartTestRetryBehavior(BaseTestCase): + """Test that start_test correctly handles retryable vs permanent failures.""" + + @mock.patch('mod_ci.controllers.find_artifact_for_commit') + @mock.patch('mod_ci.controllers._diagnose_missing_artifact') + @mock.patch('mod_ci.controllers.mark_test_failed') + @mock.patch('mod_ci.controllers.GcpInstance') + @mock.patch('mod_ci.controllers.TestProgress') + @mock.patch('run.log') + def test_retryable_failure_does_not_mark_test_failed( + self, mock_log, mock_progress, mock_gcp, mock_mark_failed, + mock_diagnose, mock_find_artifact): + """Test that retryable failures don't mark the test as failed.""" + from mod_ci.controllers import start_test + + # Setup: artifact not found, but build is in progress (retryable) + mock_find_artifact.return_value = None + mock_diagnose.return_value = ("Build still in progress", True) + + # Mock locking checks + mock_gcp.query.filter.return_value.first.return_value = None + mock_progress.query.filter.return_value.first.return_value = None + + test = Test.query.first() + repository = MagicMock() + + start_test(MagicMock(), self.app, g.db, repository, test, "token") + + # Should NOT call mark_test_failed for retryable failure + mock_mark_failed.assert_not_called() + # Should log info, not critical + mock_log.info.assert_called() + + @mock.patch('mod_ci.controllers.find_artifact_for_commit') + @mock.patch('mod_ci.controllers._diagnose_missing_artifact') + @mock.patch('mod_ci.controllers.mark_test_failed') + @mock.patch('mod_ci.controllers.GcpInstance') + @mock.patch('mod_ci.controllers.TestProgress') + @mock.patch('run.log') + def test_permanent_failure_marks_test_failed( + self, mock_log, mock_progress, mock_gcp, mock_mark_failed, + mock_diagnose, mock_find_artifact): + """Test that permanent failures mark the test as failed.""" + from mod_ci.controllers import start_test + + # Setup: artifact not found, build failed (not retryable) + mock_find_artifact.return_value = None + mock_diagnose.return_value = ("Build failed with conclusion 'failure'", False) + + # Mock locking checks + mock_gcp.query.filter.return_value.first.return_value = None + mock_progress.query.filter.return_value.first.return_value = None + + test = Test.query.first() + repository = MagicMock() + + start_test(MagicMock(), self.app, g.db, repository, test, "token") + + # SHOULD call mark_test_failed for permanent failure + mock_mark_failed.assert_called_once() + # Should log critical + mock_log.critical.assert_called()