From 231d8922e66a05d0c740c8f0ebd5ccda63723ef6 Mon Sep 17 00:00:00 2001
From: Carlos <carlos@sanz.dev>
Date: Thu, 25 Dec 2025 09:03:54 +0100
Subject: [PATCH] fix: Retry tests when build is still in progress instead of
 failing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes a regression introduced in Phase 2 reliability improvements
(PRs #943/#946) where tests would be permanently marked as failed when
the GitHub Actions build hadn't completed yet.

Previous behavior (before Dec 2025):
- If artifact not found, just return silently
- Test stays in "pending" state
- Next cron cycle retries the test
- Eventually build completes and test runs

Regression behavior (after Phase 2):
- If artifact not found, mark test as failed immediately
- Test is permanently marked as "canceled" with ERROR status
- No retry - test never runs even after build completes

Fixed behavior (this PR):
- Distinguish between retryable and permanent failures
- Retryable: build in progress, workflow queued, diagnostic error
- Permanent: build failed, artifact expired
- Only mark as failed for permanent failures
- Return silently for retryable failures (next cron cycle retries)

Changes:
- Modified _diagnose_missing_artifact() to return tuple (message, is_retryable)
- Modified start_test() to check is_retryable before marking as failed
- Added 8 comprehensive tests for retry behavior

Fixes issue where Windows tests fail with "Build still in progress"
when the cron job runs before the 40-minute Windows build completes.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 mod_ci/controllers.py             |  56 +++++---
 tests/test_ci/test_controllers.py | 217 ++++++++++++++++++++++++++++++
 2 files changed, 257 insertions(+), 16 deletions(-)

diff --git a/mod_ci/controllers.py b/mod_ci/controllers.py
index 58e142f4..73f1f9d6 100755
--- a/mod_ci/controllers.py
+++ b/mod_ci/controllers.py
@@ -616,17 +616,21 @@ def mark_test_failed(db, test, repository, message: str) -> bool:
     return db_success and github_success
 
 
-def _diagnose_missing_artifact(repository, commit_sha: str, platform, log) -> str:
+def _diagnose_missing_artifact(repository, commit_sha: str, platform, log) -> tuple:
     """
     Diagnose why an artifact was not found for a commit.
 
-    Checks the workflow run status to provide a more helpful error message.
+    Checks the workflow run status to provide a more helpful error message and
+    indicate whether this is a retryable situation (build still in progress)
+    or a permanent failure (build failed, artifact expired).
 
     :param repository: GitHub repository object
     :param commit_sha: The commit SHA to check
     :param platform: The platform (TestPlatform.linux or TestPlatform.windows)
     :param log: Logger instance
-    :return: A descriptive error message
+    :return: Tuple of (error_message: str, is_retryable: bool)
+             is_retryable=True means the test should NOT be marked as failed
+             and should be left for the next cron cycle to retry
     """
     if platform == TestPlatform.linux:
         expected_workflow = Workflow_builds.LINUX
@@ -648,26 +652,38 @@ def _diagnose_missing_artifact(repository, commit_sha: str, platform, log) -> st
 
             workflow_found = True
             if workflow_run.status != "completed":
-                return (f"Build still in progress: '{expected_workflow}' is {workflow_run.status}. "
-                        f"Please wait for the build to complete and retry.")
+                # Build is still running - this is RETRYABLE
+                # Don't mark as failed, let the next cron cycle retry
+                message = (f"Build still in progress: '{expected_workflow}' is {workflow_run.status}. "
+                           f"Will retry when build completes.")
+                return (message, True)  # Retryable
             elif workflow_run.conclusion != "success":
-                return (f"Build failed: '{expected_workflow}' finished with conclusion '{workflow_run.conclusion}'. "
-                        f"Check the GitHub Actions logs for details.")
+                # Build failed - this is a PERMANENT failure
+                message = (f"Build failed: '{expected_workflow}' finished with conclusion "
+                           f"'{workflow_run.conclusion}'. Check the GitHub Actions logs for details.")
+                return (message, False)  # Not retryable
             else:
                 # Build succeeded but artifact not found - may have expired
-                return (f"Artifact not found: '{expected_workflow}' completed successfully, "
-                        f"but no artifact was found. The artifact may have expired (GitHub deletes "
-                        f"artifacts after a retention period) or was not uploaded properly.")
+                # This is a PERMANENT failure
+                message = (f"Artifact not found: '{expected_workflow}' completed successfully, "
+                           f"but no artifact was found. The artifact may have expired (GitHub deletes "
+                           f"artifacts after a retention period) or was not uploaded properly.")
+                return (message, False)  # Not retryable
 
         if not workflow_found:
-            return (f"No workflow run found: '{expected_workflow}' has not run for commit {commit_sha[:7]}. "
-                    f"This may indicate the workflow was not triggered or is queued.")
+            # No workflow run found - could be queued or not triggered
+            # This is RETRYABLE (workflow might be queued or path filters excluded it)
+            message = (f"No workflow run found: '{expected_workflow}' has not run for commit "
+                       f"{commit_sha[:7]}. The workflow may be queued, or was not triggered "
+                       f"due to path filters. Will retry.")
+            return (message, True)  # Retryable
 
     except Exception as e:
         log.warning(f"Failed to diagnose missing artifact: {e}")
-        return f"No build artifact found for this commit (diagnostic check failed: {e})"
+        # On diagnostic failure, assume retryable to be safe
+        return (f"No build artifact found for this commit (diagnostic check failed: {e})", True)
 
-    return "No build artifact found for this commit"
+    return ("No build artifact found for this commit", True)  # Default to retryable
 
 
 def start_test(compute, app, db, repository: Repository.Repository, test, bot_token) -> None:
@@ -806,8 +822,16 @@ def start_test(compute, app, db, repository: Repository.Repository, test, bot_to
     artifact = find_artifact_for_commit(repository, test.commit, test.platform, log)
 
     if artifact is None:
-        # Use diagnostic function to provide detailed error message
-        error_detail = _diagnose_missing_artifact(repository, test.commit, test.platform, log)
+        # Use diagnostic function to determine if this is retryable
+        error_detail, is_retryable = _diagnose_missing_artifact(repository, test.commit, test.platform, log)
+
+        if is_retryable:
+            # Build is still in progress or workflow is queued - don't mark as failed
+            # Just return and let the next cron cycle retry this test
+            log.info(f"Test {test.id}: {error_detail}")
+            return
+
+        # Permanent failure - mark the test as failed
         log.critical(f"Test {test.id}: Could not find artifact for commit {test.commit[:8]}: {error_detail}")
         mark_test_failed(db, test, repository, error_detail)
         return
diff --git a/tests/test_ci/test_controllers.py b/tests/test_ci/test_controllers.py
index 59db5bc2..0470e219 100644
--- a/tests/test_ci/test_controllers.py
+++ b/tests/test_ci/test_controllers.py
@@ -3248,3 +3248,220 @@ def test_parse_gcp_error_error_not_a_dict(self):
         error_msg = parse_gcp_error(result, log=mock_log)
         self.assertIn("VM creation failed", error_msg)
         mock_log.error.assert_called_once()
+
+
+class TestDiagnoseMissingArtifact(BaseTestCase):
+    """Test the _diagnose_missing_artifact function and retry behavior."""
+
+    def test_build_in_progress_is_retryable(self):
+        """Test that build in progress returns retryable=True."""
+        from mod_ci.controllers import _diagnose_missing_artifact
+
+        repository = MagicMock()
+        log = MagicMock()
+
+        # Mock workflow
+        workflow = MagicMock()
+        workflow.id = 1
+        workflow.name = "Build CCExtractor on Linux"
+        repository.get_workflows.return_value = [workflow]
+
+        # Mock workflow run - in progress
+        workflow_run = MagicMock()
+        workflow_run.workflow_id = 1
+        workflow_run.status = "in_progress"
+        repository.get_workflow_runs.return_value = [workflow_run]
+
+        message, is_retryable = _diagnose_missing_artifact(
+            repository, "abc123", TestPlatform.linux, log
+        )
+
+        self.assertTrue(is_retryable)
+        self.assertIn("Build still in progress", message)
+        self.assertIn("Will retry", message)
+
+    def test_build_queued_is_retryable(self):
+        """Test that queued build returns retryable=True."""
+        from mod_ci.controllers import _diagnose_missing_artifact
+
+        repository = MagicMock()
+        log = MagicMock()
+
+        # Mock workflow
+        workflow = MagicMock()
+        workflow.id = 1
+        workflow.name = "Build CCExtractor on Windows"
+        repository.get_workflows.return_value = [workflow]
+
+        # Mock workflow run - queued
+        workflow_run = MagicMock()
+        workflow_run.workflow_id = 1
+        workflow_run.status = "queued"
+        repository.get_workflow_runs.return_value = [workflow_run]
+
+        message, is_retryable = _diagnose_missing_artifact(
+            repository, "abc123", TestPlatform.windows, log
+        )
+
+        self.assertTrue(is_retryable)
+        self.assertIn("Build still in progress", message)
+
+    def test_build_failed_is_not_retryable(self):
+        """Test that failed build returns retryable=False."""
+        from mod_ci.controllers import _diagnose_missing_artifact
+
+        repository = MagicMock()
+        log = MagicMock()
+
+        # Mock workflow
+        workflow = MagicMock()
+        workflow.id = 1
+        workflow.name = "Build CCExtractor on Linux"
+        repository.get_workflows.return_value = [workflow]
+
+        # Mock workflow run - completed but failed
+        workflow_run = MagicMock()
+        workflow_run.workflow_id = 1
+        workflow_run.status = "completed"
+        workflow_run.conclusion = "failure"
+        repository.get_workflow_runs.return_value = [workflow_run]
+
+        message, is_retryable = _diagnose_missing_artifact(
+            repository, "abc123", TestPlatform.linux, log
+        )
+
+        self.assertFalse(is_retryable)
+        self.assertIn("Build failed", message)
+
+    def test_artifact_expired_is_not_retryable(self):
+        """Test that expired artifact returns retryable=False."""
+        from mod_ci.controllers import _diagnose_missing_artifact
+
+        repository = MagicMock()
+        log = MagicMock()
+
+        # Mock workflow
+        workflow = MagicMock()
+        workflow.id = 1
+        workflow.name = "Build CCExtractor on Linux"
+        repository.get_workflows.return_value = [workflow]
+
+        # Mock workflow run - completed successfully but artifact gone
+        workflow_run = MagicMock()
+        workflow_run.workflow_id = 1
+        workflow_run.status = "completed"
+        workflow_run.conclusion = "success"
+        repository.get_workflow_runs.return_value = [workflow_run]
+
+        message, is_retryable = _diagnose_missing_artifact(
+            repository, "abc123", TestPlatform.linux, log
+        )
+
+        self.assertFalse(is_retryable)
+        self.assertIn("Artifact not found", message)
+        self.assertIn("expired", message)
+
+    def test_no_workflow_run_is_retryable(self):
+        """Test that missing workflow run returns retryable=True."""
+        from mod_ci.controllers import _diagnose_missing_artifact
+
+        repository = MagicMock()
+        log = MagicMock()
+
+        # Mock workflow
+        workflow = MagicMock()
+        workflow.id = 1
+        workflow.name = "Build CCExtractor on Linux"
+        repository.get_workflows.return_value = [workflow]
+
+        # No workflow runs for this commit
+        repository.get_workflow_runs.return_value = []
+
+        message, is_retryable = _diagnose_missing_artifact(
+            repository, "abc123", TestPlatform.linux, log
+        )
+
+        self.assertTrue(is_retryable)
+        self.assertIn("No workflow run found", message)
+        self.assertIn("Will retry", message)
+
+    def test_diagnostic_exception_is_retryable(self):
+        """Test that diagnostic failures default to retryable=True."""
+        from mod_ci.controllers import _diagnose_missing_artifact
+
+        repository = MagicMock()
+        log = MagicMock()
+
+        # Make get_workflows throw an exception
+        repository.get_workflows.side_effect = Exception("API error")
+
+        message, is_retryable = _diagnose_missing_artifact(
+            repository, "abc123", TestPlatform.linux, log
+        )
+
+        self.assertTrue(is_retryable)
+        self.assertIn("diagnostic check failed", message)
+
+
+class TestStartTestRetryBehavior(BaseTestCase):
+    """Test that start_test correctly handles retryable vs permanent failures."""
+
+    @mock.patch('mod_ci.controllers.find_artifact_for_commit')
+    @mock.patch('mod_ci.controllers._diagnose_missing_artifact')
+    @mock.patch('mod_ci.controllers.mark_test_failed')
+    @mock.patch('mod_ci.controllers.GcpInstance')
+    @mock.patch('mod_ci.controllers.TestProgress')
+    @mock.patch('run.log')
+    def test_retryable_failure_does_not_mark_test_failed(
+            self, mock_log, mock_progress, mock_gcp, mock_mark_failed,
+            mock_diagnose, mock_find_artifact):
+        """Test that retryable failures don't mark the test as failed."""
+        from mod_ci.controllers import start_test
+
+        # Setup: artifact not found, but build is in progress (retryable)
+        mock_find_artifact.return_value = None
+        mock_diagnose.return_value = ("Build still in progress", True)
+
+        # Mock locking checks
+        mock_gcp.query.filter.return_value.first.return_value = None
+        mock_progress.query.filter.return_value.first.return_value = None
+
+        test = Test.query.first()
+        repository = MagicMock()
+
+        start_test(MagicMock(), self.app, g.db, repository, test, "token")
+
+        # Should NOT call mark_test_failed for retryable failure
+        mock_mark_failed.assert_not_called()
+        # Should log info, not critical
+        mock_log.info.assert_called()
+
+    @mock.patch('mod_ci.controllers.find_artifact_for_commit')
+    @mock.patch('mod_ci.controllers._diagnose_missing_artifact')
+    @mock.patch('mod_ci.controllers.mark_test_failed')
+    @mock.patch('mod_ci.controllers.GcpInstance')
+    @mock.patch('mod_ci.controllers.TestProgress')
+    @mock.patch('run.log')
+    def test_permanent_failure_marks_test_failed(
+            self, mock_log, mock_progress, mock_gcp, mock_mark_failed,
+            mock_diagnose, mock_find_artifact):
+        """Test that permanent failures mark the test as failed."""
+        from mod_ci.controllers import start_test
+
+        # Setup: artifact not found, build failed (not retryable)
+        mock_find_artifact.return_value = None
+        mock_diagnose.return_value = ("Build failed with conclusion 'failure'", False)
+
+        # Mock locking checks
+        mock_gcp.query.filter.return_value.first.return_value = None
+        mock_progress.query.filter.return_value.first.return_value = None
+
+        test = Test.query.first()
+        repository = MagicMock()
+
+        start_test(MagicMock(), self.app, g.db, repository, test, "token")
+
+        # SHOULD call mark_test_failed for permanent failure
+        mock_mark_failed.assert_called_once()
+        # Should log critical
+        mock_log.critical.assert_called()