Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 40 additions & 16 deletions mod_ci/controllers.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,17 +616,21 @@ def mark_test_failed(db, test, repository, message: str) -> bool:
return db_success and github_success


def _diagnose_missing_artifact(repository, commit_sha: str, platform, log) -> str:
def _diagnose_missing_artifact(repository, commit_sha: str, platform, log) -> tuple:
"""
Diagnose why an artifact was not found for a commit.

Checks the workflow run status to provide a more helpful error message.
Checks the workflow run status to provide a more helpful error message and
indicate whether this is a retryable situation (build still in progress)
or a permanent failure (build failed, artifact expired).

:param repository: GitHub repository object
:param commit_sha: The commit SHA to check
:param platform: The platform (TestPlatform.linux or TestPlatform.windows)
:param log: Logger instance
:return: A descriptive error message
:return: Tuple of (error_message: str, is_retryable: bool)
is_retryable=True means the test should NOT be marked as failed
and should be left for the next cron cycle to retry
"""
if platform == TestPlatform.linux:
expected_workflow = Workflow_builds.LINUX
Expand All @@ -648,26 +652,38 @@ def _diagnose_missing_artifact(repository, commit_sha: str, platform, log) -> st

workflow_found = True
if workflow_run.status != "completed":
return (f"Build still in progress: '{expected_workflow}' is {workflow_run.status}. "
f"Please wait for the build to complete and retry.")
# Build is still running - this is RETRYABLE
# Don't mark as failed, let the next cron cycle retry
message = (f"Build still in progress: '{expected_workflow}' is {workflow_run.status}. "
f"Will retry when build completes.")
return (message, True) # Retryable
elif workflow_run.conclusion != "success":
return (f"Build failed: '{expected_workflow}' finished with conclusion '{workflow_run.conclusion}'. "
f"Check the GitHub Actions logs for details.")
# Build failed - this is a PERMANENT failure
message = (f"Build failed: '{expected_workflow}' finished with conclusion "
f"'{workflow_run.conclusion}'. Check the GitHub Actions logs for details.")
return (message, False) # Not retryable
else:
# Build succeeded but artifact not found - may have expired
return (f"Artifact not found: '{expected_workflow}' completed successfully, "
f"but no artifact was found. The artifact may have expired (GitHub deletes "
f"artifacts after a retention period) or was not uploaded properly.")
# This is a PERMANENT failure
message = (f"Artifact not found: '{expected_workflow}' completed successfully, "
f"but no artifact was found. The artifact may have expired (GitHub deletes "
f"artifacts after a retention period) or was not uploaded properly.")
return (message, False) # Not retryable

if not workflow_found:
return (f"No workflow run found: '{expected_workflow}' has not run for commit {commit_sha[:7]}. "
f"This may indicate the workflow was not triggered or is queued.")
# No workflow run found - could be queued or not triggered
# This is RETRYABLE (workflow might be queued or path filters excluded it)
message = (f"No workflow run found: '{expected_workflow}' has not run for commit "
f"{commit_sha[:7]}. The workflow may be queued, or was not triggered "
f"due to path filters. Will retry.")
return (message, True) # Retryable

except Exception as e:
log.warning(f"Failed to diagnose missing artifact: {e}")
return f"No build artifact found for this commit (diagnostic check failed: {e})"
# On diagnostic failure, assume retryable to be safe
return (f"No build artifact found for this commit (diagnostic check failed: {e})", True)

return "No build artifact found for this commit"
return ("No build artifact found for this commit", True) # Default to retryable


def start_test(compute, app, db, repository: Repository.Repository, test, bot_token) -> None:
Expand Down Expand Up @@ -806,8 +822,16 @@ def start_test(compute, app, db, repository: Repository.Repository, test, bot_to
artifact = find_artifact_for_commit(repository, test.commit, test.platform, log)

if artifact is None:
# Use diagnostic function to provide detailed error message
error_detail = _diagnose_missing_artifact(repository, test.commit, test.platform, log)
# Use diagnostic function to determine if this is retryable
error_detail, is_retryable = _diagnose_missing_artifact(repository, test.commit, test.platform, log)

if is_retryable:
# Build is still in progress or workflow is queued - don't mark as failed
# Just return and let the next cron cycle retry this test
log.info(f"Test {test.id}: {error_detail}")
return

# Permanent failure - mark the test as failed
log.critical(f"Test {test.id}: Could not find artifact for commit {test.commit[:8]}: {error_detail}")
mark_test_failed(db, test, repository, error_detail)
return
Expand Down
217 changes: 217 additions & 0 deletions tests/test_ci/test_controllers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3248,3 +3248,220 @@ def test_parse_gcp_error_error_not_a_dict(self):
error_msg = parse_gcp_error(result, log=mock_log)
self.assertIn("VM creation failed", error_msg)
mock_log.error.assert_called_once()


class TestDiagnoseMissingArtifact(BaseTestCase):
"""Test the _diagnose_missing_artifact function and retry behavior."""

def test_build_in_progress_is_retryable(self):
"""Test that build in progress returns retryable=True."""
from mod_ci.controllers import _diagnose_missing_artifact

repository = MagicMock()
log = MagicMock()

# Mock workflow
workflow = MagicMock()
workflow.id = 1
workflow.name = "Build CCExtractor on Linux"
repository.get_workflows.return_value = [workflow]

# Mock workflow run - in progress
workflow_run = MagicMock()
workflow_run.workflow_id = 1
workflow_run.status = "in_progress"
repository.get_workflow_runs.return_value = [workflow_run]

message, is_retryable = _diagnose_missing_artifact(
repository, "abc123", TestPlatform.linux, log
)

self.assertTrue(is_retryable)
self.assertIn("Build still in progress", message)
self.assertIn("Will retry", message)

def test_build_queued_is_retryable(self):
"""Test that queued build returns retryable=True."""
from mod_ci.controllers import _diagnose_missing_artifact

repository = MagicMock()
log = MagicMock()

# Mock workflow
workflow = MagicMock()
workflow.id = 1
workflow.name = "Build CCExtractor on Windows"
repository.get_workflows.return_value = [workflow]

# Mock workflow run - queued
workflow_run = MagicMock()
workflow_run.workflow_id = 1
workflow_run.status = "queued"
repository.get_workflow_runs.return_value = [workflow_run]

message, is_retryable = _diagnose_missing_artifact(
repository, "abc123", TestPlatform.windows, log
)

self.assertTrue(is_retryable)
self.assertIn("Build still in progress", message)

def test_build_failed_is_not_retryable(self):
"""Test that failed build returns retryable=False."""
from mod_ci.controllers import _diagnose_missing_artifact

repository = MagicMock()
log = MagicMock()

# Mock workflow
workflow = MagicMock()
workflow.id = 1
workflow.name = "Build CCExtractor on Linux"
repository.get_workflows.return_value = [workflow]

# Mock workflow run - completed but failed
workflow_run = MagicMock()
workflow_run.workflow_id = 1
workflow_run.status = "completed"
workflow_run.conclusion = "failure"
repository.get_workflow_runs.return_value = [workflow_run]

message, is_retryable = _diagnose_missing_artifact(
repository, "abc123", TestPlatform.linux, log
)

self.assertFalse(is_retryable)
self.assertIn("Build failed", message)

def test_artifact_expired_is_not_retryable(self):
"""Test that expired artifact returns retryable=False."""
from mod_ci.controllers import _diagnose_missing_artifact

repository = MagicMock()
log = MagicMock()

# Mock workflow
workflow = MagicMock()
workflow.id = 1
workflow.name = "Build CCExtractor on Linux"
repository.get_workflows.return_value = [workflow]

# Mock workflow run - completed successfully but artifact gone
workflow_run = MagicMock()
workflow_run.workflow_id = 1
workflow_run.status = "completed"
workflow_run.conclusion = "success"
repository.get_workflow_runs.return_value = [workflow_run]

message, is_retryable = _diagnose_missing_artifact(
repository, "abc123", TestPlatform.linux, log
)

self.assertFalse(is_retryable)
self.assertIn("Artifact not found", message)
self.assertIn("expired", message)

def test_no_workflow_run_is_retryable(self):
"""Test that missing workflow run returns retryable=True."""
from mod_ci.controllers import _diagnose_missing_artifact

repository = MagicMock()
log = MagicMock()

# Mock workflow
workflow = MagicMock()
workflow.id = 1
workflow.name = "Build CCExtractor on Linux"
repository.get_workflows.return_value = [workflow]

# No workflow runs for this commit
repository.get_workflow_runs.return_value = []

message, is_retryable = _diagnose_missing_artifact(
repository, "abc123", TestPlatform.linux, log
)

self.assertTrue(is_retryable)
self.assertIn("No workflow run found", message)
self.assertIn("Will retry", message)

def test_diagnostic_exception_is_retryable(self):
"""Test that diagnostic failures default to retryable=True."""
from mod_ci.controllers import _diagnose_missing_artifact

repository = MagicMock()
log = MagicMock()

# Make get_workflows throw an exception
repository.get_workflows.side_effect = Exception("API error")

message, is_retryable = _diagnose_missing_artifact(
repository, "abc123", TestPlatform.linux, log
)

self.assertTrue(is_retryable)
self.assertIn("diagnostic check failed", message)


class TestStartTestRetryBehavior(BaseTestCase):
"""Test that start_test correctly handles retryable vs permanent failures."""

@mock.patch('mod_ci.controllers.find_artifact_for_commit')
@mock.patch('mod_ci.controllers._diagnose_missing_artifact')
@mock.patch('mod_ci.controllers.mark_test_failed')
@mock.patch('mod_ci.controllers.GcpInstance')
@mock.patch('mod_ci.controllers.TestProgress')
@mock.patch('run.log')
def test_retryable_failure_does_not_mark_test_failed(
self, mock_log, mock_progress, mock_gcp, mock_mark_failed,
mock_diagnose, mock_find_artifact):
"""Test that retryable failures don't mark the test as failed."""
from mod_ci.controllers import start_test

# Setup: artifact not found, but build is in progress (retryable)
mock_find_artifact.return_value = None
mock_diagnose.return_value = ("Build still in progress", True)

# Mock locking checks
mock_gcp.query.filter.return_value.first.return_value = None
mock_progress.query.filter.return_value.first.return_value = None

test = Test.query.first()
repository = MagicMock()

start_test(MagicMock(), self.app, g.db, repository, test, "token")

# Should NOT call mark_test_failed for retryable failure
mock_mark_failed.assert_not_called()
# Should log info, not critical
mock_log.info.assert_called()

@mock.patch('mod_ci.controllers.find_artifact_for_commit')
@mock.patch('mod_ci.controllers._diagnose_missing_artifact')
@mock.patch('mod_ci.controllers.mark_test_failed')
@mock.patch('mod_ci.controllers.GcpInstance')
@mock.patch('mod_ci.controllers.TestProgress')
@mock.patch('run.log')
def test_permanent_failure_marks_test_failed(
self, mock_log, mock_progress, mock_gcp, mock_mark_failed,
mock_diagnose, mock_find_artifact):
"""Test that permanent failures mark the test as failed."""
from mod_ci.controllers import start_test

# Setup: artifact not found, build failed (not retryable)
mock_find_artifact.return_value = None
mock_diagnose.return_value = ("Build failed with conclusion 'failure'", False)

# Mock locking checks
mock_gcp.query.filter.return_value.first.return_value = None
mock_progress.query.filter.return_value.first.return_value = None

test = Test.query.first()
repository = MagicMock()

start_test(MagicMock(), self.app, g.db, repository, test, "token")

# SHOULD call mark_test_failed for permanent failure
mock_mark_failed.assert_called_once()
# Should log critical
mock_log.critical.assert_called()