test(eval): add readiness regression fixtures

haasonsaas · haasonsaas · commit 2b7cdb94eaf7 · 2026-03-15T00:04:32.000-07:00
diff --git a/TODO.md b/TODO.md
@@ -140,7 +140,7 @@ This roadmap is derived from deep research into Greptile's public docs, blog, MC
 ## 10. Eval, Benchmarking, and Model Governance
 
 91. [ ] Add eval fixtures for external-context alignment, not just diff-local correctness.
-92. [ ] Add eval fixtures for merge-readiness judgments and unresolved-blocker classification.
+92. [x] Add eval fixtures for merge-readiness judgments and unresolved-blocker classification.
 93. [ ] Add eval fixtures for addressed-vs-stale finding lifecycle inference.
 94. [x] Add eval fixtures for multi-hop graph reasoning across call chains and contract edges.
 95. [ ] Add eval runs that compare single-pass review against agentic loop review.
diff --git a/eval/fixtures/repo_regressions/readiness_current_head_stale.yml b/eval/fixtures/repo_regressions/readiness_current_head_stale.yml
@@ -0,0 +1,36 @@
+name: repo regression - current head staleness ignored
+repo_path: ../../..
+diff: |
+  diff --git a/src/server/pr_readiness.rs b/src/server/pr_readiness.rs
+  index 2222222..deadbeef 100644
+  --- a/src/server/pr_readiness.rs
+  +++ b/src/server/pr_readiness.rs
+  @@ -145,11 +145,11 @@ pub(crate) fn is_review_stale(
+          .github_head_sha
+          .as_ref()
+          .zip(latest_by_source.get(&session.diff_source))
+          .is_some_and(|(current_head, latest)| latest.head_sha != *current_head);
+      let current_head_stale = session
+          .github_head_sha
+          .as_deref()
+          .zip(current_head_sha)
+          .is_some_and(|(reviewed_head, current_head)| reviewed_head != current_head);
+  
+  -    latest_known_head_stale || current_head_stale
+  +    latest_known_head_stale
+   }
+expect:
+  must_find:
+    - file: src/server/pr_readiness.rs
+      contains_any:
+        - stale reviews can be treated as fresh when the current head sha changes
+        - dropping current_head_stale means stale reviews can stay marked ready
+        - readiness will miss a newer pr head when no newer review session exists
+      rule_id: bug.readiness.current-head-staleness
+  must_not_find:
+    - contains: style
+  summary:
+    merge_readiness: NeedsAttention
+    min_open_blockers: 1
+  min_total: 1
+  max_total: 8
diff --git a/eval/fixtures/repo_regressions/readiness_inconclusive_verification.yml b/eval/fixtures/repo_regressions/readiness_inconclusive_verification.yml
@@ -0,0 +1,36 @@
+name: repo regression - inconclusive verification no longer blocks readiness
+repo_path: ../../..
+diff: |
+  diff --git a/src/core/comment/summary.rs b/src/core/comment/summary.rs
+  index 3333333..deadbeef 100644
+  --- a/src/core/comment/summary.rs
+  +++ b/src/core/comment/summary.rs
+  @@ -114,13 +114,8 @@ pub(super) fn apply_review_runtime_state(
+      );
+  
+      let mut reasons = Vec::new();
+  -    if matches!(
+  -        summary.verification.state,
+  -        ReviewVerificationState::Inconclusive
+  -    ) {
+  -        reasons.push("verification was inconclusive or fail-open; rerun this review".to_string());
+  -    }
+      if stale_review {
+          reasons.push("new commits landed after this review".to_string());
+      }
+      summary.readiness_reasons = reasons;
+expect:
+  must_find:
+    - file: src/core/comment/summary.rs
+      contains_any:
+        - inconclusive verification will no longer force needs re-review
+        - fail-open verification warnings must still block merge readiness
+        - verification can be inconclusive but this change lets the summary stay ready or needs attention
+      rule_id: bug.readiness.inconclusive-verification
+  must_not_find:
+    - contains: style
+  summary:
+    merge_readiness: NeedsAttention
+    min_open_blockers: 1
+  min_total: 1
+  max_total: 8
diff --git a/eval/fixtures/repo_regressions/readiness_informational_blocker_classification.yml b/eval/fixtures/repo_regressions/readiness_informational_blocker_classification.yml
@@ -0,0 +1,37 @@
+name: repo regression - informational findings counted as blockers
+repo_path: ../../..
+diff: |
+  diff --git a/src/core/comment/summary.rs b/src/core/comment/summary.rs
+  index 1111111..deadbeef 100644
+  --- a/src/core/comment/summary.rs
+  +++ b/src/core/comment/summary.rs
+  @@ -34,14 +34,14 @@ pub(super) fn generate_summary(comments: &[Comment]) -> ReviewSummary {
+          match comment.status {
+              CommentStatus::Open => {
+                  open_comments += 1;
+                  *open_by_severity
+                      .entry(comment.severity.to_string())
+                      .or_insert(0) += 1;
+  -                if comment.severity.is_blocking() {
+  +                if comment.severity.is_blocking() || comment.severity.is_informational() {
+                      open_blocking_comments += 1;
+                      open_blockers += 1;
+                  }
+                  if comment.severity.is_informational() {
+                      open_informational_comments += 1;
+                  }
+expect:
+  must_find:
+    - file: src/core/comment/summary.rs
+      contains_any:
+        - informational findings will be counted as blockers
+        - info and suggestion comments should not increase open_blockers
+        - merge readiness becomes too strict because informational findings are treated as blocking
+      rule_id: bug.readiness.informational-blocker-classification
+  must_not_find:
+    - contains: style
+  summary:
+    merge_readiness: NeedsAttention
+    min_open_blockers: 1
+  min_total: 1
+  max_total: 8
diff --git a/src/commands/eval/fixtures.rs b/src/commands/eval/fixtures.rs
@@ -224,6 +224,100 @@ expect:
         );
     }
 
+    #[test]
+    fn test_checked_in_readiness_blocker_fixture_loads_summary_expectations() {
+        let fixture_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join(
+            "eval/fixtures/repo_regressions/readiness_informational_blocker_classification.yml",
+        );
+
+        let fixtures = load_eval_fixtures_from_path(&fixture_path).unwrap();
+
+        assert_eq!(fixtures.len(), 1);
+        assert_eq!(
+            fixtures[0].fixture.name.as_deref(),
+            Some("repo regression - informational findings counted as blockers")
+        );
+        assert_eq!(
+            fixtures[0].fixture.expect.must_find[0].rule_id.as_deref(),
+            Some("bug.readiness.informational-blocker-classification")
+        );
+        assert_eq!(
+            fixtures[0]
+                .fixture
+                .expect
+                .summary
+                .merge_readiness
+                .as_deref(),
+            Some("NeedsAttention")
+        );
+        assert_eq!(
+            fixtures[0].fixture.expect.summary.min_open_blockers,
+            Some(1)
+        );
+    }
+
+    #[test]
+    fn test_checked_in_current_head_stale_fixture_loads_summary_expectations() {
+        let fixture_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+            .join("eval/fixtures/repo_regressions/readiness_current_head_stale.yml");
+
+        let fixtures = load_eval_fixtures_from_path(&fixture_path).unwrap();
+
+        assert_eq!(fixtures.len(), 1);
+        assert_eq!(
+            fixtures[0].fixture.name.as_deref(),
+            Some("repo regression - current head staleness ignored")
+        );
+        assert_eq!(
+            fixtures[0].fixture.expect.must_find[0].rule_id.as_deref(),
+            Some("bug.readiness.current-head-staleness")
+        );
+        assert_eq!(
+            fixtures[0]
+                .fixture
+                .expect
+                .summary
+                .merge_readiness
+                .as_deref(),
+            Some("NeedsAttention")
+        );
+        assert_eq!(
+            fixtures[0].fixture.expect.summary.min_open_blockers,
+            Some(1)
+        );
+    }
+
+    #[test]
+    fn test_checked_in_inconclusive_verification_fixture_loads_summary_expectations() {
+        let fixture_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+            .join("eval/fixtures/repo_regressions/readiness_inconclusive_verification.yml");
+
+        let fixtures = load_eval_fixtures_from_path(&fixture_path).unwrap();
+
+        assert_eq!(fixtures.len(), 1);
+        assert_eq!(
+            fixtures[0].fixture.name.as_deref(),
+            Some("repo regression - inconclusive verification no longer blocks readiness")
+        );
+        assert_eq!(
+            fixtures[0].fixture.expect.must_find[0].rule_id.as_deref(),
+            Some("bug.readiness.inconclusive-verification")
+        );
+        assert_eq!(
+            fixtures[0]
+                .fixture
+                .expect
+                .summary
+                .merge_readiness
+                .as_deref(),
+            Some("NeedsAttention")
+        );
+        assert_eq!(
+            fixtures[0].fixture.expect.summary.min_open_blockers,
+            Some(1)
+        );
+    }
+
     #[test]
     fn test_collect_eval_fixtures_expands_pack_entries_in_sorted_order() {
         let dir = tempdir().unwrap();
diff --git a/src/commands/eval/fixtures/packs.rs b/src/commands/eval/fixtures/packs.rs
@@ -56,6 +56,7 @@ pub(super) fn expand_community_fixture_pack(
                         .collect(),
                     min_total: fixture.min_total,
                     max_total: fixture.max_total,
+                    summary: Default::default(),
                 },
             };
             validate_eval_fixture(&eval_fixture)?;
diff --git a/src/commands/eval/fixtures/validation.rs b/src/commands/eval/fixtures/validation.rs
@@ -4,6 +4,7 @@ use regex::Regex;
 use super::super::EvalFixture;
 
 pub(super) fn validate_eval_fixture(fixture: &EvalFixture) -> Result<()> {
+    let fixture_name = fixture.name.as_deref().unwrap_or("<unnamed>");
     for pattern in fixture
         .expect
         .must_find
@@ -16,12 +17,13 @@ pub(super) fn validate_eval_fixture(fixture: &EvalFixture) -> Result<()> {
                     anyhow::anyhow!(
                         "Invalid regex '{}' in fixture '{}': {}",
                         pattern_text,
-                        fixture.name.as_deref().unwrap_or("<unnamed>"),
+                        fixture_name,
                         error
                     )
                 })?;
             }
         }
     }
+    fixture.expect.summary.validate(fixture_name)?;
     Ok(())
 }
diff --git a/src/commands/eval/runner/execute/dag.rs b/src/commands/eval/runner/execute/dag.rs
@@ -20,8 +20,8 @@ use super::artifact::{
 use super::loading::PreparedFixtureExecution;
 use super::repro::maybe_run_reproduction_validation;
 use super::result::{
-    append_total_comment_failures, build_benchmark_metrics, convert_agent_activity,
-    convert_verification_report, FixtureResultDetails,
+    append_review_summary_failures, append_total_comment_failures, build_benchmark_metrics,
+    convert_agent_activity, convert_verification_report, FixtureResultDetails,
 };
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -439,11 +439,13 @@ fn spawn_stage(
             let Some(_) = context.match_summary.as_ref() else {
                 anyhow::bail!("comment count validation requires expectation matches");
             };
+            let review_summary = core::CommentSynthesizer::generate_summary(&context.comments);
             let total_comments = context.total_comments;
             let expectations = context.prepared.fixture.expect.clone();
             let mut failures = context.failures.clone();
             Ok(async move {
                 append_total_comment_failures(&mut failures, total_comments, &expectations);
+                append_review_summary_failures(&mut failures, &review_summary, &expectations);
                 Ok(EvalFixtureStageOutput::CommentCountValidation { failures })
             }
             .boxed())
diff --git a/src/commands/eval/runner/execute/result.rs b/src/commands/eval/runner/execute/result.rs
@@ -29,6 +29,14 @@ pub(super) fn append_total_comment_failures(
     }
 }
 
+pub(super) fn append_review_summary_failures(
+    failures: &mut Vec<String>,
+    summary: &crate::core::comment::ReviewSummary,
+    expectations: &EvalExpectations,
+) {
+    expectations.summary.append_failures(failures, summary);
+}
+
 pub(super) fn build_benchmark_metrics(
     prepared: &PreparedFixtureExecution,
     total_comments: usize,
diff --git a/src/commands/eval/types/pattern.rs b/src/commands/eval/types/pattern.rs

Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,7 @@ use regex::Regex;`
`4`	`4`	`use super::super::EvalFixture;`
`5`	`5`
`6`	`6`	`pub(super) fn validate_eval_fixture(fixture: &EvalFixture) -> Result<()> {`
	`7`	`+ let fixture_name = fixture.name.as_deref().unwrap_or("<unnamed>");`
`7`	`8`	`for pattern in fixture`
`8`	`9`	`.expect`
`9`	`10`	`.must_find`
`@@ -16,12 +17,13 @@ pub(super) fn validate_eval_fixture(fixture: &EvalFixture) -> Result<()> {`
`16`	`17`	`anyhow::anyhow!(`
`17`	`18`	`"Invalid regex '{}' in fixture '{}': {}",`
`18`	`19`	`pattern_text,`
`19`		`- fixture.name.as_deref().unwrap_or("<unnamed>"),`
	`20`	`+ fixture_name,`
`20`	`21`	`error`
`21`	`22`	`)`
`22`	`23`	`})?;`
`23`	`24`	`}`
`24`	`25`	`}`
`25`	`26`	`}`
	`27`	`+ fixture.expect.summary.validate(fixture_name)?;`
`26`	`28`	`Ok(())`
`27`	`29`	`}`
Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,14 @@ pub(super) fn append_total_comment_failures(`
`29`	`29`	`}`
`30`	`30`	`}`
`31`	`31`
	`32`	`+pub(super) fn append_review_summary_failures(`
	`33`	`+ failures: &mut Vec<String>,`
	`34`	`+ summary: &crate::core::comment::ReviewSummary,`
	`35`	`+ expectations: &EvalExpectations,`
	`36`	`+) {`
	`37`	`+ expectations.summary.append_failures(failures, summary);`
	`38`	`+}`
	`39`	`+`
`32`	`40`	`pub(super) fn build_benchmark_metrics(`
`33`	`41`	`prepared: &PreparedFixtureExecution,`
`34`	`42`	`total_comments: usize,`