Skip to content

Commit 2b7cdb9

Browse files
committed
test(eval): add readiness regression fixtures
1 parent 4771e5c commit 2b7cdb9

File tree

10 files changed

+395
-4
lines changed

10 files changed

+395
-4
lines changed

TODO.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ This roadmap is derived from deep research into Greptile's public docs, blog, MC
140140
## 10. Eval, Benchmarking, and Model Governance
141141

142142
91. [ ] Add eval fixtures for external-context alignment, not just diff-local correctness.
143-
92. [ ] Add eval fixtures for merge-readiness judgments and unresolved-blocker classification.
143+
92. [x] Add eval fixtures for merge-readiness judgments and unresolved-blocker classification.
144144
93. [ ] Add eval fixtures for addressed-vs-stale finding lifecycle inference.
145145
94. [x] Add eval fixtures for multi-hop graph reasoning across call chains and contract edges.
146146
95. [ ] Add eval runs that compare single-pass review against agentic loop review.
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
name: repo regression - current head staleness ignored
2+
repo_path: ../../..
3+
diff: |
4+
diff --git a/src/server/pr_readiness.rs b/src/server/pr_readiness.rs
5+
index 2222222..deadbeef 100644
6+
--- a/src/server/pr_readiness.rs
7+
+++ b/src/server/pr_readiness.rs
8+
@@ -145,11 +145,11 @@ pub(crate) fn is_review_stale(
9+
.github_head_sha
10+
.as_ref()
11+
.zip(latest_by_source.get(&session.diff_source))
12+
.is_some_and(|(current_head, latest)| latest.head_sha != *current_head);
13+
let current_head_stale = session
14+
.github_head_sha
15+
.as_deref()
16+
.zip(current_head_sha)
17+
.is_some_and(|(reviewed_head, current_head)| reviewed_head != current_head);
18+
19+
- latest_known_head_stale || current_head_stale
20+
+ latest_known_head_stale
21+
}
22+
expect:
23+
must_find:
24+
- file: src/server/pr_readiness.rs
25+
contains_any:
26+
- stale reviews can be treated as fresh when the current head sha changes
27+
- dropping current_head_stale means stale reviews can stay marked ready
28+
- readiness will miss a newer pr head when no newer review session exists
29+
rule_id: bug.readiness.current-head-staleness
30+
must_not_find:
31+
- contains: style
32+
summary:
33+
merge_readiness: NeedsAttention
34+
min_open_blockers: 1
35+
min_total: 1
36+
max_total: 8
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
name: repo regression - inconclusive verification no longer blocks readiness
2+
repo_path: ../../..
3+
diff: |
4+
diff --git a/src/core/comment/summary.rs b/src/core/comment/summary.rs
5+
index 3333333..deadbeef 100644
6+
--- a/src/core/comment/summary.rs
7+
+++ b/src/core/comment/summary.rs
8+
@@ -114,13 +114,8 @@ pub(super) fn apply_review_runtime_state(
9+
);
10+
11+
let mut reasons = Vec::new();
12+
- if matches!(
13+
- summary.verification.state,
14+
- ReviewVerificationState::Inconclusive
15+
- ) {
16+
- reasons.push("verification was inconclusive or fail-open; rerun this review".to_string());
17+
- }
18+
if stale_review {
19+
reasons.push("new commits landed after this review".to_string());
20+
}
21+
summary.readiness_reasons = reasons;
22+
expect:
23+
must_find:
24+
- file: src/core/comment/summary.rs
25+
contains_any:
26+
- inconclusive verification will no longer force needs re-review
27+
- fail-open verification warnings must still block merge readiness
28+
- verification can be inconclusive but this change lets the summary stay ready or needs attention
29+
rule_id: bug.readiness.inconclusive-verification
30+
must_not_find:
31+
- contains: style
32+
summary:
33+
merge_readiness: NeedsAttention
34+
min_open_blockers: 1
35+
min_total: 1
36+
max_total: 8
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
name: repo regression - informational findings counted as blockers
2+
repo_path: ../../..
3+
diff: |
4+
diff --git a/src/core/comment/summary.rs b/src/core/comment/summary.rs
5+
index 1111111..deadbeef 100644
6+
--- a/src/core/comment/summary.rs
7+
+++ b/src/core/comment/summary.rs
8+
@@ -34,14 +34,14 @@ pub(super) fn generate_summary(comments: &[Comment]) -> ReviewSummary {
9+
match comment.status {
10+
CommentStatus::Open => {
11+
open_comments += 1;
12+
*open_by_severity
13+
.entry(comment.severity.to_string())
14+
.or_insert(0) += 1;
15+
- if comment.severity.is_blocking() {
16+
+ if comment.severity.is_blocking() || comment.severity.is_informational() {
17+
open_blocking_comments += 1;
18+
open_blockers += 1;
19+
}
20+
if comment.severity.is_informational() {
21+
open_informational_comments += 1;
22+
}
23+
expect:
24+
must_find:
25+
- file: src/core/comment/summary.rs
26+
contains_any:
27+
- informational findings will be counted as blockers
28+
- info and suggestion comments should not increase open_blockers
29+
- merge readiness becomes too strict because informational findings are treated as blocking
30+
rule_id: bug.readiness.informational-blocker-classification
31+
must_not_find:
32+
- contains: style
33+
summary:
34+
merge_readiness: NeedsAttention
35+
min_open_blockers: 1
36+
min_total: 1
37+
max_total: 8

src/commands/eval/fixtures.rs

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,100 @@ expect:
224224
);
225225
}
226226

227+
#[test]
228+
fn test_checked_in_readiness_blocker_fixture_loads_summary_expectations() {
229+
let fixture_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join(
230+
"eval/fixtures/repo_regressions/readiness_informational_blocker_classification.yml",
231+
);
232+
233+
let fixtures = load_eval_fixtures_from_path(&fixture_path).unwrap();
234+
235+
assert_eq!(fixtures.len(), 1);
236+
assert_eq!(
237+
fixtures[0].fixture.name.as_deref(),
238+
Some("repo regression - informational findings counted as blockers")
239+
);
240+
assert_eq!(
241+
fixtures[0].fixture.expect.must_find[0].rule_id.as_deref(),
242+
Some("bug.readiness.informational-blocker-classification")
243+
);
244+
assert_eq!(
245+
fixtures[0]
246+
.fixture
247+
.expect
248+
.summary
249+
.merge_readiness
250+
.as_deref(),
251+
Some("NeedsAttention")
252+
);
253+
assert_eq!(
254+
fixtures[0].fixture.expect.summary.min_open_blockers,
255+
Some(1)
256+
);
257+
}
258+
259+
#[test]
260+
fn test_checked_in_current_head_stale_fixture_loads_summary_expectations() {
261+
let fixture_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
262+
.join("eval/fixtures/repo_regressions/readiness_current_head_stale.yml");
263+
264+
let fixtures = load_eval_fixtures_from_path(&fixture_path).unwrap();
265+
266+
assert_eq!(fixtures.len(), 1);
267+
assert_eq!(
268+
fixtures[0].fixture.name.as_deref(),
269+
Some("repo regression - current head staleness ignored")
270+
);
271+
assert_eq!(
272+
fixtures[0].fixture.expect.must_find[0].rule_id.as_deref(),
273+
Some("bug.readiness.current-head-staleness")
274+
);
275+
assert_eq!(
276+
fixtures[0]
277+
.fixture
278+
.expect
279+
.summary
280+
.merge_readiness
281+
.as_deref(),
282+
Some("NeedsAttention")
283+
);
284+
assert_eq!(
285+
fixtures[0].fixture.expect.summary.min_open_blockers,
286+
Some(1)
287+
);
288+
}
289+
290+
#[test]
291+
fn test_checked_in_inconclusive_verification_fixture_loads_summary_expectations() {
292+
let fixture_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
293+
.join("eval/fixtures/repo_regressions/readiness_inconclusive_verification.yml");
294+
295+
let fixtures = load_eval_fixtures_from_path(&fixture_path).unwrap();
296+
297+
assert_eq!(fixtures.len(), 1);
298+
assert_eq!(
299+
fixtures[0].fixture.name.as_deref(),
300+
Some("repo regression - inconclusive verification no longer blocks readiness")
301+
);
302+
assert_eq!(
303+
fixtures[0].fixture.expect.must_find[0].rule_id.as_deref(),
304+
Some("bug.readiness.inconclusive-verification")
305+
);
306+
assert_eq!(
307+
fixtures[0]
308+
.fixture
309+
.expect
310+
.summary
311+
.merge_readiness
312+
.as_deref(),
313+
Some("NeedsAttention")
314+
);
315+
assert_eq!(
316+
fixtures[0].fixture.expect.summary.min_open_blockers,
317+
Some(1)
318+
);
319+
}
320+
227321
#[test]
228322
fn test_collect_eval_fixtures_expands_pack_entries_in_sorted_order() {
229323
let dir = tempdir().unwrap();

src/commands/eval/fixtures/packs.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ pub(super) fn expand_community_fixture_pack(
5656
.collect(),
5757
min_total: fixture.min_total,
5858
max_total: fixture.max_total,
59+
summary: Default::default(),
5960
},
6061
};
6162
validate_eval_fixture(&eval_fixture)?;

src/commands/eval/fixtures/validation.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use regex::Regex;
44
use super::super::EvalFixture;
55

66
pub(super) fn validate_eval_fixture(fixture: &EvalFixture) -> Result<()> {
7+
let fixture_name = fixture.name.as_deref().unwrap_or("<unnamed>");
78
for pattern in fixture
89
.expect
910
.must_find
@@ -16,12 +17,13 @@ pub(super) fn validate_eval_fixture(fixture: &EvalFixture) -> Result<()> {
1617
anyhow::anyhow!(
1718
"Invalid regex '{}' in fixture '{}': {}",
1819
pattern_text,
19-
fixture.name.as_deref().unwrap_or("<unnamed>"),
20+
fixture_name,
2021
error
2122
)
2223
})?;
2324
}
2425
}
2526
}
27+
fixture.expect.summary.validate(fixture_name)?;
2628
Ok(())
2729
}

src/commands/eval/runner/execute/dag.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ use super::artifact::{
2020
use super::loading::PreparedFixtureExecution;
2121
use super::repro::maybe_run_reproduction_validation;
2222
use super::result::{
23-
append_total_comment_failures, build_benchmark_metrics, convert_agent_activity,
24-
convert_verification_report, FixtureResultDetails,
23+
append_review_summary_failures, append_total_comment_failures, build_benchmark_metrics,
24+
convert_agent_activity, convert_verification_report, FixtureResultDetails,
2525
};
2626

2727
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -439,11 +439,13 @@ fn spawn_stage(
439439
let Some(_) = context.match_summary.as_ref() else {
440440
anyhow::bail!("comment count validation requires expectation matches");
441441
};
442+
let review_summary = core::CommentSynthesizer::generate_summary(&context.comments);
442443
let total_comments = context.total_comments;
443444
let expectations = context.prepared.fixture.expect.clone();
444445
let mut failures = context.failures.clone();
445446
Ok(async move {
446447
append_total_comment_failures(&mut failures, total_comments, &expectations);
448+
append_review_summary_failures(&mut failures, &review_summary, &expectations);
447449
Ok(EvalFixtureStageOutput::CommentCountValidation { failures })
448450
}
449451
.boxed())

src/commands/eval/runner/execute/result.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,14 @@ pub(super) fn append_total_comment_failures(
2929
}
3030
}
3131

32+
pub(super) fn append_review_summary_failures(
33+
failures: &mut Vec<String>,
34+
summary: &crate::core::comment::ReviewSummary,
35+
expectations: &EvalExpectations,
36+
) {
37+
expectations.summary.append_failures(failures, summary);
38+
}
39+
3240
pub(super) fn build_benchmark_metrics(
3341
prepared: &PreparedFixtureExecution,
3442
total_comments: usize,

0 commit comments

Comments
 (0)