Skip to content

Commit fb64baf

Browse files
committed
feat(eval): gate feedback coverage and lifecycle accuracy
1 parent 2d2ff2a commit fb64baf

File tree

17 files changed

+313
-3
lines changed

17 files changed

+313
-3
lines changed

TODO.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ This roadmap is derived from deep research into Greptile's public docs, blog, MC
146146
95. [ ] Add eval runs that compare single-pass review against agentic loop review.
147147
96. [ ] Add production replay evals using anonymized accepted/rejected review outcomes.
148148
97. [ ] Add leaderboard reporting for reviewer usefulness metrics, not just precision/recall.
149-
98. [ ] Add regression gates for feedback coverage, verifier health, and lifecycle-state accuracy.
149+
98. [x] Add regression gates for feedback coverage, verifier health, and lifecycle-state accuracy.
150150
99. [ ] Add model-routing policies that explicitly separate generation, verification, and auditing roles.
151151
100. [ ] Publish a repeatable "independent auditor" benchmark story in the UI and CLI so DiffScope's differentiation is measurable.
152152

src/commands/eval/command/batch.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,7 @@ mod tests {
321321
min_micro_f1: None,
322322
min_macro_f1: None,
323323
min_verification_health: None,
324+
min_lifecycle_accuracy: None,
324325
min_rule_f1: vec![],
325326
max_rule_f1_drop: vec![],
326327
matrix_models: vec![],

src/commands/eval/command/fixtures.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ mod tests {
219219
min_micro_f1: None,
220220
min_macro_f1: None,
221221
min_verification_health: None,
222+
min_lifecycle_accuracy: None,
222223
min_rule_f1: Vec::new(),
223224
max_rule_f1_drop: Vec::new(),
224225
matrix_models: Vec::new(),
@@ -262,6 +263,7 @@ mod tests {
262263
min_micro_f1: None,
263264
min_macro_f1: None,
264265
min_verification_health: None,
266+
min_lifecycle_accuracy: None,
265267
min_rule_f1: Vec::new(),
266268
max_rule_f1_drop: Vec::new(),
267269
matrix_models: Vec::new(),

src/commands/eval/command/options.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ pub(super) fn prepare_eval_options(
3737
min_micro_f1: options.min_micro_f1,
3838
min_macro_f1: options.min_macro_f1,
3939
min_verification_health: options.min_verification_health,
40+
min_lifecycle_accuracy: options.min_lifecycle_accuracy,
4041
min_rule_f1: min_rule_thresholds,
4142
max_rule_f1_drop: max_rule_drop_thresholds,
4243
},

src/commands/eval/metrics.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#[path = "metrics/comparisons.rs"]
22
mod comparisons;
3+
#[path = "metrics/lifecycle.rs"]
4+
mod lifecycle;
35
#[path = "metrics/rules.rs"]
46
mod rules;
57
#[path = "metrics/suites.rs"]
@@ -8,6 +10,7 @@ mod suites;
810
pub(super) use comparisons::{
911
build_named_breakdown_comparisons, build_suite_comparisons, build_verification_health,
1012
};
13+
pub(super) use lifecycle::build_lifecycle_accuracy;
1114
pub(super) use rules::{aggregate_rule_metrics, compute_rule_metrics, summarize_rule_metrics};
1215
pub(super) use suites::{
1316
build_benchmark_breakdowns, build_overall_benchmark_summary, build_suite_results,
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
use super::super::EvalFixtureResult;
2+
3+
const LIFECYCLE_RULE_PREFIX: &str = "bug.lifecycle.";
4+
5+
#[derive(Debug, Clone, Copy, PartialEq)]
6+
pub(in super::super) struct EvalPassRate {
7+
pub(in super::super) passed: usize,
8+
pub(in super::super) total: usize,
9+
pub(in super::super) rate: f32,
10+
}
11+
12+
pub(in super::super) fn build_lifecycle_accuracy(
13+
results: &[EvalFixtureResult],
14+
) -> Option<EvalPassRate> {
15+
let total = results
16+
.iter()
17+
.filter(|result| is_lifecycle_fixture(result))
18+
.count();
19+
if total == 0 {
20+
return None;
21+
}
22+
23+
let passed = results
24+
.iter()
25+
.filter(|result| is_lifecycle_fixture(result) && result.passed)
26+
.count();
27+
28+
Some(EvalPassRate {
29+
passed,
30+
total,
31+
rate: passed as f32 / total as f32,
32+
})
33+
}
34+
35+
fn is_lifecycle_fixture(result: &EvalFixtureResult) -> bool {
36+
result
37+
.rule_metrics
38+
.iter()
39+
.any(|metric| metric.expected > 0 && metric.rule_id.starts_with(LIFECYCLE_RULE_PREFIX))
40+
}
41+
42+
#[cfg(test)]
43+
mod tests {
44+
use super::*;
45+
use crate::commands::eval::EvalRuleMetrics;
46+
47+
fn fixture_result(passed: bool, rule_id: &str) -> EvalFixtureResult {
48+
EvalFixtureResult {
49+
passed,
50+
rule_metrics: vec![EvalRuleMetrics {
51+
rule_id: rule_id.to_string(),
52+
expected: 1,
53+
predicted: 1,
54+
true_positives: usize::from(passed),
55+
false_positives: usize::from(!passed),
56+
false_negatives: usize::from(!passed),
57+
precision: if passed { 1.0 } else { 0.0 },
58+
recall: if passed { 1.0 } else { 0.0 },
59+
f1: if passed { 1.0 } else { 0.0 },
60+
}],
61+
..Default::default()
62+
}
63+
}
64+
65+
#[test]
66+
fn build_lifecycle_accuracy_aggregates_lifecycle_fixture_pass_rate() {
67+
let accuracy = build_lifecycle_accuracy(&[
68+
fixture_result(true, "bug.lifecycle.context-only-addressed"),
69+
fixture_result(false, "bug.lifecycle.api-drops-followup-addressed"),
70+
fixture_result(true, "bug.readiness.current-head-staleness"),
71+
])
72+
.unwrap();
73+
74+
assert_eq!(accuracy.passed, 1);
75+
assert_eq!(accuracy.total, 2);
76+
assert!((accuracy.rate - 0.5).abs() < f32::EPSILON);
77+
}
78+
79+
#[test]
80+
fn build_lifecycle_accuracy_returns_none_without_lifecycle_rules() {
81+
assert!(build_lifecycle_accuracy(&[fixture_result(
82+
true,
83+
"bug.readiness.current-head-staleness"
84+
)])
85+
.is_none());
86+
}
87+
}

src/commands/eval/report/output.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use anyhow::Result;
22
use std::path::Path;
33

4+
use super::super::metrics::build_lifecycle_accuracy;
45
use super::super::EvalReport;
56

67
pub(in super::super) fn print_eval_report(report: &EvalReport) {
@@ -307,6 +308,15 @@ pub(in super::super) fn print_eval_report(report: &EvalReport) {
307308
);
308309
}
309310

311+
if let Some(accuracy) = build_lifecycle_accuracy(&report.results) {
312+
println!(
313+
"Lifecycle accuracy: {:.0}% ({}/{})",
314+
accuracy.rate * 100.0,
315+
accuracy.passed,
316+
accuracy.total
317+
);
318+
}
319+
310320
for warning in &report.warnings {
311321
println!("Warning: {warning}");
312322
}

src/commands/eval/thresholds.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ pub(super) struct EvalThresholdOptions {
1212
pub(super) min_micro_f1: Option<f32>,
1313
pub(super) min_macro_f1: Option<f32>,
1414
pub(super) min_verification_health: Option<f32>,
15+
pub(super) min_lifecycle_accuracy: Option<f32>,
1516
pub(super) min_rule_f1: Vec<EvalRuleThreshold>,
1617
pub(super) max_rule_f1_drop: Vec<EvalRuleThreshold>,
1718
}

src/commands/eval/thresholds/evaluation/run.rs

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ use super::super::EvalThresholdOptions;
33
use super::drops::check_drop_thresholds;
44
use super::minimums::check_minimum_thresholds;
55
use super::rules::build_rule_f1_map;
6+
use crate::commands::eval::metrics::build_lifecycle_accuracy;
67

78
pub(in super::super::super) fn evaluate_eval_thresholds(
89
current: &EvalReport,
@@ -35,6 +36,16 @@ pub(in super::super::super) fn evaluate_eval_thresholds(
3536
}
3637
}
3738
}
39+
if let Some(threshold) = options.min_lifecycle_accuracy {
40+
if let Some(accuracy) = build_lifecycle_accuracy(&current.results) {
41+
if accuracy.total > 0 && accuracy.rate < threshold {
42+
failures.push(format!(
43+
"lifecycle accuracy {:.3} fell below minimum {:.3} ({}/{})",
44+
accuracy.rate, threshold, accuracy.passed, accuracy.total
45+
));
46+
}
47+
}
48+
}
3849
failures.extend(check_drop_thresholds(
3950
current,
4051
current_micro_f1,
@@ -90,6 +101,7 @@ mod tests {
90101
min_micro_f1: None,
91102
min_macro_f1: None,
92103
min_verification_health: None,
104+
min_lifecycle_accuracy: None,
93105
min_rule_f1: vec![],
94106
max_rule_f1_drop: vec![],
95107
};
@@ -172,6 +184,7 @@ mod tests {
172184
min_micro_f1: None,
173185
min_macro_f1: None,
174186
min_verification_health: None,
187+
min_lifecycle_accuracy: None,
175188
min_rule_f1: vec![],
176189
max_rule_f1_drop: vec![EvalRuleThreshold {
177190
rule_id: "sec.sql.injection".to_string(),
@@ -233,6 +246,7 @@ mod tests {
233246
min_micro_f1: None,
234247
min_macro_f1: None,
235248
min_verification_health: None,
249+
min_lifecycle_accuracy: None,
236250
min_rule_f1: vec![],
237251
max_rule_f1_drop: vec![],
238252
};
@@ -279,6 +293,7 @@ mod tests {
279293
min_micro_f1: None,
280294
min_macro_f1: None,
281295
min_verification_health: Some(0.8),
296+
min_lifecycle_accuracy: None,
282297
min_rule_f1: vec![],
283298
max_rule_f1_drop: vec![],
284299
};
@@ -290,4 +305,76 @@ mod tests {
290305
assert!(failures[0].contains("minimum 0.800"));
291306
assert!(failures[0].contains("7/10"));
292307
}
308+
309+
#[test]
310+
fn test_evaluate_eval_thresholds_checks_lifecycle_accuracy() {
311+
let current = EvalReport {
312+
run: Default::default(),
313+
fixtures_total: 2,
314+
fixtures_passed: 1,
315+
fixtures_failed: 1,
316+
rule_metrics: vec![],
317+
rule_summary: Some(EvalRuleScoreSummary::default()),
318+
benchmark_summary: None,
319+
suite_results: vec![],
320+
benchmark_by_category: Default::default(),
321+
benchmark_by_language: Default::default(),
322+
benchmark_by_difficulty: Default::default(),
323+
suite_comparisons: vec![],
324+
category_comparisons: vec![],
325+
language_comparisons: vec![],
326+
verification_health: None,
327+
warnings: vec![],
328+
threshold_failures: vec![],
329+
results: vec![
330+
crate::commands::eval::EvalFixtureResult {
331+
passed: true,
332+
rule_metrics: vec![EvalRuleMetrics {
333+
rule_id: "bug.lifecycle.context-only-addressed".to_string(),
334+
expected: 1,
335+
predicted: 1,
336+
true_positives: 1,
337+
false_positives: 0,
338+
false_negatives: 0,
339+
precision: 1.0,
340+
recall: 1.0,
341+
f1: 1.0,
342+
}],
343+
..Default::default()
344+
},
345+
crate::commands::eval::EvalFixtureResult {
346+
passed: false,
347+
rule_metrics: vec![EvalRuleMetrics {
348+
rule_id: "bug.lifecycle.api-drops-followup-addressed".to_string(),
349+
expected: 1,
350+
predicted: 0,
351+
true_positives: 0,
352+
false_positives: 0,
353+
false_negatives: 1,
354+
precision: 0.0,
355+
recall: 0.0,
356+
f1: 0.0,
357+
}],
358+
..Default::default()
359+
},
360+
],
361+
};
362+
let options = EvalThresholdOptions {
363+
max_micro_f1_drop: None,
364+
max_suite_f1_drop: None,
365+
max_category_f1_drop: None,
366+
max_language_f1_drop: None,
367+
min_micro_f1: None,
368+
min_macro_f1: None,
369+
min_verification_health: None,
370+
min_lifecycle_accuracy: Some(0.8),
371+
min_rule_f1: vec![],
372+
max_rule_f1_drop: vec![],
373+
};
374+
375+
let failures = evaluate_eval_thresholds(&current, None, &options);
376+
377+
assert_eq!(failures.len(), 1);
378+
assert!(failures[0].contains("lifecycle accuracy 0.500"));
379+
}
293380
}

src/commands/eval/types/options.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ pub struct EvalRunOptions {
1010
pub min_micro_f1: Option<f32>,
1111
pub min_macro_f1: Option<f32>,
1212
pub min_verification_health: Option<f32>,
13+
pub min_lifecycle_accuracy: Option<f32>,
1314
pub min_rule_f1: Vec<String>,
1415
pub max_rule_f1_drop: Vec<String>,
1516
pub matrix_models: Vec<String>,

0 commit comments

Comments
 (0)