@@ -3,6 +3,7 @@ use super::super::EvalThresholdOptions;
33use super :: drops:: check_drop_thresholds;
44use super :: minimums:: check_minimum_thresholds;
55use super :: rules:: build_rule_f1_map;
6+ use crate :: commands:: eval:: metrics:: build_lifecycle_accuracy;
67
78pub ( in super :: super :: super ) fn evaluate_eval_thresholds (
89 current : & EvalReport ,
@@ -35,6 +36,16 @@ pub(in super::super::super) fn evaluate_eval_thresholds(
3536 }
3637 }
3738 }
39+ if let Some ( threshold) = options. min_lifecycle_accuracy {
40+ if let Some ( accuracy) = build_lifecycle_accuracy ( & current. results ) {
41+ if accuracy. total > 0 && accuracy. rate < threshold {
42+ failures. push ( format ! (
43+ "lifecycle accuracy {:.3} fell below minimum {:.3} ({}/{})" ,
44+ accuracy. rate, threshold, accuracy. passed, accuracy. total
45+ ) ) ;
46+ }
47+ }
48+ }
3849 failures. extend ( check_drop_thresholds (
3950 current,
4051 current_micro_f1,
@@ -90,6 +101,7 @@ mod tests {
90101 min_micro_f1 : None ,
91102 min_macro_f1 : None ,
92103 min_verification_health : None ,
104+ min_lifecycle_accuracy : None ,
93105 min_rule_f1 : vec ! [ ] ,
94106 max_rule_f1_drop : vec ! [ ] ,
95107 } ;
@@ -172,6 +184,7 @@ mod tests {
172184 min_micro_f1 : None ,
173185 min_macro_f1 : None ,
174186 min_verification_health : None ,
187+ min_lifecycle_accuracy : None ,
175188 min_rule_f1 : vec ! [ ] ,
176189 max_rule_f1_drop : vec ! [ EvalRuleThreshold {
177190 rule_id: "sec.sql.injection" . to_string( ) ,
@@ -233,6 +246,7 @@ mod tests {
233246 min_micro_f1 : None ,
234247 min_macro_f1 : None ,
235248 min_verification_health : None ,
249+ min_lifecycle_accuracy : None ,
236250 min_rule_f1 : vec ! [ ] ,
237251 max_rule_f1_drop : vec ! [ ] ,
238252 } ;
@@ -279,6 +293,7 @@ mod tests {
279293 min_micro_f1 : None ,
280294 min_macro_f1 : None ,
281295 min_verification_health : Some ( 0.8 ) ,
296+ min_lifecycle_accuracy : None ,
282297 min_rule_f1 : vec ! [ ] ,
283298 max_rule_f1_drop : vec ! [ ] ,
284299 } ;
@@ -290,4 +305,76 @@ mod tests {
290305 assert ! ( failures[ 0 ] . contains( "minimum 0.800" ) ) ;
291306 assert ! ( failures[ 0 ] . contains( "7/10" ) ) ;
292307 }
308+
309+ #[ test]
310+ fn test_evaluate_eval_thresholds_checks_lifecycle_accuracy ( ) {
311+ let current = EvalReport {
312+ run : Default :: default ( ) ,
313+ fixtures_total : 2 ,
314+ fixtures_passed : 1 ,
315+ fixtures_failed : 1 ,
316+ rule_metrics : vec ! [ ] ,
317+ rule_summary : Some ( EvalRuleScoreSummary :: default ( ) ) ,
318+ benchmark_summary : None ,
319+ suite_results : vec ! [ ] ,
320+ benchmark_by_category : Default :: default ( ) ,
321+ benchmark_by_language : Default :: default ( ) ,
322+ benchmark_by_difficulty : Default :: default ( ) ,
323+ suite_comparisons : vec ! [ ] ,
324+ category_comparisons : vec ! [ ] ,
325+ language_comparisons : vec ! [ ] ,
326+ verification_health : None ,
327+ warnings : vec ! [ ] ,
328+ threshold_failures : vec ! [ ] ,
329+ results : vec ! [
330+ crate :: commands:: eval:: EvalFixtureResult {
331+ passed: true ,
332+ rule_metrics: vec![ EvalRuleMetrics {
333+ rule_id: "bug.lifecycle.context-only-addressed" . to_string( ) ,
334+ expected: 1 ,
335+ predicted: 1 ,
336+ true_positives: 1 ,
337+ false_positives: 0 ,
338+ false_negatives: 0 ,
339+ precision: 1.0 ,
340+ recall: 1.0 ,
341+ f1: 1.0 ,
342+ } ] ,
343+ ..Default :: default ( )
344+ } ,
345+ crate :: commands:: eval:: EvalFixtureResult {
346+ passed: false ,
347+ rule_metrics: vec![ EvalRuleMetrics {
348+ rule_id: "bug.lifecycle.api-drops-followup-addressed" . to_string( ) ,
349+ expected: 1 ,
350+ predicted: 0 ,
351+ true_positives: 0 ,
352+ false_positives: 0 ,
353+ false_negatives: 1 ,
354+ precision: 0.0 ,
355+ recall: 0.0 ,
356+ f1: 0.0 ,
357+ } ] ,
358+ ..Default :: default ( )
359+ } ,
360+ ] ,
361+ } ;
362+ let options = EvalThresholdOptions {
363+ max_micro_f1_drop : None ,
364+ max_suite_f1_drop : None ,
365+ max_category_f1_drop : None ,
366+ max_language_f1_drop : None ,
367+ min_micro_f1 : None ,
368+ min_macro_f1 : None ,
369+ min_verification_health : None ,
370+ min_lifecycle_accuracy : Some ( 0.8 ) ,
371+ min_rule_f1 : vec ! [ ] ,
372+ max_rule_f1_drop : vec ! [ ] ,
373+ } ;
374+
375+ let failures = evaluate_eval_thresholds ( & current, None , & options) ;
376+
377+ assert_eq ! ( failures. len( ) , 1 ) ;
378+ assert ! ( failures[ 0 ] . contains( "lifecycle accuracy 0.500" ) ) ;
379+ }
293380}
0 commit comments