Skip to content

Commit aab9e97

Browse files
author
LoCoBench Bot
committed
Improve checklist verifiers with soft length scaling
1 parent 7c9cf97 commit aab9e97

File tree

36 files changed

+2689
-3276
lines changed

36 files changed

+2689
-3276
lines changed
Lines changed: 82 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -1,74 +1,81 @@
11
#!/bin/bash
22
# Reward: checklist (0.0-1.0) — weighted pattern matching against ground_truth.json
3-
# Verifier for investigation tasks: scores /logs/agent/investigation.md
4-
# against ground-truth findings, file references, causal chains, and negative checks.
5-
#
6-
# Scoring weights (from ground_truth.json):
7-
# required_findings: 0.40
8-
# file_references: 0.30
9-
# causal_chain: 0.20
10-
# negative_checks: 0.10
3+
# Shared checklist verifier with soft length scaling and optional canonical-SHA bypass.
114

125
set -e
136

14-
REPORT="/logs/agent/investigation.md"
15-
GROUND_TRUTH="/tests/ground_truth.json"
7+
REPORT_PATH="${REPORT_PATH:-/logs/agent/investigation.md}"
8+
GROUND_TRUTH="${GROUND_TRUTH:-/tests/ground_truth.json}"
169
REWARD_FILE="/logs/verifier/reward.txt"
10+
MIN_REPORT_BYTES="${MIN_REPORT_BYTES:-100}"
11+
# Below this, output is treated as effectively missing / unusable.
12+
MIN_ABS_BYTES="${MIN_ABS_BYTES:-24}"
1713

1814
mkdir -p /logs/verifier
1915

20-
# ── Check prerequisites ────────────────────────────────────────────────
2116
if [ ! -f "$GROUND_TRUTH" ]; then
2217
echo "ERROR: ground_truth.json not found at $GROUND_TRUTH"
2318
echo "0.0" > "$REWARD_FILE"
2419
exit 0
2520
fi
2621

27-
if [ ! -f "$REPORT" ]; then
28-
echo "No investigation report found at $REPORT"
29-
echo "Agent did not produce the required output."
22+
if [ ! -f "$REPORT_PATH" ]; then
23+
echo "No agent output found at $REPORT_PATH"
3024
echo "0.0" > "$REWARD_FILE"
3125
exit 0
3226
fi
3327

34-
REPORT_SIZE=$(wc -c < "$REPORT")
35-
if [ "$REPORT_SIZE" -lt 100 ]; then
36-
echo "Investigation report is too short (${REPORT_SIZE} bytes). Likely incomplete."
28+
REPORT_SIZE=$(wc -c < "$REPORT_PATH")
29+
if [ "$REPORT_SIZE" -lt "$MIN_ABS_BYTES" ]; then
30+
echo "Agent output too small (${REPORT_SIZE} bytes, minimum usable ${MIN_ABS_BYTES})."
3731
echo "0.0" > "$REWARD_FILE"
3832
exit 0
3933
fi
4034

41-
echo "Scoring investigation report ($REPORT_SIZE bytes)..."
35+
echo "Scoring agent output ($REPORT_SIZE bytes)..."
36+
echo "Report: $REPORT_PATH"
4237
echo "Ground truth: $GROUND_TRUTH"
4338
echo ""
4439

45-
# ── Delegate scoring to Python (avoids shell escaping issues with regex) ─
46-
python3 << 'PYEOF'
47-
import json, re, sys
40+
REPORT_PATH="$REPORT_PATH" GROUND_TRUTH="$GROUND_TRUTH" REWARD_FILE="$REWARD_FILE" REPORT_SIZE="$REPORT_SIZE" MIN_REPORT_BYTES="$MIN_REPORT_BYTES" python3 << 'PYEOF'
41+
import hashlib
42+
import json
43+
import os
44+
import re
4845
49-
REPORT_PATH = "/logs/agent/investigation.md"
50-
GT_PATH = "/tests/ground_truth.json"
51-
REWARD_PATH = "/logs/verifier/reward.txt"
46+
REPORT_PATH = os.environ["REPORT_PATH"]
47+
GT_PATH = os.environ["GROUND_TRUTH"]
48+
REWARD_PATH = os.environ["REWARD_FILE"]
49+
REPORT_SIZE = int(os.environ["REPORT_SIZE"])
50+
MIN_REPORT_BYTES = max(1, int(os.environ["MIN_REPORT_BYTES"]))
5251
5352
with open(REPORT_PATH) as f:
5453
report = f.read()
5554
with open(GT_PATH) as f:
5655
gt = json.load(f)
5756
57+
# Canonical source (when provided) should always score 1.0 if text is exact.
58+
doc_sha = hashlib.sha256(report.encode("utf-8")).hexdigest().lower()
59+
canonical_sha = ((((gt.get("ground_truth_provenance") or {}).get("canonical_source") or {}).get("sha256") or "").lower())
60+
if canonical_sha and doc_sha == canonical_sha:
61+
print("Canonical source SHA match: awarding 1.0")
62+
with open(REWARD_PATH, "w") as f:
63+
f.write("1.00\n")
64+
raise SystemExit(0)
65+
66+
5867
def check_any_pattern(patterns, text):
59-
"""Return True if at least one pattern matches (case-insensitive)."""
6068
for p in patterns:
6169
try:
6270
if re.search(p, text, re.IGNORECASE):
6371
return True
6472
except re.error:
65-
# Fall back to literal substring match if regex is invalid
6673
if p.lower() in text.lower():
6774
return True
6875
return False
6976
77+
7078
def check_all_patterns(patterns, text):
71-
"""Return True if ALL patterns match (each represents a step in causal chain)."""
7279
for p in patterns:
7380
try:
7481
if not re.search(p, text, re.IGNORECASE):
@@ -78,81 +85,59 @@ def check_all_patterns(patterns, text):
7885
return False
7986
return True
8087
81-
# ── Score required_findings ──────────────────────────────────────────────
82-
print("=== Required Findings ===")
83-
f_score, f_total = 0.0, 0.0
84-
for item in gt["required_findings"]:
85-
f_total += item["weight"]
86-
if check_any_pattern(item["patterns"], report):
87-
f_score += item["weight"]
88-
print(f" [x] {item['description']} (weight: {item['weight']})")
89-
else:
90-
print(f" [ ] {item['description']} (weight: {item['weight']})")
91-
f_ratio = f_score / f_total if f_total > 0 else 0
92-
print(f" Findings score: {f_score:.2f} / {f_total:.2f} = {f_ratio:.2f}")
93-
print()
94-
95-
# ── Score file_references ────────────────────────────────────────────────
96-
print("=== File References ===")
97-
r_score, r_total = 0.0, 0.0
98-
for item in gt["file_references"]:
99-
r_total += item["weight"]
100-
if check_any_pattern(item["patterns"], report):
101-
r_score += item["weight"]
102-
print(f" [x] {item['description']} (weight: {item['weight']})")
103-
else:
104-
print(f" [ ] {item['description']} (weight: {item['weight']})")
105-
r_ratio = r_score / r_total if r_total > 0 else 0
106-
print(f" File refs score: {r_score:.2f} / {r_total:.2f} = {r_ratio:.2f}")
107-
print()
108-
109-
# ── Score causal_chain ───────────────────────────────────────────────────
110-
print("=== Causal Chain ===")
111-
c_score, c_total = 0.0, 0.0
112-
for item in gt["causal_chain"]:
113-
c_total += item["weight"]
114-
# All patterns must match (they represent steps in the causal chain)
115-
if check_all_patterns(item["patterns"], report):
116-
c_score += item["weight"]
117-
print(f" [x] {item['description']} (weight: {item['weight']})")
118-
else:
119-
print(f" [ ] {item['description']} (weight: {item['weight']})")
120-
c_ratio = c_score / c_total if c_total > 0 else 0
121-
print(f" Causal chain score: {c_score:.2f} / {c_total:.2f} = {c_ratio:.2f}")
122-
print()
123-
124-
# ── Score negative_checks ────────────────────────────────────────────────
125-
print("=== Negative Checks ===")
126-
n_score, n_total = 0.0, 0.0
127-
for item in gt["negative_checks"]:
128-
n_total += item["weight"]
129-
# Negative checks PASS when the pattern is NOT found
130-
if not check_any_pattern(item["patterns"], report):
131-
n_score += item["weight"]
132-
print(f" [x] {item['description']} (weight: {item['weight']})")
133-
else:
134-
print(f" [ ] FAIL: {item['description']} (weight: {item['weight']}) -- wrong conclusion found")
135-
n_ratio = n_score / n_total if n_total > 0 else 1.0
136-
print(f" Negative checks score: {n_score:.2f} / {n_total:.2f} = {n_ratio:.2f}")
137-
print()
13888
139-
# ── Compute weighted total ───────────────────────────────────────────────
140-
w = gt["weights"]
141-
total = (f_ratio * w["required_findings"] +
142-
r_ratio * w["file_references"] +
143-
c_ratio * w["causal_chain"] +
144-
n_ratio * w["negative_checks"])
89+
def score_category(items, label, use_all=False, negate=False):
90+
print(f"=== {label} ===")
91+
score = 0.0
92+
total = 0.0
93+
for item in items:
94+
w = float(item["weight"])
95+
total += w
96+
matched = check_all_patterns(item["patterns"], report) if use_all else check_any_pattern(item["patterns"], report)
97+
passed = (not matched) if negate else matched
98+
if passed:
99+
score += w
100+
print(f" [x] {item['description']} (weight: {w})")
101+
else:
102+
msg = " -- wrong conclusion found" if negate else ""
103+
prefix = "FAIL: " if negate else ""
104+
print(f" [ ] {prefix}{item['description']} (weight: {w}){msg}")
105+
ratio = score / total if total > 0 else (1.0 if negate else 0.0)
106+
print(f" Score: {score:.2f} / {total:.2f} = {ratio:.2f}")
107+
print()
108+
return ratio
109+
110+
111+
f_ratio = score_category(gt.get("required_findings", []), "Required Findings")
112+
r_ratio = score_category(gt.get("file_references", []), "File References")
113+
c_ratio = score_category(gt.get("causal_chain", []), "Causal Chain", use_all=True)
114+
n_ratio = score_category(gt.get("negative_checks", []), "Negative Checks", negate=True)
115+
116+
weights = gt.get("weights", {
117+
"required_findings": 0.40,
118+
"file_references": 0.30,
119+
"causal_chain": 0.20,
120+
"negative_checks": 0.10,
121+
})
122+
base = (
123+
f_ratio * float(weights.get("required_findings", 0.40))
124+
+ r_ratio * float(weights.get("file_references", 0.30))
125+
+ c_ratio * float(weights.get("causal_chain", 0.20))
126+
+ n_ratio * float(weights.get("negative_checks", 0.10))
127+
)
128+
129+
# Soft length scaling: avoid hard 0 for concise but correct outputs.
130+
length_factor = min(1.0, REPORT_SIZE / float(MIN_REPORT_BYTES))
131+
final = max(0.0, min(1.0, base * length_factor))
145132
146133
print("=== Final Score ===")
147-
print(f" Findings: {f_ratio:.2f} * {w['required_findings']} = {f_ratio * w['required_findings']:.3f}")
148-
print(f" File refs: {r_ratio:.2f} * {w['file_references']} = {r_ratio * w['file_references']:.3f}")
149-
print(f" Causal: {c_ratio:.2f} * {w['causal_chain']} = {c_ratio * w['causal_chain']:.3f}")
150-
print(f" Negative: {n_ratio:.2f} * {w['negative_checks']} = {n_ratio * w['negative_checks']:.3f}")
151-
print(f" TOTAL: {total:.2f}")
134+
print(f" Base checklist: {base:.3f}")
135+
print(f" Length factor: min(1.0, {REPORT_SIZE}/{MIN_REPORT_BYTES}) = {length_factor:.3f}")
136+
print(f" TOTAL: {final:.2f}")
152137
153138
with open(REWARD_PATH, "w") as f:
154-
f.write(f"{total:.2f}\n")
139+
f.write(f"{final:.2f}\n")
155140
156141
print()
157-
print(f"Tests completed - Score: {total:.2f}")
142+
print(f"Tests completed - Score: {final:.2f}")
158143
PYEOF

0 commit comments

Comments
 (0)