Skip to content

Commit 1b028c9

Browse files
committed
Merge main and resolve review feedback
2 parents 904fa04 + a00cd1f commit 1b028c9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+4718
-381
lines changed

charts/diffscope/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ serviceAccount:
1818
diffscope:
1919
host: "0.0.0.0"
2020
port: 3000
21-
model: "claude-sonnet-4-6"
21+
model: "anthropic/claude-opus-4.5"
2222
# Adapter override: openai, anthropic, ollama, openrouter (auto-detected if empty)
2323
adapter: ""
2424
# Base URL for LLM API (auto-set to Ollama service URL when ollama.enabled)

eval/fixtures/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ Starter fixture set for `diffscope eval`.
55
- `repo_regressions/` contains regression-style diffs based on realistic mistakes in this codebase.
66
- Each fixture can include `rule_id` as a label for rule-level precision/recall metrics.
77
- Set `require_rule_id: true` on a pattern if the rule id must be emitted by the model for a match.
8+
- `deep_review_suite/` now includes core, authz, supply-chain, and async-correctness packs for broader live benchmarking.
89

910
Run:
1011

@@ -81,3 +82,4 @@ Notes:
8182
- Use `--trend-file` with `--label` to append comparable live-run checkpoints into a reusable `QualityTrend` JSON history, including suite/category/language micro-F1 series and verifier-health counters.
8283
- Use `--matrix-model` plus `--repeat` to compare the configured primary model against a small frontier-model matrix and to spot flaky live-run variance.
8384
- Use `--artifact-dir` to persist failed-fixture artifacts and per-run JSON reports for debugging.
85+
- Text expectations now use lighter semantic phrase matching and optional rule-id aliases, so fixtures are less brittle about exact wording.
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
{
2+
"name": "review-depth-async",
3+
"author": "diffscope",
4+
"version": "1.0.0",
5+
"description": "Async-correctness benchmark pack for deeper live review runs.",
6+
"languages": [
7+
"rust",
8+
"typescript"
9+
],
10+
"categories": [
11+
"bug",
12+
"security"
13+
],
14+
"thresholds": {
15+
"min_precision": 0.45,
16+
"min_recall": 0.35,
17+
"min_f1": 0.4,
18+
"max_false_positive_rate": 0.45,
19+
"min_weighted_score": 0.45
20+
},
21+
"metadata": {
22+
"purpose": "live-regression-eval",
23+
"family": "review-depth"
24+
},
25+
"fixtures": [
26+
{
27+
"name": "rust-fire-and-forget-sync-task",
28+
"category": "bug",
29+
"language": "rust",
30+
"difficulty": "Hard",
31+
"diff_content": "diff --git a/src/sync.rs b/src/sync.rs\nindex 1111111..2222222 100644\n--- a/src/sync.rs\n+++ b/src/sync.rs\n@@ -1,5 +1,6 @@\n pub async fn refresh_user(user_id: String) -> anyhow::Result<()> {\n- sync_user(user_id).await?;\n+ tokio::spawn(async move {\n+ sync_user(user_id).await.unwrap();\n+ });\n Ok(())\n }\n",
32+
"expected_findings": [
33+
{
34+
"description": "Background task errors are detached from the caller and can fail silently.",
35+
"severity": "Warning",
36+
"category": "Bug",
37+
"file_pattern": "src/sync.rs",
38+
"line_hint": 3,
39+
"contains_any": [
40+
"fire and forget",
41+
"spawned task is not awaited",
42+
"detached task error",
43+
"background task may fail silently"
44+
]
45+
}
46+
],
47+
"negative_findings": [
48+
{
49+
"description": "Avoid style-only comments.",
50+
"contains": "style"
51+
}
52+
],
53+
"min_total": 1,
54+
"max_total": 8,
55+
"description": "Async work should not be detached if the caller relies on success or error propagation.",
56+
"source": "deep-review-suite"
57+
},
58+
{
59+
"name": "typescript-missing-await-permission-check",
60+
"category": "security",
61+
"language": "typescript",
62+
"difficulty": "Expert",
63+
"diff_content": "diff --git a/src/permissions.ts b/src/permissions.ts\nindex 1111111..2222222 100644\n--- a/src/permissions.ts\n+++ b/src/permissions.ts\n@@ -1,5 +1,5 @@\n export async function deleteProject(user, projectId) {\n- const allowed = await checkPermission(user, projectId);\n+ const allowed = checkPermission(user, projectId);\n if (!allowed) throw new Error(\"forbidden\");\n return db.project.delete({ where: { id: projectId } });\n }\n",
64+
"expected_findings": [
65+
{
66+
"description": "Missing await makes the permission check always truthy and bypasses authorization.",
67+
"severity": "Error",
68+
"category": "Security",
69+
"file_pattern": "src/permissions.ts",
70+
"line_hint": 2,
71+
"contains_any": [
72+
"missing await",
73+
"promise is always truthy",
74+
"authorization bypass",
75+
"async permission check"
76+
],
77+
"tags_any": [
78+
"authorization",
79+
"async"
80+
]
81+
}
82+
],
83+
"negative_findings": [
84+
{
85+
"description": "Avoid style-only comments.",
86+
"contains": "style"
87+
}
88+
],
89+
"min_total": 1,
90+
"max_total": 8,
91+
"description": "Awaiting async permission checks is required before destructive actions.",
92+
"source": "deep-review-suite"
93+
}
94+
]
95+
}
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
{
2+
"name": "review-depth-authz",
3+
"author": "diffscope",
4+
"version": "1.0.0",
5+
"description": "Authorization and tenant-isolation benchmark pack for deeper live review runs.",
6+
"languages": [
7+
"typescript",
8+
"python"
9+
],
10+
"categories": [
11+
"security"
12+
],
13+
"thresholds": {
14+
"min_precision": 0.45,
15+
"min_recall": 0.35,
16+
"min_f1": 0.4,
17+
"max_false_positive_rate": 0.45,
18+
"min_weighted_score": 0.45
19+
},
20+
"metadata": {
21+
"purpose": "live-regression-eval",
22+
"family": "review-depth"
23+
},
24+
"fixtures": [
25+
{
26+
"name": "typescript-idor-project-membership",
27+
"category": "security",
28+
"language": "typescript",
29+
"difficulty": "Hard",
30+
"diff_content": "diff --git a/src/routes/projects.ts b/src/routes/projects.ts\nindex 1111111..2222222 100644\n--- a/src/routes/projects.ts\n+++ b/src/routes/projects.ts\n@@ -1,5 +1,5 @@\n export async function getProject(req, res) {\n- const project = await db.project.findFirst({ where: { id: req.params.id, ownerId: req.user.id } });\n+ const project = await db.project.findUnique({ where: { id: req.params.id } });\n return res.json(project);\n }\n",
31+
"expected_findings": [
32+
{
33+
"description": "Project lookup is no longer scoped to the current user or tenant.",
34+
"severity": "Error",
35+
"category": "Security",
36+
"file_pattern": "src/routes/projects.ts",
37+
"line_hint": 2,
38+
"contains_any": [
39+
"idor",
40+
"authorization bypass",
41+
"missing ownership check",
42+
"project access is not scoped"
43+
],
44+
"tags_any": [
45+
"authorization",
46+
"idor"
47+
]
48+
}
49+
],
50+
"negative_findings": [
51+
{
52+
"description": "Avoid style-only comments.",
53+
"contains": "style"
54+
}
55+
],
56+
"min_total": 1,
57+
"max_total": 8,
58+
"description": "Object access should stay constrained to the caller's project membership.",
59+
"source": "deep-review-suite"
60+
},
61+
{
62+
"name": "python-admin-delete-without-role-check",
63+
"category": "security",
64+
"language": "python",
65+
"difficulty": "Hard",
66+
"diff_content": "diff --git a/admin.py b/admin.py\nindex 1111111..2222222 100644\n--- a/admin.py\n+++ b/admin.py\n@@ -1,5 +1,6 @@\n def delete_account(request, store):\n- if not request.user.is_admin:\n+ if request.args.get(\"as_admin\") == \"1\" or request.user.is_admin:\n+ pass\n+ else:\n raise PermissionError(\"admin required\")\n store.delete_user(request.args[\"user_id\"])\n return {\"deleted\": True}\n",
67+
"expected_findings": [
68+
{
69+
"description": "Dangerous admin operation no longer verifies the caller is an admin.",
70+
"severity": "Error",
71+
"category": "Security",
72+
"file_pattern": "admin.py",
73+
"line_hint": 2,
74+
"contains_any": [
75+
"missing authorization",
76+
"authz bypass",
77+
"admin action without role check",
78+
"privilege escalation",
79+
"user-controlled bypass"
80+
],
81+
"tags_any": [
82+
"authorization"
83+
]
84+
}
85+
],
86+
"negative_findings": [
87+
{
88+
"description": "Avoid style-only comments.",
89+
"contains": "style"
90+
}
91+
],
92+
"min_total": 1,
93+
"max_total": 8,
94+
"description": "Privilege-gated destructive operations must keep their role checks.",
95+
"source": "deep-review-suite"
96+
}
97+
]
98+
}
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
{
2+
"name": "review-depth-supply-chain",
3+
"author": "diffscope",
4+
"version": "1.0.0",
5+
"description": "Supply-chain and dependency trust benchmark pack for deeper live review runs.",
6+
"languages": [
7+
"yaml",
8+
"docker"
9+
],
10+
"categories": [
11+
"security"
12+
],
13+
"thresholds": {
14+
"min_precision": 0.45,
15+
"min_recall": 0.35,
16+
"min_f1": 0.4,
17+
"max_false_positive_rate": 0.45,
18+
"min_weighted_score": 0.45
19+
},
20+
"metadata": {
21+
"purpose": "live-regression-eval",
22+
"family": "review-depth"
23+
},
24+
"fixtures": [
25+
{
26+
"name": "yaml-unpinned-github-action",
27+
"category": "security",
28+
"language": "yaml",
29+
"difficulty": "Medium",
30+
"diff_content": "diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml\nindex 1111111..2222222 100644\n--- a/.github/workflows/build.yml\n+++ b/.github/workflows/build.yml\n@@ -6,4 +6,5 @@ jobs:\n build:\n runs-on: ubuntu-latest\n steps:\n- - uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608\n+ - uses: actions/checkout@v4\n+ - uses: docker/setup-buildx-action@v3\n",
31+
"expected_findings": [
32+
{
33+
"description": "GitHub Actions are no longer pinned to immutable commit SHAs.",
34+
"severity": "Warning",
35+
"category": "Security",
36+
"file_pattern": ".github/workflows/build.yml",
37+
"line_hint": 9,
38+
"contains_any": [
39+
"unpinned action",
40+
"pin github action to a commit sha",
41+
"supply chain risk",
42+
"mutable action tag"
43+
],
44+
"tags_any": [
45+
"supply-chain"
46+
]
47+
}
48+
],
49+
"negative_findings": [
50+
{
51+
"description": "Avoid style-only comments.",
52+
"contains": "style"
53+
}
54+
],
55+
"min_total": 1,
56+
"max_total": 8,
57+
"description": "Workflow dependencies should stay pinned to immutable revisions.",
58+
"source": "deep-review-suite"
59+
},
60+
{
61+
"name": "docker-curl-pipe-bash",
62+
"category": "security",
63+
"language": "docker",
64+
"difficulty": "Hard",
65+
"diff_content": "diff --git a/Dockerfile b/Dockerfile\nindex 1111111..2222222 100644\n--- a/Dockerfile\n+++ b/Dockerfile\n@@ -3,3 +3,4 @@ FROM python:3.12-slim\n RUN apt-get update && apt-get install -y curl\n-RUN pip install -r requirements.txt\n+RUN curl -fsSL https://downloads.example.com/install.sh | bash\n+RUN pip install -r requirements.txt\n",
66+
"expected_findings": [
67+
{
68+
"description": "Remote installation script is piped directly into a shell without verification.",
69+
"severity": "Error",
70+
"category": "Security",
71+
"file_pattern": "Dockerfile",
72+
"line_hint": 4,
73+
"contains_any": [
74+
"curl pipe to shell",
75+
"remote script execution",
76+
"verify downloaded script",
77+
"supply chain risk"
78+
],
79+
"tags_any": [
80+
"supply-chain",
81+
"remote-exec"
82+
]
83+
}
84+
],
85+
"negative_findings": [
86+
{
87+
"description": "Avoid style-only comments.",
88+
"contains": "style"
89+
}
90+
],
91+
"min_total": 1,
92+
"max_total": 8,
93+
"description": "Build pipelines should not execute unverified remote scripts.",
94+
"source": "deep-review-suite"
95+
}
96+
]
97+
}

src/adapters/llm.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ pub struct ModelConfig {
2727
impl Default for ModelConfig {
2828
fn default() -> Self {
2929
Self {
30-
model_name: "claude-opus-4-6".to_string(),
30+
model_name: "anthropic/claude-opus-4.5".to_string(),
3131
api_key: None,
3232
base_url: None,
3333
temperature: 0.2,
@@ -505,7 +505,7 @@ mod tests {
505505
#[test]
506506
fn test_model_config_default() {
507507
let config = ModelConfig::default();
508-
assert_eq!(config.model_name, "claude-opus-4-6");
508+
assert_eq!(config.model_name, "anthropic/claude-opus-4.5");
509509
assert!(config.api_key.is_none());
510510
assert!(config.base_url.is_none());
511511
assert!((config.temperature - 0.2).abs() < f32::EPSILON);

0 commit comments

Comments
 (0)