evalops
diff --git a/‎charts/diffscope/values.yaml‎
Lines changed: 1 addition & 1 deletion b/‎charts/diffscope/values.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎eval/fixtures/README.md‎
Lines changed: 2 additions & 0 deletions b/‎eval/fixtures/README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎eval/fixtures/deep_review_suite/review_depth_async.json‎
Lines changed: 95 additions & 0 deletions b/‎eval/fixtures/deep_review_suite/review_depth_async.json‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎eval/fixtures/deep_review_suite/review_depth_authz.json‎
Lines changed: 98 additions & 0 deletions b/‎eval/fixtures/deep_review_suite/review_depth_authz.json‎
Lines changed: 98 additions & 0 deletions
diff --git a/‎eval/fixtures/deep_review_suite/review_depth_supply_chain.json‎
Lines changed: 97 additions & 0 deletions b/‎eval/fixtures/deep_review_suite/review_depth_supply_chain.json‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎src/adapters/llm.rs‎
Lines changed: 2 additions & 2 deletions b/‎src/adapters/llm.rs‎
Lines changed: 2 additions & 2 deletions
@@ -18,7 +18,7 @@ serviceAccount:
 diffscope:
   host: "0.0.0.0"
   port: 3000
-  model: "claude-sonnet-4-6"
+  model: "anthropic/claude-opus-4.5"
   # Adapter override: openai, anthropic, ollama, openrouter (auto-detected if empty)
   adapter: ""
   # Base URL for LLM API (auto-set to Ollama service URL when ollama.enabled)
 
@@ -5,6 +5,7 @@ Starter fixture set for `diffscope eval`.
 - `repo_regressions/` contains regression-style diffs based on realistic mistakes in this codebase.
 - Each fixture can include `rule_id` as a label for rule-level precision/recall metrics.
 - Set `require_rule_id: true` on a pattern if the rule id must be emitted by the model for a match.
+- `deep_review_suite/` now includes core, authz, supply-chain, and async-correctness packs for broader live benchmarking.
 
 Run:
 
@@ -81,3 +82,4 @@ Notes:
 - Use `--trend-file` with `--label` to append comparable live-run checkpoints into a reusable `QualityTrend` JSON history, including suite/category/language micro-F1 series and verifier-health counters.
 - Use `--matrix-model` plus `--repeat` to compare the configured primary model against a small frontier-model matrix and to spot flaky live-run variance.
 - Use `--artifact-dir` to persist failed-fixture artifacts and per-run JSON reports for debugging.
+- Text expectations now use lighter semantic phrase matching and optional rule-id aliases, so fixtures are less brittle about exact wording.
@@ -0,0 +1,95 @@
+{
+  "name": "review-depth-async",
+  "author": "diffscope",
+  "version": "1.0.0",
+  "description": "Async-correctness benchmark pack for deeper live review runs.",
+  "languages": [
+    "rust",
+    "typescript"
+  ],
+  "categories": [
+    "bug",
+    "security"
+  ],
+  "thresholds": {
+    "min_precision": 0.45,
+    "min_recall": 0.35,
+    "min_f1": 0.4,
+    "max_false_positive_rate": 0.45,
+    "min_weighted_score": 0.45
+  },
+  "metadata": {
+    "purpose": "live-regression-eval",
+    "family": "review-depth"
+  },
+  "fixtures": [
+    {
+      "name": "rust-fire-and-forget-sync-task",
+      "category": "bug",
+      "language": "rust",
+      "difficulty": "Hard",
+      "diff_content": "diff --git a/src/sync.rs b/src/sync.rs\nindex 1111111..2222222 100644\n--- a/src/sync.rs\n+++ b/src/sync.rs\n@@ -1,5 +1,6 @@\n pub async fn refresh_user(user_id: String) -> anyhow::Result<()> {\n-    sync_user(user_id).await?;\n+    tokio::spawn(async move {\n+        sync_user(user_id).await.unwrap();\n+    });\n     Ok(())\n }\n",
+      "expected_findings": [
+        {
+          "description": "Background task errors are detached from the caller and can fail silently.",
+          "severity": "Warning",
+          "category": "Bug",
+          "file_pattern": "src/sync.rs",
+          "line_hint": 3,
+          "contains_any": [
+            "fire and forget",
+            "spawned task is not awaited",
+            "detached task error",
+            "background task may fail silently"
+          ]
+        }
+      ],
+      "negative_findings": [
+        {
+          "description": "Avoid style-only comments.",
+          "contains": "style"
+        }
+      ],
+      "min_total": 1,
+      "max_total": 8,
+      "description": "Async work should not be detached if the caller relies on success or error propagation.",
+      "source": "deep-review-suite"
+    },
+    {
+      "name": "typescript-missing-await-permission-check",
+      "category": "security",
+      "language": "typescript",
+      "difficulty": "Expert",
+      "diff_content": "diff --git a/src/permissions.ts b/src/permissions.ts\nindex 1111111..2222222 100644\n--- a/src/permissions.ts\n+++ b/src/permissions.ts\n@@ -1,5 +1,5 @@\n export async function deleteProject(user, projectId) {\n-  const allowed = await checkPermission(user, projectId);\n+  const allowed = checkPermission(user, projectId);\n   if (!allowed) throw new Error(\"forbidden\");\n   return db.project.delete({ where: { id: projectId } });\n }\n",
+      "expected_findings": [
+        {
+          "description": "Missing await makes the permission check always truthy and bypasses authorization.",
+          "severity": "Error",
+          "category": "Security",
+          "file_pattern": "src/permissions.ts",
+          "line_hint": 2,
+          "contains_any": [
+            "missing await",
+            "promise is always truthy",
+            "authorization bypass",
+            "async permission check"
+          ],
+          "tags_any": [
+            "authorization",
+            "async"
+          ]
+        }
+      ],
+      "negative_findings": [
+        {
+          "description": "Avoid style-only comments.",
+          "contains": "style"
+        }
+      ],
+      "min_total": 1,
+      "max_total": 8,
+      "description": "Awaiting async permission checks is required before destructive actions.",
+      "source": "deep-review-suite"
+    }
+  ]
+}
@@ -0,0 +1,98 @@
+{
+  "name": "review-depth-authz",
+  "author": "diffscope",
+  "version": "1.0.0",
+  "description": "Authorization and tenant-isolation benchmark pack for deeper live review runs.",
+  "languages": [
+    "typescript",
+    "python"
+  ],
+  "categories": [
+    "security"
+  ],
+  "thresholds": {
+    "min_precision": 0.45,
+    "min_recall": 0.35,
+    "min_f1": 0.4,
+    "max_false_positive_rate": 0.45,
+    "min_weighted_score": 0.45
+  },
+  "metadata": {
+    "purpose": "live-regression-eval",
+    "family": "review-depth"
+  },
+  "fixtures": [
+    {
+      "name": "typescript-idor-project-membership",
+      "category": "security",
+      "language": "typescript",
+      "difficulty": "Hard",
+      "diff_content": "diff --git a/src/routes/projects.ts b/src/routes/projects.ts\nindex 1111111..2222222 100644\n--- a/src/routes/projects.ts\n+++ b/src/routes/projects.ts\n@@ -1,5 +1,5 @@\n export async function getProject(req, res) {\n-  const project = await db.project.findFirst({ where: { id: req.params.id, ownerId: req.user.id } });\n+  const project = await db.project.findUnique({ where: { id: req.params.id } });\n   return res.json(project);\n }\n",
+      "expected_findings": [
+        {
+          "description": "Project lookup is no longer scoped to the current user or tenant.",
+          "severity": "Error",
+          "category": "Security",
+          "file_pattern": "src/routes/projects.ts",
+          "line_hint": 2,
+          "contains_any": [
+            "idor",
+            "authorization bypass",
+            "missing ownership check",
+            "project access is not scoped"
+          ],
+          "tags_any": [
+            "authorization",
+            "idor"
+          ]
+        }
+      ],
+      "negative_findings": [
+        {
+          "description": "Avoid style-only comments.",
+          "contains": "style"
+        }
+      ],
+      "min_total": 1,
+      "max_total": 8,
+      "description": "Object access should stay constrained to the caller's project membership.",
+      "source": "deep-review-suite"
+    },
+    {
+      "name": "python-admin-delete-without-role-check",
+      "category": "security",
+      "language": "python",
+      "difficulty": "Hard",
+      "diff_content": "diff --git a/admin.py b/admin.py\nindex 1111111..2222222 100644\n--- a/admin.py\n+++ b/admin.py\n@@ -1,5 +1,6 @@\n def delete_account(request, store):\n-    if not request.user.is_admin:\n+    if request.args.get(\"as_admin\") == \"1\" or request.user.is_admin:\n+        pass\n+    else:\n         raise PermissionError(\"admin required\")\n     store.delete_user(request.args[\"user_id\"])\n     return {\"deleted\": True}\n",
+      "expected_findings": [
+        {
+          "description": "Dangerous admin operation no longer verifies the caller is an admin.",
+          "severity": "Error",
+          "category": "Security",
+          "file_pattern": "admin.py",
+          "line_hint": 2,
+          "contains_any": [
+            "missing authorization",
+            "authz bypass",
+            "admin action without role check",
+            "privilege escalation",
+            "user-controlled bypass"
+          ],
+          "tags_any": [
+            "authorization"
+          ]
+        }
+      ],
+      "negative_findings": [
+        {
+          "description": "Avoid style-only comments.",
+          "contains": "style"
+        }
+      ],
+      "min_total": 1,
+      "max_total": 8,
+      "description": "Privilege-gated destructive operations must keep their role checks.",
+      "source": "deep-review-suite"
+    }
+  ]
+}
@@ -0,0 +1,97 @@
+{
+  "name": "review-depth-supply-chain",
+  "author": "diffscope",
+  "version": "1.0.0",
+  "description": "Supply-chain and dependency trust benchmark pack for deeper live review runs.",
+  "languages": [
+    "yaml",
+    "docker"
+  ],
+  "categories": [
+    "security"
+  ],
+  "thresholds": {
+    "min_precision": 0.45,
+    "min_recall": 0.35,
+    "min_f1": 0.4,
+    "max_false_positive_rate": 0.45,
+    "min_weighted_score": 0.45
+  },
+  "metadata": {
+    "purpose": "live-regression-eval",
+    "family": "review-depth"
+  },
+  "fixtures": [
+    {
+      "name": "yaml-unpinned-github-action",
+      "category": "security",
+      "language": "yaml",
+      "difficulty": "Medium",
+      "diff_content": "diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml\nindex 1111111..2222222 100644\n--- a/.github/workflows/build.yml\n+++ b/.github/workflows/build.yml\n@@ -6,4 +6,5 @@ jobs:\n   build:\n     runs-on: ubuntu-latest\n     steps:\n-      - uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608\n+      - uses: actions/checkout@v4\n+      - uses: docker/setup-buildx-action@v3\n",
+      "expected_findings": [
+        {
+          "description": "GitHub Actions are no longer pinned to immutable commit SHAs.",
+          "severity": "Warning",
+          "category": "Security",
+          "file_pattern": ".github/workflows/build.yml",
+          "line_hint": 9,
+          "contains_any": [
+            "unpinned action",
+            "pin github action to a commit sha",
+            "supply chain risk",
+            "mutable action tag"
+          ],
+          "tags_any": [
+            "supply-chain"
+          ]
+        }
+      ],
+      "negative_findings": [
+        {
+          "description": "Avoid style-only comments.",
+          "contains": "style"
+        }
+      ],
+      "min_total": 1,
+      "max_total": 8,
+      "description": "Workflow dependencies should stay pinned to immutable revisions.",
+      "source": "deep-review-suite"
+    },
+    {
+      "name": "docker-curl-pipe-bash",
+      "category": "security",
+      "language": "docker",
+      "difficulty": "Hard",
+      "diff_content": "diff --git a/Dockerfile b/Dockerfile\nindex 1111111..2222222 100644\n--- a/Dockerfile\n+++ b/Dockerfile\n@@ -3,3 +3,4 @@ FROM python:3.12-slim\n RUN apt-get update && apt-get install -y curl\n-RUN pip install -r requirements.txt\n+RUN curl -fsSL https://downloads.example.com/install.sh | bash\n+RUN pip install -r requirements.txt\n",
+      "expected_findings": [
+        {
+          "description": "Remote installation script is piped directly into a shell without verification.",
+          "severity": "Error",
+          "category": "Security",
+          "file_pattern": "Dockerfile",
+          "line_hint": 4,
+          "contains_any": [
+            "curl pipe to shell",
+            "remote script execution",
+            "verify downloaded script",
+            "supply chain risk"
+          ],
+          "tags_any": [
+            "supply-chain",
+            "remote-exec"
+          ]
+        }
+      ],
+      "negative_findings": [
+        {
+          "description": "Avoid style-only comments.",
+          "contains": "style"
+        }
+      ],
+      "min_total": 1,
+      "max_total": 8,
+      "description": "Build pipelines should not execute unverified remote scripts.",
+      "source": "deep-review-suite"
+    }
+  ]
+}
@@ -27,7 +27,7 @@ pub struct ModelConfig {
 impl Default for ModelConfig {
     fn default() -> Self {
         Self {
-            model_name: "claude-opus-4-6".to_string(),
+            model_name: "anthropic/claude-opus-4.5".to_string(),
             api_key: None,
             base_url: None,
             temperature: 0.2,
@@ -505,7 +505,7 @@ mod tests {
     #[test]
     fn test_model_config_default() {
         let config = ModelConfig::default();
-        assert_eq!(config.model_name, "claude-opus-4-6");
+        assert_eq!(config.model_name, "anthropic/claude-opus-4.5");
         assert!(config.api_key.is_none());
         assert!(config.base_url.is_none());
         assert!((config.temperature - 0.2).abs() < f32::EPSILON);