future-agi · hadarishav · May 25, 2026 · May 22, 2026 · May 25, 2026 · hadarishav
diff --git a/src/components/Sidebar.astro b/src/components/Sidebar.astro
@@ -94,7 +94,7 @@ const isApiTab = activeTab?.tab === 'API';
 
 function inferApiMethod(title: string): { method: string; css: string } | null {
   const t = title.toLowerCase();
-  if (/\b(list|get|retrieve|health|find|export|progress|analytics|agreement|compare|stats|summary|voices|tts)\b/.test(t)) {
+  if (/\b(list|get|retrieve|health|find|export|progress|analytics|agreement|compare|stats|summary|voices|tts|aggregat\w*)\b/.test(t)) {
     return { method: 'GET', css: 'api-method-get' };
   }
   if (/\b(create|add|generate|execute|submit|assign|bulk|complete|skip|release|pause|unpause|check|upload|start|duplicate|fetch|run|rerun|cancel|clone|merge)\b/.test(t)) {

diff --git a/src/lib/api-navigation.ts b/src/lib/api-navigation.ts
@@ -294,7 +294,8 @@ export const apiNavigation: ApiNavGroup[] = [
       { "title": "Delete Eval Task", "href": "/docs/api/eval-tasks/delete-eval-task", "method": "DELETE" },
       { "title": "Bulk Delete Eval Tasks", "href": "/docs/api/eval-tasks/bulk-delete-eval-tasks", "method": "POST" },
       { "title": "Pause Eval Task", "href": "/docs/api/eval-tasks/pause-eval-task", "method": "POST" },
-      { "title": "Unpause Eval Task", "href": "/docs/api/eval-tasks/unpause-eval-task", "method": "POST" }
+      { "title": "Unpause Eval Task", "href": "/docs/api/eval-tasks/unpause-eval-task", "method": "POST" },
+      { "title": "Eval Task Aggregations", "href": "/docs/api/eval-tasks/eval-task-aggregations", "method": "GET" }
     ]
   },
   {

diff --git a/src/lib/navigation.ts b/src/lib/navigation.ts
@@ -1015,6 +1015,7 @@ export const tabNavigation: NavTab[] = [
               { title: 'Bulk Delete Eval Tasks', href: '/docs/api/eval-tasks/bulk-delete-eval-tasks' },
               { title: 'Pause Eval Task', href: '/docs/api/eval-tasks/pause-eval-task' },
               { title: 'Unpause Eval Task', href: '/docs/api/eval-tasks/unpause-eval-task' },
+              { title: 'Eval Task Aggregations', href: '/docs/api/eval-tasks/eval-task-aggregations' },
             ]
           },
           {

diff --git a/src/pages/docs/api/eval-tasks/eval-task-aggregations.mdx b/src/pages/docs/api/eval-tasks/eval-task-aggregations.mdx
@@ -0,0 +1,125 @@
+---
+title: "Eval Task Aggregations"
+description: "Aggregate eval-task results as per-eval rollups, per-span pivots, or both."
+---
+
+<ApiPlayground
+  method="GET"
+  endpoint="/tracer/eval-task/get_usage/"
+  baseUrl="https://api.futureagi.com"
+  parameters={[
+    {"name": "eval_task_id", "in": "query", "required": true, "description": "UUID of the eval task to aggregate.", "type": "string"},
+    {"name": "eval_aggregation", "in": "query", "required": false, "description": "When true, returns the per-eval rollup keyed by eval name.", "type": "boolean"},
+    {"name": "span_aggregation", "in": "query", "required": false, "description": "When true, returns the per-span pivot keyed by span ID.", "type": "boolean"}
+  ]}
+  responseExample={{
+    eval_task_id: "a1b2c3d4-e5f6-7890-abcd-ef1234567890",
+    eval_aggregation: {
+      Faithfulness: {
+        id: "11111111-1111-1111-1111-111111111111",
+        name: "Faithfulness",
+        output_type: "percentage",
+        aggregated_score: 0.7421
+      },
+      "Toxicity Check": {
+        id: "22222222-2222-2222-2222-222222222222",
+        name: "Toxicity Check",
+        output_type: "pass_fail",
+        aggregated_score: 87.5
+      },
+      Sentiment: {
+        id: "33333333-3333-3333-3333-333333333333",
+        name: "Sentiment",
+        output_type: "deterministic",
+        aggregated_score: { positive: 62.5, neutral: 25.0, negative: 12.5 }
+      }
+    },
+    span_aggregation: {
+      "span_abc123": {
+        Faithfulness:   { id: "11111111-1111-1111-1111-111111111111", name: "Faithfulness",   output_type: "percentage",    value: 0.82 },
+        "Toxicity Check": { id: "22222222-2222-2222-2222-222222222222", name: "Toxicity Check", output_type: "pass_fail",     value: true },
+        Sentiment:      { id: "33333333-3333-3333-3333-333333333333", name: "Sentiment",      output_type: "deterministic", value: ["positive"] }
+      },
+      "span_def456": {
+        Faithfulness: { id: "11111111-1111-1111-1111-111111111111", name: "Faithfulness", output_type: "percentage", value: 0.31 }
+      }
+    }
+  }}
+  responseStatus={200}
+  responseStatusText="OK"
+/>
+
+<ApiSection title="Authentication">
+  <ParamField name="X-Api-Key" type="API Key" required>
+    Your Future AGI API key used to authenticate requests. You can find and manage your API keys in the [Dashboard](https://app.futureagi.com) under Settings.
+  </ParamField>
+  <ParamField name="X-Secret-Key" type="Secret Key" required>
+    Your Future AGI secret key, used alongside the API key for request authentication. This is generated when you create an API key in the [Dashboard](https://app.futureagi.com).
+  </ParamField>
+</ApiSection>
+
+<ApiSection title="Query parameters">
+  <ParamField query="eval_task_id" type="UUID" required>
+    The eval task whose runs should be aggregated.
+  </ParamField>
+  <ParamField query="eval_aggregation" type="boolean" optional>
+    When `true`, the response includes the `eval_aggregation` object — one rollup per `CustomEvalConfig` that ran in the task, keyed by eval name. Defaults to `false`. At least one of `eval_aggregation` or `span_aggregation` must be `true`.
+  </ParamField>
+  <ParamField query="span_aggregation" type="boolean" optional>
+    When `true`, the response includes the `span_aggregation` object — one entry per span the task evaluated, keyed by `span_id`, with the raw value of every eval that touched it. Defaults to `false`. At least one of `eval_aggregation` or `span_aggregation` must be `true`.
+  </ParamField>
+</ApiSection>
+
+<ApiSection title="Response" status={200} statusText="OK">
+  <ResponseField name="eval_task_id" type="string">UUID of the eval task that was aggregated. Echoed back from the request.</ResponseField>
+
+  <ResponseField name="eval_aggregation" type="object">
+    Per-eval rollup. Present only when `eval_aggregation=true`. Keys are `CustomEvalConfig` names; values are one rollup object per eval.
+    <ApiCollapsible title="Show eval rollup properties">
+      <ResponseField name="id" type="string">UUID of the eval config.</ResponseField>
+      <ResponseField name="name" type="string">Eval config name (same as the parent key).</ResponseField>
+      <ResponseField name="output_type" type="string">Normalised output type for the eval: `percentage`, `pass_fail`, or `deterministic`. Drives the shape of `aggregated_score`.</ResponseField>
+      <ResponseField name="aggregated_score" type="number | object | null">
+        The eval-level rollup. Shape depends on `output_type`:
+        <br />• **`percentage`** — `number` (4-dp average across non-error runs, e.g. `0.7421`).
+        <br />• **`pass_fail`** — `number` (pass rate as `0–100` with 2 dp, e.g. `87.5`).
+        <br />• **`deterministic`** — `object` mapping each observed choice to its occurrence percentage `0–100` with 2 dp, e.g. `{"positive": 62.5, "neutral": 25.0}`. Only choices that actually appeared in the data are included.
+        <br />`null` when no aggregatable rows exist (all errors / empty).
+      </ResponseField>
+    </ApiCollapsible>
+  </ResponseField>
+
+  <ResponseField name="span_aggregation" type="object">
+    Per-span pivot. Present only when `span_aggregation=true`. Outer keys are `span_id` (one per span the task evaluated); inner keys are eval names; inner values are one entry per eval that touched the span.
+    <ApiCollapsible title="Show span entry properties">
+      <ResponseField name="id" type="string">UUID of the eval config.</ResponseField>
+      <ResponseField name="name" type="string">Eval config name.</ResponseField>
+      <ResponseField name="output_type" type="string">Normalised output type for the eval: `percentage`, `pass_fail`, or `deterministic`. Drives the shape of `value`.</ResponseField>
+      <ResponseField name="value" type="number | boolean | array | null">
+        The raw per-row eval result — **no averaging**. Shape depends on `output_type`:
+        <br />• **`percentage`** — `number` (e.g. `0.82`).
+        <br />• **`pass_fail`** — `boolean`.
+        <br />• **`deterministic`** — `array` of choice strings (e.g. `["positive"]`).
+        <br />When the same `(span, eval)` pair has multiple runs (re-runs), the latest by `created_at` wins.
+      </ResponseField>
+    </ApiCollapsible>
+  </ResponseField>
+</ApiSection>
+
+<Note>
+  Soft-deleted eval runs are skipped in both aggregations so the rollups reflect the user's current view of the data.
+
+  `span_aggregation` only includes span-target eval runs — session- and trace-target eval runs (where there is no underlying span) are not included.
+</Note>
+
+<ApiSection title="Errors">
+  <ParamField name="400" type="Bad Request">
+    `eval_task_id` is missing, or no eval task with that ID exists in the caller's organization.
+  </ParamField>
+  <ParamField name="401" type="Unauthorized">
+    Invalid or missing API credentials.
+  </ParamField>
+  <ParamField name="500" type="Internal Server Error">
+    Unexpected server error.
+  </ParamField>
+</ApiSection>