Skip to content

Commit 6201cce

Browse files
committed
Scaffold wave-1 org task expansion
1 parent 25cc997 commit 6201cce

File tree

434 files changed

+48954
-35
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

434 files changed

+48954
-35
lines changed
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
FROM ubuntu:22.04
2+
3+
ENV DEBIAN_FRONTEND=noninteractive
4+
5+
# Base tools
6+
RUN apt-get update && apt-get install -y --no-install-recommends \
7+
git \
8+
ca-certificates \
9+
curl \
10+
python3 \
11+
python3 python3-pip \
12+
&& rm -rf /var/lib/apt/lists/*
13+
14+
WORKDIR /workspace
15+
16+
# Clone local checkout repos (baseline config: agent has local access to these)
17+
RUN git clone --depth 1 https://github.com/fastapi/fastapi /workspace/fastapi
18+
RUN git clone --depth 1 https://github.com/psf/requests /workspace/requests
19+
20+
# Initialize git identity for agent commits
21+
RUN git config --global user.email "agent@example.com" && \
22+
git config --global user.name "Agent" && \
23+
git config --global safe.directory '*'
24+
25+
# Create log directories
26+
RUN mkdir -p /logs/agent /logs/verifier
27+
28+
# Pre-create claude user and set ownership at build time so Harbor's
29+
# runtime chown is a no-op (avoids 15-30 min delay on large repos).
30+
RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
31+
for d in /workspace /app /testbed /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
32+
33+
ENTRYPOINT []
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# ccx-incident-297 — artifact_baseline variant
2+
# Baseline with local code + artifact mode (verifier parses answer.json).
3+
4+
FROM ubuntu:22.04
5+
6+
ENV DEBIAN_FRONTEND=noninteractive
7+
8+
# Base tools
9+
RUN apt-get update && apt-get install -y --no-install-recommends \
10+
git \
11+
ca-certificates \
12+
curl \
13+
python3 \
14+
python3 python3-pip \
15+
&& rm -rf /var/lib/apt/lists/*
16+
17+
WORKDIR /workspace
18+
19+
# Clone local checkout repos (baseline config: agent has local access to these)
20+
RUN git clone --depth 1 https://github.com/fastapi/fastapi /workspace/fastapi
21+
RUN git clone --depth 1 https://github.com/psf/requests /workspace/requests
22+
23+
# Initialize git identity for agent commits
24+
RUN git config --global user.email "agent@example.com" && \
25+
git config --global user.name "Agent" && \
26+
git config --global safe.directory '*'
27+
28+
# Create log directories
29+
RUN mkdir -p /logs/agent /logs/verifier
30+
31+
# Pre-create claude user and set ownership at build time so Harbor's
32+
# runtime chown is a no-op (avoids 15-30 min delay on large repos).
33+
RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
34+
for d in /workspace /app /testbed /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
35+
36+
# Mark artifact-only mode — verifier parses answer.json
37+
RUN touch /tmp/.artifact_only_mode
38+
39+
ENTRYPOINT []
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# ccx-incident-297 — artifact_only variant
2+
# No local repo clone — agent uses Sourcegraph MCP exclusively for code access.
3+
# Agent produces answer.json artifact; verifier scores the artifact.
4+
5+
FROM ubuntu:22.04
6+
7+
ENV DEBIAN_FRONTEND=noninteractive
8+
ENV SOURCEGRAPH_REPOS="fastapi/fastapi,psf/requests"
9+
10+
RUN apt-get update && apt-get install -y --no-install-recommends \
11+
git \
12+
ca-certificates \
13+
python3 \
14+
curl \
15+
&& rm -rf /var/lib/apt/lists/*
16+
17+
WORKDIR /workspace
18+
19+
# Empty workspace — agent discovers code via MCP tools only
20+
RUN git init && \
21+
git config user.email "agent@example.com" && \
22+
git config user.name "Agent" && \
23+
git config --global safe.directory '*'
24+
25+
# Create log directories
26+
RUN mkdir -p /logs/agent /logs/verifier
27+
28+
# Mark artifact-only mode — verifiers and eval scripts check this flag
29+
RUN touch /tmp/.artifact_only_mode
30+
31+
ENTRYPOINT []
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# CCX-incident-297 — sg_only variant
2+
# No local repo clone — agent uses Sourcegraph MCP exclusively for code access.
3+
# The verifier clones mirror repos at verification time (no /repo_full/ backup).
4+
5+
FROM ubuntu:22.04
6+
7+
ENV DEBIAN_FRONTEND=noninteractive
8+
ENV SOURCEGRAPH_REPOS="fastapi/fastapi,psf/requests"
9+
10+
RUN apt-get update && apt-get install -y --no-install-recommends \
11+
git \
12+
ca-certificates \
13+
python3 \
14+
curl \
15+
&& rm -rf /var/lib/apt/lists/*
16+
17+
WORKDIR /workspace
18+
19+
# Empty workspace — agent discovers code via MCP tools only
20+
RUN git init && \
21+
git config user.email "agent@example.com" && \
22+
git config user.name "Agent" && \
23+
git config --global safe.directory '*'
24+
25+
# Create log directories
26+
RUN mkdir -p /logs/agent /logs/verifier
27+
28+
# Mark sg_only mode — verifiers and eval scripts check this flag
29+
RUN touch /tmp/.sg_only_mode
30+
31+
# Pre-create claude user and set ownership at build time so Harbor's
32+
# runtime chown is a no-op (avoids 15-30 min delay on large repos).
33+
RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
34+
for d in /workspace /app /testbed /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
35+
36+
RUN echo '{"workdir":"/workspace","repos":[{"mirror":"fastapi/fastapi","target_dir":"fastapi"},{"mirror":"psf/requests","target_dir":"requests"}]}' > /tmp/.sg_only_clone_manifest.json
37+
38+
ENTRYPOINT []
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# FastAPI 422 Validation Error Trace Across Client and Server
2+
3+
## Your Task
4+
5+
A POST request sent with requests returns HTTP 422 from a FastAPI service. Find the Python source files across fastapi/fastapi and psf/requests that (1) serialize and send the request body on the client side, (2) parse request bodies and trigger validation in FastAPI, and (3) construct the validation error response returned to the caller.
6+
7+
## Context
8+
9+
You are working on a codebase task involving repos from the incident domain.
10+
11+
## Available Resources
12+
13+
The local `/workspace/` directory contains: fastapi/fastapi, psf/requests.
14+
15+
## Output Format
16+
17+
Create a file at `/workspace/answer.json` with your findings in the following structure:
18+
19+
```json
20+
{
21+
"files": [
22+
{"repo": "org/repo-name", "path": "relative/path/to/file.py"}
23+
],
24+
"symbols": [
25+
{"repo": "org/repo-name", "path": "relative/path/to/file.py", "symbol": "SymbolName"}
26+
],
27+
"chain": [
28+
{"repo": "org/repo-name", "path": "relative/path/to/file.py", "symbol": "FunctionName"}
29+
],
30+
"text": "Narrative explanation of your findings, citing repos and file paths."
31+
}
32+
```
33+
34+
Include only the fields relevant to this task. Your answer is evaluated against a closed-world oracle — completeness matters.
35+
36+
## Evaluation
37+
38+
Your answer will be scored on:
39+
- **File recall and precision**: Did you find all relevant files?
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# IMPORTANT: Source Code Access
2+
3+
**Local source files are not present.** Your workspace does not contain source code. You **MUST** use Sourcegraph MCP tools to discover, read, and understand code before making any changes.
4+
5+
**Target Repositories (version-pinned mirrors):**
6+
7+
- `github.com/fastapi/fastapi` — use `repo:^github.com/fastapi/fastapi$` filter
8+
- `github.com/psf/requests` — use `repo:^github.com/psf/requests$` filter
9+
10+
Scope ALL keyword_search/nls_search queries to these repos.
11+
Use the repo name as the `repo` parameter for read_file/go_to_definition/find_references.
12+
13+
14+
## Required Workflow
15+
16+
1. **Search first** — Use MCP tools to find relevant files and understand existing patterns
17+
2. **Read remotely** — Use `sg_read_file` to read full file contents from Sourcegraph
18+
3. **Edit locally** — Use Edit, Write, and Bash to create or modify files in your working directory
19+
4. **Verify locally** — Run tests with Bash to check your changes
20+
21+
## Tool Selection
22+
23+
| Goal | Tool |
24+
|------|------|
25+
| Exact symbol/string | `sg_keyword_search` |
26+
| Concepts/semantic search | `sg_nls_search` |
27+
| Trace usage/callers | `sg_find_references` |
28+
| See implementation | `sg_go_to_definition` |
29+
| Read full file | `sg_read_file` |
30+
| Browse structure | `sg_list_files` |
31+
| Find repos | `sg_list_repos` |
32+
| Search commits | `sg_commit_search` |
33+
| Track changes | `sg_diff_search` |
34+
| Compare versions | `sg_compare_revisions` |
35+
36+
**Decision logic:**
37+
1. Know the exact symbol? -> `sg_keyword_search`
38+
2. Know the concept, not the name? -> `sg_nls_search`
39+
3. Need definition of a symbol? -> `sg_go_to_definition`
40+
4. Need all callers/references? -> `sg_find_references`
41+
5. Need full file content? -> `sg_read_file`
42+
43+
## Scoping (Always Do This)
44+
45+
```
46+
repo:^github.com/ORG/REPO$ # Exact repo (preferred)
47+
repo:github.com/ORG/ # All repos in org
48+
file:.*\.ts$ # TypeScript only
49+
file:src/api/ # Specific directory
50+
```
51+
52+
Start narrow. Expand only if results are empty.
53+
54+
## Efficiency Rules
55+
56+
- Chain searches logically: search -> read -> references -> definition
57+
- Don't re-search for the same pattern; use results from prior calls
58+
- Prefer `sg_keyword_search` over `sg_nls_search` when you have exact terms
59+
- Read 2-3 related files before synthesising, rather than one at a time
60+
- Don't read 20+ remote files without writing code — once you understand the pattern, start implementing
61+
62+
## If Stuck
63+
64+
If MCP search returns no results:
65+
1. Broaden the search query (synonyms, partial identifiers)
66+
2. Try `sg_nls_search` for semantic matching
67+
3. Use `sg_list_files` to browse the directory structure
68+
4. Use `sg_list_repos` to verify the repository name
69+
70+
---
71+
72+
**Sourcegraph Repositories:** `github.com/fastapi/fastapi`, `github.com/psf/requests`
73+
74+
# FastAPI 422 Validation Error Trace Across Client and Server
75+
76+
## Your Task
77+
78+
A POST request sent with requests returns HTTP 422 from a FastAPI service. Find the Python source files across fastapi/fastapi and psf/requests that (1) serialize and send the request body on the client side, (2) parse request bodies and trigger validation in FastAPI, and (3) construct the validation error response returned to the caller.
79+
80+
## Context
81+
82+
You are working on a codebase task involving repos from the incident domain.
83+
84+
## Available Resources
85+
86+
The local `/workspace/` directory contains: fastapi/fastapi, psf/requests.
87+
88+
## Output Format
89+
90+
Create a file at `/workspace/answer.json` with your findings in the following structure:
91+
92+
```json
93+
{
94+
"files": [
95+
{"repo": "org/repo-name", "path": "relative/path/to/file.go"}
96+
],
97+
"symbols": [
98+
{"repo": "org/repo-name", "path": "relative/path/to/file.go", "symbol": "SymbolName"}
99+
],
100+
"chain": [
101+
{"repo": "org/repo-name", "path": "relative/path/to/file.go", "symbol": "FunctionName"}
102+
],
103+
"text": "Narrative explanation of your findings, citing repos and file paths."
104+
}
105+
```
106+
107+
Include only the fields relevant to this task. Your answer is evaluated against a closed-world oracle — completeness matters.
108+
109+
## Evaluation
110+
111+
Your answer will be scored on:
112+
- **File recall and precision**: Did you find all relevant files?
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
version = "1.0"
2+
3+
[metadata]
4+
name = "CCX-incident-297"
5+
description = "FastAPI 422 Validation Error Trace Across Client and Server"
6+
license = "Apache-2.0"
7+
8+
[task]
9+
id = "CCX-incident-297"
10+
repo = "fastapi/fastapi"
11+
category = "incident-debug"
12+
language = "python"
13+
difficulty = "hard"
14+
time_limit_sec = 900
15+
mcp_suite = "csb_org_incident"
16+
use_case_id = 297
17+
repo_set_id = "fastapi-requests"
18+
mcp_unique = true
19+
verification_modes = ["artifact"]
20+
21+
[verification]
22+
type = "test"
23+
command = "bash /tests/test.sh"
24+
25+
reward_type = "score"
26+
description = "FastAPI 422 Validation Error Trace Across Client and Server"
27+
28+
[environment]
29+
build_timeout_sec = 600.0
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/bin/bash
2+
# eval.sh — MCP-unique benchmark evaluator for CCX-incident-297
3+
# Exit-code-first (SWE-Factory pattern):
4+
# exit 0 — agent produced useful output (composite score > 0)
5+
# exit 1 — total failure (composite score == 0 or missing answer)
6+
#
7+
# Writes /logs/verifier/reward.txt with the composite score [0.0, 1.0]
8+
9+
set -euo pipefail
10+
11+
TASK_ID="CCX-incident-297"
12+
ANSWER_PATH="/workspace/answer.json"
13+
TASK_SPEC_PATH="/tests/task_spec.json"
14+
ORACLE_CHECKS="/tests/oracle_checks.py"
15+
REWARD_PATH="/logs/verifier/reward.txt"
16+
17+
mkdir -p /logs/verifier
18+
19+
echo "=== CCX-incident-297 evaluator ==="
20+
echo "Task spec: $TASK_SPEC_PATH"
21+
echo "Answer: $ANSWER_PATH"
22+
echo ""
23+
24+
# sg_only mode guard: restore full repo if verifier wrapper exists
25+
if [ -f /tmp/.sg_only_mode ] && [ -f /tests/sgonly_verifier_wrapper.sh ]; then
26+
echo "sg_only mode: sourcing verifier wrapper..."
27+
source /tests/sgonly_verifier_wrapper.sh
28+
fi
29+
30+
# Verify answer file exists
31+
if [ ! -f "$ANSWER_PATH" ]; then
32+
echo "ERROR: answer.json not found at $ANSWER_PATH"
33+
echo "0.0" > "$REWARD_PATH"
34+
exit 1
35+
fi
36+
37+
# Validate answer is valid JSON
38+
if ! python3 -c "import json; json.load(open('$ANSWER_PATH'))" 2>/dev/null; then
39+
echo "ERROR: answer.json is not valid JSON"
40+
echo "0.0" > "$REWARD_PATH"
41+
exit 1
42+
fi
43+
44+
echo "answer.json found and valid JSON"
45+
46+
# Run oracle checks
47+
if [ ! -f "$ORACLE_CHECKS" ]; then
48+
echo "ERROR: oracle_checks.py not found at $ORACLE_CHECKS"
49+
echo "0.0" > "$REWARD_PATH"
50+
exit 1
51+
fi
52+
53+
echo "Running oracle checks..."
54+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
55+
56+
# Validate score is a number
57+
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then
58+
echo "ERROR: oracle_checks.py did not return a valid score: $SCORE"
59+
echo "0.0" > "$REWARD_PATH"
60+
exit 1
61+
fi
62+
63+
echo ""
64+
echo "Composite score: $SCORE"
65+
echo "$SCORE" > "$REWARD_PATH"
66+
67+
# Exit based on score (SWE-Factory exit-code-first pattern)
68+
python3 -c "import sys; sys.exit(0 if float('$SCORE') > 0 else 1)"

0 commit comments

Comments
 (0)