Skip to content

Commit 7a46611

Browse files
committed
fix: implement modified swebench harness evaluation
1 parent 4177e08 commit 7a46611

File tree

3 files changed

+309
-88
lines changed

3 files changed

+309
-88
lines changed
Lines changed: 295 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,295 @@
1+
"""
2+
Largely copied from swebench/harness/modal_eval/run_evaluation_modal.py
3+
4+
Points of difference:
5+
- We added CGModalSandboxRuntime class that is used to populate the sandbox with the snapshot.
6+
- We are adding custom post-processing of the TestOutput in run_instances_modal
7+
"""
8+
9+
import json
10+
import time
11+
import traceback
12+
from typing import cast
13+
14+
import modal
15+
from swebench.harness.constants import (
16+
APPLY_PATCH_FAIL,
17+
APPLY_PATCH_PASS,
18+
SWEbenchInstance,
19+
)
20+
from swebench.harness.docker_build import setup_logger
21+
from swebench.harness.grading import get_eval_report
22+
from swebench.harness.modal_eval.run_evaluation_modal import (
23+
LOCAL_SANDBOX_ENTRYPOINT_PATH,
24+
REMOTE_SANDBOX_ENTRYPOINT_PATH,
25+
ModalSandboxRuntime,
26+
TestOutput,
27+
get_log_dir,
28+
)
29+
from swebench.harness.test_spec.test_spec import TestSpec, make_test_spec
30+
from swebench.harness.utils import EvaluationError
31+
32+
from .snapshot_manager import ModalDictSnapshotManager
33+
34+
app = modal.App.from_name("swebench-agent-run", create_if_missing=True)
35+
36+
37+
class CGModalSandboxRuntime(ModalSandboxRuntime):
38+
def __init__(
39+
self,
40+
example: SWEbenchInstance,
41+
timeout: int | None = None,
42+
verbose: bool = True,
43+
):
44+
self.example = example
45+
self.snapshot_manager = ModalDictSnapshotManager()
46+
self.test_spec = make_test_spec(example)
47+
self.sandbox = self._get_sandbox(timeout)
48+
self.verbose = verbose
49+
self._stream_tasks = []
50+
51+
# Hack for pylint
52+
self.write_file("/sys/fs/cgroup/cpu/cpu.shares", "2048")
53+
54+
@property
55+
def image(self) -> modal.Image:
56+
return ModalSandboxRuntime.get_instance_image(self.test_spec)
57+
58+
def _get_sandbox(self, timeout: int | None = None):
59+
"""
60+
Populate sandbox ourselves
61+
"""
62+
uid = self.snapshot_manager.get_snapshot_uid(self.example)
63+
if uid is None:
64+
sandbox = super()._get_sandbox(timeout)
65+
snapshot = sandbox._experimental_snapshot()
66+
self.snapshot_manager.save_snapshot_uid(self.example, snapshot.object_id)
67+
else:
68+
return modal.Sandbox._experimental_from_snapshot(uid)
69+
70+
71+
@app.function(
72+
image=modal.Image.debian_slim(python_version="3.13").add_local_file(
73+
LOCAL_SANDBOX_ENTRYPOINT_PATH,
74+
REMOTE_SANDBOX_ENTRYPOINT_PATH,
75+
),
76+
timeout=120 * 60, # Much larger than default timeout to account for image build time
77+
)
78+
def run_instance_modal(
79+
test_spec: TestSpec,
80+
pred: dict,
81+
run_id: str,
82+
timeout: int | None = None,
83+
) -> TestOutput:
84+
"""
85+
Run a single instance with the given prediction.
86+
87+
Args:
88+
test_spec (TestSpec): TestSpec instance
89+
pred (dict): Prediction w/ model_name_or_path, model_patch, instance_id
90+
run_id (str): Run ID
91+
timeout (int): Timeout for running tests
92+
"""
93+
instance_id = test_spec.instance_id
94+
log_dir = get_log_dir(pred, run_id, instance_id)
95+
log_dir.mkdir(parents=True, exist_ok=True)
96+
97+
log_file = log_dir / "run_instance.log"
98+
99+
logger = setup_logger(instance_id, log_file, add_stdout=True)
100+
101+
try:
102+
runner = ModalSandboxRuntime(test_spec, timeout)
103+
except Exception as e:
104+
print(f"Error creating sandbox: {e}")
105+
raise EvaluationError(
106+
instance_id,
107+
f"Error creating sandbox: {e}",
108+
logger,
109+
) from e
110+
111+
patch_diff = pred.get("model_patch", "")
112+
113+
try:
114+
patch_file = "/tmp/patch.diff"
115+
runner.write_file(patch_file, patch_diff)
116+
117+
apply_patch_output, returncode = runner.exec(
118+
"cd /testbed && git apply -v /tmp/patch.diff",
119+
)
120+
121+
if returncode != 0:
122+
logger.info("Failed to apply patch to container, trying again...")
123+
124+
apply_patch_output, returncode = runner.exec(
125+
"cd /testbed && patch --batch --fuzz=5 -p1 -i /tmp/patch.diff",
126+
)
127+
128+
if returncode != 0:
129+
logger.info(f"{APPLY_PATCH_FAIL}:\n{apply_patch_output}")
130+
raise EvaluationError(
131+
instance_id,
132+
f"{APPLY_PATCH_FAIL}:\n{apply_patch_output}",
133+
logger,
134+
)
135+
else:
136+
logger.info(f"{APPLY_PATCH_PASS}:\n{apply_patch_output}")
137+
else:
138+
logger.info(f"{APPLY_PATCH_PASS}:\n{apply_patch_output}")
139+
140+
# Get git diff before running eval script
141+
git_diff_output_before, returncode = runner.exec(
142+
"cd /testbed && git diff",
143+
)
144+
logger.info(f"Git diff before:\n{git_diff_output_before}")
145+
146+
eval_file = "/root/eval.sh"
147+
eval_script = test_spec.eval_script
148+
# django hack
149+
eval_script = eval_script.replace("locale-gen", "locale-gen en_US.UTF-8")
150+
runner.write_file(eval_file, eval_script)
151+
152+
start_time = time.time()
153+
154+
run_command = "cd /testbed"
155+
# pylint hack
156+
if "pylint" in test_spec.instance_id:
157+
run_command += " && PYTHONPATH="
158+
# increase recursion limit for testing
159+
run_command += " && python3 -c 'import sys; sys.setrecursionlimit(10000)'"
160+
# run eval script
161+
run_command += " && /bin/bash /root/eval.sh"
162+
test_output, returncode = runner.exec(run_command)
163+
164+
total_runtime = time.time() - start_time
165+
166+
test_output_path = log_dir / "test_output.txt"
167+
logger.info(f"Test runtime: {total_runtime:_.2f} seconds")
168+
with open(test_output_path, "w") as f:
169+
f.write(test_output)
170+
logger.info(f"Test output for {instance_id} written to {test_output_path}")
171+
print(f"Test output for {instance_id} written to {test_output_path}")
172+
173+
# Get git diff after running eval script
174+
git_diff_output_after, returncode = runner.exec("cd /testbed && git diff")
175+
176+
# Check if git diff changed after running eval script
177+
logger.info(f"Git diff after:\n{git_diff_output_after}")
178+
if git_diff_output_after != git_diff_output_before:
179+
logger.info("Git diff changed after running eval script")
180+
181+
# Get report from test output
182+
logger.info(f"Grading answer for {instance_id}...")
183+
report = get_eval_report(
184+
test_spec=test_spec,
185+
prediction=pred,
186+
test_log_path=test_output_path,
187+
include_tests_status=True,
188+
)
189+
logger.info(f"report: {report}\nResult for {instance_id}: resolved: {report[instance_id]['resolved']}")
190+
191+
return TestOutput(
192+
instance_id=instance_id,
193+
test_output=test_output,
194+
report_json_str=json.dumps(report, indent=4),
195+
run_instance_log=log_file.read_text(),
196+
patch_diff=patch_diff,
197+
log_dir=log_dir,
198+
errored=False,
199+
)
200+
except modal.exception.SandboxTimeoutError as e:
201+
raise EvaluationError(
202+
instance_id,
203+
f"Test timed out after {timeout} seconds.",
204+
logger,
205+
) from e
206+
except EvaluationError:
207+
error_msg = traceback.format_exc()
208+
logger.info(error_msg)
209+
return TestOutput(
210+
instance_id=instance_id,
211+
test_output="",
212+
report_json_str="",
213+
run_instance_log=log_file.read_text(),
214+
patch_diff=patch_diff,
215+
log_dir=log_dir,
216+
errored=True,
217+
)
218+
except Exception as e:
219+
error_msg = f"Error in evaluating model for {instance_id}: {e}\n{traceback.format_exc()}\nCheck ({logger.log_file}) for more information."
220+
logger.error(error_msg)
221+
return TestOutput(
222+
instance_id=instance_id,
223+
test_output="",
224+
report_json_str="",
225+
run_instance_log=log_file.read_text(),
226+
patch_diff=patch_diff,
227+
log_dir=log_dir,
228+
errored=True,
229+
)
230+
231+
232+
def run_instances_modal(
233+
predictions: dict,
234+
instances: list,
235+
full_dataset: list,
236+
run_id: str,
237+
timeout: int,
238+
):
239+
"""
240+
Run all instances for the given predictions on Modal.
241+
242+
Args:
243+
predictions (dict): Predictions dict generated by the model
244+
instances (list): List of instances
245+
run_id (str): Run ID
246+
timeout (int): Timeout for running tests
247+
"""
248+
test_specs = list(map(make_test_spec, instances))
249+
250+
with modal.enable_output():
251+
with app.run():
252+
run_test_specs = []
253+
254+
# Check for instances that have already been run
255+
for test_spec in test_specs:
256+
log_dir = get_log_dir(predictions[test_spec.instance_id], run_id, test_spec.instance_id)
257+
if log_dir.exists():
258+
continue
259+
run_test_specs.append(test_spec)
260+
261+
if run_test_specs:
262+
# Run instances that haven't been run yet
263+
results = run_instance_modal.starmap(
264+
[
265+
(
266+
test_spec,
267+
predictions[test_spec.instance_id],
268+
run_id,
269+
timeout,
270+
)
271+
for test_spec in run_test_specs
272+
],
273+
)
274+
275+
for result in results:
276+
result = cast(TestOutput, result)
277+
278+
# log_dir = result.log_dir
279+
# log_dir.mkdir(parents=True, exist_ok=True)
280+
# with open(log_dir / "run_instance.log", "w") as f:
281+
# f.write(result.run_instance_log)
282+
# with open(log_dir / "test_output.txt", "w") as f:
283+
# f.write(result.test_output)
284+
# with open(log_dir / "patch.diff", "w") as f:
285+
# f.write(result.patch_diff)
286+
# with open(log_dir / "report.json", "w") as f:
287+
# try:
288+
# report_json = json.loads(result.report_json_str)
289+
# json.dump(report_json, f, indent=4)
290+
# except Exception:
291+
# # This happens if the test fails with any exception
292+
# print(f"{result.instance_id}: no report.json")
293+
294+
# TODO: DO SOMETHING WITH OUTPUTS AND LOGS.
295+
# TODO: SAVE THINGS TO POSTGRESQL FOR DASHBOARD

codegen-examples/examples/swebench_agent_run/sandbox.py

Lines changed: 0 additions & 74 deletions
This file was deleted.

0 commit comments

Comments
 (0)