Skip to content

Commit 7cf4f81

Browse files
committed
fix: implement modified swebench harness evaluation
1 parent 4177e08 commit 7cf4f81

File tree

3 files changed

+314
-88
lines changed

3 files changed

+314
-88
lines changed
Lines changed: 300 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
"""
2+
Largely copied from swebench/harness/modal_eval/run_evaluation_modal.py
3+
4+
Points of difference:
5+
- We added CGModalSandboxRuntime class that is used to populate the sandbox with the snapshot.
6+
- We are adding custom post-processing of the TestOutput in run_instances_modal
7+
"""
8+
9+
import json
10+
import time
11+
import traceback
12+
from typing import cast
13+
14+
import modal
15+
from swebench.harness.constants import (
16+
APPLY_PATCH_FAIL,
17+
APPLY_PATCH_PASS,
18+
SWEbenchInstance,
19+
)
20+
from swebench.harness.docker_build import setup_logger
21+
from swebench.harness.grading import get_eval_report
22+
from swebench.harness.modal_eval.run_evaluation_modal import (
23+
LOCAL_SANDBOX_ENTRYPOINT_PATH,
24+
REMOTE_SANDBOX_ENTRYPOINT_PATH,
25+
ModalSandboxRuntime,
26+
TestOutput,
27+
get_log_dir,
28+
)
29+
from swebench.harness.test_spec.test_spec import TestSpec, make_test_spec
30+
from swebench.harness.utils import EvaluationError
31+
32+
from .snapshot_manager import ModalDictSnapshotManager
33+
34+
app = modal.App.from_name("swebench-agent-run", create_if_missing=True)
35+
36+
37+
class CGModalSandboxRuntime(ModalSandboxRuntime):
38+
def __init__(
39+
self,
40+
example: SWEbenchInstance,
41+
timeout: int | None = None,
42+
verbose: bool = True,
43+
):
44+
self.example = example
45+
self.snapshot_manager = ModalDictSnapshotManager()
46+
self.test_spec = make_test_spec(example)
47+
self.sandbox = self._get_sandbox(timeout)
48+
self.verbose = verbose
49+
self._stream_tasks = []
50+
51+
# Hack for pylint
52+
self.write_file("/sys/fs/cgroup/cpu/cpu.shares", "2048")
53+
54+
@property
55+
def image(self) -> modal.Image:
56+
return ModalSandboxRuntime.get_instance_image(self.test_spec)
57+
58+
def _get_sandbox(self, timeout: int | None = None):
59+
"""
60+
Populate sandbox ourselves
61+
"""
62+
uid = self.snapshot_manager.get_snapshot_uid(self.example)
63+
if uid is None:
64+
sandbox = super()._get_sandbox(timeout)
65+
snapshot = sandbox._experimental_snapshot()
66+
self.snapshot_manager.save_snapshot_uid(self.example, snapshot.object_id)
67+
else:
68+
return modal.Sandbox._experimental_from_snapshot(uid)
69+
70+
71+
@app.function(
72+
image=modal.Image.debian_slim(python_version="3.13").add_local_file(
73+
LOCAL_SANDBOX_ENTRYPOINT_PATH,
74+
REMOTE_SANDBOX_ENTRYPOINT_PATH,
75+
),
76+
timeout=120
77+
* 60, # Much larger than default timeout to account for image build time
78+
)
79+
def run_instance_modal(
80+
test_spec: TestSpec,
81+
pred: dict,
82+
run_id: str,
83+
timeout: int | None = None,
84+
) -> TestOutput:
85+
"""
86+
Run a single instance with the given prediction.
87+
88+
Args:
89+
test_spec (TestSpec): TestSpec instance
90+
pred (dict): Prediction w/ model_name_or_path, model_patch, instance_id
91+
run_id (str): Run ID
92+
timeout (int): Timeout for running tests
93+
"""
94+
instance_id = test_spec.instance_id
95+
log_dir = get_log_dir(pred, run_id, instance_id)
96+
log_dir.mkdir(parents=True, exist_ok=True)
97+
98+
log_file = log_dir / "run_instance.log"
99+
100+
logger = setup_logger(instance_id, log_file, add_stdout=True)
101+
102+
try:
103+
runner = CGModalSandboxRuntime(test_spec, timeout)
104+
except Exception as e:
105+
print(f"Error creating sandbox: {e}")
106+
raise EvaluationError(
107+
instance_id,
108+
f"Error creating sandbox: {e}",
109+
logger,
110+
) from e
111+
112+
patch_diff = pred.get("model_patch", "")
113+
114+
try:
115+
patch_file = "/tmp/patch.diff"
116+
runner.write_file(patch_file, patch_diff)
117+
118+
apply_patch_output, returncode = runner.exec(
119+
"cd /testbed && git apply -v /tmp/patch.diff",
120+
)
121+
122+
if returncode != 0:
123+
logger.info("Failed to apply patch to container, trying again...")
124+
125+
apply_patch_output, returncode = runner.exec(
126+
"cd /testbed && patch --batch --fuzz=5 -p1 -i /tmp/patch.diff",
127+
)
128+
129+
if returncode != 0:
130+
logger.info(f"{APPLY_PATCH_FAIL}:\n{apply_patch_output}")
131+
raise EvaluationError(
132+
instance_id,
133+
f"{APPLY_PATCH_FAIL}:\n{apply_patch_output}",
134+
logger,
135+
)
136+
else:
137+
logger.info(f"{APPLY_PATCH_PASS}:\n{apply_patch_output}")
138+
else:
139+
logger.info(f"{APPLY_PATCH_PASS}:\n{apply_patch_output}")
140+
141+
# Get git diff before running eval script
142+
git_diff_output_before, returncode = runner.exec(
143+
"cd /testbed && git diff",
144+
)
145+
logger.info(f"Git diff before:\n{git_diff_output_before}")
146+
147+
eval_file = "/root/eval.sh"
148+
eval_script = test_spec.eval_script
149+
# django hack
150+
eval_script = eval_script.replace("locale-gen", "locale-gen en_US.UTF-8")
151+
runner.write_file(eval_file, eval_script)
152+
153+
start_time = time.time()
154+
155+
run_command = "cd /testbed"
156+
# pylint hack
157+
if "pylint" in test_spec.instance_id:
158+
run_command += " && PYTHONPATH="
159+
# increase recursion limit for testing
160+
run_command += " && python3 -c 'import sys; sys.setrecursionlimit(10000)'"
161+
# run eval script
162+
run_command += " && /bin/bash /root/eval.sh"
163+
test_output, returncode = runner.exec(run_command)
164+
165+
total_runtime = time.time() - start_time
166+
167+
test_output_path = log_dir / "test_output.txt"
168+
logger.info(f"Test runtime: {total_runtime:_.2f} seconds")
169+
with open(test_output_path, "w") as f:
170+
f.write(test_output)
171+
logger.info(f"Test output for {instance_id} written to {test_output_path}")
172+
print(f"Test output for {instance_id} written to {test_output_path}")
173+
174+
# Get git diff after running eval script
175+
git_diff_output_after, returncode = runner.exec("cd /testbed && git diff")
176+
177+
# Check if git diff changed after running eval script
178+
logger.info(f"Git diff after:\n{git_diff_output_after}")
179+
if git_diff_output_after != git_diff_output_before:
180+
logger.info("Git diff changed after running eval script")
181+
182+
# Get report from test output
183+
logger.info(f"Grading answer for {instance_id}...")
184+
report = get_eval_report(
185+
test_spec=test_spec,
186+
prediction=pred,
187+
test_log_path=test_output_path,
188+
include_tests_status=True,
189+
)
190+
logger.info(
191+
f"report: {report}\nResult for {instance_id}: resolved: {report[instance_id]['resolved']}"
192+
)
193+
194+
return TestOutput(
195+
instance_id=instance_id,
196+
test_output=test_output,
197+
report_json_str=json.dumps(report, indent=4),
198+
run_instance_log=log_file.read_text(),
199+
patch_diff=patch_diff,
200+
log_dir=log_dir,
201+
errored=False,
202+
)
203+
except modal.exception.SandboxTimeoutError as e:
204+
raise EvaluationError(
205+
instance_id,
206+
f"Test timed out after {timeout} seconds.",
207+
logger,
208+
) from e
209+
except EvaluationError:
210+
error_msg = traceback.format_exc()
211+
logger.info(error_msg)
212+
return TestOutput(
213+
instance_id=instance_id,
214+
test_output="",
215+
report_json_str="",
216+
run_instance_log=log_file.read_text(),
217+
patch_diff=patch_diff,
218+
log_dir=log_dir,
219+
errored=True,
220+
)
221+
except Exception as e:
222+
error_msg = f"Error in evaluating model for {instance_id}: {e}\n{traceback.format_exc()}\nCheck ({logger.log_file}) for more information."
223+
logger.error(error_msg)
224+
return TestOutput(
225+
instance_id=instance_id,
226+
test_output="",
227+
report_json_str="",
228+
run_instance_log=log_file.read_text(),
229+
patch_diff=patch_diff,
230+
log_dir=log_dir,
231+
errored=True,
232+
)
233+
234+
235+
def run_instances_modal(
236+
predictions: dict,
237+
instances: list,
238+
full_dataset: list,
239+
run_id: str,
240+
timeout: int,
241+
):
242+
"""
243+
Run all instances for the given predictions on Modal.
244+
245+
Args:
246+
predictions (dict): Predictions dict generated by the model
247+
instances (list): List of instances
248+
run_id (str): Run ID
249+
timeout (int): Timeout for running tests
250+
"""
251+
test_specs = list(map(make_test_spec, instances))
252+
253+
with modal.enable_output():
254+
with app.run():
255+
run_test_specs = []
256+
257+
# Check for instances that have already been run
258+
for test_spec in test_specs:
259+
log_dir = get_log_dir(
260+
predictions[test_spec.instance_id], run_id, test_spec.instance_id
261+
)
262+
if log_dir.exists():
263+
continue
264+
run_test_specs.append(test_spec)
265+
266+
if run_test_specs:
267+
# Run instances that haven't been run yet
268+
results = run_instance_modal.starmap(
269+
[
270+
(
271+
test_spec,
272+
predictions[test_spec.instance_id],
273+
run_id,
274+
timeout,
275+
)
276+
for test_spec in run_test_specs
277+
],
278+
)
279+
280+
for result in results:
281+
result = cast(TestOutput, result)
282+
283+
# log_dir = result.log_dir
284+
# log_dir.mkdir(parents=True, exist_ok=True)
285+
# with open(log_dir / "run_instance.log", "w") as f:
286+
# f.write(result.run_instance_log)
287+
# with open(log_dir / "test_output.txt", "w") as f:
288+
# f.write(result.test_output)
289+
# with open(log_dir / "patch.diff", "w") as f:
290+
# f.write(result.patch_diff)
291+
# with open(log_dir / "report.json", "w") as f:
292+
# try:
293+
# report_json = json.loads(result.report_json_str)
294+
# json.dump(report_json, f, indent=4)
295+
# except Exception:
296+
# # This happens if the test fails with any exception
297+
# print(f"{result.instance_id}: no report.json")
298+
299+
# TODO: DO SOMETHING WITH OUTPUTS AND LOGS.
300+
# TODO: SAVE THINGS TO POSTGRESQL FOR DASHBOARD

codegen-examples/examples/swebench_agent_run/sandbox.py

Lines changed: 0 additions & 74 deletions
This file was deleted.

0 commit comments

Comments
 (0)