11import asyncio
22import json
33import traceback
4- from pathlib import Path
54import uuid
6- import modal
7- import click
5+ from collections import defaultdict
6+ from dataclasses import asdict
87from datetime import datetime
9- from codegen .extensions .swebench .utils import SWEBenchDataset , get_swe_bench_example , get_swe_bench_examples
8+ from pathlib import Path
9+
10+ import click
11+ import modal
1012from codegen .extensions .swebench .report import generate_report
13+ from codegen .extensions .swebench .utils import (
14+ SWEBenchDataset ,
15+ SweBenchExample ,
16+ get_swe_bench_example ,
17+ get_swe_bench_examples ,
18+ )
19+
20+ from .sandbox import SandboxManager
1121
1222PREDS_DNAME = Path (__file__ ).parent / "predictions"
1323LOG_DIR = Path (__file__ ).parent / "logs"
@@ -61,11 +71,26 @@ async def process_batch(examples, batch_size=10):
6171 print ("Traceback:" )
6272 print ("" .join (error_info ["traceback" ]))
6373
64- results .append ({"instance_id" : example .instance_id , "status" : "error" , "error_info" : error_info })
74+ results .append (
75+ {
76+ "instance_id" : example .instance_id ,
77+ "status" : "error" ,
78+ "error_info" : error_info ,
79+ }
80+ )
6581 else :
6682 if result is None :
6783 print (f"Warning: Null result for { example .instance_id } " )
68- results .append ({"instance_id" : example .instance_id , "status" : "error" , "error_info" : {"error_type" : "NullResult" , "error_message" : "Process returned None" }})
84+ results .append (
85+ {
86+ "instance_id" : example .instance_id ,
87+ "status" : "error" ,
88+ "error_info" : {
89+ "error_type" : "NullResult" ,
90+ "error_message" : "Process returned None" ,
91+ },
92+ }
93+ )
6994 else :
7095 results .append (result )
7196
@@ -81,14 +106,24 @@ async def process_batch(examples, batch_size=10):
81106 {
82107 "instance_id" : example .instance_id ,
83108 "status" : "error" ,
84- "error_info" : {"error_type" : type (e ).__name__ , "error_message" : str (e ), "traceback" : traceback .format_exc (), "batch_failure" : True },
109+ "error_info" : {
110+ "error_type" : type (e ).__name__ ,
111+ "error_message" : str (e ),
112+ "traceback" : traceback .format_exc (),
113+ "batch_failure" : True ,
114+ },
85115 }
86116 )
87117
88118 return results
89119
90120
91- async def run_eval (use_existing_preds : str | None , dataset : str , length : int , instance_id : str | None = None ):
121+ async def run_eval (
122+ use_existing_preds : str | None ,
123+ dataset : str ,
124+ length : int ,
125+ instance_id : str | None = None ,
126+ ):
92127 run_id = use_existing_preds or str (uuid .uuid4 ())
93128 predictions_dir = PREDS_DNAME / f"results_{ run_id } "
94129 dataset = SWEBenchDataset (dataset )
@@ -157,13 +192,58 @@ async def run_eval(use_existing_preds: str | None, dataset: str, length: int, in
157192 raise
158193
159194
195+ SANDBOX_SEMAPHORES = defaultdict (asyncio .Semaphore )
196+
197+
198+ async def run_example (sandbox_manager : SandboxManager , example : SweBenchExample ):
199+ async with SANDBOX_SEMAPHORES [(example .repo , example .base_commit )]:
200+ async with sandbox_manager .get_sandbox (example ) as sandbox :
201+ result = await sandbox .exec (
202+ "python3" ,
203+ "-c" ,
204+ f"from codegen.extensions.swebench.harness import run_agent_from_serialized_entry; run_agent_from_serialized_entry({ json .dumps (asdict (example ))} )" ,
205+ )
206+ exit_code = await result .wait ()
207+ if exit_code != 0 :
208+ raise Exception (f"Sandbox exited with non-zero exit code { exit_code } " )
209+ return result
210+
211+
212+ async def run_on_sandbox (use_existing_preds , dataset , length , instance_id ):
213+ dataset = SWEBenchDataset (dataset )
214+ if instance_id :
215+ examples = [get_swe_bench_example (instance_id , dataset = dataset )]
216+ else :
217+ examples = get_swe_bench_examples (dataset = dataset , length = length )
218+
219+ sandbox_manager = SandboxManager ()
220+ # TODO: remote execution should push results to the database. See: codegeon-on-oss/outputs/sql_output.py
221+ return await asyncio .gather (* (run_example (sandbox_manager , example ) for example in examples ))
222+
223+
160224@click .command ()
161- @click .option ("--use-existing-preds" , help = "The run ID of the existing predictions to use." , type = str , default = None )
162- @click .option ("--dataset" , help = "The dataset to use." , type = click .Choice ([dataset .value for dataset in SWEBenchDataset ]), default = SWEBenchDataset .LITE .value )
225+ @click .option (
226+ "--use-existing-preds" ,
227+ help = "The run ID of the existing predictions to use." ,
228+ type = str ,
229+ default = None ,
230+ )
231+ @click .option (
232+ "--dataset" ,
233+ help = "The dataset to use." ,
234+ type = click .Choice ([dataset .value for dataset in SWEBenchDataset ]),
235+ default = SWEBenchDataset .LITE .value ,
236+ )
163237@click .option ("--length" , help = "The number of examples to process." , type = int , default = 10 )
164- @click .option ("--instance-id" , help = "The instance ID of the example to process." , type = str , default = None )
238+ @click .option (
239+ "--instance-id" ,
240+ help = "The instance ID of the example to process." ,
241+ type = str ,
242+ default = None ,
243+ )
165244def run_eval_command (use_existing_preds , dataset , length , instance_id ):
166- asyncio .run (run_eval (use_existing_preds , dataset , length , instance_id ))
245+ # asyncio.run(run_eval(use_existing_preds, dataset, length, instance_id))
246+ asyncio .run (run_on_sandbox (use_existing_preds , dataset , length , instance_id ))
167247
168248
169249if __name__ == "__main__" :
0 commit comments