wip: first pass over using sandbox and snapshots

clee-codegen · clee-codegen · commit 8444ca67bbae · 2025-02-25T20:17:28.000-08:00
diff --git a/codegen-examples/examples/swebench_agent_run/entry_point.py b/codegen-examples/examples/swebench_agent_run/entry_point.py
@@ -1,6 +1,6 @@
-from codegen.extensions.swebench.utils import SweBenchExample
-from codegen.extensions.swebench.harness import run_agent_on_entry
 import modal
+from codegen.extensions.swebench.harness import run_agent_on_entry
+from codegen.extensions.swebench.utils import SweBenchExample
 
 image = (
     modal.Image.debian_slim(python_version="3.13")
diff --git a/codegen-examples/examples/swebench_agent_run/run_eval.py b/codegen-examples/examples/swebench_agent_run/run_eval.py
@@ -1,13 +1,23 @@
 import asyncio
 import json
 import traceback
-from pathlib import Path
 import uuid
-import modal
-import click
+from collections import defaultdict
+from dataclasses import asdict
 from datetime import datetime
-from codegen.extensions.swebench.utils import SWEBenchDataset, get_swe_bench_example, get_swe_bench_examples
+from pathlib import Path
+
+import click
+import modal
 from codegen.extensions.swebench.report import generate_report
+from codegen.extensions.swebench.utils import (
+    SWEBenchDataset,
+    SweBenchExample,
+    get_swe_bench_example,
+    get_swe_bench_examples,
+)
+
+from .sandbox import SandboxManager
 
 PREDS_DNAME = Path(__file__).parent / "predictions"
 LOG_DIR = Path(__file__).parent / "logs"
@@ -61,11 +71,26 @@ async def process_batch(examples, batch_size=10):
                     print("Traceback:")
                     print("".join(error_info["traceback"]))
 
-                    results.append({"instance_id": example.instance_id, "status": "error", "error_info": error_info})
+                    results.append(
+                        {
+                            "instance_id": example.instance_id,
+                            "status": "error",
+                            "error_info": error_info,
+                        }
+                    )
                 else:
                     if result is None:
                         print(f"Warning: Null result for {example.instance_id}")
-                        results.append({"instance_id": example.instance_id, "status": "error", "error_info": {"error_type": "NullResult", "error_message": "Process returned None"}})
+                        results.append(
+                            {
+                                "instance_id": example.instance_id,
+                                "status": "error",
+                                "error_info": {
+                                    "error_type": "NullResult",
+                                    "error_message": "Process returned None",
+                                },
+                            }
+                        )
                     else:
                         results.append(result)
 
@@ -81,14 +106,24 @@ async def process_batch(examples, batch_size=10):
                     {
                         "instance_id": example.instance_id,
                         "status": "error",
-                        "error_info": {"error_type": type(e).__name__, "error_message": str(e), "traceback": traceback.format_exc(), "batch_failure": True},
+                        "error_info": {
+                            "error_type": type(e).__name__,
+                            "error_message": str(e),
+                            "traceback": traceback.format_exc(),
+                            "batch_failure": True,
+                        },
                     }
                 )
 
     return results
 
 
-async def run_eval(use_existing_preds: str | None, dataset: str, length: int, instance_id: str | None = None):
+async def run_eval(
+    use_existing_preds: str | None,
+    dataset: str,
+    length: int,
+    instance_id: str | None = None,
+):
     run_id = use_existing_preds or str(uuid.uuid4())
     predictions_dir = PREDS_DNAME / f"results_{run_id}"
     dataset = SWEBenchDataset(dataset)
@@ -157,13 +192,58 @@ async def run_eval(use_existing_preds: str | None, dataset: str, length: int, in
         raise
 
 
+SANDBOX_SEMAPHORES = defaultdict(asyncio.Semaphore)
+
+
+async def run_example(sandbox_manager: SandboxManager, example: SweBenchExample):
+    async with SANDBOX_SEMAPHORES[(example.repo, example.base_commit)]:
+        async with sandbox_manager.get_sandbox(example) as sandbox:
+            result = await sandbox.exec(
+                "python3",
+                "-c",
+                f"from codegen.extensions.swebench.harness import run_agent_from_serialized_entry; run_agent_from_serialized_entry({json.dumps(asdict(example))})",
+            )
+            exit_code = await result.wait()
+            if exit_code != 0:
+                raise Exception(f"Sandbox exited with non-zero exit code {exit_code}")
+            return result
+
+
+async def run_on_sandbox(use_existing_preds, dataset, length, instance_id):
+    dataset = SWEBenchDataset(dataset)
+    if instance_id:
+        examples = [get_swe_bench_example(instance_id, dataset=dataset)]
+    else:
+        examples = get_swe_bench_examples(dataset=dataset, length=length)
+
+    sandbox_manager = SandboxManager()
+    # TODO: remote execution should push results to the database. See: codegeon-on-oss/outputs/sql_output.py
+    return await asyncio.gather(*(run_example(sandbox_manager, example) for example in examples))
+
+
 @click.command()
-@click.option("--use-existing-preds", help="The run ID of the existing predictions to use.", type=str, default=None)
-@click.option("--dataset", help="The dataset to use.", type=click.Choice([dataset.value for dataset in SWEBenchDataset]), default=SWEBenchDataset.LITE.value)
+@click.option(
+    "--use-existing-preds",
+    help="The run ID of the existing predictions to use.",
+    type=str,
+    default=None,
+)
+@click.option(
+    "--dataset",
+    help="The dataset to use.",
+    type=click.Choice([dataset.value for dataset in SWEBenchDataset]),
+    default=SWEBenchDataset.LITE.value,
+)
 @click.option("--length", help="The number of examples to process.", type=int, default=10)
-@click.option("--instance-id", help="The instance ID of the example to process.", type=str, default=None)
+@click.option(
+    "--instance-id",
+    help="The instance ID of the example to process.",
+    type=str,
+    default=None,
+)
 def run_eval_command(use_existing_preds, dataset, length, instance_id):
-    asyncio.run(run_eval(use_existing_preds, dataset, length, instance_id))
+    # asyncio.run(run_eval(use_existing_preds, dataset, length, instance_id))
+    asyncio.run(run_on_sandbox(use_existing_preds, dataset, length, instance_id))
 
 
 if __name__ == "__main__":
diff --git a/codegen-examples/examples/swebench_agent_run/sandbox.py b/codegen-examples/examples/swebench_agent_run/sandbox.py
@@ -0,0 +1,90 @@
+import io
+import json
+from collections import defaultdict
+from contextlib import asynccontextmanager
+
+import modal
+from codegen.extensions.swebench.utils import SweBenchExample
+
+BASE_IMAGE: modal.Image = modal.Image.debian_slim(python_version="3.13").apt_install("git").pip_install("fastapi[standard]")
+
+SNAPSHOT_META_VOLUME = modal.Volume.from_name("swebench-agent-snapshot-volume", create_if_missing=True)
+SNAPSHOT_META_FILE_PATH: str = "/root/snapshot_meta.json"
+
+
+try:
+    # To ensure secrets are consistent across runs, we look up existing secret
+    secret = modal.Secret.from_name("swebench-agent-run-secrets")
+except modal.exception.NotFoundError:
+    secret = modal.Secret.from_dotenv()
+
+app = modal.App.lookup(name="swebench-agent-run", create_if_missing=True)
+
+
+class SandboxManager:
+    keep_alive: bool
+
+    def __init__(self, keep_alive: bool = False):
+        self.keep_alive = keep_alive
+
+    async def read_snapshot_meta(self) -> dict[str, dict[str, str]]:
+        bytes_io = io.BytesIO()
+        try:
+            await SNAPSHOT_META_VOLUME.read_file_into_fileobj(SNAPSHOT_META_FILE_PATH, bytes_io)
+            snapshot_meta = json.loads(bytes_io.getvalue().decode("utf-8"))
+        except FileNotFoundError:
+            snapshot_meta = {}
+        return defaultdict(lambda: defaultdict(lambda: None), snapshot_meta)
+
+    async def update_snapshot_meta(self, example: SweBenchExample, snapshot_uid: str):
+        snapshot_meta = await self.read_snapshot_meta()
+        snapshot_meta[example.repo][example.base_commit] = snapshot_uid
+        async with SNAPSHOT_META_VOLUME.batch_upload() as upload:
+            await upload.put_file(
+                io.BytesIO(json.dumps(snapshot_meta).encode("utf-8")),
+                SNAPSHOT_META_FILE_PATH,
+            )
+        await SNAPSHOT_META_VOLUME.commit()
+
+    async def create_sandbox(self, example: SweBenchExample) -> modal.Sandbox:
+        snapshot_meta = await self.read_snapshot_meta()
+        existing_snapshot_uid = snapshot_meta[example.repo][example.base_commit]
+        if existing_snapshot_uid:
+            return await modal.Sandbox._experimental_from_snapshot(existing_snapshot_uid)
+
+        # TODO: test if this get local version works / add ability to install specific version
+        with modal.enable_output():
+            return await modal.Sandbox.create(
+                app=app,
+                image=BASE_IMAGE.add_local_python_source("codegen"),
+                secrets=[secret],
+                tags={"repo": example.repo, "commit": example.base_commit},
+            )
+
+    @asynccontextmanager
+    async def get_sandbox(self, example: SweBenchExample):
+        async for sandbox in modal.Sandbox.list(
+            app_id=app.app_id,
+            tags={"repo": example.repo, "commit": example.base_commit},
+        ):
+            break
+        else:
+            sandbox = await self.create_sandbox(example)
+
+        try:
+            await sandbox.wait()
+            yield sandbox
+        finally:
+            if not self.keep_alive:
+                # Killing sandbox, so take a snapshot and save it
+                await sandbox.exec(
+                    "bash",
+                    "-c",
+                    f"cd /root/tmp/{example.repo}; git stash",  # cheeky little stash
+                )
+                snapshot = await sandbox._experimental_snapshot()  # commit any codegen updates
+                await self.update_snapshot_meta(example, snapshot.object_id)
+
+                # Codebase.from_repo doesn't use git to fetch/checkout the repo.
+                # We could replace this with our own git commands to control the file state
+                await sandbox.terminate()
diff --git a/src/codegen/extensions/swebench/harness.py b/src/codegen/extensions/swebench/harness.py
@@ -48,7 +48,12 @@ def show_problems(dataset):
         print(f"{inst}: {problem}")
 
 
-def run_agent_on_entry(entry: SweBenchExample):
+def run_agent_from_serialized_entry(serialized_entry: str):
+    entry = json.loads(serialized_entry)
+    return run_agent_on_entry(SweBenchExample(**entry))
+
+
+def run_agent_on_entry(entry: SweBenchExample, tmp_dir="/root/tmp"):
     """Process one `entry` from SWE Bench using the LLM `models` at the
     given `temperature`.  Set `model_name_or_path` in the result json.
     """
@@ -63,7 +68,12 @@ def run_agent_on_entry(entry: SweBenchExample):
 
     gold_files = files_in_patch(entry.patch)
 
-    codebase = Codebase.from_repo(repo_full_name=entry.repo, commit=base_commit, language="python")  # check out the repo
+    codebase = Codebase.from_repo(
+        repo_full_name=entry.repo,
+        commit=base_commit,
+        language="python",
+        tmp_dir=tmp_dir,
+    )  # check out the repo
 
     agent = CodeAgent(codebase=codebase)