NVIDIA · pimlock · Mar 16, 2026 · Mar 16, 2026
diff --git a/sandboxes/autoresearch-spark/Dockerfile b/sandboxes/autoresearch-spark/Dockerfile
@@ -0,0 +1,135 @@
+# syntax=docker/dockerfile:1
+
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# autoresearch-spark sandbox image for OpenShell
+#
+# Autonomous LLM pretraining research on DGX Spark (GB10 / Blackwell).
+# Uses CUDA 13.0 devel base for sm_121a ptxas support (Triton kernel
+# compilation) and PyTorch cu128 wheels.
+#
+# Build:  docker build -t autoresearch-spark .
+
+# ---------------------------------------------------------------------------
+# Stage 1: System base
+# ---------------------------------------------------------------------------
+
+# This image uses its own CUDA base (not the community base) because the
+# GB10 / Blackwell GPU requires CUDA 13.0 devel for sm_121a ptxas support.
+# The ARG is declared so the CI sandbox-build workflow can parse it, but
+# it is intentionally unused — the FROM below pins the CUDA image directly.
+ARG BASE_IMAGE=nvidia/cuda:13.0.0-cudnn-devel-ubuntu24.04
+FROM nvidia/cuda:13.0.0-cudnn-devel-ubuntu24.04 AS system
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1
+
+WORKDIR /sandbox
+
+# Core system dependencies (mirrors sandboxes/base)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        ca-certificates \
+        curl \
+        dnsutils \
+        iproute2 \
+        iptables \
+        iputils-ping \
+        net-tools \
+        netcat-openbsd \
+        openssh-sftp-server \
+        procps \
+        traceroute \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create supervisor and sandbox users/groups (same as base)
+RUN groupadd -r supervisor && useradd -r -g supervisor -s /usr/sbin/nologin supervisor && \
+    groupadd -r sandbox && useradd -r -g sandbox -d /sandbox -s /bin/bash sandbox
+
+# ---------------------------------------------------------------------------
+# Stage 2: Developer tools
+# ---------------------------------------------------------------------------
+FROM system AS devtools
+
+# Node.js 22 + build toolchain
+RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - && \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        git \
+        nodejs=22.22.1-1nodesource1 \
+        vim-tiny \
+        nano \
+    && rm -rf /var/lib/apt/lists/* \
+    && npm install -g npm@11.11.0
+
+# Global npm packages (same versions as base)
+RUN npm install -g \
+        tar@7.5.11 \
+        @hono/node-server@1.19.11 \
+        opencode-ai@1.2.18 \
+        @openai/codex@0.111.0
+
+# GitHub CLI
+RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg \
+        -o /usr/share/keyrings/githubcli-archive-keyring.gpg && \
+    echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" \
+        > /etc/apt/sources.list.d/github-cli.list && \
+    apt-get update && apt-get install -y --no-install-recommends gh && \
+    rm -rf /var/lib/apt/lists/*
+
+# Claude CLI (via native installer)
+RUN curl -fsSL https://claude.ai/install.sh | bash \
+    && cp /root/.local/bin/claude /usr/local/bin/claude \
+    && chmod 755 /usr/local/bin/claude
+
+# uv (Python package/project manager)
+COPY --from=ghcr.io/astral-sh/uv:0.10.8 /uv /usr/local/bin/uv
+ENV UV_PYTHON_INSTALL_DIR="/sandbox/.uv/python"
+RUN uv python install 3.13 && \
+    ln -s $(uv python find 3.13) /usr/local/bin/python3 && \
+    ln -s $(uv python find 3.13) /usr/local/bin/python && \
+    uv cache clean
+
+# ---------------------------------------------------------------------------
+# Stage 3: Final image with autoresearch
+# ---------------------------------------------------------------------------
+FROM devtools AS final
+
+# GB10 / Blackwell (sm_121a) requires CUDA 13.0's ptxas for Triton kernel
+# compilation. PyTorch cu128 wheels work fine on the CUDA 13.0 runtime
+# (backward compatible), but Triton needs the newer ptxas to emit sm_121a
+# instructions. Without this, Triton falls back to slow generic kernels.
+ENV TRITON_PTXAS_PATH="/usr/local/cuda-13.0/bin/ptxas"
+
+# Add venvs to PATH
+ENV PATH="/sandbox/.venv/bin:/usr/local/bin:/usr/bin:/bin" \
+    VIRTUAL_ENV="/sandbox/.venv"
+
+# Sandbox network / filesystem policy
+COPY policy.yaml /etc/openshell/policy.yaml
+
+# Create writable venv
+RUN mkdir -p /sandbox/.claude && \
+    uv venv --python 3.13 --seed /sandbox/.venv && \
+    uv cache clean && \
+    chown -R sandbox:sandbox /sandbox/.venv
+
+# Copy autoresearch code
+COPY autoresearch/ /sandbox/autoresearch/
+
+# Install autoresearch Python dependencies (PyTorch cu128, kernels, etc.)
+RUN uv sync --project /sandbox/autoresearch && \
+    uv cache clean
+
+# Shell init (same pattern as base)
+RUN printf 'export PATH="/sandbox/.venv/bin:/usr/local/bin:/usr/bin:/bin"\nexport VIRTUAL_ENV="/sandbox/.venv"\nexport UV_PYTHON_INSTALL_DIR="/sandbox/.uv/python"\nexport TRITON_PTXAS_PATH="/usr/local/cuda-13.0/bin/ptxas"\nexport PS1="\\u@\\h:\\w\\$ "\n' \
+        > /sandbox/.bashrc && \
+    printf '[ -f ~/.bashrc ] && . ~/.bashrc\n' > /sandbox/.profile && \
+    chown sandbox:sandbox /sandbox/.bashrc /sandbox/.profile && \
+    chown -R sandbox:sandbox /sandbox
+
+USER sandbox
+WORKDIR /sandbox/autoresearch
+
+ENTRYPOINT ["/bin/bash"]
diff --git a/sandboxes/autoresearch-spark/README.md b/sandboxes/autoresearch-spark/README.md
@@ -0,0 +1,47 @@
+# autoresearch-spark
+
+Autonomous LLM pretraining research on DGX Spark (GB10 / Blackwell).
+
+Based on [karpathy/autoresearch](https://github.com/karpathy/autoresearch),
+adapted for the NVIDIA GB10 single-GPU platform.
+
+## Quick start
+
+```
+openshell sandbox create \
+  --remote my-spark \
+  --gpu \
+  --provider claude \
+  --provider github \
+  --from autoresearch-spark \
+  -- claude
+```
+
+This launches an autoresearch sandbox on your DGX Spark with Claude as the
+autonomous researcher. The agent will set up, run experiments, and iterate
+on the model -- all while you sleep.
+
+## Why CUDA 13.0 + cu128?
+
+The DGX Spark has an NVIDIA GB10 GPU (Blackwell, compute capability 12.1 /
+sm_121a). This creates a unique toolchain situation:
+
+- **CUDA 13.0 devel base image**: provides `ptxas` with sm_121a support,
+  which Triton needs to compile optimized GPU kernels for the GB10.
+- **PyTorch cu128 wheels**: the cu130 wheels are not yet functional on this
+  platform, but cu128 works correctly on the CUDA 13.0 runtime (backward
+  compatible).
+- **`TRITON_PTXAS_PATH`** is set globally in the container so Triton finds
+  the CUDA 13.0 ptxas automatically.
+
+## Platform notes
+
+See [autoresearch/LEARNINGS.md](autoresearch/LEARNINGS.md) for GB10-specific
+findings including hyperparameter tuning, MFU expectations, and best configs
+found across ~135 experiments.
+
+## Build
+
+```
+docker build -t autoresearch-spark .
+```
diff --git a/sandboxes/autoresearch-spark/autoresearch/.gitignore b/sandboxes/autoresearch-spark/autoresearch/.gitignore
@@ -0,0 +1,23 @@
+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+
+# Virtual environments
+.venv
+worktrees/
+results/
+queue/
+
+# Agent prompt files (generated per-session by launchers)
+CLAUDE.md
+AGENTS.md
+
+# Experimental code/artifacts
+dev/
+
+# Results file
+results.tsv
diff --git a/sandboxes/autoresearch-spark/autoresearch/.python-version b/sandboxes/autoresearch-spark/autoresearch/.python-version
@@ -0,0 +1 @@
+3.10
diff --git a/sandboxes/autoresearch-spark/autoresearch/LEARNINGS.md b/sandboxes/autoresearch-spark/autoresearch/LEARNINGS.md
@@ -0,0 +1,75 @@
+# Learnings
+
+Platform-specific notes and experiment findings accumulated across autoresearch runs.
+
+---
+
+## Platform: NVIDIA GB10 (Blackwell, CUDA 12.1) — March 2026
+
+### Environment quirks
+
+- **`uv` is not in `$PATH`** — must invoke as `/sandbox/.local/bin/uv run train.py`
+- `train.py` already sets `TRITON_PTXAS_PATH` to the system CUDA 13.0 `ptxas`, which helps Triton-compiled kernels target the GB10. PyTorch's own CUDA kernels are still limited to sm_120 and below.
+
+### GPU compatibility issue
+
+PyTorch (as of this run) supports CUDA capability up to 12.0. The GB10 is 12.1, so PyTorch cannot use its optimized kernels (fast matmul, FlashAttention, etc.) and falls back to slow generic paths.
+
+**Symptoms:**
+- MFU ~0.86% (vs 35–45% on a well-supported GPU like H100)
+- ~15 seconds per training step (vs ~0.3s on H100)
+- ~31 optimizer steps in the 5-minute budget (vs ~950 on H100)
+
+### Key hyperparameter adaptation
+
+The default `TOTAL_BATCH_SIZE = 2**19` (~524K tokens/step) was tuned for ~950 steps/run. With only 31 steps available, the model barely trains. The fix is to shrink the batch so you get far more gradient updates in the time budget:
+
+| TOTAL_BATCH_SIZE | steps/run | val_bpb |
+|-----------------|-----------|---------|
+| 2^19 (default)  | ~31       | 1.8794  |
+| 2^17            | ~89       | 1.6715  |
+| 2^16            | ~164      | 1.4010  |
+| 2^15            | ~360      | 1.3107  |
+| **2^14**        | **~689**  | **1.3073** ← sweet spot |
+| 2^13            | ~1242     | 1.3301  ↑ too noisy    |
+
+**Also reduce `DEVICE_BATCH_SIZE` proportionally** (e.g. 8 with `TOTAL_BATCH_SIZE=2^14`) to keep `grad_accum_steps=1` — gradient accumulation hurts quality on this platform.
+
+### Best config found (mar15 run, val_bpb 1.2265 from baseline 1.8794)
+
+```python
+ASPECT_RATIO     = 96        # model_dim = depth * ASPECT_RATIO
+HEAD_DIM         = 96        # 4 attention heads
+WINDOW_PATTERN   = "L"       # all full-context attention (no sliding window)
+TOTAL_BATCH_SIZE = 2**14     # ~16K tokens/step
+DEVICE_BATCH_SIZE = 8        # grad_accum_steps = 1
+EMBEDDING_LR     = 0.35
+MATRIX_LR        = 0.05
+WEIGHT_DECAY     = 0.0
+ADAM_BETAS       = (0.9, 0.99)
+WARMDOWN_RATIO   = 0.6
+FINAL_LR_FRAC    = 0.05
+DEPTH            = 4
+# Muon: beta2=0.90, momentum warmup 0.85->0.95 over 5000 steps
+```
+
+### Other findings (mar15, ~135 experiments)
+
+**Helped:**
+- Full-context attention (`WINDOW_PATTERN="L"`) over sliding window (`"SSSL"`)
+- Larger head dim (96–128) over smaller (32–64) at this model size
+- Slow Muon momentum warmup (0.85→0.95 over 5000 steps)
+- No weight decay (`WEIGHT_DECAY=0.0`)
+- `ADAM_BETAS=(0.9, 0.99)` — higher beta2 for small batches
+- `FINAL_LR_FRAC=0.05` — don't decay LR fully to zero
+- Muon `beta2=0.90` (less smoothing of second moment)
+
+**Did not help / hurt:**
+- GQA (grouped-query attention) — speed gain not worth quality loss
+- SwiGLU activation — worse than ReLU²
+- Gradient clipping — overhead reduces step count
+- Wider MLP (4x→8x expansion) — fewer steps, worse result
+- Removing value embeddings — significantly worse
+- Post-norm — much worse than pre-norm
+- Parallel attention+MLP (GPT-J style) — much worse
+- Label smoothing — breaks train/val metric alignment