From b04185c95272c0b7e40abfc1f55a9c782024dcc3 Mon Sep 17 00:00:00 2001
From: Tony Narlock <tony@git-pull.com>
Date: Tue, 28 Apr 2026 18:24:49 -0500
Subject: [PATCH 1/2] chore(toolchain[python]) add Python 3.15.0a8 for Tachyon
 profiler access

why: Python 3.15 ships the new profiling.sampling stdlib module
("Tachyon"), a statistical sampling profiler that produces flamegraphs,
heatmaps, and Gecko-format call trees with zero target instrumentation.
It works on WSL where kernel perf is unavailable. We're using it to
profile libtmux's bench-engines path and identify lag sources outside
the engine layer (e.g. tmuxp's _wait_for_pane_ready loop).
what:
- .tool-versions: append 3.15.0a8 to the python line so `mise install`
  picks it up alongside the existing 3.10-3.14 matrix. The first entry
  (3.14) remains the default for project tooling; 3.15 is for ad-hoc
  profiling
---
 .tool-versions | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.tool-versions b/.tool-versions
index 09479bd71..67a285e7f 100644
--- a/.tool-versions
+++ b/.tool-versions
@@ -1,3 +1,3 @@
 just 1.50
 uv 0.11.8
-python 3.14 3.13 3.12 3.11 3.10
+python 3.14 3.13 3.12 3.11 3.10 3.15.0a8

From b6b40f1adfee4f676dac5105e612ec9a9898d683 Mon Sep 17 00:00:00 2001
From: Tony Narlock <tony@git-pull.com>
Date: Tue, 28 Apr 2026 18:34:18 -0500
Subject: [PATCH 2/2] chore(skill[libtmux-profiler]) add Tachyon profiling
 skill at .claude/skills/
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

why: Profiling investigations against libtmux/tmuxp historically required
ad-hoc cProfile invocations and manual flamegraph wrangling. Python 3.15's
new stdlib `profiling.sampling` module (Tachyon) ships everything needed
in-tree — flamegraphs, heatmaps, line-level attribution, pstats — but the
"how to use it on this codebase" knowledge isn't discoverable from the
module help. This skill is the institutional capital that makes Tachyon
trivially repeatable on libtmux/tmuxp work.
what:
- SKILL.md with workflows for: pstats top-N (terminal-only summary),
  automated pstats diff (before/after with the bundled diff-pstats.py
  helper, replaces the broken --diff-flamegraph alpha), single-call
  microbench (per-call timing of one libtmux API), heatmap (line-level
  attribution), live TUI (top(1)-style real-time profiler), attach to a
  running pytest (diagnose hangs without restart), sampling-mode
  selection guide, output-reading cheat sheet
- scripts/setup-tachyon-venv.sh: idempotent bootstrap that creates
  `.venv-3.15` with libtmux installed editable + the testing dep group
  (skips dev because watchfiles has no Rust wheel for 3.15a8 yet)
- scripts/init-profile-session.sh: builds the per-session output
  directory at /tmp/py-profiling/<TS>/<project>/<branch>/<name>/ and
  writes a README.md with HEAD short-sha so the artifact set stays
  reconstructable after the branch moves
- scripts/bench-libtmux-call.py: tight-loop microbench against a real
  tmux server using a registry-of-callables design (BENCH_TARGETS dict)
  so the script can never execute caller-supplied Python expressions.
  Targets: has_session, list_sessions, list_windows, session_name,
  show_options, list_panes
- scripts/diff-pstats.py: structured pstats arithmetic — loads two
  .pstats files, computes per-function cumtime deltas, prints a
  markdown table sorted by abs(delta) for paste-into-PR/issue use.
  Replaces Python 3.15.0a8's broken `--diff-flamegraph` until the
  upstream UnboundLocalError is fixed
- reference/flamegraph-reading.md: visual-signature cheat sheet for
  reading plain flamegraphs (sleep loops, fixture setup, dispatch
  chains) and diff flamegraphs (when 3.15 fixes the alpha bug)
- .gitignore: override the global `.claude/` ignore so this skill
  ships with the repo
---
 .claude/skills/libtmux-profiler/SKILL.md      | 378 ++++++++++++++++++
 .../reference/flamegraph-reading.md           | 127 ++++++
 .../scripts/bench-libtmux-call.py             |  84 ++++
 .../libtmux-profiler/scripts/diff-pstats.py   |  91 +++++
 .../scripts/init-profile-session.sh           |  77 ++++
 .../scripts/setup-tachyon-venv.sh             |  71 ++++
 .gitignore                                    |   4 +
 7 files changed, 832 insertions(+)
 create mode 100644 .claude/skills/libtmux-profiler/SKILL.md
 create mode 100644 .claude/skills/libtmux-profiler/reference/flamegraph-reading.md
 create mode 100755 .claude/skills/libtmux-profiler/scripts/bench-libtmux-call.py
 create mode 100755 .claude/skills/libtmux-profiler/scripts/diff-pstats.py
 create mode 100755 .claude/skills/libtmux-profiler/scripts/init-profile-session.sh
 create mode 100755 .claude/skills/libtmux-profiler/scripts/setup-tachyon-venv.sh

diff --git a/.claude/skills/libtmux-profiler/SKILL.md b/.claude/skills/libtmux-profiler/SKILL.md
new file mode 100644
index 000000000..c6082cc6e
--- /dev/null
+++ b/.claude/skills/libtmux-profiler/SKILL.md
@@ -0,0 +1,378 @@
+---
+name: libtmux-profiler
+description: Use when the user wants to profile or benchmark libtmux/tmuxp performance — investigating "where is the lag", "why is the test suite slow", flamegraphs, heatmaps, hot-spot analysis, or pstats diffs. TRIGGER when phrases include "profile libtmux", "profile tmuxp", "tachyon", "flamegraph", "heatmap", "where is the bottleneck", "where is the lag", "benchmark libtmux", "benchmark tmuxp", "how fast is", "microbench", "single-call speed", "per-call timing", or "diff pstats". Uses Python 3.15's stdlib profiling.sampling module (Tachyon) plus a pstats-arithmetic diff helper for structured comparisons. SKIP for unrelated profiling questions.
+---
+
+# Profiling libtmux & tmuxp with Tachyon
+
+Tachyon is Python 3.15's `profiling.sampling` stdlib module: a statistical
+sampling profiler that reads stack frames externally with near-zero target
+overhead. It produces interactive flamegraphs, line-level heatmaps,
+Gecko-compatible call trees, and pstats output — no third-party tooling
+required.
+
+## Prerequisites
+
+1. **Python 3.15.0a8** must be installed via mise:
+   ```console
+   $ mise install python@3.15.0a8
+   ```
+   `~/work/python/libtmux/.tool-versions` declares it for this repo.
+
+2. **A `.venv-3.15` venv** in the target repo with libtmux/tmuxp installed
+   editable. If missing, run the bootstrap script:
+   ```console
+   $ bash ~/work/python/libtmux/.claude/skills/libtmux-profiler/scripts/setup-tachyon-venv.sh
+   ```
+   The script is idempotent — safe to re-run; it skips work when the venv
+   already exists.
+
+3. **Verify Tachyon is reachable:**
+   ```console
+   $ ./.venv-3.15/bin/python -m profiling.sampling --help
+   ```
+
+## Output directory convention
+
+Every profiling session goes into a structured path:
+
+```
+/tmp/py-profiling/<YYYY-MM-DD-HH-MM-SS>/<project>/<branch>/<session-name>/
+```
+
+This keeps artifacts organized across many sessions, surfaces the project
+and branch in the path, and timestamps each run for easy chronological
+sorting. The included `init-profile-session.sh` helper builds this path,
+creates a per-session `README.md` with metadata (timestamp, repo root,
+branch, HEAD short-sha, Python version), and prints the absolute path:
+
+```console
+$ PROFILE_DIR=$(bash ~/work/python/libtmux/.claude/skills/libtmux-profiler/scripts/init-profile-session.sh slow-suite)
+$ echo "$PROFILE_DIR"
+/tmp/py-profiling/2026-04-28-11-20-25/libtmux/main/slow-suite/
+```
+
+**Always `cd` into the target repo first** — the script reads
+`git rev-parse --show-toplevel` from `$PWD`, so the project name and branch
+are auto-detected.
+
+## Which recipe should I use?
+
+| if the question is... | use |
+|---|---|
+| "where is wall time spent in this test suite?" | **Workflow — pstats top-N** |
+| "structured comparison of two profiles (before/after a change)" | **Workflow — automated pstats diff** |
+| "what hot lines in this function?" | **Workflow — heatmap** |
+| "test is hanging, what's it doing?" | **Workflow — live TUI / attach** |
+| "how fast is `server.X()`?" | **Recipe — single-call microbenchmark** |
+
+The Workflows are general-purpose; the single-call microbenchmark
+recipe is for measuring per-call cost of one libtmux API in isolation
+without any test framework overhead.
+
+## Workflow — pstats top-N for terminal-only analysis
+
+When a browser is unavailable or you want a shareable text summary:
+
+```console
+$ PROFILE_DIR=$(bash ~/work/python/libtmux/.claude/skills/libtmux-profiler/scripts/init-profile-session.sh pstats-ad-hoc)
+$ SHELL=/bin/sh ./.venv-3.15/bin/python \
+    -m profiling.sampling run --pstats -r 5khz \
+    -o "$PROFILE_DIR/suite.pstats" \
+    -m pytest tests/test_server.py --no-cov -q -p no:randomly
+$ ./.venv-3.15/bin/python -c "
+import pstats
+pstats.Stats('$PROFILE_DIR/suite.pstats').sort_stats('cumulative').print_stats(30)
+"
+```
+
+This is the single most useful recipe: it surfaces the dominant
+wall-time consumers (e.g. `subprocess.Popen.communicate`,
+`selectors.poll()`, fixture setup) in seconds without needing a
+browser. Use it as the first step before deciding whether deeper
+investigation (flamegraph, heatmap) is worth it.
+
+**Why `SHELL=/bin/sh`:** aligns the runtime env with the test config
+(`default-shell /bin/sh` set in libtmux's pytest plugin) so any
+`os.getenv("SHELL")`-reading code paths agree with what tmux actually
+spawns.
+
+**Why `-p no:randomly`:** ensures repeatable test order so before/after
+profiles compare apples to apples.
+
+## Workflow — automated pstats diff (structured before/after)
+
+For "did my optimization actually help?" or "what regressed since
+master?" comparisons. Use the bundled `diff-pstats.py` helper: it
+loads two `pstats` files, computes per-function `cumtime` deltas, and
+prints a markdown table sorted by `abs(Δ cumtime)` — the biggest
+movers first, regardless of direction.
+
+The recommended workflow is to capture two `.bin` binaries (Tachyon's
+fast capture format), replay each into a `.pstats`, and run
+`diff-pstats.py`:
+
+```console
+$ cd ~/work/python/libtmux
+$ PROFILE_DIR=$(bash ~/work/python/libtmux/.claude/skills/libtmux-profiler/scripts/init-profile-session.sh before-after)
+
+# Capture baseline (e.g. on master).
+$ git checkout master
+$ SHELL=/bin/sh ./.venv-3.15/bin/python \
+    -m profiling.sampling run --binary -r 5khz \
+    -o "$PROFILE_DIR/baseline.bin" \
+    -m pytest tests/test_server.py --no-cov -p no:randomly
+
+# Capture current (e.g. on your branch).
+$ git checkout your-branch
+$ SHELL=/bin/sh ./.venv-3.15/bin/python \
+    -m profiling.sampling run --binary -r 5khz \
+    -o "$PROFILE_DIR/current.bin" \
+    -m pytest tests/test_server.py --no-cov -p no:randomly
+
+# Replay each binary to pstats.
+$ for tag in baseline current; do
+    ./.venv-3.15/bin/python -m profiling.sampling replay \
+      "$PROFILE_DIR/$tag.bin" --pstats -o "$PROFILE_DIR/$tag.pstats"
+  done
+
+# Markdown diff table.
+$ ./.venv-3.15/bin/python \
+    ~/work/python/libtmux/.claude/skills/libtmux-profiler/scripts/diff-pstats.py \
+    "$PROFILE_DIR/baseline.pstats" "$PROFILE_DIR/current.pstats" \
+    --top 30
+```
+
+Output:
+
+```markdown
+# pstats diff (top 30 by |Δ cumtime|)
+
+- baseline: `/tmp/py-profiling/.../baseline.pstats`
+- current:  `/tmp/py-profiling/.../current.pstats`
+
+| function | baseline (s) | current (s) | Δ (s) | Δ% |
+|---|---:|---:|---:|---:|
+| `common.py:320(get_version)` | 1.234 | 0.012 | -1.222 | -99.0% |
+| `subprocess.py:1274(Popen.communicate)` | 5.350 | 4.100 | -1.250 | -23.4% |
+| ... |
+```
+
+The markdown is paste-ready for PRs, issues, or chat. **Functions
+only present in one profile show "—" in Δ%** (no baseline to compare
+against). Sort order is by `abs(Δ)` so a -1.222s drop and a +0.317s
+increase both surface near the top.
+
+## Recipe — single-call microbenchmark
+
+For "how fast is `server.has_session()`?"-shaped questions. Skips the
+test framework entirely and runs a tight loop against a real tmux
+server using a pre-defined registry of bench targets.
+
+```console
+$ cd ~/work/python/libtmux
+$ PROFILE_DIR=$(bash ~/work/python/libtmux/.claude/skills/libtmux-profiler/scripts/init-profile-session.sh microbench-has-session)
+
+$ SHELL=/bin/sh BENCH_TARGET=has_session BENCH_ITERS=2000 \
+    ./.venv-3.15/bin/python \
+    -m profiling.sampling run --binary -r 10khz \
+    -o "$PROFILE_DIR/run.bin" \
+    ~/work/python/libtmux/.claude/skills/libtmux-profiler/scripts/bench-libtmux-call.py
+
+$ ./.venv-3.15/bin/python -m profiling.sampling replay \
+    "$PROFILE_DIR/run.bin" --pstats -o "$PROFILE_DIR/run.pstats"
+
+$ ./.venv-3.15/bin/python -c "
+import pstats
+pstats.Stats('$PROFILE_DIR/run.pstats').sort_stats('cumulative').print_stats(30)
+"
+```
+
+**Bench target registry** (in `scripts/bench-libtmux-call.py`):
+
+| `BENCH_TARGET=` | call |
+|---|---|
+| `has_session` (default) | `server.has_session("bench")` |
+| `list_sessions` | `server.sessions` |
+| `list_windows` | `session.windows` |
+| `session_name` | `session.session_name` |
+| `show_options` | `session.cmd("show-options", "-g")` |
+| `list_panes` | `session.active_window.panes` |
+
+To add a new target, edit the `BENCH_TARGETS` dict in the script —
+the dict is the entire allowlist, so the script can never execute
+caller-supplied Python expressions. Higher sample rate (`-r 10khz`)
+because each call is sub-millisecond and the 1 kHz default would
+miss most of them.
+
+The default socket name embeds `os.getpid()` so back-to-back runs in
+a comparison loop don't trip on leftover sessions from a prior run
+whose `kill_server` didn't fully drain.
+
+## Workflow — heatmap for line-level hot spots
+
+Once you've identified a slow function, get exact line-level
+attribution. The heatmap renders one HTML page per source file with
+color intensity per line.
+
+```console
+$ PROFILE_DIR=$(bash ~/work/python/libtmux/.claude/skills/libtmux-profiler/scripts/init-profile-session.sh heatmap-server)
+$ ./.venv-3.15/bin/python -m profiling.sampling run \
+    --heatmap -o "$PROFILE_DIR/heatmap" \
+    -m pytest tests/test_server.py --no-cov -p no:randomly
+```
+
+Open `$PROFILE_DIR/heatmap/index.html` — pick the file you care about.
+Add `--opcodes` to see bytecode-level intensity inside hot lines (when
+the bottleneck is a Python interpreter detail, not a library call).
+
+## Workflow — live TUI for ad-hoc investigation
+
+`--live` runs a top(1)-style real-time profiler. Useful when a test is
+behaving strangely and you want to see the hotspot evolve. Note: live
+mode does not write artifacts to disk, so a `PROFILE_DIR` isn't needed.
+
+```console
+$ ./.venv-3.15/bin/python -m profiling.sampling run --live \
+    -m pytest tests/test_server.py::test_no_server_is_alive -v
+```
+
+Key shortcuts in the TUI:
+- `q` — quit
+- `s` / `S` — cycle sort order forward/back
+- `p` — pause display (sampling continues)
+- `t` — toggle per-thread view
+- `/` — filter functions by substring
+- `+` / `-` — adjust refresh rate (0.05–1.0s)
+
+## Workflow — attach to a running pytest
+
+When a test is hanging and you want to diagnose without restarting it:
+
+```console
+$ pytest tests/test_server.py::test_some_hang -v &
+$ # note the PID printed by `&`, or get it from `ps`
+$ ./.venv-3.15/bin/python -m profiling.sampling attach --live <PID>
+```
+
+For a recorded capture instead of live:
+
+```console
+$ PROFILE_DIR=$(bash ~/work/python/libtmux/.claude/skills/libtmux-profiler/scripts/init-profile-session.sh attach-debug)
+$ ./.venv-3.15/bin/python -m profiling.sampling attach \
+    --binary -d 30 -o "$PROFILE_DIR/attached.bin" <PID>
+```
+
+If the target uses `pytest-asyncio`, add `--async-aware` so Tachyon
+reconstructs task-level stacks instead of event-loop internals.
+
+## Sampling mode selection — `--mode {wall,cpu,gil,exception}`
+
+| mode | measures | use when |
+|---|---|---|
+| `wall` (default) | all elapsed time, including I/O / sleep / lock waits | general "where is wall time spent" |
+| `cpu` | CPU-active time only | filter out I/O, see compute-bound hotspots |
+| `gil` | time holding the Python GIL | multi-threaded contention diagnosis |
+| `exception` | time inside `except`/`finally` after a `raise` | exception-driven control flow audits |
+
+For libtmux/tmuxp profiling, `wall` is right ~95% of the time. The
+shell-startup wait inside `subprocess.Popen.communicate` and any
+explicit poll loops are *both* I/O-bound and will be invisible under
+`cpu` mode.
+
+## Reading the output
+
+**Plain flamegraph**: width = sample count = time. Wide flat plateaus
+mean a single function dominates (sleep loops, `selectors.poll`,
+fixture setup). Tall narrow spikes mean deep call stacks (often
+recursion or framework dispatch chains like pluggy).
+
+**Heatmap**: brighter = more samples on that line. Source-line
+attribution is exact; perfect for "I know `fetch_objs` is slow but
+which line?" investigations.
+
+See `reference/flamegraph-reading.md` for a fuller cheat sheet on
+visual signatures.
+
+## Known issue — `--diff-flamegraph` is broken in 3.15.0a8
+
+The "natural" one-shot diff workflow (`run --diff-flamegraph baseline.bin`)
+crashes in Python 3.15.0a8:
+
+```
+File "<...>/python3.15/profiling/sampling/stack_collector.py", line 689,
+    in _add_elided_metadata
+    if baseline_self > 0:
+       ^^^^^^^^^^^^^
+UnboundLocalError: cannot access local variable 'baseline_self' where it
+    is not associated with a value
+```
+
+The samples are captured but the diff HTML never gets written. **The
+*automated pstats diff* workflow above sidesteps this** with the
+bundled `diff-pstats.py` helper that operates on `.pstats` files
+instead of `.bin` files. When 3.15.x ships the fix, the one-shot
+workflow becomes:
+
+```console
+$ ... run --diff-flamegraph "$PROFILE_DIR/baseline.bin" \
+    -o "$PROFILE_DIR/diff.html" -m pytest ...   # not yet usable
+```
+
+Color legend (when fixed): red=regression, blue=improvement,
+gray=no change, purple=new code path.
+
+## Pitfalls
+
+- **Short scripts (<1s)** don't collect enough samples for reliable
+  results. Either loop the target or use `profiling.tracing` (the
+  deterministic profiler in 3.15's `profiling.tracing` module).
+- **Subprocess children are not profiled by default.** libtmux shells
+  out to `tmux` via `subprocess.Popen`; the `tmux` child does its own
+  work that Tachyon can't see — what you'll see in the flamegraph is
+  the parent Python time waiting in `Popen.communicate`. To profile
+  children too, add `--subprocesses` (incompatible with `--live`).
+- **Sampling rate trade-off.** Default 1 kHz is balanced. For a 30s test
+  run, `5khz` (used in the recipes above) gives ~150K samples — plenty
+  of resolution. `20khz` adds profiler overhead without meaningful
+  detail gain at this duration.
+- **Statistical noise.** Numbers vary 1-2% between runs. Don't chase
+  small deltas; focus on patterns (which functions dominate? which
+  call paths shifted in the diff?).
+- **The `.venv-3.15` venv is separate** from the project's main
+  `.venv`. Don't try to use `uv run` with the 3.15 venv unless you've
+  set `VIRTUAL_ENV` explicitly — `uv run` uses the project's pinned
+  Python (3.14). Always invoke `./.venv-3.15/bin/python ...` directly.
+- **`watchfiles` won't build for 3.15a8** (Rust compatibility). The
+  bootstrap script installs `--group testing` instead of `dev` to skip
+  the docs deps that pull in `sphinx-autobuild` → `watchfiles`.
+- **`--diff-flamegraph` UnboundLocalError in 3.15.0a8** — see *Known
+  issue* above. Use the `diff-pstats.py` helper until fixed.
+
+## Output directory layout
+
+Every session lives under:
+
+```
+/tmp/py-profiling/
+└── <YYYY-MM-DD-HH-MM-SS>/         e.g. 2026-04-28-11-20-25
+    └── <project>/                 e.g. libtmux
+        └── <branch>/              e.g. main  (slashes → underscores)
+            └── <session-name>/    e.g. before-after
+                ├── README.md      session metadata + HEAD sha
+                ├── baseline.bin
+                ├── baseline.pstats
+                ├── current.bin
+                ├── current.pstats
+                └── heatmap/       (when --heatmap is used)
+                    ├── index.html
+                    └── *.html
+```
+
+The README.md is auto-generated by `init-profile-session.sh` and
+captures the git HEAD short-sha, so you can reconstruct the exact code
+state even after the branch has moved or been deleted.
+
+Clean up `/tmp/py-profiling/` periodically — pstats files are ~350 KB
+each, HTML flamegraphs are 800 KB to a few MB, heatmap directories can
+hit 10+ MB.
diff --git a/.claude/skills/libtmux-profiler/reference/flamegraph-reading.md b/.claude/skills/libtmux-profiler/reference/flamegraph-reading.md
new file mode 100644
index 000000000..31aa7f4d2
--- /dev/null
+++ b/.claude/skills/libtmux-profiler/reference/flamegraph-reading.md
@@ -0,0 +1,127 @@
+# Flamegraph & heatmap reading cheat sheet
+
+A quick reference for interpreting Tachyon output. Read once, refer
+back when staring at a fresh diff.
+
+## Diff flamegraph color legend
+
+Tachyon's `--diff-flamegraph` colors each frame by its delta vs.
+the baseline binary file. Frame width represents the **current** run's
+sample count (i.e. time spent now); the **color** tells you what
+changed.
+
+| color  | meaning                                                    |
+|--------|------------------------------------------------------------|
+| red    | regression — current took longer than baseline              |
+| blue   | improvement — current took less than baseline               |
+| gray   | minimal change — within sampling noise                      |
+| purple | new code path — function appears only in current run        |
+
+Darker red/blue = larger absolute delta. Hover any frame to see
+`baseline_time / current_time / Δ%`.
+
+If your optimization eliminates whole call chains, look for an
+"elided" toggle — switching to elided view shows what disappeared
+(the inverse of "purple"). Useful when verifying that a refactor
+actually removed work rather than just shifted it.
+
+## Frame width semantics
+
+Width is sample count, which proxies time. Two frames at the same
+width contributed equally to wall time at the chosen sampling rate.
+
+A function's own width includes time spent in its callees. To see a
+function's *exclusive* time, look at the difference between its width
+and the sum of its children's widths — narrow leaves under a wide
+parent mean the parent is doing real work itself, not just dispatching.
+
+## Three common visual signatures
+
+### "Wide flat plateau"
+
+A single function spans most of the width with few or no children.
+
+```
+[          some_function          ]
+[                ...               ]
+```
+
+→ The function itself is the bottleneck, or it's blocking on a
+syscall. Common examples:
+- `time.sleep` in a polling loop
+- `selectors.select` waiting for I/O
+- `os.read` on a pipe
+- Whole-function bytecode in a tight Python loop
+
+### "Tall narrow spike"
+
+A deep call stack stacks vertically with each frame ~as wide as its
+parent.
+
+```
+[              caller              ]
+   [           callee              ]
+      [        callee2             ]
+         [     callee3             ]
+            [    callee4           ]
+               [   recursing_fn    ]
+                  [recursing_fn]
+                  ...
+```
+
+→ Often deep dispatch chains (pluggy hook flow, decorator stacks) or
+recursion. The dispatch case is harmless overhead; recursion may
+indicate algorithmic issue.
+
+### "Many short equal-width bars"
+
+A row of frames each ~the same narrow width, with similar callers
+above.
+
+```
+[           outer_loop            ]
+[a][b][c][d][e][f][g][h][i][j][k]...
+```
+
+→ A loop body, each call costing ~the same. Optimization target: the
+loop itself, or replace with vectorized/batched alternative.
+
+## Heatmap intensity
+
+Heatmaps show sample counts per source line, color-graded:
+- bright red / orange = many samples on that line
+- yellow / green = moderate
+- cool / gray = few or none
+
+Source lines with no samples (dispatched but instant) appear cool.
+Don't conflate "executed" with "expensive" — a one-line `return`
+might run 10000× without ever showing up because each execution is
+sub-microsecond.
+
+With `--opcodes` enabled, hot lines expand to show per-bytecode-op
+intensity. Useful for diagnosing surprising tight-loop costs (e.g.,
+"why is this attribute access showing up?" → `LOAD_ATTR` repeated
+across the loop).
+
+## Quick pattern-matching to root cause
+
+| visual signature | likely cause |
+|---|---|
+| wide plateau on `time.sleep` | speculative sleep, replace with `retry_until` |
+| wide plateau on `selectors.select` | blocking I/O wait — look one frame up to find the caller |
+| wide plateau on `subprocess.Popen.communicate` | child-process wait — engine is waiting for tmux, ssh, etc. |
+| wide plateau on `os.fork` / `os.posix_spawn` | per-call process spawn overhead |
+| tall spike through `pluggy._hooks.HookCaller.__call__` | normal pytest dispatch — usually not a bug |
+| tall spike through `_pytest.fixtures.SubRequest.getfixturevalue` | fixture chain — heavy when many autouse fixtures |
+| many narrow bars under `for ... in items` | loop body, optimize per-iteration cost |
+| purple frames in diff against baseline | new code paths — verify they're intentional |
+
+## When the signature is misleading
+
+Sampling profilers undersample very-short-lived calls. A function
+that runs 100,000 times for 5 µs each (500 ms total) may show
+*fewer* samples than a function that runs once for 100 ms — because
+the 5 µs slot rarely overlaps with a sampling tick. If the
+flamegraph and your wall-time intuition disagree, double-check with
+`profiling.tracing` (deterministic) or inline `time.perf_counter()`
+brackets around the suspicious code.
diff --git a/.claude/skills/libtmux-profiler/scripts/bench-libtmux-call.py b/.claude/skills/libtmux-profiler/scripts/bench-libtmux-call.py
new file mode 100755
index 000000000..0efbafa30
--- /dev/null
+++ b/.claude/skills/libtmux-profiler/scripts/bench-libtmux-call.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+"""Tight-loop benchmark for a single libtmux call.
+
+Configurable via env vars:
+
+  BENCH_TARGET    name from BENCH_TARGETS registry (default: has_session)
+  BENCH_ITERS     int (default: 1000)
+  BENCH_SOCKET    socket name (default: ``microbench-<pid>`` for isolation)
+  BENCH_SESSION   session name created for the run (default: bench)
+
+The default socket name embeds the process PID so back-to-back
+invocations (e.g. before/after a change in a comparison loop) get
+isolated tmux servers and don't trip on leftover sessions from a prior
+run whose ``kill_server`` didn't fully drain.
+
+To add a new bench target: edit ``BENCH_TARGETS`` below — keeps the
+script safe by construction (no dynamic code execution from caller
+input). The dict is intentionally short and edited in source rather
+than configured at runtime.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import os
+import sys
+import typing as t
+
+import libtmux
+
+if t.TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from libtmux.server import Server
+    from libtmux.session import Session
+
+
+# Registry of profile-able single calls. Each takes (server, session)
+# and returns whatever the call returns. Add entries here — never accept
+# caller-supplied expressions.
+BENCH_TARGETS: dict[str, Callable[[Server, Session], object]] = {
+    "has_session": lambda server, session: server.has_session(
+        os.environ.get("BENCH_SESSION", "bench"),
+    ),
+    "list_sessions": lambda server, session: server.sessions,
+    "list_windows": lambda server, session: session.windows,
+    "session_name": lambda server, session: session.session_name,
+    "show_options": lambda server, session: session.cmd("show-options", "-g"),
+    "list_panes": lambda server, session: session.active_window.panes,
+}
+
+
+def main() -> int:
+    """Run a fixed-iteration loop of the selected libtmux call."""
+    target_name = os.environ.get("BENCH_TARGET", "has_session")
+    if target_name not in BENCH_TARGETS:
+        valid = ", ".join(sorted(BENCH_TARGETS))
+        sys.stderr.write(
+            f"unknown BENCH_TARGET={target_name!r}; valid: {valid}\n",
+        )
+        return 2
+
+    iters = int(os.environ.get("BENCH_ITERS", "1000"))
+    if iters <= 0:
+        sys.stderr.write(f"BENCH_ITERS must be > 0 (got {iters})\n")
+        return 2
+
+    target = BENCH_TARGETS[target_name]
+    socket_name = os.environ.get("BENCH_SOCKET", f"microbench-{os.getpid()}")
+    session_name = os.environ.get("BENCH_SESSION", "bench")
+
+    server: Server = libtmux.Server(socket_name=socket_name)
+    try:
+        session = server.new_session(session_name=session_name)
+        for _ in range(iters):
+            target(server, session)
+    finally:
+        with contextlib.suppress(Exception):
+            server.kill_server()
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/.claude/skills/libtmux-profiler/scripts/diff-pstats.py b/.claude/skills/libtmux-profiler/scripts/diff-pstats.py
new file mode 100755
index 000000000..3ea1db243
--- /dev/null
+++ b/.claude/skills/libtmux-profiler/scripts/diff-pstats.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+"""Compare two pstats files; print top-N functions sorted by Δ cumtime.
+
+Replaces the broken ``--diff-flamegraph`` workflow in Python 3.15.0a8
+(see SKILL.md "Known issue" section). Output is markdown so it can be
+pasted directly into PRs, issues, or chat.
+
+Usage:
+    python diff-pstats.py BASELINE.pstats CURRENT.pstats [--top 30]
+
+Output format:
+    | function | baseline (s) | current (s) | Δ (s) | Δ% |
+    |---|---:|---:|---:|---:|
+    | `path/to/file.py:NNN(funcname)` | 1.234 | 2.345 | +1.111 | +90.0% |
+    ...
+
+Sorted by absolute Δ cumtime descending — the biggest movers first,
+regardless of direction. Functions only present in one profile show
+"—" in the Δ% column.
+"""
+
+from __future__ import annotations
+
+import argparse
+import pathlib
+import pstats
+import sys
+
+
+def collect_cumtime(path: str) -> dict[str, float]:
+    """Return a ``{function_label: cumulative_time}`` mapping for one pstats file.
+
+    The ``stats`` attribute on ``pstats.Stats`` is a private dict keyed
+    by ``(filename, lineno, name)`` 3-tuples. We flatten the key to a
+    readable label suitable for printing in a markdown table.
+    """
+    stats = pstats.Stats(path)
+    out: dict[str, float] = {}
+    for func, (_cc, _nc, _tt, ct, _callers) in stats.stats.items():
+        filename, lineno, name = func
+        label = f"{filename}:{lineno}({name})"
+        out[label] = ct
+    return out
+
+
+def main() -> int:
+    """Parse args, build the diff table, print to stdout."""
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("baseline", help="path to baseline .pstats file")
+    p.add_argument("current", help="path to current .pstats file")
+    p.add_argument(
+        "--top",
+        type=int,
+        default=30,
+        help="max rows to print (default: 30)",
+    )
+    args = p.parse_args()
+
+    for path_str, label in ((args.baseline, "baseline"), (args.current, "current")):
+        if not pathlib.Path(path_str).is_file():
+            sys.stderr.write(f"error: {label} file not found: {path_str}\n")
+            return 2
+
+    base = collect_cumtime(args.baseline)
+    cur = collect_cumtime(args.current)
+
+    keys = set(base) | set(cur)
+    rows: list[tuple[str, float, float, float, float]] = []
+    for k in keys:
+        b, c = base.get(k, 0.0), cur.get(k, 0.0)
+        delta = c - b
+        pct = (delta / b * 100) if b > 0 else float("inf")
+        rows.append((k, b, c, delta, pct))
+
+    rows.sort(key=lambda r: abs(r[3]), reverse=True)
+
+    print(f"# pstats diff (top {args.top} by |Δ cumtime|)")
+    print()
+    print(f"- baseline: `{args.baseline}`")
+    print(f"- current:  `{args.current}`")
+    print()
+    print("| function | baseline (s) | current (s) | Δ (s) | Δ% |")
+    print("|---|---:|---:|---:|---:|")
+    for k, b, c, d, pct in rows[: args.top]:
+        pct_str = "—" if pct == float("inf") else f"{pct:+.1f}%"
+        print(f"| `{k}` | {b:.3f} | {c:.3f} | {d:+.3f} | {pct_str} |")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/.claude/skills/libtmux-profiler/scripts/init-profile-session.sh b/.claude/skills/libtmux-profiler/scripts/init-profile-session.sh
new file mode 100755
index 000000000..7eccafd31
--- /dev/null
+++ b/.claude/skills/libtmux-profiler/scripts/init-profile-session.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+# Bootstrap a per-session profile output directory.
+#
+# Usage:
+#   PROFILE_DIR=$(bash ~/work/python/libtmux/.claude/skills/libtmux-profiler/scripts/init-profile-session.sh <session_name>)
+#
+# Creates: /tmp/py-profiling/<YYYY-MM-DD-HH-MM-SS>/<project>/<branch>/<session_name>/
+#   ├── README.md       (auto-generated session metadata)
+#   └── (artifacts go here — .bin, .html, .pstats, etc.)
+#
+# Prints the absolute path to stdout (so the caller can capture it via
+# command substitution). All other output goes to stderr.
+
+set -euo pipefail
+
+session_name="${1:-}"
+if [ -z "${session_name}" ]; then
+    echo "ERROR: session_name required" >&2
+    echo "  usage: $(basename "$0") <session_name>" >&2
+    echo "  example: $(basename "$0") server-cmd-baseline" >&2
+    exit 1
+fi
+
+# Sanitize session name (replace path-unsafe chars).
+session_name="${session_name//\//_}"
+session_name="${session_name// /_}"
+
+ts="$(date +%Y-%m-%d-%H-%M-%S)"
+
+# Detect git repo from $PWD.
+if root="$(git rev-parse --show-toplevel 2>/dev/null)"; then
+    project="$(basename "${root}")"
+    branch="$(git -C "${root}" rev-parse --abbrev-ref HEAD 2>/dev/null || echo no-branch)"
+    branch="${branch//\//_}"  # sanitize / in branch names like 'feature/foo'
+    head_sha="$(git -C "${root}" rev-parse --short HEAD 2>/dev/null || echo unknown)"
+else
+    root="${PWD}"
+    project="non-git-$(basename "${PWD}")"
+    branch="non-git"
+    head_sha="none"
+fi
+
+dir="/tmp/py-profiling/${ts}/${project}/${branch}/${session_name}"
+mkdir -p "${dir}"
+
+# Per-session README captures invocation context.
+cat > "${dir}/README.md" <<EOF
+# Profile session: \`${session_name}\`
+
+| field | value |
+|---|---|
+| timestamp | \`${ts}\` |
+| project | \`${project}\` |
+| repo root | \`${root}\` |
+| branch | \`${branch}\` |
+| HEAD | \`${head_sha}\` |
+| python | \`$(python --version 2>&1 | head -1 || echo unknown)\` |
+| invoked from | \`${PWD}\` |
+| user shell | \`${SHELL:-unset}\` |
+
+## Files in this session
+
+(populated as artifacts are written)
+
+## How to read
+
+- \`*.bin\` — Tachyon binary captures, replay-able via
+  \`python -m profiling.sampling replay <file>.bin --flamegraph -o out.html\`
+- \`*.html\` — interactive flamegraphs (open in browser; Ctrl+F to search)
+- \`*.pstats\` — Python pstats binaries:
+  \`python -c "import pstats; pstats.Stats('<file>.pstats').sort_stats('cumulative').print_stats(30)"\`
+- \`heatmap-*/\` — per-source-file HTML heatmaps (start at \`index.html\`)
+
+EOF
+
+echo "==> profile session ready: ${dir}" >&2
+echo "${dir}"
diff --git a/.claude/skills/libtmux-profiler/scripts/setup-tachyon-venv.sh b/.claude/skills/libtmux-profiler/scripts/setup-tachyon-venv.sh
new file mode 100755
index 000000000..df40916dc
--- /dev/null
+++ b/.claude/skills/libtmux-profiler/scripts/setup-tachyon-venv.sh
@@ -0,0 +1,71 @@
+#!/usr/bin/env bash
+# Bootstrap a Python 3.15-backed venv for Tachyon profiling.
+#
+# Idempotent: safe to re-run. Skips work when .venv-3.15 already exists
+# and reports Tachyon's status.
+#
+# Usage:
+#   bash ~/work/python/libtmux/.claude/skills/libtmux-profiler/scripts/setup-tachyon-venv.sh
+#
+# Run from the libtmux (or any libtmux/tmuxp) repo root, OR the script
+# will detect the repo root from $PWD.
+
+set -euo pipefail
+
+PYTHON_315="${HOME}/.config/mise/installs/python/3.15.0a8/bin/python3.15"
+
+if [ ! -x "${PYTHON_315}" ]; then
+    echo "ERROR: Python 3.15.0a8 not installed via mise."
+    echo "  fix: cd into a repo with .tool-versions declaring 3.15.0a8 and run \`mise install\`"
+    echo "  or:  mise install python@3.15.0a8"
+    exit 1
+fi
+
+# Find the repo root from $PWD by walking up looking for pyproject.toml.
+repo_root="${PWD}"
+while [ "${repo_root}" != "/" ] && [ ! -f "${repo_root}/pyproject.toml" ]; do
+    repo_root="$(dirname "${repo_root}")"
+done
+
+if [ ! -f "${repo_root}/pyproject.toml" ]; then
+    echo "ERROR: no pyproject.toml found in ${PWD} or any parent."
+    echo "  fix: cd into a libtmux or tmuxp repo first."
+    exit 1
+fi
+
+cd "${repo_root}"
+echo "==> repo: ${repo_root}"
+
+venv=".venv-3.15"
+
+if [ -d "${venv}" ]; then
+    if "${venv}/bin/python" -c "import profiling.sampling" 2>/dev/null; then
+        echo "==> ${venv} already exists with Tachyon importable — skipping setup."
+        echo "==> verify: ${venv}/bin/python -m profiling.sampling --help"
+        exit 0
+    fi
+    echo "==> ${venv} exists but Tachyon import failed; rebuilding."
+    rm -rf "${venv}"
+fi
+
+echo "==> creating ${venv} with Python 3.15"
+uv venv --python "${PYTHON_315}" "${venv}" >/dev/null
+
+# Install with the testing group only — `dev` pulls in sphinx-autobuild
+# → watchfiles, which has no Rust wheel for 3.15a8 yet and fails to build.
+echo "==> installing project (editable) + testing group"
+VIRTUAL_ENV="${repo_root}/${venv}" uv pip install --quiet \
+    --editable . --group testing
+
+# pyproject.toml's addopts requires `--no-cov`, which only exists when
+# pytest-cov is installed. The `coverage` group exists for this but adds
+# nothing else useful to a profiler venv, so install pytest-cov directly.
+echo "==> installing pytest-cov (required by repo's pytest addopts)"
+VIRTUAL_ENV="${repo_root}/${venv}" uv pip install --quiet pytest-cov
+
+echo "==> verifying Tachyon import"
+"${venv}/bin/python" -c "import profiling.sampling; print('  Tachyon at:', profiling.sampling.__file__)"
+
+echo
+echo "==> done. Try:"
+echo "    ${venv}/bin/python -m profiling.sampling run --help"
diff --git a/.gitignore b/.gitignore
index 25123a749..789292196 100644
--- a/.gitignore
+++ b/.gitignore
@@ -83,6 +83,10 @@ docs/_static/fonts/
 docs/_static/css/fonts.css
 
 # Claude code
+# Override the global ~/.gitignore_global `.claude/` exclusion so this
+# repo's project-scoped skills check in cleanly. settings.local.json
+# and personal CLAUDE notes stay ignored via the explicit lines below.
+!.claude/
 **/CLAUDE.local.md
 **/CLAUDE.*.md
 **/.claude/settings.local.json