From 9c7eb27e5fb605e95a929246a1af305494d120f5 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Thu, 2 Apr 2026 21:20:45 +0200 Subject: [PATCH 1/2] Scale H5 pipeline to 50 workers at 1 CPU each MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The H5 build work is single-threaded numpy. 4 CPUs per worker was wasted. 8 workers meant each processed ~60 items serially. - Increase default workers from 8 to 50 (~10 items per worker) - Drop worker CPU from 4 to 1 (saves 75% CPU cost) - Add max_containers=50 as safety cap - Wall-clock time drops from ~60min to ~12min - Total CPU cost drops: 8×60min×4CPU → 50×12min×1CPU Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/pipeline.yaml | 2 +- modal_app/local_area.py | 7 ++++--- modal_app/pipeline.py | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/pipeline.yaml b/.github/workflows/pipeline.yaml index d15284d5e..d51af16e7 100644 --- a/.github/workflows/pipeline.yaml +++ b/.github/workflows/pipeline.yaml @@ -19,7 +19,7 @@ on: type: string num_workers: description: "Number of parallel H5 workers" - default: "8" + default: "50" type: string skip_national: description: "Skip national calibration/H5" diff --git a/modal_app/local_area.py b/modal_app/local_area.py index 4d5d847bf..5fee7e73f 100644 --- a/modal_app/local_area.py +++ b/modal_app/local_area.py @@ -293,8 +293,9 @@ def run_phase( "/pipeline": pipeline_volume, }, memory=16384, - cpu=4.0, + cpu=1.0, timeout=28800, + max_containers=50, nonpreemptible=True, ) def build_areas_worker( @@ -618,7 +619,7 @@ def promote_publish(branch: str = "main", version: str = "", run_id: str = "") - ) def coordinate_publish( branch: str = "main", - num_workers: int = 8, + num_workers: int = 50, skip_upload: bool = False, n_clones: int = 430, validate: bool = True, @@ -877,7 +878,7 @@ def coordinate_publish( @app.local_entrypoint() def main( branch: str = "main", - num_workers: int = 8, + num_workers: int = 50, skip_upload: bool = False, n_clones: int = 430, run_id: str = "", diff --git a/modal_app/pipeline.py b/modal_app/pipeline.py index 95d293d81..62ca398e4 100644 --- a/modal_app/pipeline.py +++ b/modal_app/pipeline.py @@ -606,7 +606,7 @@ def run_pipeline( epochs: int = 1000, national_gpu: str = "T4", national_epochs: int = 4000, - num_workers: int = 8, + num_workers: int = 50, n_clones: int = 430, skip_national: bool = False, resume_run_id: str = None, @@ -1268,7 +1268,7 @@ def main( epochs: int = 1000, national_gpu: str = "T4", national_epochs: int = 4000, - num_workers: int = 8, + num_workers: int = 50, n_clones: int = 430, skip_national: bool = False, clear_checkpoints: bool = False, From 3a35d0209fec8928cb81560277f239778f548b61 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 5 Apr 2026 20:21:13 -0400 Subject: [PATCH 2/2] Fix pipeline push worker default --- .github/workflows/pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pipeline.yaml b/.github/workflows/pipeline.yaml index d51af16e7..fed58023e 100644 --- a/.github/workflows/pipeline.yaml +++ b/.github/workflows/pipeline.yaml @@ -49,7 +49,7 @@ jobs: GPU="${{ inputs.gpu || 'T4' }}" EPOCHS="${{ inputs.epochs || '1000' }}" NATIONAL_EPOCHS="${{ inputs.national_epochs || '4000' }}" - NUM_WORKERS="${{ inputs.num_workers || '8' }}" + NUM_WORKERS="${{ inputs.num_workers || '50' }}" SKIP_NATIONAL="${{ inputs.skip_national || 'false' }}" python -c "