OpenEuroLLM · geoalgo · Mar 6, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/containers/jupiter.def b/containers/jupiter.def
@@ -14,7 +14,7 @@ From: nvcr.io/nvidia/pytorch:25.06-py3
     uv --version
 
     uv pip install --system --break-system-packages lm-eval \
-        "transformers<=4.53.0" "datasets<4.0.0" wandb sentencepiece tiktoken accelerate
+        wandb sentencepiece tiktoken accelerate
 
     # lighteval as isolated tool (avoids dependency conflicts)
     export UV_TOOL_DIR=/opt/uv-tools
@@ -36,12 +36,15 @@ nltk.download('punkt', download_dir='/opt/nltk_data')
 nltk.download('punkt_tab', download_dir='/opt/nltk_data')
 PY
 
+    # SSL cert fix for --contain mode
+    update-ca-certificates 2>/dev/null || true
+
 %environment
     export PATH=/usr/local/bin:$PATH
     export UV_TOOL_BIN_DIR=/usr/local/bin
     export UV_TOOL_DIR=/opt/uv-tools
     export NLTK_DATA=/opt/nltk_data
-
+    export SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt
 
 %runscript
     exec bash "$@"
diff --git a/docs/VENV.md b/docs/VENV.md
@@ -36,3 +36,20 @@ oellm schedule-eval \
 ## Why Two Install Steps?
 
 lm-eval requires `datasets<4.0.0` while lighteval requires `datasets>=4.0.0`. Installing lighteval as an isolated uv tool (like the containers do) avoids this conflict.
+
+## DCLM-core-22
+
+`dclm-core-22` needs `lm-eval==0.4.9.2` (v0.4.10+ breaks `agieval_lsat_ar` in few-shot). Use `requirements-venv-dclm.txt` instead of the default requirements:
+
+```bash
+uv venv --python 3.12 dclm-core-venv
+uv pip install --python dclm-core-venv/bin/python -r requirements-venv-dclm.txt
+```
+
+```bash
+oellm schedule-eval \
+    --models Qwen/Qwen3-0.6B-Base \
+    --task_groups dclm-core-22 \
+    --venv_path dclm-core-venv \
+    --skip_checks true
+```
diff --git a/oellm/resources/clusters.yaml b/oellm/resources/clusters.yaml
@@ -31,7 +31,7 @@ jupiter:
   ACCOUNT: "jureap59"
   QUEUE_LIMIT: 250
   EVAL_CONTAINER_IMAGE: "eval_env-jupiter.sif"
-  SINGULARITY_ARGS: "--nv --contain --env PYTHONNOUSERSITE=1"
+  SINGULARITY_ARGS: "--nv --contain --env PYTHONNOUSERSITE=1 --env SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt"
 
 lumi:
   hostname_pattern: "uan*"

diff --git a/oellm/resources/task-groups.yaml b/oellm/resources/task-groups.yaml
@@ -10,6 +10,17 @@ task_metrics:
   commonsense_qa: acc
   hellaswag: acc_norm
   piqa: acc_norm
+  social_iqa: acc
+  agieval_lsat_ar: acc
+  wsc273: acc
+  bigbench_language_identification_multiple_choice: acc
+  squadv2: f1
+  coqa: f1
+  bigbench_qa_wikidata_generate_until: exact_match
+  bigbench_dyck_languages_generate_until: exact_match
+  bigbench_operators_generate_until: exact_match
+  bigbench_repeat_copy_logic_generate_until: exact_match
+  bigbench_cs_algorithms_generate_until: exact_match
 
 task_groups:
   open-sci-0.01:
@@ -293,6 +304,87 @@ task_groups:
       - task: include_base_44_ukrainian
         subset: Ukrainian
 
+  dclm-core-22:
+    description: "DCLM core 22 evaluation tasks (lm-eval-harness, matching LLM Foundry task types)"
+    suite: lm-eval-harness
+    tasks:
+      - task: agieval_lsat_ar
+        n_shots: [3]
+        dataset: hails/agieval-lsat-ar
+      - task: arc_easy
+        n_shots: [10]
+        dataset: allenai/ai2_arc
+        subset: ARC-Easy
+      - task: arc_challenge
+        n_shots: [10]
+        dataset: allenai/ai2_arc
+        subset: ARC-Challenge
+      - task: boolq
+        n_shots: [10]
+        dataset: aps/super_glue
+        subset: boolq
+      - task: commonsense_qa
+        n_shots: [10]
+        dataset: tau/commonsense_qa
+      - task: copa
+        n_shots: [0]
+        dataset: aps/super_glue
+        subset: copa
+      - task: hellaswag
+        n_shots: [0, 10]
+        dataset: Rowan/hellaswag
+      - task: openbookqa
+        n_shots: [0]
+        dataset: allenai/openbookqa
+        subset: main
+      - task: piqa
+        n_shots: [10]
+        dataset: baber/piqa
+      - task: bigbench_language_identification_multiple_choice
+        n_shots: [10]
+        dataset: hails/bigbench
+        subset: language_identification_zero_shot
+      - task: winogrande
+        n_shots: [0]
+        dataset: allenai/winogrande
+        subset: winogrande_xl
+      - task: wsc273
+        n_shots: [0]
+        dataset: winograd_wsc
+      - task: lambada_openai
+        n_shots: [0]
+        dataset: EleutherAI/lambada_openai
+      - task: bigbench_qa_wikidata_generate_until
+        n_shots: [10]
+        dataset: hails/bigbench
+        subset: qa_wikidata_zero_shot
+      - task: bigbench_dyck_languages_generate_until
+        n_shots: [10]
+        dataset: hails/bigbench
+        subset: dyck_languages_zero_shot
+      - task: bigbench_operators_generate_until
+        n_shots: [10]
+        dataset: hails/bigbench
+        subset: operators_zero_shot
+      - task: bigbench_repeat_copy_logic_generate_until
+        n_shots: [10]
+        dataset: hails/bigbench
+        subset: repeat_copy_logic_zero_shot
+      - task: bigbench_cs_algorithms_generate_until
+        n_shots: [10]
+        dataset: hails/bigbench
+        subset: cs_algorithms_zero_shot
+      - task: coqa
+        n_shots: [0]
+        dataset: EleutherAI/coqa
+      - task: squadv2
+        n_shots: [10]
+        dataset: rajpurkar/squad_v2
+      # TODO: jeopardy is not available in lm-eval-harness.
+      # - task: jeopardy
+      #   n_shots: [10]
+      #   dataset: openaccess-ai-collective/jeopardy
+
 super_groups:
   oellm-multilingual:
     description: "Combined Belebele EU set plus multilingual benchmarks"

diff --git a/oellm/resources/template.sbatch b/oellm/resources/template.sbatch
@@ -91,6 +91,16 @@ do
         fi
     fi
 
+    # When using --contain, provide a writable home for prediction caches
+    SINGULARITY_HOME_ARG=""
+    if [[ "$SINGULARITY_ARGS" == *"--contain"* ]]; then
+        JOB_HOME="$EVAL_BASE_DIR/$USER/container_home/$SLURM_JOB_ID"
+        mkdir -p "$JOB_HOME"
+        SINGULARITY_HOME_ARG="--home $JOB_HOME"
+    fi
+
+    GPU_DEVICES=$(seq -s, 0 $(($GPUS_PER_NODE - 1)))
+
     suite_normalized=$(echo "$eval_suite" | tr '[:upper:]' '[:lower:]')
 
     # Helper function to run Python commands in the appropriate environment
@@ -100,8 +110,10 @@ do
             python "$@"
         else
             singularity exec $SINGULARITY_ARGS \
+                $SINGULARITY_HOME_ARG \
                 --bind $BIND_PATHS \
                 $EVAL_SIF_PATH \
+                env CUDA_VISIBLE_DEVICES=$GPU_DEVICES \
                 python "$@"
         fi
     }}
@@ -142,17 +154,17 @@ do
                 lighteval accelerate \
                     "model_name=$model_path,trust_remote_code=True" \
                     "$LIGHT_TASK_ARG" \
-                    --load-tasks-multilingual \
                     --output-dir "$RESULTS_SUBDIR" \
                     ${{LIMIT:+--max-samples $LIMIT}}
             else
                 singularity exec $SINGULARITY_ARGS \
+                    $SINGULARITY_HOME_ARG \
                     --bind $BIND_PATHS \
                     $EVAL_SIF_PATH \
+                    env CUDA_VISIBLE_DEVICES=$GPU_DEVICES \
                     lighteval accelerate \
                         "model_name=$model_path,trust_remote_code=True" \
                         "$LIGHT_TASK_ARG" \
-                        --load-tasks-multilingual \
                         --output-dir "$RESULTS_SUBDIR" \
                         ${{LIMIT:+--max-samples $LIMIT}}
             fi

diff --git a/requirements-venv-dclm.txt b/requirements-venv-dclm.txt
@@ -0,0 +1,10 @@
+# Dependencies for DCLM-core-22 evaluation (install in venv)
+# Install with: uv pip install -r requirements-venv-dclm.txt
+lm-eval==0.4.9.2
+torch
+transformers>=4.43.2,<5.0.0
+accelerate
+datasets<4.0.0
+wandb
+sentencepiece
+tiktoken