From e2f2a81c7259e839be857f40f6cf1f4d71e2c3d1 Mon Sep 17 00:00:00 2001
From: Rahul Anand Sharma <rahulasharma@google.com>
Date: Wed, 28 Jan 2026 23:18:26 -0800
Subject: [PATCH 01/88] fix gemm timing logic (#92)

---
 Ironwood/src/benchmark_gemm.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/Ironwood/src/benchmark_gemm.py b/Ironwood/src/benchmark_gemm.py
index 99564490..b802ddc0 100644
--- a/Ironwood/src/benchmark_gemm.py
+++ b/Ironwood/src/benchmark_gemm.py
@@ -213,6 +213,8 @@ def data_generator():
         return (lhs_device, rhs_device)
 
     # Run the benchmark
+    num_runs = 1 
+    ## Need to fix gemm timing logic to handle num_runs > 1
 
     time_ms_list = iteration_timeit(
         jit_sharded_f,
@@ -300,6 +302,9 @@ def data_generator():
 
         return (lhs_device, rhs_device)
 
+    num_runs = 1 
+    ## Need to fix gemm timing logic to handle num_runs > 1
+
     # Run the benchmark
     time_ms_list = iteration_timeit(
         jit_sharded_f,
@@ -402,6 +407,9 @@ def data_generator():
 
         return (lhs_device, rhs_device, sf0_device, sf1_device)
 
+    num_runs = 1 
+    ## Need to fix gemm timing logic to handle num_runs > 1
+
     time_ms_list = iteration_timeit(
         jit_sharded_f,
         data_generator,
@@ -513,6 +521,10 @@ def data_generator():
 
         return (out_buffer_device, lhs_device, rhs_device, sf0_device, sf1_device)
 
+
+    num_runs = 1 
+    ## Need to fix gemm timing logic to handle num_runs > 1
+
     time_ms_list = iteration_timeit(
         jit_sharded_f,
         data_generator,

From 20286e50af3b7f3b0edbe0750011a6a6dbf1e057 Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Tue, 27 Jan 2026 05:33:27 +0000
Subject: [PATCH 02/88] Add `gcs-bucket-csv-dir` to support GCS upload

Support run_benchmark.py to use argument `gcs-bucket-csv-dir` to configure the
directory for writing csv/tsv result.
---
 Ironwood/src/run_benchmark.py | 56 ++++++++++++++++++++++++++++++-----
 requirements.txt              |  1 +
 2 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/Ironwood/src/run_benchmark.py b/Ironwood/src/run_benchmark.py
index b44aab75..5b3c3b7f 100644
--- a/Ironwood/src/run_benchmark.py
+++ b/Ironwood/src/run_benchmark.py
@@ -316,7 +316,30 @@ def convert_dict_to_df(target_dict: Dict) -> pd.DataFrame:
     print(f"Metrics written to CSV at {csv_path}.")
 
 
-def run_single_benchmark(benchmark_config: Dict[str, Any], output_path: str):
+def write_metrics_to_gcs(
+    gcs_bucket_csv_dir: str,
+    config_path: str,
+    test_name: str,
+    calculate_metrics_results: List[Dict[str, Any]],
+):
+    """Writes metrics to GCS bucket defined by gcs_bucket_csv_dir."""
+    if not gcs_bucket_csv_dir:
+        return
+
+    config_dir = os.path.dirname(config_path)
+    if not config_dir:
+        print("No config directory found, the config path is: ", config_path)
+        config_category = "root"
+    else:
+        config_category = os.path.basename(config_dir)
+
+    config_stem = os.path.splitext(os.path.basename(config_path))[0]
+
+    gcs_path = os.path.join(gcs_bucket_csv_dir, config_category, config_stem)
+    write_to_csv(f"{gcs_path}/{test_name}.tsv", calculate_metrics_results)
+
+
+def run_single_benchmark(benchmark_config: Dict[str, Any], output_path: str, gcs_bucket_csv_dir: str = None, config_path: str = None):
     """Run a single benchmark with one or more configurations."""
     # Extract benchmark details
     benchmark_name = benchmark_config.get("benchmark_name")
@@ -413,14 +436,19 @@ def run_single_benchmark(benchmark_config: Dict[str, Any], output_path: str):
             "metrics": metrics
         })
 
-    # Dump metrics to file.
+    test_name = f"t_{benchmark_name}_" + "".join(
+        random.choices(string.ascii_uppercase + string.digits, k=10)
+    )
+
     if csv_path:
         os.makedirs(csv_path, exist_ok=True)
-        test_name = f"t_{benchmark_name}_" + "".join(
-            random.choices(string.ascii_uppercase + string.digits, k=10)
-        )
         write_to_csv(f"{csv_path}/{test_name}.tsv", calculate_metrics_results)
 
+    if gcs_bucket_csv_dir:
+        write_metrics_to_gcs(
+            gcs_bucket_csv_dir, config_path, test_name, calculate_metrics_results
+        )
+
 
 def main(args):
     """Main function."""
@@ -428,6 +456,7 @@ def main(args):
     config_path = args.config
     multithreaded = args.multithreaded
     output_path = args.output_path
+    gcs_bucket_csv_dir = args.gcs_bucket_csv_dir
     config = get_benchmark_config(config_path)
     benchmarks = config.get("benchmarks")
     if not benchmarks or not isinstance(benchmarks, list):
@@ -459,14 +488,14 @@ def main(args):
         # print("Num hosts detected: %d", num_hosts)
 
         for benchmark_config in benchmarks:
-            run_benchmark_multithreaded(benchmark_config, output_path)
+            run_benchmark_multithreaded(benchmark_config, output_path, gcs_bucket_csv_dir, config_path)
 
     else:
         for benchmark_config in benchmarks:
-            run_single_benchmark(benchmark_config, output_path)
+            run_single_benchmark(benchmark_config, output_path, gcs_bucket_csv_dir, config_path)
 
 
-def run_benchmark_multithreaded(benchmark_config, output_path):
+def run_benchmark_multithreaded(benchmark_config, output_path, gcs_bucket_csv_dir=None, config_path=None):
     # Extract benchmark details
     benchmark_name = benchmark_config.get("benchmark_name")
     benchmark_params = benchmark_config.get("benchmark_params", [])
@@ -543,6 +572,11 @@ def run_benchmark_multithreaded(benchmark_config, output_path):
         os.makedirs(csv_path, exist_ok=True)
         write_to_csv(f"{csv_path}/{test_name}.tsv", calculate_metrics_results)
 
+    if gcs_bucket_csv_dir:
+        write_metrics_to_gcs(
+            gcs_bucket_csv_dir, config_path, test_name, calculate_metrics_results
+        )
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
@@ -560,6 +594,12 @@ def run_benchmark_multithreaded(benchmark_config, output_path):
         default="",
         help="Path to output.",
     )
+    parser.add_argument(
+        "--gcs-bucket-csv-dir",
+        type=str,
+        default=None,
+        help="GCS bucket directory to write CSVs to.",
+    )
     parser.add_argument(
         "--multithreaded",
         type=bool,
diff --git a/requirements.txt b/requirements.txt
index 3ae246a8..ff76e478 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,3 +14,4 @@ qwix@git+https://github.com/google/qwix.git
 tokamax
 tune-jax
 immutabledict
+gcsfs

From 8dedb4c4704bef5767e89ee361188a9a39c80259 Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Tue, 27 Jan 2026 06:54:07 +0000
Subject: [PATCH 03/88] Add automation script and an HBM yaml example.

---
 .../guides/automation/automation_launch.sh    | 21 +++++++
 Ironwood/guides/automation/tpu7x-16-hbm.yaml  | 57 +++++++++++++++++++
 2 files changed, 78 insertions(+)
 create mode 100644 Ironwood/guides/automation/automation_launch.sh
 create mode 100644 Ironwood/guides/automation/tpu7x-16-hbm.yaml

diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
new file mode 100644
index 00000000..5b72bcb3
--- /dev/null
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S")
+
+yaml_names=("tpu7x-16-hbm.yaml")
+job_names=("tpu7x-16-hbm")
+
+# Fill the target GCS bucket path.
+export GCS_BUCKET_ROOT_DIR=""
+export GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${TIMESTAMP}"
+
+for yaml_file in "${yaml_names[@]}"; do
+    echo "Launch job: ${yaml_file}"
+    envsubst '${GCS_PATH}' < ${yaml_file} | kubectl apply -f -
+done
+
+for job_name in "${job_names[@]}"; do
+    kubectl wait --for=condition=complete job/${job_name} --timeout=1800s
+    kubectl delete job ${job_name}
+done
+kubectl apply -f aggregator.yaml
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-16-hbm.yaml b/Ironwood/guides/automation/tpu7x-16-hbm.yaml
new file mode 100644
index 00000000..206d4b79
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-16-hbm.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc
+spec:
+  clusterIP: None
+  selector:
+    job-name: tpu7x-16-hbm
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: tpu7x-16-hbm
+spec:
+  completionMode: Indexed
+  parallelism: 2
+  completions: 2
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x2
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml"
+          echo "Result will be written to ${GCS_BUCKET_DIR}"
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4

From 2bf6a941d5391188a4671b557d43f41a92a4afc1 Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Tue, 27 Jan 2026 06:56:25 +0000
Subject: [PATCH 04/88] Add aggregator yaml file.

---
 Ironwood/guides/automation/aggregator.yaml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 Ironwood/guides/automation/aggregator.yaml

diff --git a/Ironwood/guides/automation/aggregator.yaml b/Ironwood/guides/automation/aggregator.yaml
new file mode 100644
index 00000000..2c0ecbd0
--- /dev/null
+++ b/Ironwood/guides/automation/aggregator.yaml
@@ -0,0 +1,16 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: job-waiter
+spec:
+  template:
+    spec:
+      containers:
+      - name: main-app
+        image: ubuntu
+        command: ["/bin/sh", "-c"]
+        args:
+        - |
+          echo "Collectives job is finally done."
+          echo "Everything is all fine!"
+      restartPolicy: Never
\ No newline at end of file

From 00fa7b2432eee4a1effabb919f374a260c6e311e Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Tue, 27 Jan 2026 07:41:02 +0000
Subject: [PATCH 05/88] [Automation] Add readme and node-pools topology check

---
 Ironwood/guides/automation/README.md          | 40 +++++++++++++++++++
 .../guides/automation/automation_launch.sh    | 26 ++++++++++--
 .../automation/check_node_pool_setup.sh       | 33 +++++++++++++++
 3 files changed, 96 insertions(+), 3 deletions(-)
 create mode 100644 Ironwood/guides/automation/README.md
 create mode 100644 Ironwood/guides/automation/check_node_pool_setup.sh

diff --git a/Ironwood/guides/automation/README.md b/Ironwood/guides/automation/README.md
new file mode 100644
index 00000000..05e2e2d1
--- /dev/null
+++ b/Ironwood/guides/automation/README.md
@@ -0,0 +1,40 @@
+# Ironwood Automation Tool
+
+This directory contains the automation scripts for running TPU microbenchmarks. The tool simplifies the process of launching multiple benchmark jobs, waiting for their completion, and aggregating the results into a unified format.
+
+## Prerequisites
+
+Before running the automation script, ensure the following requirements are met:
+
+1.  **Node Pool Topology**: The script expects specific TPU node pools to be available in your cluster.
+    *   The `check_node_pool_setup.sh` script validates this.
+2.  **GCS Bucket**: You must have a Google Cloud Storage (GCS) bucket for the intermediate and final results.
+    *   This can be setup by `gcloud storage buckets create gs://my-unique-bucket-name --location=us-central1`
+3.  **Kubectl**: Ensure `kubectl` is configured and connected to your GKE cluster.
+
+## User Journey
+
+1.  **Clone & Checkout Branch**.
+    ```bash
+    git clone https://github.com/google/accelerator-microbenchmarks.git
+    cd accelerator-microbenchmarks
+    git checkout tpu7x-auto
+    ```
+
+2.  **Setup Environment**: Ensure your node pools are set up and you have prepared a GCS bucket.
+
+3.  **Run Automation Script**:
+    The main script is `automation_launch.sh`. You need to set the `GCS_BUCKET_ROOT_DIR` environment variable before running it.
+
+    ```bash
+    # Replace with your actual bucket path (must start with gs://)
+    export GCS_BUCKET_ROOT_DIR="gs://your-bucket-name/automation_results"
+    
+    # Run the launch script
+    bash Ironwood/guides/automation/automation_launch.sh
+    ```
+
+4.  **Retrieve Results**:
+    After the script completes, the final aggregated TSV files will be available in your GCS bucket. The script generates a timestamped directory for each run.
+    *   Check the script output for the exact path: `The intermediate result will be written to gs://...`
+    *   Look for the `final` directory under that path (e.g., `gs://your-bucket/automation_results/<timestamp>/final`).
diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
index 5b72bcb3..b069340f 100644
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -1,13 +1,33 @@
 #!/usr/bin/env bash
 
-TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S")
+######################################################################
+#                            USER INPUT
+######################################################################
+export GCS_BUCKET_ROOT_DIR=""
 
 yaml_names=("tpu7x-16-hbm.yaml")
 job_names=("tpu7x-16-hbm")
 
-# Fill the target GCS bucket path.
-export GCS_BUCKET_ROOT_DIR=""
+######################################################################
+#                        VALIDATION & SETUP
+######################################################################
+
+if [[ -z "${GCS_BUCKET_ROOT_DIR}" || "${GCS_BUCKET_ROOT_DIR}" != "gs://"* ]]; then
+  echo "Error: GCS_BUCKET_ROOT_DIR must be set and start with gs://"
+  exit 1
+fi
+TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S")
 export GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${TIMESTAMP}"
+echo "The intermediate result will be written to ${GCS_PATH}"
+
+SCRIPT_DIR="$(dirname "$(realpath "$0")")"
+if ! bash "${SCRIPT_DIR}/check_node_pool_setup.sh"; then
+  exit 1
+fi
+
+######################################################################
+#                 LAUNCH JOBS & WAIT FOR COMPLETION
+######################################################################
 
 for yaml_file in "${yaml_names[@]}"; do
     echo "Launch job: ${yaml_file}"
diff --git a/Ironwood/guides/automation/check_node_pool_setup.sh b/Ironwood/guides/automation/check_node_pool_setup.sh
new file mode 100644
index 00000000..5dd8737a
--- /dev/null
+++ b/Ironwood/guides/automation/check_node_pool_setup.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+required_chip="tpu7x"
+required_topologies=("2x2x1")
+
+echo "Checking for required GKE TPU configurations..."
+echo "Required TPU Type: ${required_chip}"
+echo "-----------------------------------------------------------------"
+
+all_found=true
+
+for topology in "${required_topologies[@]}"; do
+  echo -n "Checking for TPU topology '${topology}' with type '${required_chip}': "
+
+  matching_nodes=$(kubectl get nodes -l cloud.google.com/gke-tpu-topology=${topology},cloud.google.com/gke-tpu-accelerator=${required_chip} -o custom-columns=NAME:.metadata.name --no-headers 2>/dev/null)
+
+  if [[ -n "${matching_nodes}" ]]; then
+    echo "FOUND"
+  else
+    echo "MISSING"
+    all_found=false
+  fi
+done
+
+echo "-----------------------------------------------------------------"
+
+if [[ "${all_found}" = true ]]; then
+  echo "SUCCESS: All required TPU configurations (topology + type) are present in the cluster."
+  exit 0
+else
+  echo "FAILURE: One or more required TPU configurations are missing."
+  exit 1
+fi

From 99fa6b1b1f872c30c79b664dcd5b2d9f276e972a Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Tue, 27 Jan 2026 09:49:40 +0000
Subject: [PATCH 06/88] Update automation script and yaml files for different
 topologies.

---
 .../guides/automation/automation_launch.sh    | 38 ++++++++-----
 .../guides/automation/tpu7x-2x2x1-hbm.yaml    | 57 +++++++++++++++++++
 .../guides/automation/tpu7x-2x2x2-hbm.yaml    | 57 +++++++++++++++++++
 .../guides/automation/tpu7x-2x2x4-hbm.yaml    | 57 +++++++++++++++++++
 .../guides/automation/tpu7x-2x4x4-hbm.yaml    | 57 +++++++++++++++++++
 .../guides/automation/tpu7x-4x4x4-hbm.yaml    | 57 +++++++++++++++++++
 .../guides/automation/tpu7x-4x4x8-hbm.yaml    | 57 +++++++++++++++++++
 7 files changed, 367 insertions(+), 13 deletions(-)
 create mode 100644 Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml

diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
index b069340f..a904e14d 100644
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -3,21 +3,20 @@
 ######################################################################
 #                            USER INPUT
 ######################################################################
-export GCS_BUCKET_ROOT_DIR=""
+export GCS_PATH=""
 
-yaml_names=("tpu7x-16-hbm.yaml")
-job_names=("tpu7x-16-hbm")
+yaml_names=("tpu7x-2x2x1-hbm.yaml" "tpu7x-2x2x2-hbm.yaml")
+job_names=("tpu7x-8-hbm" "tpu7x-16-hbm")
 
 ######################################################################
 #                        VALIDATION & SETUP
 ######################################################################
 
-if [[ -z "${GCS_BUCKET_ROOT_DIR}" || "${GCS_BUCKET_ROOT_DIR}" != "gs://"* ]]; then
-  echo "Error: GCS_BUCKET_ROOT_DIR must be set and start with gs://"
+if [[ -z "${GCS_PATH}" || "${GCS_PATH}" != "gs://"* ]]; then
+  echo "Error: GCS_PATH must be set and start with gs://"
   exit 1
 fi
-TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S")
-export GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${TIMESTAMP}"
+
 echo "The intermediate result will be written to ${GCS_PATH}"
 
 SCRIPT_DIR="$(dirname "$(realpath "$0")")"
@@ -29,13 +28,26 @@ fi
 #                 LAUNCH JOBS & WAIT FOR COMPLETION
 ######################################################################
 
-for yaml_file in "${yaml_names[@]}"; do
+length=${#yaml_names[@]}
+for (( i=0; i<length; i++ )); do
+    yaml_file=${yaml_names[${i}]}
+    export JOB_NAME=${job_names[${i}]}
     echo "Launch job: ${yaml_file}"
-    envsubst '${GCS_PATH}' < ${yaml_file} | kubectl apply -f -
+    envsubst '${JOB_NAME} ${GCS_PATH}' < ${yaml_file} | kubectl apply -f -
+    (
+        echo "Job launched successfully!"
+        if kubectl wait --for=condition=complete job/${JOB_NAME} --timeout=1800s &> /dev/null; then
+            echo "Job from ${yaml_file} is completed!"
+        else
+            echo "Job from ${yaml_file} failed!"
+            
+        fi
+        envsubst '${JOB_NAME} ${GCS_PATH}' < ${yaml_file} | kubectl delete -f -
+    ) &
 done
 
-for job_name in "${job_names[@]}"; do
-    kubectl wait --for=condition=complete job/${job_name} --timeout=1800s
-    kubectl delete job ${job_name}
-done
+echo "All jobs dispatched. Waiting for results..."
+wait
+echo "All processing done."
+
 kubectl apply -f aggregator.yaml
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml
new file mode 100644
index 00000000..7455961d
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 1
+  completions: 1
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x1
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml
new file mode 100644
index 00000000..3f3f7929
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 2
+  completions: 2
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x2
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml
new file mode 100644
index 00000000..ed87aaf8
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 4
+  completions: 4
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x4
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml
new file mode 100644
index 00000000..57182026
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 8
+  completions: 8
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x4x4
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml
new file mode 100644
index 00000000..ccce9dc0
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 16
+  completions: 16
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 4x4x4
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml
new file mode 100644
index 00000000..8d8e960e
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 32
+  completions: 32
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 4x4x8
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file

From 91d4638aaed01ceff8f14febabcce2a2150f2380 Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Tue, 27 Jan 2026 15:33:55 +0000
Subject: [PATCH 07/88] [Automation] Error catch and failure retry

* Fix the issue where `kubectl wait` could only wait for one condition.
  Use poll loop to check for status.

* Store the failed jobs and retry with maximum 3 times

TEST=Use dummy `must-fail` and `must-succeed` job which exit 1/0
directly. Make sure the script will retry on the failed one for 3 times,
and eventually print out the command to retry.
---
 .../guides/automation/automation_launch.sh    | 140 +++++++++++++++---
 1 file changed, 120 insertions(+), 20 deletions(-)

diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
index a904e14d..9815780f 100644
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -6,7 +6,6 @@
 export GCS_PATH=""
 
 yaml_names=("tpu7x-2x2x1-hbm.yaml" "tpu7x-2x2x2-hbm.yaml")
-job_names=("tpu7x-8-hbm" "tpu7x-16-hbm")
 
 ######################################################################
 #                        VALIDATION & SETUP
@@ -28,26 +27,127 @@ fi
 #                 LAUNCH JOBS & WAIT FOR COMPLETION
 ######################################################################
 
-length=${#yaml_names[@]}
-for (( i=0; i<length; i++ )); do
-    yaml_file=${yaml_names[${i}]}
-    export JOB_NAME=${job_names[${i}]}
-    echo "Launch job: ${yaml_file}"
-    envsubst '${JOB_NAME} ${GCS_PATH}' < ${yaml_file} | kubectl apply -f -
-    (
-        echo "Job launched successfully!"
-        if kubectl wait --for=condition=complete job/${JOB_NAME} --timeout=1800s &> /dev/null; then
-            echo "Job from ${yaml_file} is completed!"
-        else
-            echo "Job from ${yaml_file} failed!"
-            
+
+# Function to wait for a job to complete or fail
+wait_for_job_completion() {
+    local job_name="$1"
+    local timeout="$2"
+    local start_time=$(date +%s)
+    local end_time=$((start_time + timeout))
+
+    while true; do
+        current_time=$(date +%s)
+        if [[ $current_time -gt $end_time ]]; then
+            echo "Timeout waiting for job ${job_name}"
+            return 2
+        fi
+
+        # Check for Complete condition
+        if kubectl get job "${job_name}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null | grep -q "True"; then
+            echo "Job ${job_name} completed successfully!"
+            return 0
+        fi
+
+        # Check for Failed condition
+        if kubectl get job "${job_name}" -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null | grep -q "True"; then
+            echo "Job ${job_name} FAILED!"
+            return 1
+        fi
+
+        sleep 5
+    done
+}
+
+# Function to apply jobs and wait for them to complete
+# Returns a list of failed yaml files in the variable FAILED_JOBS
+apply_and_wait() {
+    local yaml_files=("$@")
+    local pids=()
+    local job_names_in_batch=()
+    FAILED_JOBS=()
+
+    echo "Processing batch of ${#yaml_files[@]} jobs..."
+
+    # Launch all jobs
+    for yaml_file in "${yaml_files[@]}"; do
+        local filepath="${SCRIPT_DIR}/${yaml_file}"
+        # Derive job name: remove .yaml, lowercase, replace _ with -
+        local job_name=$(basename "${yaml_file}" .yaml | tr '[:upper:]' '[:lower:]' | tr '_' '-')
+        export JOB_NAME="${job_name}"
+        
+        echo "Launching job: ${filepath} (name: ${JOB_NAME})"
+        envsubst '${JOB_NAME} ${GCS_PATH}' < "${filepath}" | kubectl apply -f -
+        job_names_in_batch+=("${JOB_NAME}")
+    done
+
+    # Wait for completion in background
+    for i in "${!yaml_files[@]}"; do
+        local yaml_file="${yaml_files[$i]}"
+        local filepath="${SCRIPT_DIR}/${yaml_file}"
+        local job_name="${job_names_in_batch[$i]}"
+
+        (
+            wait_for_job_completion "${job_name}" 1800
+            wait_status=$?
+
+            export JOB_NAME="${job_name}"
+            envsubst '${JOB_NAME} ${GCS_PATH}' < "${filepath}" | kubectl delete -f - &> /dev/null
+            exit $wait_status
+        ) &
+        pids+=($!)
+    done
+
+    # Collect results
+    for i in "${!pids[@]}"; do
+        wait "${pids[$i]}"
+        if [[ $? -ne 0 ]]; then
+            FAILED_JOBS+=("${yaml_files[$i]}")
         fi
-        envsubst '${JOB_NAME} ${GCS_PATH}' < ${yaml_file} | kubectl delete -f -
-    ) &
+    done
+}
+
+# Retry loop
+current_batch=("${yaml_names[@]}")
+MAX_RETRIES=3
+
+for (( retry=1; retry<=MAX_RETRIES; retry++ )); do
+    apply_and_wait "${current_batch[@]}"
+    
+    if [[ ${#FAILED_JOBS[@]} -eq 0 ]]; then
+        echo "All jobs completed successfully in Round ${retry}!"
+        break
+    fi
+    
+    echo "Round ${retry} finished. ${#FAILED_JOBS[@]} jobs failed."
+    current_batch=("${FAILED_JOBS[@]}")
+    
+    if [[ ${retry} -lt ${MAX_RETRIES} ]]; then
+        echo "Retrying failed jobs..."
+        echo "========================================"
+        echo "$((retry + 1)) / ${MAX_RETRIES}" max retries
+        echo "========================================"
+    else
+        echo "Max retries reached. ¯\_(ツ)_/¯"
+    fi
 done
 
-echo "All jobs dispatched. Waiting for results..."
-wait
-echo "All processing done."
+echo ""
+echo "Jobs completed. Aggregating results..."
+echo ""
+
+kubectl apply -f aggregator.yaml
+
+# Print the failed jobs at the end for better visibility.
+
+if [[ ${#FAILED_JOBS[@]} -gt 0 ]]; then
+    echo "The following jobs finally failed after ${MAX_RETRIES} rounds:"
+    printf '%s\n' "${FAILED_JOBS[@]}"
 
-kubectl apply -f aggregator.yaml
\ No newline at end of file
+    echo -e "\nTo retry manually, run:"
+    for yaml_file in "${FAILED_JOBS[@]}"; do
+        job_name=$(basename "${yaml_file}" .yaml | tr '[:upper:]' '[:lower:]' | tr '_' '-')
+        echo "JOB_NAME=\"${job_name}\" GCS_PATH=\"${GCS_PATH}\" envsubst '\${JOB_NAME} \${GCS_PATH}' < \"${SCRIPT_DIR}/${yaml_file}\" | kubectl apply -f -"
+    done
+else
+    echo "Success! All jobs finished."
+fi
\ No newline at end of file

From 62a04612eb2a20c4047820ce5e3989bd453370a4 Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Wed, 28 Jan 2026 03:47:34 +0000
Subject: [PATCH 08/88] [Automation] Add missing topology tracking in
 check_node_pool_setup.sh

---
 .../automation/check_node_pool_setup.sh       |  6 +-
 Ironwood/guides/automation/tpu7x-16-hbm.yaml  | 57 -------------------
 2 files changed, 5 insertions(+), 58 deletions(-)
 delete mode 100644 Ironwood/guides/automation/tpu7x-16-hbm.yaml

diff --git a/Ironwood/guides/automation/check_node_pool_setup.sh b/Ironwood/guides/automation/check_node_pool_setup.sh
index 5dd8737a..b318823e 100644
--- a/Ironwood/guides/automation/check_node_pool_setup.sh
+++ b/Ironwood/guides/automation/check_node_pool_setup.sh
@@ -1,13 +1,15 @@
 #!/bin/bash
 
 required_chip="tpu7x"
-required_topologies=("2x2x1")
+required_topologies=("2x2x1" "2x2x2" "2x2x4" "2x4x4" "4x4x4")
 
 echo "Checking for required GKE TPU configurations..."
 echo "Required TPU Type: ${required_chip}"
 echo "-----------------------------------------------------------------"
 
 all_found=true
+missing_topologies=()
+
 
 for topology in "${required_topologies[@]}"; do
   echo -n "Checking for TPU topology '${topology}' with type '${required_chip}': "
@@ -18,6 +20,7 @@ for topology in "${required_topologies[@]}"; do
     echo "FOUND"
   else
     echo "MISSING"
+    missing_topologies+=("${topology}")
     all_found=false
   fi
 done
@@ -29,5 +32,6 @@ if [[ "${all_found}" = true ]]; then
   exit 0
 else
   echo "FAILURE: One or more required TPU configurations are missing."
+  echo "Missing topologies: ${missing_topologies[@]}"
   exit 1
 fi
diff --git a/Ironwood/guides/automation/tpu7x-16-hbm.yaml b/Ironwood/guides/automation/tpu7x-16-hbm.yaml
deleted file mode 100644
index 206d4b79..00000000
--- a/Ironwood/guides/automation/tpu7x-16-hbm.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: headless-svc
-spec:
-  clusterIP: None
-  selector:
-    job-name: tpu7x-16-hbm
----
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: tpu7x-16-hbm
-spec:
-  completionMode: Indexed
-  parallelism: 2
-  completions: 2
-  backoffLimit: 0
-  template:
-    spec:
-      subdomain: headless-svc
-      restartPolicy: Never
-      nodeSelector:
-        cloud.google.com/gke-tpu-accelerator: tpu7x
-        cloud.google.com/gke-tpu-topology: 2x2x2
-      containers:
-      - name: jax-tpu
-        image: python:3.12
-        securityContext:
-          privileged: false
-        env:
-        - name: JAX_PLATFORMS
-          value: "tpu,cpu"
-        - name: TPU_VMODULE
-          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
-        - name: XLA_IR_DEBUG
-          value: "1"
-        - name: XLA_HLO_DEBUG
-          value: "1"
-        command:
-        - bash
-        - -c
-        - |
-          set -ex
-
-          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
-          cd accelerator-microbenchmarks
-          pip install -r requirements.txt
-
-          GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml"
-          echo "Result will be written to ${GCS_BUCKET_DIR}"
-        resources:
-          requests:
-            google.com/tpu: 4
-          limits:
-            google.com/tpu: 4

From 991cb2f9c49ed3504a8113f36d2f4bbee5543503 Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Wed, 28 Jan 2026 04:08:10 +0000
Subject: [PATCH 09/88] [Automation] Add topology-aware node pool validation.

---
 Ironwood/guides/automation/automation_launch.sh     | 4 +++-
 Ironwood/guides/automation/check_node_pool_setup.sh | 6 +++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
index 9815780f..7ff69a19 100644
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -18,8 +18,10 @@ fi
 
 echo "The intermediate result will be written to ${GCS_PATH}"
 
+required_topologies=($(printf "%s\n" "${yaml_names[@]}" | grep -oE '[0-9]+x[0-9]+x[0-9]+' | sort -u))
+
 SCRIPT_DIR="$(dirname "$(realpath "$0")")"
-if ! bash "${SCRIPT_DIR}/check_node_pool_setup.sh"; then
+if ! bash "${SCRIPT_DIR}/check_node_pool_setup.sh" "${required_topologies[@]}"; then
   exit 1
 fi
 
diff --git a/Ironwood/guides/automation/check_node_pool_setup.sh b/Ironwood/guides/automation/check_node_pool_setup.sh
index b318823e..a77138b4 100644
--- a/Ironwood/guides/automation/check_node_pool_setup.sh
+++ b/Ironwood/guides/automation/check_node_pool_setup.sh
@@ -1,7 +1,11 @@
 #!/bin/bash
 
 required_chip="tpu7x"
-required_topologies=("2x2x1" "2x2x2" "2x2x4" "2x4x4" "4x4x4")
+if [[ $# -gt 0 ]]; then
+  required_topologies=("$@")
+else
+  required_topologies=("2x2x1" "2x2x2" "2x2x4" "2x4x4" "4x4x4")
+fi
 
 echo "Checking for required GKE TPU configurations..."
 echo "Required TPU Type: ${required_chip}"

From 6b5c66bb96cde8749a97b8bdd56e5e40a7db4920 Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Thu, 29 Jan 2026 05:52:16 +0000
Subject: [PATCH 10/88] [Automation] Update configurations for GEMM, H2D and
 Collectives

---
 .../automation/tpu7x-2x2x1-collectives.yaml   | 59 +++++++++++++++++++
 .../guides/automation/tpu7x-2x2x1-gemm.yaml   | 57 ++++++++++++++++++
 .../guides/automation/tpu7x-2x2x1-h2d.yaml    | 57 ++++++++++++++++++
 .../automation/tpu7x-2x2x2-collectives.yaml   | 59 +++++++++++++++++++
 .../guides/automation/tpu7x-2x2x2-gemm.yaml   | 57 ++++++++++++++++++
 .../guides/automation/tpu7x-2x2x2-h2d.yaml    | 57 ++++++++++++++++++
 .../automation/tpu7x-2x2x4-collectives.yaml   | 59 +++++++++++++++++++
 .../guides/automation/tpu7x-2x2x4-gemm.yaml   | 57 ++++++++++++++++++
 .../guides/automation/tpu7x-2x2x4-h2d.yaml    | 57 ++++++++++++++++++
 .../automation/tpu7x-2x4x4-collectives.yaml   | 59 +++++++++++++++++++
 .../guides/automation/tpu7x-2x4x4-gemm.yaml   | 57 ++++++++++++++++++
 .../guides/automation/tpu7x-2x4x4-h2d.yaml    | 57 ++++++++++++++++++
 .../automation/tpu7x-4x4x4-collectives.yaml   | 59 +++++++++++++++++++
 .../guides/automation/tpu7x-4x4x4-gemm.yaml   | 57 ++++++++++++++++++
 .../guides/automation/tpu7x-4x4x4-h2d.yaml    | 57 ++++++++++++++++++
 .../automation/tpu7x-4x4x8-collectives.yaml   | 59 +++++++++++++++++++
 .../guides/automation/tpu7x-4x4x8-gemm.yaml   | 57 ++++++++++++++++++
 .../guides/automation/tpu7x-4x4x8-h2d.yaml    | 57 ++++++++++++++++++
 18 files changed, 1038 insertions(+)
 create mode 100644 Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-2x2x1-h2d.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-2x2x2-h2d.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-2x2x4-h2d.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-2x4x4-h2d.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-4x4x4-h2d.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-4x4x8-h2d.yaml

diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml
new file mode 100644
index 00000000..1671f2ec
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml
@@ -0,0 +1,59 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 1
+  completions: 1
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x1
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml
new file mode 100644
index 00000000..1929d886
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 1
+  completions: 1
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x1
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-h2d.yaml
new file mode 100644
index 00000000..3f36f54e
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-h2d.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 1
+  completions: 1
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x1
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml
new file mode 100644
index 00000000..7758e449
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml
@@ -0,0 +1,59 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 2
+  completions: 2
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x2
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml
new file mode 100644
index 00000000..c869fbdb
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 2
+  completions: 2
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x2
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-h2d.yaml
new file mode 100644
index 00000000..34d19110
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-2x2x2-h2d.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 2
+  completions: 2
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x2
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml
new file mode 100644
index 00000000..dce47004
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml
@@ -0,0 +1,59 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 4
+  completions: 4
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x4
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml
new file mode 100644
index 00000000..e514f1f6
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 4
+  completions: 4
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x4
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-h2d.yaml
new file mode 100644
index 00000000..f48d047f
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-2x2x4-h2d.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 4
+  completions: 4
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x4
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml
new file mode 100644
index 00000000..4b0617c5
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml
@@ -0,0 +1,59 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 8
+  completions: 8
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x4x4
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml
new file mode 100644
index 00000000..e07a1072
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 8
+  completions: 8
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x4x4
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-h2d.yaml
new file mode 100644
index 00000000..258aa8ee
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-2x4x4-h2d.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 8
+  completions: 8
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x4x4
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml
new file mode 100644
index 00000000..036bbfeb
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml
@@ -0,0 +1,59 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 16
+  completions: 16
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 4x4x4
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml
new file mode 100644
index 00000000..00f09803
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 16
+  completions: 16
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 4x4x4
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-h2d.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-h2d.yaml
new file mode 100644
index 00000000..21ddef16
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-4x4x4-h2d.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 16
+  completions: 16
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 4x4x4
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml
new file mode 100644
index 00000000..97adb283
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml
@@ -0,0 +1,59 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 32
+  completions: 32
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 4x4x8
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml
new file mode 100644
index 00000000..e186c51c
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 32
+  completions: 32
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 4x4x8
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-h2d.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-h2d.yaml
new file mode 100644
index 00000000..712d94b7
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-4x4x8-h2d.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+spec:
+  completionMode: Indexed
+  parallelism: 32
+  completions: 32
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 4x4x8
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file

From 926a7c02949497169ce16e9b30abba74ff79cf3b Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Thu, 29 Jan 2026 08:34:47 +0000
Subject: [PATCH 11/88] [Automation] Update `automation_launch.sh`

---
 .../guides/automation/automation_launch.sh    | 26 +++++++++++++------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
index 7ff69a19..678285cf 100644
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -3,20 +3,29 @@
 ######################################################################
 #                            USER INPUT
 ######################################################################
-export GCS_PATH=""
+TIMESTAMP=$(date +%Y-%m-%d_%H-%M-%S)
+export GCS_BUCKET_ROOT_DIR=""
 
-yaml_names=("tpu7x-2x2x1-hbm.yaml" "tpu7x-2x2x2-hbm.yaml")
+MAX_RETRIES=3
+TIMEOUT_SECOND=3600
+
+yaml_names=(
+    "tpu7x-2x2x1-hbm.yaml" "tpu7x-2x2x1-h2d.yaml" "tpu7x-2x2x1-gemm.yaml" "tpu7x-2x2x1-collectives.yaml"
+    "tpu7x-2x2x2-hbm.yaml" "tpu7x-2x2x2-h2d.yaml" "tpu7x-2x2x2-gemm.yaml" "tpu7x-2x2x2-collectives.yaml"
+    "tpu7x-2x2x4-hbm.yaml" "tpu7x-2x2x4-h2d.yaml" "tpu7x-2x2x4-gemm.yaml" "tpu7x-2x2x4-collectives.yaml"
+    "tpu7x-2x4x4-hbm.yaml" "tpu7x-2x4x4-h2d.yaml" "tpu7x-2x4x4-gemm.yaml" "tpu7x-2x4x4-collectives.yaml"
+)
 
 ######################################################################
 #                        VALIDATION & SETUP
 ######################################################################
 
-if [[ -z "${GCS_PATH}" || "${GCS_PATH}" != "gs://"* ]]; then
-  echo "Error: GCS_PATH must be set and start with gs://"
+if [[ -z "${GCS_BUCKET_ROOT_DIR}" || "${GCS_BUCKET_ROOT_DIR}" != "gs://"* ]]; then
+  echo "Error: GCS_BUCKET_ROOT_DIR must be set and start with gs://"
   exit 1
 fi
 
-echo "The intermediate result will be written to ${GCS_PATH}"
+echo "The intermediate result will be written to ${GCS_BUCKET_ROOT_DIR}"
 
 required_topologies=($(printf "%s\n" "${yaml_names[@]}" | grep -oE '[0-9]+x[0-9]+x[0-9]+' | sort -u))
 
@@ -76,6 +85,7 @@ apply_and_wait() {
         # Derive job name: remove .yaml, lowercase, replace _ with -
         local job_name=$(basename "${yaml_file}" .yaml | tr '[:upper:]' '[:lower:]' | tr '_' '-')
         export JOB_NAME="${job_name}"
+        local GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}"
         
         echo "Launching job: ${filepath} (name: ${JOB_NAME})"
         envsubst '${JOB_NAME} ${GCS_PATH}' < "${filepath}" | kubectl apply -f -
@@ -87,9 +97,9 @@ apply_and_wait() {
         local yaml_file="${yaml_files[$i]}"
         local filepath="${SCRIPT_DIR}/${yaml_file}"
         local job_name="${job_names_in_batch[$i]}"
-
+        local GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}"
         (
-            wait_for_job_completion "${job_name}" 1800
+            wait_for_job_completion "${job_name}" ${TIMEOUT_SECOND}
             wait_status=$?
 
             export JOB_NAME="${job_name}"
@@ -110,7 +120,6 @@ apply_and_wait() {
 
 # Retry loop
 current_batch=("${yaml_names[@]}")
-MAX_RETRIES=3
 
 for (( retry=1; retry<=MAX_RETRIES; retry++ )); do
     apply_and_wait "${current_batch[@]}"
@@ -148,6 +157,7 @@ if [[ ${#FAILED_JOBS[@]} -gt 0 ]]; then
     echo -e "\nTo retry manually, run:"
     for yaml_file in "${FAILED_JOBS[@]}"; do
         job_name=$(basename "${yaml_file}" .yaml | tr '[:upper:]' '[:lower:]' | tr '_' '-')
+        GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}"
         echo "JOB_NAME=\"${job_name}\" GCS_PATH=\"${GCS_PATH}\" envsubst '\${JOB_NAME} \${GCS_PATH}' < \"${SCRIPT_DIR}/${yaml_file}\" | kubectl apply -f -"
     done
 else

From a86ab39f289faf1a7959a9bffd27da617ff8369d Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Thu, 29 Jan 2026 12:08:15 +0000
Subject: [PATCH 12/88] [Automation] Enable kueue to prevent deadlock from race
 condition

---
 .../guides/automation/automation_launch.sh    | 10 ++++-
 Ironwood/guides/automation/job-queue.yaml     | 41 +++++++++++++++++++
 .../automation/tpu7x-2x2x1-collectives.yaml   |  3 ++
 .../guides/automation/tpu7x-2x2x1-gemm.yaml   |  3 ++
 .../guides/automation/tpu7x-2x2x1-h2d.yaml    |  3 ++
 .../guides/automation/tpu7x-2x2x1-hbm.yaml    |  3 ++
 .../automation/tpu7x-2x2x2-collectives.yaml   |  3 ++
 .../guides/automation/tpu7x-2x2x2-gemm.yaml   |  3 ++
 .../guides/automation/tpu7x-2x2x2-h2d.yaml    |  3 ++
 .../guides/automation/tpu7x-2x2x2-hbm.yaml    |  3 ++
 .../automation/tpu7x-2x2x4-collectives.yaml   |  3 ++
 .../guides/automation/tpu7x-2x2x4-gemm.yaml   |  3 ++
 .../guides/automation/tpu7x-2x2x4-h2d.yaml    |  3 ++
 .../guides/automation/tpu7x-2x2x4-hbm.yaml    |  3 ++
 .../automation/tpu7x-2x4x4-collectives.yaml   |  3 ++
 .../guides/automation/tpu7x-2x4x4-gemm.yaml   |  3 ++
 .../guides/automation/tpu7x-2x4x4-h2d.yaml    |  3 ++
 .../guides/automation/tpu7x-2x4x4-hbm.yaml    |  3 ++
 .../automation/tpu7x-4x4x4-collectives.yaml   |  3 ++
 .../guides/automation/tpu7x-4x4x4-gemm.yaml   |  3 ++
 .../guides/automation/tpu7x-4x4x4-h2d.yaml    |  3 ++
 .../guides/automation/tpu7x-4x4x4-hbm.yaml    |  3 ++
 .../automation/tpu7x-4x4x8-collectives.yaml   |  3 ++
 .../guides/automation/tpu7x-4x4x8-gemm.yaml   |  3 ++
 .../guides/automation/tpu7x-4x4x8-h2d.yaml    |  3 ++
 .../guides/automation/tpu7x-4x4x8-hbm.yaml    |  3 ++
 26 files changed, 121 insertions(+), 2 deletions(-)
 create mode 100644 Ironwood/guides/automation/job-queue.yaml

diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
index 678285cf..3079da66 100644
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -34,6 +34,12 @@ if ! bash "${SCRIPT_DIR}/check_node_pool_setup.sh" "${required_topologies[@]}";
   exit 1
 fi
 
+for topology in "${required_topologies[@]}"; do
+    export TOPOLOGY="${topology}"
+    export TPUS=$(echo "${TOPOLOGY}" | sed 's/x/*/g' | bc)
+    envsubst '${TOPOLOGY} ${TPUS}' < ${SCRIPT_DIR}/job-queue.yaml | kubectl apply -f -
+done
+
 ######################################################################
 #                 LAUNCH JOBS & WAIT FOR COMPLETION
 ######################################################################
@@ -85,7 +91,7 @@ apply_and_wait() {
         # Derive job name: remove .yaml, lowercase, replace _ with -
         local job_name=$(basename "${yaml_file}" .yaml | tr '[:upper:]' '[:lower:]' | tr '_' '-')
         export JOB_NAME="${job_name}"
-        local GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}"
+        export GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}"
         
         echo "Launching job: ${filepath} (name: ${JOB_NAME})"
         envsubst '${JOB_NAME} ${GCS_PATH}' < "${filepath}" | kubectl apply -f -
@@ -97,7 +103,7 @@ apply_and_wait() {
         local yaml_file="${yaml_files[$i]}"
         local filepath="${SCRIPT_DIR}/${yaml_file}"
         local job_name="${job_names_in_batch[$i]}"
-        local GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}"
+        export GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}"
         (
             wait_for_job_completion "${job_name}" ${TIMEOUT_SECOND}
             wait_status=$?
diff --git a/Ironwood/guides/automation/job-queue.yaml b/Ironwood/guides/automation/job-queue.yaml
new file mode 100644
index 00000000..b13f5573
--- /dev/null
+++ b/Ironwood/guides/automation/job-queue.yaml
@@ -0,0 +1,41 @@
+apiVersion: kueue.x-k8s.io/v1beta2
+kind: ResourceFlavor
+metadata:
+  name: "flavor-${TOPOLOGY}"
+spec:
+  nodeLabels:
+    cloud.google.com/gke-tpu-accelerator: tpu7x
+    cloud.google.com/gke-tpu-topology: ${TOPOLOGY}
+---
+apiVersion: kueue.x-k8s.io/v1beta2
+kind: ClusterQueue
+metadata:
+  name: cluster-queue-${TOPOLOGY}
+spec:
+  flavorFungibility:
+    whenCanBorrow: MayStopSearch
+    whenCanPreempt: TryNextFlavor
+  namespaceSelector: {}
+  preemption:
+    borrowWithinCohort:
+      policy: Never
+    reclaimWithinCohort: Never
+    withinClusterQueue: LowerPriority
+  queueingStrategy: BestEffortFIFO
+  resourceGroups:
+  - coveredResources:
+    - google.com/tpu
+    flavors:
+    - name: flavor-${TOPOLOGY}
+      resources:
+      - name: google.com/tpu
+        nominalQuota: ${TPUS}
+  stopPolicy: None
+---
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: LocalQueue
+metadata:
+  namespace: default
+  name: "user-queue-${TOPOLOGY}"
+spec:
+  clusterQueue: "cluster-queue-${TOPOLOGY}"
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml
index 1671f2ec..fc878ea0 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-2x2x1
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 1
   completions: 1
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml
index 1929d886..1c9fa143 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-2x2x1
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 1
   completions: 1
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-h2d.yaml
index 3f36f54e..3f662be5 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x1-h2d.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-h2d.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-2x2x1
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 1
   completions: 1
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml
index 7455961d..e84e13ea 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-2x2x1
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 1
   completions: 1
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml
index 7758e449..c2efba03 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-2x2x2
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 2
   completions: 2
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml
index c869fbdb..3f6004bd 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-2x2x2
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 2
   completions: 2
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-h2d.yaml
index 34d19110..fbcd556d 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x2-h2d.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x2-h2d.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-2x2x2
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 2
   completions: 2
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml
index 3f3f7929..71793e3f 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-2x2x2
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 2
   completions: 2
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml
index dce47004..a42b04ae 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-2x2x4
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 4
   completions: 4
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml
index e514f1f6..fee014e9 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-2x2x4
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 4
   completions: 4
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-h2d.yaml
index f48d047f..799c5b35 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x4-h2d.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x4-h2d.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-2x2x4
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 4
   completions: 4
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml
index ed87aaf8..100e1f7f 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-2x2x4
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 4
   completions: 4
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml
index 4b0617c5..de6f7106 100644
--- a/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-2x4x4
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 8
   completions: 8
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml
index e07a1072..d51c9c31 100644
--- a/Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-2x4x4
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 8
   completions: 8
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-h2d.yaml
index 258aa8ee..30a25c4c 100644
--- a/Ironwood/guides/automation/tpu7x-2x4x4-h2d.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x4x4-h2d.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-2x4x4
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 8
   completions: 8
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml
index 57182026..a438b93d 100644
--- a/Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-2x4x4
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 8
   completions: 8
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml
index 036bbfeb..828d1352 100644
--- a/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml
+++ b/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-4x4x4
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 16
   completions: 16
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml
index 00f09803..ef9220f6 100644
--- a/Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml
+++ b/Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-4x4x4
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 16
   completions: 16
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-h2d.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-h2d.yaml
index 21ddef16..da7a4e7c 100644
--- a/Ironwood/guides/automation/tpu7x-4x4x4-h2d.yaml
+++ b/Ironwood/guides/automation/tpu7x-4x4x4-h2d.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-4x4x4
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 16
   completions: 16
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml
index ccce9dc0..0819edad 100644
--- a/Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml
+++ b/Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-4x4x4
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 16
   completions: 16
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml
index 97adb283..f3bf721d 100644
--- a/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml
+++ b/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-4x4x8
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 32
   completions: 32
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml
index e186c51c..65f93467 100644
--- a/Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml
+++ b/Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-4x4x8
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 32
   completions: 32
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-h2d.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-h2d.yaml
index 712d94b7..09d2a5ab 100644
--- a/Ironwood/guides/automation/tpu7x-4x4x8-h2d.yaml
+++ b/Ironwood/guides/automation/tpu7x-4x4x8-h2d.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-4x4x8
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 32
   completions: 32
   backoffLimit: 0
diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml
index 8d8e960e..e6445b54 100644
--- a/Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml
+++ b/Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml
@@ -11,8 +11,11 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-4x4x8
 spec:
   completionMode: Indexed
+  suspend: true
   parallelism: 32
   completions: 32
   backoffLimit: 0

From f53bf5e336d99ef8c1fb59da06adaeb3daba42cd Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Thu, 29 Jan 2026 16:07:16 +0000
Subject: [PATCH 13/88] [Automation] Update aggregator

---
 Ironwood/guides/automation/aggregator.py      | 24 +++++++++++++++++++
 Ironwood/guides/automation/aggregator.yaml    | 19 ++++++++++-----
 .../guides/automation/automation_launch.sh    |  2 +-
 3 files changed, 38 insertions(+), 7 deletions(-)
 create mode 100644 Ironwood/guides/automation/aggregator.py

diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py
new file mode 100644
index 00000000..44889890
--- /dev/null
+++ b/Ironwood/guides/automation/aggregator.py
@@ -0,0 +1,24 @@
+import argparse
+import os
+import glob
+import pandas as pd
+import gcsfs
+
+def download_from_gcs(bucket_path: str, local_dir: str):
+    """
+    Downloads the content of the GCS bucket path to a local directory.
+    """
+    fs = gcsfs.GCSFileSystem()
+    gcs_path = bucket_path.replace("gs://", "")
+    
+    print(f"Downloading from gs://{gcs_path} to {local_dir}...")
+    os.makedirs(local_dir, exist_ok=True)
+    fs.get(gcs_path, local_dir, recursive=True)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Download from GCS and aggregate results locally.")
+    parser.add_argument("--bucket_path", type=str, required=True, help="The GCS bucket path (gs://...)")
+    parser.add_argument("--local_dir", type=str, default="./results", help="Local directory to download and aggregate results.")
+    args = parser.parse_args()
+    
+    download_from_gcs(args.bucket_path, args.local_dir)
diff --git a/Ironwood/guides/automation/aggregator.yaml b/Ironwood/guides/automation/aggregator.yaml
index 2c0ecbd0..04cef00e 100644
--- a/Ironwood/guides/automation/aggregator.yaml
+++ b/Ironwood/guides/automation/aggregator.yaml
@@ -7,10 +7,17 @@ spec:
     spec:
       containers:
       - name: main-app
-        image: ubuntu
-        command: ["/bin/sh", "-c"]
-        args:
+        image: python:3.12
+        command:
+        - bash
+        - -c
         - |
-          echo "Collectives job is finally done."
-          echo "Everything is all fine!"
-      restartPolicy: Never
\ No newline at end of file
+          set -ex
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/guides/automation/aggregator.py --bucket_path=${GCS_BUCKET_DIR}
+          sleep infinity
+      restartPolicy: Never
diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
index 3079da66..c832f8ac 100644
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -152,7 +152,7 @@ echo ""
 echo "Jobs completed. Aggregating results..."
 echo ""
 
-kubectl apply -f aggregator.yaml
+envsubst '${GCS_BUCKET_ROOT_DIR}' < ${SCRIPT_DIR}/aggregator.yaml | kubectl apply -f -
 
 # Print the failed jobs at the end for better visibility.
 

From a06944d11580a2196b8848631bee145c59065e5f Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Thu, 29 Jan 2026 17:13:01 +0000
Subject: [PATCH 14/88] [Automation] Update aggregator and rename host to
 device yaml files

---
 Ironwood/guides/automation/aggregator.py                 | 9 +++++++++
 Ironwood/guides/automation/aggregator.yaml               | 5 +++--
 Ironwood/guides/automation/automation_launch.sh          | 8 ++++----
 ...tpu7x-2x2x1-h2d.yaml => tpu7x-2x2x1-host_device.yaml} | 0
 ...tpu7x-2x2x2-h2d.yaml => tpu7x-2x2x2-host_device.yaml} | 0
 ...tpu7x-2x2x4-h2d.yaml => tpu7x-2x2x4-host_device.yaml} | 0
 ...tpu7x-2x4x4-h2d.yaml => tpu7x-2x4x4-host_device.yaml} | 0
 ...tpu7x-4x4x4-h2d.yaml => tpu7x-4x4x4-host_device.yaml} | 0
 ...tpu7x-4x4x8-h2d.yaml => tpu7x-4x4x8-host_device.yaml} | 0
 9 files changed, 16 insertions(+), 6 deletions(-)
 rename Ironwood/guides/automation/{tpu7x-2x2x1-h2d.yaml => tpu7x-2x2x1-host_device.yaml} (100%)
 rename Ironwood/guides/automation/{tpu7x-2x2x2-h2d.yaml => tpu7x-2x2x2-host_device.yaml} (100%)
 rename Ironwood/guides/automation/{tpu7x-2x2x4-h2d.yaml => tpu7x-2x2x4-host_device.yaml} (100%)
 rename Ironwood/guides/automation/{tpu7x-2x4x4-h2d.yaml => tpu7x-2x4x4-host_device.yaml} (100%)
 rename Ironwood/guides/automation/{tpu7x-4x4x4-h2d.yaml => tpu7x-4x4x4-host_device.yaml} (100%)
 rename Ironwood/guides/automation/{tpu7x-4x4x8-h2d.yaml => tpu7x-4x4x8-host_device.yaml} (100%)

diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py
index 44889890..1bd18973 100644
--- a/Ironwood/guides/automation/aggregator.py
+++ b/Ironwood/guides/automation/aggregator.py
@@ -15,6 +15,14 @@ def download_from_gcs(bucket_path: str, local_dir: str):
     os.makedirs(local_dir, exist_ok=True)
     fs.get(gcs_path, local_dir, recursive=True)
 
+def aggregate_results(local_dir: str):
+    categories = ["collectives", "hbm", "host_device"]
+    directories = {}
+    for category in categories:
+        directories[category] = glob.glob(f"{local_dir}/*/{category}/*", recursive=True)
+    
+    
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Download from GCS and aggregate results locally.")
     parser.add_argument("--bucket_path", type=str, required=True, help="The GCS bucket path (gs://...)")
@@ -22,3 +30,4 @@ def download_from_gcs(bucket_path: str, local_dir: str):
     args = parser.parse_args()
     
     download_from_gcs(args.bucket_path, args.local_dir)
+    aggregate_results(args.local_dir)
diff --git a/Ironwood/guides/automation/aggregator.yaml b/Ironwood/guides/automation/aggregator.yaml
index 04cef00e..bc81bed6 100644
--- a/Ironwood/guides/automation/aggregator.yaml
+++ b/Ironwood/guides/automation/aggregator.yaml
@@ -1,7 +1,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: job-waiter
+  name: aggregator
 spec:
   template:
     spec:
@@ -16,8 +16,9 @@ spec:
           git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
           cd accelerator-microbenchmarks
           git checkout tpu7x-auto
+          pip install -r requirements.txt
 
-          GCS_BUCKET_DIR=${GCS_PATH}
+          GCS_BUCKET_DIR=${GCS_BUCKET_ROOT_DIR}
           python Ironwood/guides/automation/aggregator.py --bucket_path=${GCS_BUCKET_DIR}
           sleep infinity
       restartPolicy: Never
diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
index c832f8ac..198ad91c 100644
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -10,10 +10,10 @@ MAX_RETRIES=3
 TIMEOUT_SECOND=3600
 
 yaml_names=(
-    "tpu7x-2x2x1-hbm.yaml" "tpu7x-2x2x1-h2d.yaml" "tpu7x-2x2x1-gemm.yaml" "tpu7x-2x2x1-collectives.yaml"
-    "tpu7x-2x2x2-hbm.yaml" "tpu7x-2x2x2-h2d.yaml" "tpu7x-2x2x2-gemm.yaml" "tpu7x-2x2x2-collectives.yaml"
-    "tpu7x-2x2x4-hbm.yaml" "tpu7x-2x2x4-h2d.yaml" "tpu7x-2x2x4-gemm.yaml" "tpu7x-2x2x4-collectives.yaml"
-    "tpu7x-2x4x4-hbm.yaml" "tpu7x-2x4x4-h2d.yaml" "tpu7x-2x4x4-gemm.yaml" "tpu7x-2x4x4-collectives.yaml"
+    "tpu7x-2x2x1-hbm.yaml" "tpu7x-2x2x1-host_device.yaml" "tpu7x-2x2x1-gemm.yaml" "tpu7x-2x2x1-collectives.yaml"
+    "tpu7x-2x2x2-hbm.yaml" "tpu7x-2x2x2-host_device.yaml" "tpu7x-2x2x2-gemm.yaml" "tpu7x-2x2x2-collectives.yaml"
+    "tpu7x-2x2x4-hbm.yaml" "tpu7x-2x2x4-host_device.yaml" "tpu7x-2x2x4-gemm.yaml" "tpu7x-2x2x4-collectives.yaml"
+    "tpu7x-2x4x4-hbm.yaml" "tpu7x-2x4x4-host_device.yaml" "tpu7x-2x4x4-gemm.yaml" "tpu7x-2x4x4-collectives.yaml"
 )
 
 ######################################################################
diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml
similarity index 100%
rename from Ironwood/guides/automation/tpu7x-2x2x1-h2d.yaml
rename to Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml
diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-host_device.yaml
similarity index 100%
rename from Ironwood/guides/automation/tpu7x-2x2x2-h2d.yaml
rename to Ironwood/guides/automation/tpu7x-2x2x2-host_device.yaml
diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-host_device.yaml
similarity index 100%
rename from Ironwood/guides/automation/tpu7x-2x2x4-h2d.yaml
rename to Ironwood/guides/automation/tpu7x-2x2x4-host_device.yaml
diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-host_device.yaml
similarity index 100%
rename from Ironwood/guides/automation/tpu7x-2x4x4-h2d.yaml
rename to Ironwood/guides/automation/tpu7x-2x4x4-host_device.yaml
diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-h2d.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-host_device.yaml
similarity index 100%
rename from Ironwood/guides/automation/tpu7x-4x4x4-h2d.yaml
rename to Ironwood/guides/automation/tpu7x-4x4x4-host_device.yaml
diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-h2d.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-host_device.yaml
similarity index 100%
rename from Ironwood/guides/automation/tpu7x-4x4x8-h2d.yaml
rename to Ironwood/guides/automation/tpu7x-4x4x8-host_device.yaml

From ddd34730c432c9c9a366eb2566f61a1fd16bb14e Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Thu, 29 Jan 2026 18:44:45 +0000
Subject: [PATCH 15/88] [Automation] Delete unused yaml file and update
 aggregator file

---
 Ironwood/guides/automation/aggregator.py      | 102 ++++++++++++++++--
 Ironwood/guides/automation/aggregator.yaml    |   3 +-
 .../guides/automation/automation_launch.sh    |   4 +-
 .../guides/automation/tpu7x-2x2x2-gemm.yaml   |  60 -----------
 .../guides/automation/tpu7x-2x2x2-hbm.yaml    |  60 -----------
 .../automation/tpu7x-2x2x2-host_device.yaml   |  60 -----------
 .../guides/automation/tpu7x-2x2x4-gemm.yaml   |  60 -----------
 .../guides/automation/tpu7x-2x2x4-hbm.yaml    |  60 -----------
 .../automation/tpu7x-2x2x4-host_device.yaml   |  60 -----------
 .../guides/automation/tpu7x-2x4x4-gemm.yaml   |  60 -----------
 .../guides/automation/tpu7x-2x4x4-hbm.yaml    |  60 -----------
 .../automation/tpu7x-2x4x4-host_device.yaml   |  60 -----------
 .../guides/automation/tpu7x-4x4x4-gemm.yaml   |  60 -----------
 .../guides/automation/tpu7x-4x4x4-hbm.yaml    |  60 -----------
 .../automation/tpu7x-4x4x4-host_device.yaml   |  60 -----------
 .../guides/automation/tpu7x-4x4x8-gemm.yaml   |  60 -----------
 .../guides/automation/tpu7x-4x4x8-hbm.yaml    |  60 -----------
 .../automation/tpu7x-4x4x8-host_device.yaml   |  60 -----------
 18 files changed, 96 insertions(+), 913 deletions(-)
 delete mode 100644 Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml
 delete mode 100644 Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml
 delete mode 100644 Ironwood/guides/automation/tpu7x-2x2x2-host_device.yaml
 delete mode 100644 Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml
 delete mode 100644 Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml
 delete mode 100644 Ironwood/guides/automation/tpu7x-2x2x4-host_device.yaml
 delete mode 100644 Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml
 delete mode 100644 Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml
 delete mode 100644 Ironwood/guides/automation/tpu7x-2x4x4-host_device.yaml
 delete mode 100644 Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml
 delete mode 100644 Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml
 delete mode 100644 Ironwood/guides/automation/tpu7x-4x4x4-host_device.yaml
 delete mode 100644 Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml
 delete mode 100644 Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml
 delete mode 100644 Ironwood/guides/automation/tpu7x-4x4x8-host_device.yaml

diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py
index 1bd18973..471a4ca8 100644
--- a/Ironwood/guides/automation/aggregator.py
+++ b/Ironwood/guides/automation/aggregator.py
@@ -4,24 +4,110 @@
 import pandas as pd
 import gcsfs
 
+columns_mapping = {
+    "collectives": [
+        "topology", "input_num_elements", "transferred_data (GB)", "dtype_bytes", "hlo_input_shape", "hlo_output_shape", "test_name",
+        'step_time_ms_p50', 'step_time_ms_p90', 'step_time_ms_p95', 'step_time_ms_p99', 'step_time_ms_avg', 'step_time_ms_max', 'step_time_ms_num_runs', 'step_time_ms_min',
+        'achieved_bw (GB/s)_p50', 'achieved_bw (GB/s)_p90', 'achieved_bw (GB/s)_p95', 'achieved_bw (GB/s)_p99', 'achieved_bw (GB/s)_avg', 'achieved_bw (GB/s)_max', 'achieved_bw (GB/s)_num_runs', 'achieved_bw (GB/s)_min',
+    ],
+    "hbm": [
+        'num_elements', 'dtype', 'tensor_size_gbytes', 'test_name',
+        'time_ms_p50', 'time_ms_p90', 'time_ms_p95', 'time_ms_p99', 'time_ms_avg', 'time_ms_max', 'time_ms_num_runs', 'time_ms_min',
+        'bw_gbyte_sec_p50', 'bw_gbyte_sec_p90', 'bw_gbyte_sec_p95', 'bw_gbyte_sec_p99', 'bw_gbyte_sec_avg', 'bw_gbyte_sec_max', 'bw_gbyte_sec_num_runs', 'bw_gbyte_sec_min',
+    ],
+    "host_device": [
+        'data_size_mib', 'H2D_bw (GiB/s)_p50', 'H2D_bw (GiB/s)_p90',
+       'H2D_bw (GiB/s)_p95', 'H2D_bw (GiB/s)_p99', 'H2D_bw (GiB/s)_avg',
+       'H2D_bw (GiB/s)_max', 'H2D_bw (GiB/s)_num_runs', 'H2D_bw (GiB/s)_min',
+       'D2H_bw (GiB/s)_p50', 'D2H_bw (GiB/s)_p90', 'D2H_bw (GiB/s)_p95',
+       'D2H_bw (GiB/s)_p99', 'D2H_bw (GiB/s)_avg', 'D2H_bw (GiB/s)_max',
+       'D2H_bw (GiB/s)_num_runs', 'D2H_bw (GiB/s)_min'],
+    "training": ['m', 'n', 'k', 'dtype', 'StepTime(median,ms)',
+       'Throughput(median,TFLOP/s/device)', 'TotalThroughput(median,TFLOP/s)',
+       'MFU', 'total_flops', 'test_name', 'step_time_ms_p50', 'step_time_ms_p90',
+       'step_time_ms_p95', 'step_time_ms_p99', 'step_time_ms_avg',
+       'step_time_ms_max', 'step_time_ms_num_runs', 'step_time_ms_min',
+       'tflops_per_sec_pre_device_p50', 'tflops_per_sec_pre_device_p90',
+       'tflops_per_sec_pre_device_p95', 'tflops_per_sec_pre_device_p99',
+       'tflops_per_sec_pre_device_avg', 'tflops_per_sec_pre_device_max',
+       'tflops_per_sec_pre_device_num_runs', 'tflops_per_sec_pre_device_min',
+       'tflops_per_sec_p50', 'tflops_per_sec_p90', 'tflops_per_sec_p95',
+       'tflops_per_sec_p99', 'tflops_per_sec_avg', 'tflops_per_sec_max',
+       'tflops_per_sec_num_runs', 'tflops_per_sec_min', 'MFU_p50', 'MFU_p90',
+       'MFU_p95', 'MFU_p99', 'MFU_avg', 'MFU_max', 'MFU_num_runs', 'MFU_min']
+}
+
 def download_from_gcs(bucket_path: str, local_dir: str):
     """
     Downloads the content of the GCS bucket path to a local directory.
     """
     fs = gcsfs.GCSFileSystem()
-    gcs_path = bucket_path.replace("gs://", "")
-    
+    gcs_path = bucket_path.replace("gs://", "").rstrip("/") + "/"
+
     print(f"Downloading from gs://{gcs_path} to {local_dir}...")
     os.makedirs(local_dir, exist_ok=True)
     fs.get(gcs_path, local_dir, recursive=True)
 
-def aggregate_results(local_dir: str):
-    categories = ["collectives", "hbm", "host_device"]
+def aggregate_collectives(directories: list[str], picked_columns: list[str]) -> pd.DataFrame:
+    if len(directories) == 0:
+        return None
+    aggregated_df = pd.DataFrame()
+    for directory in directories:
+        file = glob.glob(f"{directory}/*.tsv")[0]
+        df = pd.read_csv(file, sep='\t')
+        df["topology"] = [file.split('/')[-4].split('-')[1] for _ in range(df.shape[0])]
+        aggregated_df = pd.concat([aggregated_df, df[picked_columns]], ignore_index=True)
+    return aggregated_df
+
+def aggregate_hbm(directories: list[str], picked_columns: list[str]) -> pd.DataFrame:
+    if len(directories) == 0:
+        return None
+    aggregated_df = pd.DataFrame()
+    for directory in directories:
+        file = glob.glob(f"{directory}/*.tsv")[0]
+        df = pd.read_csv(file, sep='\t')
+        aggregated_df = pd.concat([aggregated_df, df[picked_columns]], ignore_index=True)
+    return aggregated_df
+
+def aggregate_host_device(directories: list[str], picked_columns: list[str]) -> pd.DataFrame:
+    if len(directories) == 0:
+        return None
+    aggregated_df = pd.DataFrame()
+    for directory in directories:
+        file = glob.glob(f"{directory}/*.tsv")[0]
+        df = pd.read_csv(file, sep='\t')
+        aggregated_df = pd.concat([aggregated_df, df[picked_columns]], ignore_index=True)
+    return aggregated_df
+
+def aggregate_training(directories: list[str], picked_columns: list[str]) -> pd.DataFrame:
+    if len(directories) == 0:
+        return None
+    aggregated_df = pd.DataFrame()
+    for directory in directories:
+        files = glob.glob(f"{directory}/*.tsv")
+        for file in files:
+            df = pd.read_csv(file, sep='\t')
+            if "topology" in picked_columns:
+                df["topology"] = [file.split('/')[-4].split('-')[1] for _ in range(df.shape[0])]
+            aggregated_df = pd.concat([aggregated_df, df[picked_columns]], ignore_index=True)
+    return aggregated_df
+
+aggregate_function = {
+    "collectives": aggregate_collectives,
+    "hbm": aggregate_hbm,
+    "host_device": aggregate_host_device,
+    "training": aggregate_training
+}
+
+def aggregate_results(bucket_path: str, local_dir: str):
+    categories = ["collectives", "hbm", "host_device", "training"]
     directories = {}
+    results = {}
     for category in categories:
-        directories[category] = glob.glob(f"{local_dir}/*/{category}/*", recursive=True)
-    
-    
+        directories[category] = sorted(glob.glob(f"{local_dir}/*/{category}/*", recursive=True))
+        results[category] = aggregate_function[category](directories[category], columns_mapping[category])
+        if results[category] is not None:
+            results[category].to_csv(f"{bucket_path}/aggregated_results/{category}.csv", index=False)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Download from GCS and aggregate results locally.")
@@ -30,4 +116,4 @@ def aggregate_results(local_dir: str):
     args = parser.parse_args()
     
     download_from_gcs(args.bucket_path, args.local_dir)
-    aggregate_results(args.local_dir)
+    aggregate_results(args.bucket_path, args.local_dir)
diff --git a/Ironwood/guides/automation/aggregator.yaml b/Ironwood/guides/automation/aggregator.yaml
index bc81bed6..12a0832f 100644
--- a/Ironwood/guides/automation/aggregator.yaml
+++ b/Ironwood/guides/automation/aggregator.yaml
@@ -1,7 +1,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: aggregator
+  name: wait
 spec:
   template:
     spec:
@@ -20,5 +20,4 @@ spec:
 
           GCS_BUCKET_DIR=${GCS_BUCKET_ROOT_DIR}
           python Ironwood/guides/automation/aggregator.py --bucket_path=${GCS_BUCKET_DIR}
-          sleep infinity
       restartPolicy: Never
diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
index 198ad91c..262e06f2 100644
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -11,9 +11,7 @@ TIMEOUT_SECOND=3600
 
 yaml_names=(
     "tpu7x-2x2x1-hbm.yaml" "tpu7x-2x2x1-host_device.yaml" "tpu7x-2x2x1-gemm.yaml" "tpu7x-2x2x1-collectives.yaml"
-    "tpu7x-2x2x2-hbm.yaml" "tpu7x-2x2x2-host_device.yaml" "tpu7x-2x2x2-gemm.yaml" "tpu7x-2x2x2-collectives.yaml"
-    "tpu7x-2x2x4-hbm.yaml" "tpu7x-2x2x4-host_device.yaml" "tpu7x-2x2x4-gemm.yaml" "tpu7x-2x2x4-collectives.yaml"
-    "tpu7x-2x4x4-hbm.yaml" "tpu7x-2x4x4-host_device.yaml" "tpu7x-2x4x4-gemm.yaml" "tpu7x-2x4x4-collectives.yaml"
+    "tpu7x-2x2x2-collectives.yaml" "tpu7x-2x2x4-collectives.yaml" "tpu7x-2x4x4-collectives.yaml" "tpu7x-4x4x4-collectives.yaml"
 )
 
 ######################################################################
diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml
deleted file mode 100644
index 3f6004bd..00000000
--- a/Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: headless-svc-${JOB_NAME}
-spec:
-  clusterIP: None
-  selector:
-    job-name: ${JOB_NAME}
----
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: ${JOB_NAME}
-  labels:
-    kueue.x-k8s.io/queue-name: user-queue-2x2x2
-spec:
-  completionMode: Indexed
-  suspend: true
-  parallelism: 2
-  completions: 2
-  backoffLimit: 0
-  template:
-    spec:
-      subdomain: headless-svc-${JOB_NAME}
-      restartPolicy: Never
-      nodeSelector:
-        cloud.google.com/gke-tpu-accelerator: tpu7x
-        cloud.google.com/gke-tpu-topology: 2x2x2
-      containers:
-      - name: jax-tpu
-        image: python:3.12
-        securityContext:
-          privileged: false
-        env:
-        - name: JAX_PLATFORMS
-          value: "tpu,cpu"
-        - name: TPU_VMODULE
-          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
-        - name: XLA_IR_DEBUG
-          value: "1"
-        - name: XLA_HLO_DEBUG
-          value: "1"
-        command:
-        - bash
-        - -c
-        - |
-          set -ex
-
-          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
-          cd accelerator-microbenchmarks
-          git checkout tpu7x-auto
-          pip install -r requirements.txt
-
-          GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
-        resources:
-          requests:
-            google.com/tpu: 4
-          limits:
-            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml
deleted file mode 100644
index 71793e3f..00000000
--- a/Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: headless-svc-${JOB_NAME}
-spec:
-  clusterIP: None
-  selector:
-    job-name: ${JOB_NAME}
----
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: ${JOB_NAME}
-  labels:
-    kueue.x-k8s.io/queue-name: user-queue-2x2x2
-spec:
-  completionMode: Indexed
-  suspend: true
-  parallelism: 2
-  completions: 2
-  backoffLimit: 0
-  template:
-    spec:
-      subdomain: headless-svc-${JOB_NAME}
-      restartPolicy: Never
-      nodeSelector:
-        cloud.google.com/gke-tpu-accelerator: tpu7x
-        cloud.google.com/gke-tpu-topology: 2x2x2
-      containers:
-      - name: jax-tpu
-        image: python:3.12
-        securityContext:
-          privileged: false
-        env:
-        - name: JAX_PLATFORMS
-          value: "tpu,cpu"
-        - name: TPU_VMODULE
-          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
-        - name: XLA_IR_DEBUG
-          value: "1"
-        - name: XLA_HLO_DEBUG
-          value: "1"
-        command:
-        - bash
-        - -c
-        - |
-          set -ex
-
-          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
-          cd accelerator-microbenchmarks
-          git checkout tpu7x-auto
-          pip install -r requirements.txt
-
-          GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
-        resources:
-          requests:
-            google.com/tpu: 4
-          limits:
-            google.com/tpu: 4
diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-host_device.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-host_device.yaml
deleted file mode 100644
index fbcd556d..00000000
--- a/Ironwood/guides/automation/tpu7x-2x2x2-host_device.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: headless-svc-${JOB_NAME}
-spec:
-  clusterIP: None
-  selector:
-    job-name: ${JOB_NAME}
----
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: ${JOB_NAME}
-  labels:
-    kueue.x-k8s.io/queue-name: user-queue-2x2x2
-spec:
-  completionMode: Indexed
-  suspend: true
-  parallelism: 2
-  completions: 2
-  backoffLimit: 0
-  template:
-    spec:
-      subdomain: headless-svc-${JOB_NAME}
-      restartPolicy: Never
-      nodeSelector:
-        cloud.google.com/gke-tpu-accelerator: tpu7x
-        cloud.google.com/gke-tpu-topology: 2x2x2
-      containers:
-      - name: jax-tpu
-        image: python:3.12
-        securityContext:
-          privileged: false
-        env:
-        - name: JAX_PLATFORMS
-          value: "tpu,cpu"
-        - name: TPU_VMODULE
-          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
-        - name: XLA_IR_DEBUG
-          value: "1"
-        - name: XLA_HLO_DEBUG
-          value: "1"
-        command:
-        - bash
-        - -c
-        - |
-          set -ex
-
-          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
-          cd accelerator-microbenchmarks
-          git checkout tpu7x-auto
-          pip install -r requirements.txt
-
-          GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
-        resources:
-          requests:
-            google.com/tpu: 4
-          limits:
-            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml
deleted file mode 100644
index fee014e9..00000000
--- a/Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: headless-svc-${JOB_NAME}
-spec:
-  clusterIP: None
-  selector:
-    job-name: ${JOB_NAME}
----
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: ${JOB_NAME}
-  labels:
-    kueue.x-k8s.io/queue-name: user-queue-2x2x4
-spec:
-  completionMode: Indexed
-  suspend: true
-  parallelism: 4
-  completions: 4
-  backoffLimit: 0
-  template:
-    spec:
-      subdomain: headless-svc-${JOB_NAME}
-      restartPolicy: Never
-      nodeSelector:
-        cloud.google.com/gke-tpu-accelerator: tpu7x
-        cloud.google.com/gke-tpu-topology: 2x2x4
-      containers:
-      - name: jax-tpu
-        image: python:3.12
-        securityContext:
-          privileged: false
-        env:
-        - name: JAX_PLATFORMS
-          value: "tpu,cpu"
-        - name: TPU_VMODULE
-          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
-        - name: XLA_IR_DEBUG
-          value: "1"
-        - name: XLA_HLO_DEBUG
-          value: "1"
-        command:
-        - bash
-        - -c
-        - |
-          set -ex
-
-          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
-          cd accelerator-microbenchmarks
-          git checkout tpu7x-auto
-          pip install -r requirements.txt
-
-          GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
-        resources:
-          requests:
-            google.com/tpu: 4
-          limits:
-            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml
deleted file mode 100644
index 100e1f7f..00000000
--- a/Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: headless-svc-${JOB_NAME}
-spec:
-  clusterIP: None
-  selector:
-    job-name: ${JOB_NAME}
----
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: ${JOB_NAME}
-  labels:
-    kueue.x-k8s.io/queue-name: user-queue-2x2x4
-spec:
-  completionMode: Indexed
-  suspend: true
-  parallelism: 4
-  completions: 4
-  backoffLimit: 0
-  template:
-    spec:
-      subdomain: headless-svc-${JOB_NAME}
-      restartPolicy: Never
-      nodeSelector:
-        cloud.google.com/gke-tpu-accelerator: tpu7x
-        cloud.google.com/gke-tpu-topology: 2x2x4
-      containers:
-      - name: jax-tpu
-        image: python:3.12
-        securityContext:
-          privileged: false
-        env:
-        - name: JAX_PLATFORMS
-          value: "tpu,cpu"
-        - name: TPU_VMODULE
-          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
-        - name: XLA_IR_DEBUG
-          value: "1"
-        - name: XLA_HLO_DEBUG
-          value: "1"
-        command:
-        - bash
-        - -c
-        - |
-          set -ex
-
-          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
-          cd accelerator-microbenchmarks
-          git checkout tpu7x-auto
-          pip install -r requirements.txt
-
-          GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
-        resources:
-          requests:
-            google.com/tpu: 4
-          limits:
-            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-host_device.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-host_device.yaml
deleted file mode 100644
index 799c5b35..00000000
--- a/Ironwood/guides/automation/tpu7x-2x2x4-host_device.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: headless-svc-${JOB_NAME}
-spec:
-  clusterIP: None
-  selector:
-    job-name: ${JOB_NAME}
----
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: ${JOB_NAME}
-  labels:
-    kueue.x-k8s.io/queue-name: user-queue-2x2x4
-spec:
-  completionMode: Indexed
-  suspend: true
-  parallelism: 4
-  completions: 4
-  backoffLimit: 0
-  template:
-    spec:
-      subdomain: headless-svc-${JOB_NAME}
-      restartPolicy: Never
-      nodeSelector:
-        cloud.google.com/gke-tpu-accelerator: tpu7x
-        cloud.google.com/gke-tpu-topology: 2x2x4
-      containers:
-      - name: jax-tpu
-        image: python:3.12
-        securityContext:
-          privileged: false
-        env:
-        - name: JAX_PLATFORMS
-          value: "tpu,cpu"
-        - name: TPU_VMODULE
-          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
-        - name: XLA_IR_DEBUG
-          value: "1"
-        - name: XLA_HLO_DEBUG
-          value: "1"
-        command:
-        - bash
-        - -c
-        - |
-          set -ex
-
-          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
-          cd accelerator-microbenchmarks
-          git checkout tpu7x-auto
-          pip install -r requirements.txt
-
-          GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
-        resources:
-          requests:
-            google.com/tpu: 4
-          limits:
-            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml
deleted file mode 100644
index d51c9c31..00000000
--- a/Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: headless-svc-${JOB_NAME}
-spec:
-  clusterIP: None
-  selector:
-    job-name: ${JOB_NAME}
----
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: ${JOB_NAME}
-  labels:
-    kueue.x-k8s.io/queue-name: user-queue-2x4x4
-spec:
-  completionMode: Indexed
-  suspend: true
-  parallelism: 8
-  completions: 8
-  backoffLimit: 0
-  template:
-    spec:
-      subdomain: headless-svc-${JOB_NAME}
-      restartPolicy: Never
-      nodeSelector:
-        cloud.google.com/gke-tpu-accelerator: tpu7x
-        cloud.google.com/gke-tpu-topology: 2x4x4
-      containers:
-      - name: jax-tpu
-        image: python:3.12
-        securityContext:
-          privileged: false
-        env:
-        - name: JAX_PLATFORMS
-          value: "tpu,cpu"
-        - name: TPU_VMODULE
-          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
-        - name: XLA_IR_DEBUG
-          value: "1"
-        - name: XLA_HLO_DEBUG
-          value: "1"
-        command:
-        - bash
-        - -c
-        - |
-          set -ex
-
-          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
-          cd accelerator-microbenchmarks
-          git checkout tpu7x-auto
-          pip install -r requirements.txt
-
-          GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
-        resources:
-          requests:
-            google.com/tpu: 4
-          limits:
-            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml
deleted file mode 100644
index a438b93d..00000000
--- a/Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: headless-svc-${JOB_NAME}
-spec:
-  clusterIP: None
-  selector:
-    job-name: ${JOB_NAME}
----
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: ${JOB_NAME}
-  labels:
-    kueue.x-k8s.io/queue-name: user-queue-2x4x4
-spec:
-  completionMode: Indexed
-  suspend: true
-  parallelism: 8
-  completions: 8
-  backoffLimit: 0
-  template:
-    spec:
-      subdomain: headless-svc-${JOB_NAME}
-      restartPolicy: Never
-      nodeSelector:
-        cloud.google.com/gke-tpu-accelerator: tpu7x
-        cloud.google.com/gke-tpu-topology: 2x4x4
-      containers:
-      - name: jax-tpu
-        image: python:3.12
-        securityContext:
-          privileged: false
-        env:
-        - name: JAX_PLATFORMS
-          value: "tpu,cpu"
-        - name: TPU_VMODULE
-          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
-        - name: XLA_IR_DEBUG
-          value: "1"
-        - name: XLA_HLO_DEBUG
-          value: "1"
-        command:
-        - bash
-        - -c
-        - |
-          set -ex
-
-          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
-          cd accelerator-microbenchmarks
-          git checkout tpu7x-auto
-          pip install -r requirements.txt
-
-          GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
-        resources:
-          requests:
-            google.com/tpu: 4
-          limits:
-            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-host_device.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-host_device.yaml
deleted file mode 100644
index 30a25c4c..00000000
--- a/Ironwood/guides/automation/tpu7x-2x4x4-host_device.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: headless-svc-${JOB_NAME}
-spec:
-  clusterIP: None
-  selector:
-    job-name: ${JOB_NAME}
----
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: ${JOB_NAME}
-  labels:
-    kueue.x-k8s.io/queue-name: user-queue-2x4x4
-spec:
-  completionMode: Indexed
-  suspend: true
-  parallelism: 8
-  completions: 8
-  backoffLimit: 0
-  template:
-    spec:
-      subdomain: headless-svc-${JOB_NAME}
-      restartPolicy: Never
-      nodeSelector:
-        cloud.google.com/gke-tpu-accelerator: tpu7x
-        cloud.google.com/gke-tpu-topology: 2x4x4
-      containers:
-      - name: jax-tpu
-        image: python:3.12
-        securityContext:
-          privileged: false
-        env:
-        - name: JAX_PLATFORMS
-          value: "tpu,cpu"
-        - name: TPU_VMODULE
-          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
-        - name: XLA_IR_DEBUG
-          value: "1"
-        - name: XLA_HLO_DEBUG
-          value: "1"
-        command:
-        - bash
-        - -c
-        - |
-          set -ex
-
-          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
-          cd accelerator-microbenchmarks
-          git checkout tpu7x-auto
-          pip install -r requirements.txt
-
-          GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
-        resources:
-          requests:
-            google.com/tpu: 4
-          limits:
-            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml
deleted file mode 100644
index ef9220f6..00000000
--- a/Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: headless-svc-${JOB_NAME}
-spec:
-  clusterIP: None
-  selector:
-    job-name: ${JOB_NAME}
----
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: ${JOB_NAME}
-  labels:
-    kueue.x-k8s.io/queue-name: user-queue-4x4x4
-spec:
-  completionMode: Indexed
-  suspend: true
-  parallelism: 16
-  completions: 16
-  backoffLimit: 0
-  template:
-    spec:
-      subdomain: headless-svc-${JOB_NAME}
-      restartPolicy: Never
-      nodeSelector:
-        cloud.google.com/gke-tpu-accelerator: tpu7x
-        cloud.google.com/gke-tpu-topology: 4x4x4
-      containers:
-      - name: jax-tpu
-        image: python:3.12
-        securityContext:
-          privileged: false
-        env:
-        - name: JAX_PLATFORMS
-          value: "tpu,cpu"
-        - name: TPU_VMODULE
-          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
-        - name: XLA_IR_DEBUG
-          value: "1"
-        - name: XLA_HLO_DEBUG
-          value: "1"
-        command:
-        - bash
-        - -c
-        - |
-          set -ex
-
-          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
-          cd accelerator-microbenchmarks
-          git checkout tpu7x-auto
-          pip install -r requirements.txt
-
-          GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
-        resources:
-          requests:
-            google.com/tpu: 4
-          limits:
-            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml
deleted file mode 100644
index 0819edad..00000000
--- a/Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: headless-svc-${JOB_NAME}
-spec:
-  clusterIP: None
-  selector:
-    job-name: ${JOB_NAME}
----
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: ${JOB_NAME}
-  labels:
-    kueue.x-k8s.io/queue-name: user-queue-4x4x4
-spec:
-  completionMode: Indexed
-  suspend: true
-  parallelism: 16
-  completions: 16
-  backoffLimit: 0
-  template:
-    spec:
-      subdomain: headless-svc-${JOB_NAME}
-      restartPolicy: Never
-      nodeSelector:
-        cloud.google.com/gke-tpu-accelerator: tpu7x
-        cloud.google.com/gke-tpu-topology: 4x4x4
-      containers:
-      - name: jax-tpu
-        image: python:3.12
-        securityContext:
-          privileged: false
-        env:
-        - name: JAX_PLATFORMS
-          value: "tpu,cpu"
-        - name: TPU_VMODULE
-          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
-        - name: XLA_IR_DEBUG
-          value: "1"
-        - name: XLA_HLO_DEBUG
-          value: "1"
-        command:
-        - bash
-        - -c
-        - |
-          set -ex
-
-          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
-          cd accelerator-microbenchmarks
-          git checkout tpu7x-auto
-          pip install -r requirements.txt
-
-          GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
-        resources:
-          requests:
-            google.com/tpu: 4
-          limits:
-            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-host_device.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-host_device.yaml
deleted file mode 100644
index da7a4e7c..00000000
--- a/Ironwood/guides/automation/tpu7x-4x4x4-host_device.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: headless-svc-${JOB_NAME}
-spec:
-  clusterIP: None
-  selector:
-    job-name: ${JOB_NAME}
----
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: ${JOB_NAME}
-  labels:
-    kueue.x-k8s.io/queue-name: user-queue-4x4x4
-spec:
-  completionMode: Indexed
-  suspend: true
-  parallelism: 16
-  completions: 16
-  backoffLimit: 0
-  template:
-    spec:
-      subdomain: headless-svc-${JOB_NAME}
-      restartPolicy: Never
-      nodeSelector:
-        cloud.google.com/gke-tpu-accelerator: tpu7x
-        cloud.google.com/gke-tpu-topology: 4x4x4
-      containers:
-      - name: jax-tpu
-        image: python:3.12
-        securityContext:
-          privileged: false
-        env:
-        - name: JAX_PLATFORMS
-          value: "tpu,cpu"
-        - name: TPU_VMODULE
-          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
-        - name: XLA_IR_DEBUG
-          value: "1"
-        - name: XLA_HLO_DEBUG
-          value: "1"
-        command:
-        - bash
-        - -c
-        - |
-          set -ex
-
-          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
-          cd accelerator-microbenchmarks
-          git checkout tpu7x-auto
-          pip install -r requirements.txt
-
-          GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
-        resources:
-          requests:
-            google.com/tpu: 4
-          limits:
-            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml
deleted file mode 100644
index 65f93467..00000000
--- a/Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: headless-svc-${JOB_NAME}
-spec:
-  clusterIP: None
-  selector:
-    job-name: ${JOB_NAME}
----
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: ${JOB_NAME}
-  labels:
-    kueue.x-k8s.io/queue-name: user-queue-4x4x8
-spec:
-  completionMode: Indexed
-  suspend: true
-  parallelism: 32
-  completions: 32
-  backoffLimit: 0
-  template:
-    spec:
-      subdomain: headless-svc-${JOB_NAME}
-      restartPolicy: Never
-      nodeSelector:
-        cloud.google.com/gke-tpu-accelerator: tpu7x
-        cloud.google.com/gke-tpu-topology: 4x4x8
-      containers:
-      - name: jax-tpu
-        image: python:3.12
-        securityContext:
-          privileged: false
-        env:
-        - name: JAX_PLATFORMS
-          value: "tpu,cpu"
-        - name: TPU_VMODULE
-          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
-        - name: XLA_IR_DEBUG
-          value: "1"
-        - name: XLA_HLO_DEBUG
-          value: "1"
-        command:
-        - bash
-        - -c
-        - |
-          set -ex
-
-          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
-          cd accelerator-microbenchmarks
-          git checkout tpu7x-auto
-          pip install -r requirements.txt
-
-          GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
-        resources:
-          requests:
-            google.com/tpu: 4
-          limits:
-            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml
deleted file mode 100644
index e6445b54..00000000
--- a/Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: headless-svc-${JOB_NAME}
-spec:
-  clusterIP: None
-  selector:
-    job-name: ${JOB_NAME}
----
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: ${JOB_NAME}
-  labels:
-    kueue.x-k8s.io/queue-name: user-queue-4x4x8
-spec:
-  completionMode: Indexed
-  suspend: true
-  parallelism: 32
-  completions: 32
-  backoffLimit: 0
-  template:
-    spec:
-      subdomain: headless-svc-${JOB_NAME}
-      restartPolicy: Never
-      nodeSelector:
-        cloud.google.com/gke-tpu-accelerator: tpu7x
-        cloud.google.com/gke-tpu-topology: 4x4x8
-      containers:
-      - name: jax-tpu
-        image: python:3.12
-        securityContext:
-          privileged: false
-        env:
-        - name: JAX_PLATFORMS
-          value: "tpu,cpu"
-        - name: TPU_VMODULE
-          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
-        - name: XLA_IR_DEBUG
-          value: "1"
-        - name: XLA_HLO_DEBUG
-          value: "1"
-        command:
-        - bash
-        - -c
-        - |
-          set -ex
-
-          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
-          cd accelerator-microbenchmarks
-          git checkout tpu7x-auto
-          pip install -r requirements.txt
-
-          GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
-        resources:
-          requests:
-            google.com/tpu: 4
-          limits:
-            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-host_device.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-host_device.yaml
deleted file mode 100644
index 09d2a5ab..00000000
--- a/Ironwood/guides/automation/tpu7x-4x4x8-host_device.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: headless-svc-${JOB_NAME}
-spec:
-  clusterIP: None
-  selector:
-    job-name: ${JOB_NAME}
----
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: ${JOB_NAME}
-  labels:
-    kueue.x-k8s.io/queue-name: user-queue-4x4x8
-spec:
-  completionMode: Indexed
-  suspend: true
-  parallelism: 32
-  completions: 32
-  backoffLimit: 0
-  template:
-    spec:
-      subdomain: headless-svc-${JOB_NAME}
-      restartPolicy: Never
-      nodeSelector:
-        cloud.google.com/gke-tpu-accelerator: tpu7x
-        cloud.google.com/gke-tpu-topology: 4x4x8
-      containers:
-      - name: jax-tpu
-        image: python:3.12
-        securityContext:
-          privileged: false
-        env:
-        - name: JAX_PLATFORMS
-          value: "tpu,cpu"
-        - name: TPU_VMODULE
-          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
-        - name: XLA_IR_DEBUG
-          value: "1"
-        - name: XLA_HLO_DEBUG
-          value: "1"
-        command:
-        - bash
-        - -c
-        - |
-          set -ex
-
-          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
-          cd accelerator-microbenchmarks
-          git checkout tpu7x-auto
-          pip install -r requirements.txt
-
-          GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
-        resources:
-          requests:
-            google.com/tpu: 4
-          limits:
-            google.com/tpu: 4
\ No newline at end of file

From 9ea7110fb14e7dd2aa52b68a30b97eb420c9b029 Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Thu, 29 Jan 2026 19:02:00 +0000
Subject: [PATCH 16/88] [Automation] Update aggregator

---
 Ironwood/guides/automation/aggregator.py   | 55 ++++++++++++----------
 Ironwood/guides/automation/aggregator.yaml |  4 +-
 2 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py
index 471a4ca8..655debe6 100644
--- a/Ironwood/guides/automation/aggregator.py
+++ b/Ironwood/guides/automation/aggregator.py
@@ -6,35 +6,38 @@
 
 columns_mapping = {
     "collectives": [
-        "topology", "input_num_elements", "transferred_data (GB)", "dtype_bytes", "hlo_input_shape", "hlo_output_shape", "test_name",
-        'step_time_ms_p50', 'step_time_ms_p90', 'step_time_ms_p95', 'step_time_ms_p99', 'step_time_ms_avg', 'step_time_ms_max', 'step_time_ms_num_runs', 'step_time_ms_min',
-        'achieved_bw (GB/s)_p50', 'achieved_bw (GB/s)_p90', 'achieved_bw (GB/s)_p95', 'achieved_bw (GB/s)_p99', 'achieved_bw (GB/s)_avg', 'achieved_bw (GB/s)_max', 'achieved_bw (GB/s)_num_runs', 'achieved_bw (GB/s)_min',
+        "topology", "op_type", "input_num_elements", "transferred_data (GB)", "dtype_bytes", "hlo_input_shape", "hlo_output_shape",
+        "step_time_ms_p50", "step_time_ms_p90", "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg", "step_time_ms_max", "step_time_ms_num_runs", "step_time_ms_min",
+        "achieved_bw (GB/s)_p50", "achieved_bw (GB/s)_p90", "achieved_bw (GB/s)_p95", "achieved_bw (GB/s)_p99", "achieved_bw (GB/s)_avg", "achieved_bw (GB/s)_max", "achieved_bw (GB/s)_num_runs", "achieved_bw (GB/s)_min",
     ],
     "hbm": [
-        'num_elements', 'dtype', 'tensor_size_gbytes', 'test_name',
-        'time_ms_p50', 'time_ms_p90', 'time_ms_p95', 'time_ms_p99', 'time_ms_avg', 'time_ms_max', 'time_ms_num_runs', 'time_ms_min',
-        'bw_gbyte_sec_p50', 'bw_gbyte_sec_p90', 'bw_gbyte_sec_p95', 'bw_gbyte_sec_p99', 'bw_gbyte_sec_avg', 'bw_gbyte_sec_max', 'bw_gbyte_sec_num_runs', 'bw_gbyte_sec_min',
+        "num_elements", "dtype", "tensor_size_gbytes",
+        "time_ms_p50", "time_ms_p90", "time_ms_p95", "time_ms_p99", "time_ms_avg", "time_ms_max", "time_ms_num_runs", "time_ms_min",
+        "bw_gbyte_sec_p50", "bw_gbyte_sec_p90", "bw_gbyte_sec_p95", "bw_gbyte_sec_p99", "bw_gbyte_sec_avg", "bw_gbyte_sec_max", "bw_gbyte_sec_num_runs", "bw_gbyte_sec_min",
     ],
     "host_device": [
-        'data_size_mib', 'H2D_bw (GiB/s)_p50', 'H2D_bw (GiB/s)_p90',
-       'H2D_bw (GiB/s)_p95', 'H2D_bw (GiB/s)_p99', 'H2D_bw (GiB/s)_avg',
-       'H2D_bw (GiB/s)_max', 'H2D_bw (GiB/s)_num_runs', 'H2D_bw (GiB/s)_min',
-       'D2H_bw (GiB/s)_p50', 'D2H_bw (GiB/s)_p90', 'D2H_bw (GiB/s)_p95',
-       'D2H_bw (GiB/s)_p99', 'D2H_bw (GiB/s)_avg', 'D2H_bw (GiB/s)_max',
-       'D2H_bw (GiB/s)_num_runs', 'D2H_bw (GiB/s)_min'],
-    "training": ['m', 'n', 'k', 'dtype', 'StepTime(median,ms)',
-       'Throughput(median,TFLOP/s/device)', 'TotalThroughput(median,TFLOP/s)',
-       'MFU', 'total_flops', 'test_name', 'step_time_ms_p50', 'step_time_ms_p90',
-       'step_time_ms_p95', 'step_time_ms_p99', 'step_time_ms_avg',
-       'step_time_ms_max', 'step_time_ms_num_runs', 'step_time_ms_min',
-       'tflops_per_sec_pre_device_p50', 'tflops_per_sec_pre_device_p90',
-       'tflops_per_sec_pre_device_p95', 'tflops_per_sec_pre_device_p99',
-       'tflops_per_sec_pre_device_avg', 'tflops_per_sec_pre_device_max',
-       'tflops_per_sec_pre_device_num_runs', 'tflops_per_sec_pre_device_min',
-       'tflops_per_sec_p50', 'tflops_per_sec_p90', 'tflops_per_sec_p95',
-       'tflops_per_sec_p99', 'tflops_per_sec_avg', 'tflops_per_sec_max',
-       'tflops_per_sec_num_runs', 'tflops_per_sec_min', 'MFU_p50', 'MFU_p90',
-       'MFU_p95', 'MFU_p99', 'MFU_avg', 'MFU_max', 'MFU_num_runs', 'MFU_min']
+        "data_size_mib", "H2D_bw (GiB/s)_p50", "H2D_bw (GiB/s)_p90",
+        "H2D_bw (GiB/s)_p95", "H2D_bw (GiB/s)_p99", "H2D_bw (GiB/s)_avg",
+        "H2D_bw (GiB/s)_max", "H2D_bw (GiB/s)_num_runs", "H2D_bw (GiB/s)_min",
+        "D2H_bw (GiB/s)_p50", "D2H_bw (GiB/s)_p90", "D2H_bw (GiB/s)_p95",
+        "D2H_bw (GiB/s)_p99", "D2H_bw (GiB/s)_avg", "D2H_bw (GiB/s)_max",
+        "D2H_bw (GiB/s)_num_runs", "D2H_bw (GiB/s)_min"
+    ],
+    "training": [
+        "m", "n", "k", "dtype", "StepTime(median,ms)",
+        "Throughput(median,TFLOP/s/device)", "TotalThroughput(median,TFLOP/s)",
+        "MFU", "total_flops", "step_time_ms_p50", "step_time_ms_p90",
+        "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg",
+        "step_time_ms_max", "step_time_ms_num_runs", "step_time_ms_min",
+        "tflops_per_sec_pre_device_p50", "tflops_per_sec_pre_device_p90",
+        "tflops_per_sec_pre_device_p95", "tflops_per_sec_pre_device_p99",
+        "tflops_per_sec_pre_device_avg", "tflops_per_sec_pre_device_max",
+        "tflops_per_sec_pre_device_num_runs", "tflops_per_sec_pre_device_min",
+        "tflops_per_sec_p50", "tflops_per_sec_p90", "tflops_per_sec_p95",
+        "tflops_per_sec_p99", "tflops_per_sec_avg", "tflops_per_sec_max",
+        "tflops_per_sec_num_runs", "tflops_per_sec_min", "MFU_p50", "MFU_p90",
+        "MFU_p95", "MFU_p99", "MFU_avg", "MFU_max", "MFU_num_runs", "MFU_min"
+    ],
 }
 
 def download_from_gcs(bucket_path: str, local_dir: str):
@@ -107,7 +110,7 @@ def aggregate_results(bucket_path: str, local_dir: str):
         directories[category] = sorted(glob.glob(f"{local_dir}/*/{category}/*", recursive=True))
         results[category] = aggregate_function[category](directories[category], columns_mapping[category])
         if results[category] is not None:
-            results[category].to_csv(f"{bucket_path}/aggregated_results/{category}.csv", index=False)
+            results[category].to_csv(f"{bucket_path}/aggregated_results/{category}.tsv", index=False, sep='\t')
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Download from GCS and aggregate results locally.")
diff --git a/Ironwood/guides/automation/aggregator.yaml b/Ironwood/guides/automation/aggregator.yaml
index 12a0832f..5a2c89a3 100644
--- a/Ironwood/guides/automation/aggregator.yaml
+++ b/Ironwood/guides/automation/aggregator.yaml
@@ -1,7 +1,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: wait
+  name: aggregator
 spec:
   template:
     spec:
@@ -16,7 +16,7 @@ spec:
           git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
           cd accelerator-microbenchmarks
           git checkout tpu7x-auto
-          pip install -r requirements.txt
+          pip install gcsfs pandas
 
           GCS_BUCKET_DIR=${GCS_BUCKET_ROOT_DIR}
           python Ironwood/guides/automation/aggregator.py --bucket_path=${GCS_BUCKET_DIR}

From bb5fc2f455f2b1edf7ba291727b3c0352ce4d0cf Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Fri, 30 Jan 2026 06:46:58 +0000
Subject: [PATCH 17/88] Add dtype to H2D/D2H

---
 Ironwood/guides/automation/automation_launch.sh | 10 ++++++++--
 Ironwood/src/benchmark_host_device.py           |  1 +
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
index 262e06f2..386370de 100644
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -10,8 +10,14 @@ MAX_RETRIES=3
 TIMEOUT_SECOND=3600
 
 yaml_names=(
-    "tpu7x-2x2x1-hbm.yaml" "tpu7x-2x2x1-host_device.yaml" "tpu7x-2x2x1-gemm.yaml" "tpu7x-2x2x1-collectives.yaml"
-    "tpu7x-2x2x2-collectives.yaml" "tpu7x-2x2x4-collectives.yaml" "tpu7x-2x4x4-collectives.yaml" "tpu7x-4x4x4-collectives.yaml"
+    "tpu7x-2x2x1-hbm.yaml"
+    "tpu7x-2x2x1-host_device.yaml"
+    "tpu7x-2x2x1-gemm.yaml"
+    "tpu7x-2x2x1-collectives.yaml"
+    "tpu7x-2x2x2-collectives.yaml"
+    "tpu7x-2x2x4-collectives.yaml"
+    "tpu7x-2x4x4-collectives.yaml"
+    "tpu7x-4x4x4-collectives.yaml"
 )
 
 ######################################################################
diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index f745eb48..16352e2a 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -107,6 +107,7 @@ def benchmark_host_device_calculate_metrics(
         "data_size_mib", 
     }
     metadata = {k: v for k, v in params if k in metadata_keys}
+    metadata["dtype"] = "float32"
     
     metrics = {}
     

From 5315132d526743e61a021530dad8c4edc203fbf3 Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Fri, 30 Jan 2026 07:34:01 +0000
Subject: [PATCH 18/88] [Automation] Automatically delete aggregator after
 completion

---
 Ironwood/guides/automation/automation_launch.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
index 386370de..1f544499 100644
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -157,6 +157,8 @@ echo "Jobs completed. Aggregating results..."
 echo ""
 
 envsubst '${GCS_BUCKET_ROOT_DIR}' < ${SCRIPT_DIR}/aggregator.yaml | kubectl apply -f -
+wait_for_job_completion "aggregator" ${TIMEOUT_SECOND}
+envsubst '${GCS_BUCKET_ROOT_DIR}' < ${SCRIPT_DIR}/aggregator.yaml | kubectl delete -f -
 
 # Print the failed jobs at the end for better visibility.
 

From 4bdf77fb4345e8198057dda30d0b5470e7dd6df8 Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Fri, 30 Jan 2026 07:34:45 +0000
Subject: [PATCH 19/88] Update README with kueue and reformat

---
 Ironwood/guides/automation/README.md | 109 +++++++++++++++++++++------
 1 file changed, 88 insertions(+), 21 deletions(-)

diff --git a/Ironwood/guides/automation/README.md b/Ironwood/guides/automation/README.md
index 05e2e2d1..3c463c67 100644
--- a/Ironwood/guides/automation/README.md
+++ b/Ironwood/guides/automation/README.md
@@ -1,40 +1,107 @@
-# Ironwood Automation Tool
+# Ironwood Benchmark Automation
 
-This directory contains the automation scripts for running TPU microbenchmarks. The tool simplifies the process of launching multiple benchmark jobs, waiting for their completion, and aggregating the results into a unified format.
+This directory contains the automation framework for running TPU microbenchmarks (HBM, Host-Device, Collectives, etc.) on GKE clusters. The tool simplifies the workflow of launching multiple benchmark jobs via [Kueue](https://kueue.sigs.k8s.io/), monitoring their status, handling retries, and aggregating the final results into a unified format.
+
+## Overview
+
+The automation workflow consists of three main stages:
+1.  **Launch**: Submits Kubernetes Jobs for various benchmark configurations (e.g., different topologies like 2x2x1, 2x2x2) using Kueue for queue management.
+2.  **Monitor & Retry**: Watches the jobs until completion. If any job fails, it automatically retries them (up to 3 times by default).
+3.  **Aggregate**: Once all jobs succeed, an aggregator job is launched to collect all intermediate results from GCS and consolidate them into summary TSV files.
 
 ## Prerequisites
 
 Before running the automation script, ensure the following requirements are met:
 
-1.  **Node Pool Topology**: The script expects specific TPU node pools to be available in your cluster.
-    *   The `check_node_pool_setup.sh` script validates this.
-2.  **GCS Bucket**: You must have a Google Cloud Storage (GCS) bucket for the intermediate and final results.
-    *   This can be setup by `gcloud storage buckets create gs://my-unique-bucket-name --location=us-central1`
-3.  **Kubectl**: Ensure `kubectl` is configured and connected to your GKE cluster.
+### 1. Environment Setup
+*   **GKE Cluster**: You must have a GKE cluster with TPU node pools configured.
+*   **Kubectl**: Ensure `kubectl` is installed and authenticated to your cluster.
+*   **GCS Bucket**: A Google Cloud Storage bucket is required to store intermediate and final aggregated results.
+    ```bash
+    gcloud storage buckets create gs://my-unique-bucket-name --location=us-central1
+    ```
+
+### 2. Install Kueue
+The automation relies on Kueue for job queuing. Check if it's already installed:
+
+```bash
+kubectl get namespace kueue-system
+```
+
+If you see `Error from server (NotFound)`, install it with:
+
+```bash
+kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.16.0/manifests.yaml
+```
+
+### 3. Verify Node Pool Topology
+The script expects specific TPU node pools (e.g., `tpu7x-2x2x1`, `tpu7x-2x2x2`) to be available. The `check_node_pool_setup.sh` utility will automatically validate this before launching jobs.
+
+## Directory Structure
+
+*   `automation_launch.sh`: The main entry point script. Manages the full lifecycle of the benchmark run.
+*   `check_node_pool_setup.sh`: Validation script to ensure required node pools exist in the cluster.
+*   `aggregator.py`: Python script that downloads results from GCS and produces summary tables.
+*   `aggregator.yaml`: Kubernetes Job definition for running the aggregator.
+*   `job-queue.yaml`: Kueue resource definitions (ClusterQueue, LocalQueue).
+*   `*.yaml`: Benchmark job configurations (e.g., `tpu7x-2x2x1-hbm.yaml`).
+
+## Configuration
+
+You can configure the behavior using the following environment variable:
 
-## User Journey
+| Variable | Description | Required | Default |
+| :--- | :--- | :--- | :--- |
+| `GCS_BUCKET_ROOT_DIR` | The root GCS path where results will be stored. Must start with `gs://`. | **Yes** | `gs://amylin-microbenchmark` (Change this!) |
 
-1.  **Clone & Checkout Branch**.
+## Usage Guide
+
+1.  **Clone the Repository**:
     ```bash
     git clone https://github.com/google/accelerator-microbenchmarks.git
     cd accelerator-microbenchmarks
+    # Switch to the correct branch if necessary
     git checkout tpu7x-auto
     ```
 
-2.  **Setup Environment**: Ensure your node pools are set up and you have prepared a GCS bucket.
-
-3.  **Run Automation Script**:
-    The main script is `automation_launch.sh`. You need to set the `GCS_BUCKET_ROOT_DIR` environment variable before running it.
+2.  **Set the GCS Bucket**:
+    Export the path to your GCS bucket. This is where all results will be saved.
+    ```bash
+    export GCS_BUCKET_ROOT_DIR="gs://your-unique-bucket-name/benchmark_runs/$(date +%Y%m%d_%H%M%S)"
+    ```
 
+3.  **Run the Automation Script**:
+    Execute the launch script from the root of the repository.
     ```bash
-    # Replace with your actual bucket path (must start with gs://)
-    export GCS_BUCKET_ROOT_DIR="gs://your-bucket-name/automation_results"
-    
-    # Run the launch script
     bash Ironwood/guides/automation/automation_launch.sh
     ```
 
-4.  **Retrieve Results**:
-    After the script completes, the final aggregated TSV files will be available in your GCS bucket. The script generates a timestamped directory for each run.
-    *   Check the script output for the exact path: `The intermediate result will be written to gs://...`
-    *   Look for the `final` directory under that path (e.g., `gs://your-bucket/automation_results/<timestamp>/final`).
+    **What happens next?**
+    *   The script validates your node pools.
+    *   It applies the Kueue job queues.
+    *   It submits the benchmark jobs defined in the script (e.g., HBM tests).
+    *   It waits for jobs to finish, retrying any failures up to 3 times.
+    *   Finally, it launches the `aggregator` job.
+
+## Output
+
+After the automation completes, check your GCS bucket (`GCS_BUCKET_ROOT_DIR`). You will find:
+
+*   **`aggregated_results/`**: Contains the final summary CSV/TSV files (e.g., `hbm.tsv`, `collectives.tsv`).
+*   **`<job-name>/`**: Directories for each individual job containing intermediate results.
+
+## Troubleshooting
+
+### Job Failures
+If jobs fail even after retries:
+1.  Check the script output to see which specific jobs failed.
+2.  Inspect the logs of a failed job using `kubectl logs job/<job-name>`.
+3.  Manually retry a specific job if needed using the command printed by the script at the end of the run.
+
+### Missing Results
+If the `aggregated_results` folder is empty:
+1.  Check the logs of the aggregator job:
+    ```bash
+    kubectl logs job/aggregator
+    ```
+2.  Ensure the `GCS_BUCKET_ROOT_DIR` was accessible by the pods (check Workload Identity or service account permissions if running in a restricted project).

From 2de55f4dc44e35830dfa04a7bf7a29e247574331 Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Fri, 30 Jan 2026 07:45:52 +0000
Subject: [PATCH 20/88] Add dtype to aggregator H2D method

---
 Ironwood/guides/automation/aggregator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py
index 655debe6..c4e3cc20 100644
--- a/Ironwood/guides/automation/aggregator.py
+++ b/Ironwood/guides/automation/aggregator.py
@@ -16,7 +16,7 @@
         "bw_gbyte_sec_p50", "bw_gbyte_sec_p90", "bw_gbyte_sec_p95", "bw_gbyte_sec_p99", "bw_gbyte_sec_avg", "bw_gbyte_sec_max", "bw_gbyte_sec_num_runs", "bw_gbyte_sec_min",
     ],
     "host_device": [
-        "data_size_mib", "H2D_bw (GiB/s)_p50", "H2D_bw (GiB/s)_p90",
+        "data_size_mib", "dtype", "H2D_bw (GiB/s)_p50", "H2D_bw (GiB/s)_p90",
         "H2D_bw (GiB/s)_p95", "H2D_bw (GiB/s)_p99", "H2D_bw (GiB/s)_avg",
         "H2D_bw (GiB/s)_max", "H2D_bw (GiB/s)_num_runs", "H2D_bw (GiB/s)_min",
         "D2H_bw (GiB/s)_p50", "D2H_bw (GiB/s)_p90", "D2H_bw (GiB/s)_p95",

From ed9f6ef017896f6aa14fc759d18fc15386263900 Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Fri, 30 Jan 2026 07:51:59 +0000
Subject: [PATCH 21/88] Remove unnecessary columns when aggregating and fix a
 typo of per_device

---
 Ironwood/guides/automation/aggregator.py | 53 ++++++++++--------------
 Ironwood/src/benchmark_utils.py          |  2 +-
 2 files changed, 23 insertions(+), 32 deletions(-)

diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py
index c4e3cc20..d0d5f8a3 100644
--- a/Ironwood/guides/automation/aggregator.py
+++ b/Ironwood/guides/automation/aggregator.py
@@ -6,37 +6,28 @@
 
 columns_mapping = {
     "collectives": [
-        "topology", "op_type", "input_num_elements", "transferred_data (GB)", "dtype_bytes", "hlo_input_shape", "hlo_output_shape",
-        "step_time_ms_p50", "step_time_ms_p90", "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg", "step_time_ms_max", "step_time_ms_num_runs", "step_time_ms_min",
-        "achieved_bw (GB/s)_p50", "achieved_bw (GB/s)_p90", "achieved_bw (GB/s)_p95", "achieved_bw (GB/s)_p99", "achieved_bw (GB/s)_avg", "achieved_bw (GB/s)_max", "achieved_bw (GB/s)_num_runs", "achieved_bw (GB/s)_min",
+        "topology", "op_type", "input_num_elements", "transferred_data (GB)", "dtype_bytes", "step_time_ms_num_runs",
+        "achieved_bw (GB/s)_p50", "achieved_bw (GB/s)_p90", "achieved_bw (GB/s)_p95", "achieved_bw (GB/s)_p99", "achieved_bw (GB/s)_avg", "achieved_bw (GB/s)_min", "achieved_bw (GB/s)_max",
+        "step_time_ms_p50", "step_time_ms_p90", "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg", "step_time_ms_min", "step_time_ms_max",
     ],
     "hbm": [
-        "num_elements", "dtype", "tensor_size_gbytes",
-        "time_ms_p50", "time_ms_p90", "time_ms_p95", "time_ms_p99", "time_ms_avg", "time_ms_max", "time_ms_num_runs", "time_ms_min",
-        "bw_gbyte_sec_p50", "bw_gbyte_sec_p90", "bw_gbyte_sec_p95", "bw_gbyte_sec_p99", "bw_gbyte_sec_avg", "bw_gbyte_sec_max", "bw_gbyte_sec_num_runs", "bw_gbyte_sec_min",
+        "dtype", "tensor_size_gbytes", "time_ms_num_runs",
+        "bw_gbyte_sec_p50", "bw_gbyte_sec_p90", "bw_gbyte_sec_p95", "bw_gbyte_sec_p99", "bw_gbyte_sec_avg", "bw_gbyte_sec_min", "bw_gbyte_sec_max",
+        "time_ms_p50", "time_ms_p90", "time_ms_p95", "time_ms_p99", "time_ms_avg", "time_ms_min", "time_ms_max",
     ],
     "host_device": [
-        "data_size_mib", "dtype", "H2D_bw (GiB/s)_p50", "H2D_bw (GiB/s)_p90",
-        "H2D_bw (GiB/s)_p95", "H2D_bw (GiB/s)_p99", "H2D_bw (GiB/s)_avg",
-        "H2D_bw (GiB/s)_max", "H2D_bw (GiB/s)_num_runs", "H2D_bw (GiB/s)_min",
-        "D2H_bw (GiB/s)_p50", "D2H_bw (GiB/s)_p90", "D2H_bw (GiB/s)_p95",
-        "D2H_bw (GiB/s)_p99", "D2H_bw (GiB/s)_avg", "D2H_bw (GiB/s)_max",
-        "D2H_bw (GiB/s)_num_runs", "D2H_bw (GiB/s)_min"
+        "data_size_mib", "H2D_bw (GiB/s)_num_runs",
+        "H2D_bw (GiB/s)_p50", "H2D_bw (GiB/s)_p90", "H2D_bw (GiB/s)_p95", "H2D_bw (GiB/s)_p99", 
+        "H2D_bw (GiB/s)_avg", "H2D_bw (GiB/s)_min", "H2D_bw (GiB/s)_max",
+        "D2H_bw (GiB/s)_p50", "D2H_bw (GiB/s)_p90", "D2H_bw (GiB/s)_p95", "D2H_bw (GiB/s)_p99", 
+        "D2H_bw (GiB/s)_avg", "D2H_bw (GiB/s)_min", "D2H_bw (GiB/s)_max",
     ],
     "training": [
-        "m", "n", "k", "dtype", "StepTime(median,ms)",
-        "Throughput(median,TFLOP/s/device)", "TotalThroughput(median,TFLOP/s)",
-        "MFU", "total_flops", "step_time_ms_p50", "step_time_ms_p90",
-        "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg",
-        "step_time_ms_max", "step_time_ms_num_runs", "step_time_ms_min",
-        "tflops_per_sec_pre_device_p50", "tflops_per_sec_pre_device_p90",
-        "tflops_per_sec_pre_device_p95", "tflops_per_sec_pre_device_p99",
-        "tflops_per_sec_pre_device_avg", "tflops_per_sec_pre_device_max",
-        "tflops_per_sec_pre_device_num_runs", "tflops_per_sec_pre_device_min",
-        "tflops_per_sec_p50", "tflops_per_sec_p90", "tflops_per_sec_p95",
-        "tflops_per_sec_p99", "tflops_per_sec_avg", "tflops_per_sec_max",
-        "tflops_per_sec_num_runs", "tflops_per_sec_min", "MFU_p50", "MFU_p90",
-        "MFU_p95", "MFU_p99", "MFU_avg", "MFU_max", "MFU_num_runs", "MFU_min"
+        "m", "n", "k", "dtype", "step_time_ms_num_runs",
+        "tflops_per_sec_per_device_p50", "tflops_per_sec_per_device_p90",
+        "tflops_per_sec_per_device_p95", "tflops_per_sec_per_device_p99",
+        "tflops_per_sec_per_device_avg", "tflops_per_sec_per_device_min",
+        "tflops_per_sec_per_device_max",
     ],
 }
 
@@ -59,7 +50,7 @@ def aggregate_collectives(directories: list[str], picked_columns: list[str]) ->
         file = glob.glob(f"{directory}/*.tsv")[0]
         df = pd.read_csv(file, sep='\t')
         df["topology"] = [file.split('/')[-4].split('-')[1] for _ in range(df.shape[0])]
-        aggregated_df = pd.concat([aggregated_df, df[picked_columns]], ignore_index=True)
+        aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True)
     return aggregated_df
 
 def aggregate_hbm(directories: list[str], picked_columns: list[str]) -> pd.DataFrame:
@@ -69,7 +60,7 @@ def aggregate_hbm(directories: list[str], picked_columns: list[str]) -> pd.DataF
     for directory in directories:
         file = glob.glob(f"{directory}/*.tsv")[0]
         df = pd.read_csv(file, sep='\t')
-        aggregated_df = pd.concat([aggregated_df, df[picked_columns]], ignore_index=True)
+        aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"time_ms_num_runs": "num_runs"})], ignore_index=True)
     return aggregated_df
 
 def aggregate_host_device(directories: list[str], picked_columns: list[str]) -> pd.DataFrame:
@@ -79,10 +70,10 @@ def aggregate_host_device(directories: list[str], picked_columns: list[str]) ->
     for directory in directories:
         file = glob.glob(f"{directory}/*.tsv")[0]
         df = pd.read_csv(file, sep='\t')
-        aggregated_df = pd.concat([aggregated_df, df[picked_columns]], ignore_index=True)
+        aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"H2D_bw (GiB/s)_num_runs": "num_runs"})], ignore_index=True)
     return aggregated_df
 
-def aggregate_training(directories: list[str], picked_columns: list[str]) -> pd.DataFrame:
+def aggregate_gemm(directories: list[str], picked_columns: list[str]) -> pd.DataFrame:
     if len(directories) == 0:
         return None
     aggregated_df = pd.DataFrame()
@@ -92,14 +83,14 @@ def aggregate_training(directories: list[str], picked_columns: list[str]) -> pd.
             df = pd.read_csv(file, sep='\t')
             if "topology" in picked_columns:
                 df["topology"] = [file.split('/')[-4].split('-')[1] for _ in range(df.shape[0])]
-            aggregated_df = pd.concat([aggregated_df, df[picked_columns]], ignore_index=True)
+            aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True)
     return aggregated_df
 
 aggregate_function = {
     "collectives": aggregate_collectives,
     "hbm": aggregate_hbm,
     "host_device": aggregate_host_device,
-    "training": aggregate_training
+    "training": aggregate_gemm,
 }
 
 def aggregate_results(bucket_path: str, local_dir: str):
diff --git a/Ironwood/src/benchmark_utils.py b/Ironwood/src/benchmark_utils.py
index e28f39e4..fa1fb81c 100644
--- a/Ironwood/src/benchmark_utils.py
+++ b/Ironwood/src/benchmark_utils.py
@@ -1134,7 +1134,7 @@ def unified_flops_metrics(
         metrics_list=time_ms_list, metrics_name="step_time_ms"
     )
     tflops_per_sec_statistics = MetricsStatistics(
-        metrics_list=tflops_per_sec_list, metrics_name="tflops_per_sec_pre_device"
+        metrics_list=tflops_per_sec_list, metrics_name="tflops_per_sec_per_device"
     )
     tflops_per_sec_all_devices_statistics = MetricsStatistics(
         metrics_list=tflops_per_sec_all_devices, metrics_name="tflops_per_sec"

From aad5d9db02dd3bb016e64bddcdabd6693c0444e2 Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Fri, 30 Jan 2026 08:01:11 +0000
Subject: [PATCH 22/88] Create config folder and modify kubenetes yaml for gemm
 test

---
 .../configs/gemm/gemm_multiple_run_more.yaml  | 75 +++++++++++++++++++
 .../guides/automation/tpu7x-2x2x1-gemm.yaml   |  2 +-
 2 files changed, 76 insertions(+), 1 deletion(-)
 create mode 100644 Ironwood/configs/gemm/gemm_multiple_run_more.yaml

diff --git a/Ironwood/configs/gemm/gemm_multiple_run_more.yaml b/Ironwood/configs/gemm/gemm_multiple_run_more.yaml
new file mode 100644
index 00000000..ea89f98b
--- /dev/null
+++ b/Ironwood/configs/gemm/gemm_multiple_run_more.yaml
@@ -0,0 +1,75 @@
+benchmarks:
+- benchmark_name: "gemm_multiple_run"
+  trace_dir: "../microbenchmarks/gemm_multiple_run_bf16"
+  csv_path: "../microbenchmarks/gemm_multiple_run_bf16"
+  xlml_metrics_dir: "../microbenchmarks/gemm_multiple_run_bf16"
+  xla_dump_dir: "../microbenchmarks/gemm_multiple_run_bf16/hlo_graphs"
+  benchmark_sweep_params:
+  - {m: 128, k: 128, n: 128, num_runs: 100, dtype: 'bfloat16'}
+  - {m: 256, k: 256, n: 256, num_runs: 100, dtype: 'bfloat16'}
+  - {m: 512, k: 512, n: 512, num_runs: 100, dtype: 'bfloat16'}
+  - {m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'bfloat16'}
+  - {m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'bfloat16'}
+  - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'bfloat16'}
+  - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'bfloat16'}
+  - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'bfloat16'}
+
+- benchmark_name: "gemm_multiple_run"
+  trace_dir: "../microbenchmarks/gemm_multiple_run_f32"
+  csv_path: "../microbenchmarks/gemm_multiple_run_f32"
+  xlml_metrics_dir: "../microbenchmarks/gemm_multiple_run_f32"
+  xla_dump_dir: "../microbenchmarks/gemm_multiple_run_f32/hlo_graphs"
+  benchmark_sweep_params:
+  - {m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float32'}
+  - {m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float32'}
+  - {m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float32'}
+  - {m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float32'}
+  - {m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float32'}
+  - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float32'}
+  - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float32'}
+  - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float32'}
+
+- benchmark_name: "gemm_multiple_run"
+  trace_dir: "../microbenchmarks/gemm_multiple_run_fp16"
+  csv_path: "../microbenchmarks/gemm_multiple_run_fp16"
+  xlml_metrics_dir: "../microbenchmarks/gemm_multiple_run_fp16"
+  xla_dump_dir: "../microbenchmarks/gemm_multiple_run_fp16/hlo_graphs"
+  benchmark_sweep_params:
+  - {m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float16'}
+  - {m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float16'}
+  - {m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float16'}
+  - {m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float16'}
+  - {m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float16'}
+  - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float16'}
+  - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float16'}
+  - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float16'}
+
+- benchmark_name: "gemm_multiple_run"
+  trace_dir: "../microbenchmarks/gemm_multiple_run_fp8"
+  csv_path: "../microbenchmarks/gemm_multiple_run_fp8"
+  xlml_metrics_dir: "../microbenchmarks/gemm_multiple_run_fp8"
+  xla_dump_dir: "../microbenchmarks/gemm_multiple_run_fp8/hlo_graphs"
+  benchmark_sweep_params:
+  - {m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float8'}
+  - {m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float8'}
+  - {m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float8'}
+  - {m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float8'}
+  - {m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float8'}
+  - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float8'}
+  - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float8'}
+  - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float8'}
+
+- benchmark_name: "gemm_multiple_run"
+  trace_dir: "../microbenchmarks/gemm_multiple_run_fp4"
+  csv_path: "../microbenchmarks/gemm_multiple_run_fp4"
+  xlml_metrics_dir: "../microbenchmarks/gemm_multiple_run_fp4"
+  xla_dump_dir: "../microbenchmarks/gemm_multiple_run_fp4/hlo_graphs"
+  benchmark_sweep_params:
+  - {m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float4'}
+  - {m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float4'}
+  - {m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float4'}
+  - {m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float4'}
+  - {m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float4'}
+  - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float4'}
+  - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float4'}
+  - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float4'}
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml
index 1c9fa143..8181941b 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml
@@ -52,7 +52,7 @@ spec:
           pip install -r requirements.txt
 
           GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/gemm/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
         resources:
           requests:
             google.com/tpu: 4

From cb79abbba193efa7e12de32e792271b21e0dad5e Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Fri, 30 Jan 2026 08:32:10 +0000
Subject: [PATCH 23/88] Update aggregator for gemm test

---
 Ironwood/guides/automation/aggregator.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py
index d0d5f8a3..931d5ab3 100644
--- a/Ironwood/guides/automation/aggregator.py
+++ b/Ironwood/guides/automation/aggregator.py
@@ -22,7 +22,7 @@
         "D2H_bw (GiB/s)_p50", "D2H_bw (GiB/s)_p90", "D2H_bw (GiB/s)_p95", "D2H_bw (GiB/s)_p99", 
         "D2H_bw (GiB/s)_avg", "D2H_bw (GiB/s)_min", "D2H_bw (GiB/s)_max",
     ],
-    "training": [
+    "gemm": [
         "m", "n", "k", "dtype", "step_time_ms_num_runs",
         "tflops_per_sec_per_device_p50", "tflops_per_sec_per_device_p90",
         "tflops_per_sec_per_device_p95", "tflops_per_sec_per_device_p99",
@@ -81,8 +81,6 @@ def aggregate_gemm(directories: list[str], picked_columns: list[str]) -> pd.Data
         files = glob.glob(f"{directory}/*.tsv")
         for file in files:
             df = pd.read_csv(file, sep='\t')
-            if "topology" in picked_columns:
-                df["topology"] = [file.split('/')[-4].split('-')[1] for _ in range(df.shape[0])]
             aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True)
     return aggregated_df
 
@@ -90,11 +88,11 @@ def aggregate_gemm(directories: list[str], picked_columns: list[str]) -> pd.Data
     "collectives": aggregate_collectives,
     "hbm": aggregate_hbm,
     "host_device": aggregate_host_device,
-    "training": aggregate_gemm,
+    "gemm": aggregate_gemm,
 }
 
 def aggregate_results(bucket_path: str, local_dir: str):
-    categories = ["collectives", "hbm", "host_device", "training"]
+    categories = ["collectives", "hbm", "host_device", "gemm"]
     directories = {}
     results = {}
     for category in categories:

From b10e6bb275e83519c056faa320cb57325cc5d7a9 Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Fri, 30 Jan 2026 09:11:43 +0000
Subject: [PATCH 24/88] Add dtype string in aggregated TSV file

---
 Ironwood/guides/automation/aggregator.py | 2 +-
 Ironwood/src/benchmark_collectives.py    | 2 ++
 Ironwood/src/benchmark_hbm.py            | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py
index 931d5ab3..26f53d87 100644
--- a/Ironwood/guides/automation/aggregator.py
+++ b/Ironwood/guides/automation/aggregator.py
@@ -6,7 +6,7 @@
 
 columns_mapping = {
     "collectives": [
-        "topology", "op_type", "input_num_elements", "transferred_data (GB)", "dtype_bytes", "step_time_ms_num_runs",
+        "topology", "op_type", "input_num_elements", "transferred_data (GB)", "dtype", "step_time_ms_num_runs",
         "achieved_bw (GB/s)_p50", "achieved_bw (GB/s)_p90", "achieved_bw (GB/s)_p95", "achieved_bw (GB/s)_p99", "achieved_bw (GB/s)_avg", "achieved_bw (GB/s)_min", "achieved_bw (GB/s)_max",
         "step_time_ms_p50", "step_time_ms_p90", "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg", "step_time_ms_min", "step_time_ms_max",
     ],
diff --git a/Ironwood/src/benchmark_collectives.py b/Ironwood/src/benchmark_collectives.py
index 69b4d212..e142f59b 100644
--- a/Ironwood/src/benchmark_collectives.py
+++ b/Ironwood/src/benchmark_collectives.py
@@ -98,6 +98,7 @@ def unified_ici_collectives_metrics(
   hlo_first_replica_group = []
 
   input_num_elements = matrix_shape[0] * matrix_shape[1] * matrix_shape[2]
+  dtype_name = dtype.dtype.name
   dtype_bytes = dtype.dtype.itemsize
   if xla_output:
     xla_output_json = json.loads(xla_output)
@@ -169,6 +170,7 @@ def unified_ici_collectives_metrics(
       "input_num_elements": input_num_elements,
       "matrix_shape": json.dumps(f"({matrix_shape})"),
       "transferred_data (GB)": transferred_data,
+      "dtype": dtype_name,
       "dtype_bytes": dtype_bytes,
       "hlo_input_shape": json.dumps(hlo_input_shape),
       "hlo_output_shape": json.dumps(hlo_output_shape),
diff --git a/Ironwood/src/benchmark_hbm.py b/Ironwood/src/benchmark_hbm.py
index bb279f42..53744b5d 100644
--- a/Ironwood/src/benchmark_hbm.py
+++ b/Ironwood/src/benchmark_hbm.py
@@ -29,7 +29,7 @@ def get_metrics_helper(
         for key, value in params
         if value is not None and key not in exclude_keys
     }
-    metadata["dtype"] = metadata["dtype"].dtype.itemsize
+    metadata["dtype"] = metadata["dtype"].dtype.name
     return metadata
 
 
From 6f525bf9eb34ba79df9132e8c70a716b308b6658 Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Fri, 30 Jan 2026 09:23:55 +0000
Subject: [PATCH 25/88] Add multiple precisions for HBM test

---
 Ironwood/configs/hbm/hbm.yaml            | 29 ++++++++++++++++++++----
 Ironwood/guides/automation/aggregator.py | 14 +++++++-----
 2 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/Ironwood/configs/hbm/hbm.yaml b/Ironwood/configs/hbm/hbm.yaml
index d2c76e29..1912cb17 100644
--- a/Ironwood/configs/hbm/hbm.yaml
+++ b/Ironwood/configs/hbm/hbm.yaml
@@ -2,7 +2,28 @@ benchmarks:
 - benchmark_name: "single_device_hbm_copy"
   benchmark_sweep_params:
   - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "bfloat16", num_runs: 1}
-  trace_dir: "../microbenchmarks/hbm"
-  csv_path: "../microbenchmarks/hbm"
-  xlml_metrics_dir: "../microbenchmarks/hbm"
-  xla_dump_dir: "../microbenchmarks/hbm/hlo_graphs"
\ No newline at end of file
+  trace_dir: "../microbenchmarks/hbm_bfloat16"
+  csv_path: "../microbenchmarks/hbm_bfloat16"
+  xlml_metrics_dir: "../microbenchmarks/hbm_bfloat16"
+  xla_dump_dir: "../microbenchmarks/hbm_bfloat16/hlo_graphs"
+- benchmark_name: "single_device_hbm_copy"
+  benchmark_sweep_params:
+  - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float32", num_runs: 1}
+  trace_dir: "../microbenchmarks/hbm_float32"
+  csv_path: "../microbenchmarks/hbm_float32"
+  xlml_metrics_dir: "../microbenchmarks/hbm_float32"
+  xla_dump_dir: "../microbenchmarks/hbm_float32/hlo_graphs"
+- benchmark_name: "single_device_hbm_copy"
+  benchmark_sweep_params:
+  - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float8", num_runs: 1}
+  trace_dir: "../microbenchmarks/hbm_float8"
+  csv_path: "../microbenchmarks/hbm_float8"
+  xlml_metrics_dir: "../microbenchmarks/hbm_float8"
+  xla_dump_dir: "../microbenchmarks/hbm_float8/hlo_graphs"
+- benchmark_name: "single_device_hbm_copy"
+  benchmark_sweep_params:
+  - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float16", num_runs: 1}
+  trace_dir: "../microbenchmarks/hbm_float16"
+  csv_path: "../microbenchmarks/hbm_float16"
+  xlml_metrics_dir: "../microbenchmarks/hbm_float16"
+  xla_dump_dir: "../microbenchmarks/hbm_float16/hlo_graphs"
\ No newline at end of file
diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py
index 26f53d87..bc5e5e24 100644
--- a/Ironwood/guides/automation/aggregator.py
+++ b/Ironwood/guides/automation/aggregator.py
@@ -58,9 +58,10 @@ def aggregate_hbm(directories: list[str], picked_columns: list[str]) -> pd.DataF
         return None
     aggregated_df = pd.DataFrame()
     for directory in directories:
-        file = glob.glob(f"{directory}/*.tsv")[0]
-        df = pd.read_csv(file, sep='\t')
-        aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"time_ms_num_runs": "num_runs"})], ignore_index=True)
+        files = glob.glob(f"{directory}/*.tsv")
+        for file in files:
+            df = pd.read_csv(file, sep='\t')
+            aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"time_ms_num_runs": "num_runs"})], ignore_index=True)
     return aggregated_df
 
 def aggregate_host_device(directories: list[str], picked_columns: list[str]) -> pd.DataFrame:
@@ -68,9 +69,10 @@ def aggregate_host_device(directories: list[str], picked_columns: list[str]) ->
         return None
     aggregated_df = pd.DataFrame()
     for directory in directories:
-        file = glob.glob(f"{directory}/*.tsv")[0]
-        df = pd.read_csv(file, sep='\t')
-        aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"H2D_bw (GiB/s)_num_runs": "num_runs"})], ignore_index=True)
+        files = glob.glob(f"{directory}/*.tsv")
+        for file in files:
+            df = pd.read_csv(file, sep='\t')
+            aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"H2D_bw (GiB/s)_num_runs": "num_runs"})], ignore_index=True)
     return aggregated_df
 
 def aggregate_gemm(directories: list[str], picked_columns: list[str]) -> pd.DataFrame:

From 69661f97afa44e434898c97226bfecab7f7a015a Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Fri, 30 Jan 2026 13:44:00 +0000
Subject: [PATCH 26/88] Print pending process status every minute

---
 .../guides/automation/automation_launch.sh    | 85 +++++++++++++------
 1 file changed, 61 insertions(+), 24 deletions(-)

diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
index 1f544499..9b65bbf7 100644
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -83,7 +83,6 @@ wait_for_job_completion() {
 # Returns a list of failed yaml files in the variable FAILED_JOBS
 apply_and_wait() {
     local yaml_files=("$@")
-    local pids=()
     local job_names_in_batch=()
     FAILED_JOBS=()
 
@@ -102,29 +101,67 @@ apply_and_wait() {
         job_names_in_batch+=("${JOB_NAME}")
     done
 
-    # Wait for completion in background
+    # Monitor jobs
+    local start_time=$(date +%s)
+    local end_time=$((start_time + TIMEOUT_SECOND))
+    local last_print_time=0
+    
+    while true; do
+        local current_time=$(date +%s)
+        if [[ $current_time -gt $end_time ]]; then
+            echo "Timeout waiting for batch completion"
+            break
+        fi
+
+        # Identify active jobs
+        local active_jobs=()
+        for job_name in "${job_names_in_batch[@]}"; do
+            # Check for Complete
+            if kubectl get job "${job_name}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null | grep -q "True"; then
+                continue
+            fi
+            
+            # Check for Failed
+            if kubectl get job "${job_name}" -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null | grep -q "True"; then
+                continue
+            fi
+            
+            # If neither, it's pending/running
+            active_jobs+=("${job_name}")
+        done
+
+        if [[ ${#active_jobs[@]} -eq 0 ]]; then
+            break
+        fi
+
+        # Dashboard View - Print every 60 seconds
+        if [[ $((current_time - last_print_time)) -ge 60 ]]; then
+            echo "======================================================================"
+            date "+%Y-%m-%d %H:%M:%S"
+            echo "----------------------------------------------------------------------"
+            kubectl get jobs "${active_jobs[@]}"
+            echo "======================================================================"
+            last_print_time=$current_time
+        fi
+        
+        sleep 10
+    done
+
+    # Collect results and cleanup
+    FAILED_JOBS=()
     for i in "${!yaml_files[@]}"; do
         local yaml_file="${yaml_files[$i]}"
-        local filepath="${SCRIPT_DIR}/${yaml_file}"
         local job_name="${job_names_in_batch[$i]}"
-        export GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}"
-        (
-            wait_for_job_completion "${job_name}" ${TIMEOUT_SECOND}
-            wait_status=$?
-
-            export JOB_NAME="${job_name}"
-            envsubst '${JOB_NAME} ${GCS_PATH}' < "${filepath}" | kubectl delete -f - &> /dev/null
-            exit $wait_status
-        ) &
-        pids+=($!)
-    done
-
-    # Collect results
-    for i in "${!pids[@]}"; do
-        wait "${pids[$i]}"
-        if [[ $? -ne 0 ]]; then
-            FAILED_JOBS+=("${yaml_files[$i]}")
+        local filepath="${SCRIPT_DIR}/${yaml_file}"
+        
+        # Check if failed or still running (timeout)
+        if ! kubectl get job "${job_name}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null | grep -q "True"; then
+             FAILED_JOBS+=("${yaml_files[$i]}")
         fi
+        
+        export JOB_NAME="${job_name}"
+        export GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}"
+        envsubst '${JOB_NAME} ${GCS_PATH}' < "${filepath}" | kubectl delete -f - &> /dev/null
     done
 }
 
@@ -133,15 +170,15 @@ current_batch=("${yaml_names[@]}")
 
 for (( retry=1; retry<=MAX_RETRIES; retry++ )); do
     apply_and_wait "${current_batch[@]}"
-    
+
     if [[ ${#FAILED_JOBS[@]} -eq 0 ]]; then
         echo "All jobs completed successfully in Round ${retry}!"
         break
     fi
-    
+
     echo "Round ${retry} finished. ${#FAILED_JOBS[@]} jobs failed."
     current_batch=("${FAILED_JOBS[@]}")
-    
+
     if [[ ${retry} -lt ${MAX_RETRIES} ]]; then
         echo "Retrying failed jobs..."
         echo "========================================"
@@ -174,4 +211,4 @@ if [[ ${#FAILED_JOBS[@]} -gt 0 ]]; then
     done
 else
     echo "Success! All jobs finished."
-fi
\ No newline at end of file
+fi

From 3e4b59a0d8a5e87bea51f8c934895a4c9fda3ac5 Mon Sep 17 00:00:00 2001
From: Chi Shuen Lee <chishuen@google.com>
Date: Tue, 27 Jan 2026 15:38:39 +0800
Subject: [PATCH 27/88] Revert the changes that were made for an urgent demo
 (#90)

---
 Ironwood/configs/collectives/all_gather_1d.yaml        |  3 ++-
 Ironwood/configs/collectives/all_gather_2d.yaml        |  3 ++-
 Ironwood/configs/collectives/all_gather_3d.yaml        |  3 ++-
 Ironwood/configs/collectives/all_gather_demo.yaml      | 10 +++++++---
 .../configs/collectives/all_gather_tpu7x_2x2x1.yaml    |  3 ++-
 .../configs/collectives/all_gather_tpu7x_2x2x2.yaml    |  3 ++-
 .../configs/collectives/all_gather_tpu7x_2x2x4.yaml    |  3 ++-
 .../configs/collectives/all_gather_tpu7x_2x4x4.yaml    |  3 ++-
 .../configs/collectives/all_gather_tpu7x_4x4x4.yaml    |  5 +++--
 .../configs/collectives/all_gather_tpu7x_4x4x8.yaml    |  3 ++-
 Ironwood/configs/collectives/all_reduce_1d.yaml        |  3 ++-
 Ironwood/configs/collectives/all_reduce_2d.yaml        |  3 ++-
 Ironwood/configs/collectives/all_reduce_3d.yaml        |  3 ++-
 .../configs/collectives/all_reduce_tpu7x_2x2x1.yaml    |  3 ++-
 .../configs/collectives/all_reduce_tpu7x_2x2x2.yaml    |  3 ++-
 .../configs/collectives/all_reduce_tpu7x_2x2x4.yaml    |  3 ++-
 .../configs/collectives/all_reduce_tpu7x_2x4x4.yaml    |  3 ++-
 .../configs/collectives/all_reduce_tpu7x_4x4x4.yaml    |  3 ++-
 .../configs/collectives/all_reduce_tpu7x_4x4x8.yaml    |  3 ++-
 Ironwood/configs/collectives/all_to_all_1d.yaml        |  2 +-
 Ironwood/configs/collectives/all_to_all_2d.yaml        |  2 +-
 Ironwood/configs/collectives/all_to_all_3d.yaml        |  2 +-
 .../configs/collectives/all_to_all_tpu7x_2x2x1.yaml    |  3 ++-
 .../configs/collectives/all_to_all_tpu7x_2x2x2.yaml    |  3 ++-
 .../configs/collectives/all_to_all_tpu7x_2x2x4.yaml    |  3 ++-
 .../configs/collectives/all_to_all_tpu7x_2x4x4.yaml    |  3 ++-
 .../configs/collectives/all_to_all_tpu7x_4x4x4.yaml    |  3 ++-
 .../configs/collectives/all_to_all_tpu7x_4x4x8.yaml    |  3 ++-
 Ironwood/configs/collectives/reduce_scatter_1d.yaml    |  3 ++-
 Ironwood/configs/collectives/reduce_scatter_2d.yaml    |  3 ++-
 30 files changed, 63 insertions(+), 33 deletions(-)

diff --git a/Ironwood/configs/collectives/all_gather_1d.yaml b/Ironwood/configs/collectives/all_gather_1d.yaml
index 0b1313dc..85d8fc3e 100644
--- a/Ironwood/configs/collectives/all_gather_1d.yaml
+++ b/Ironwood/configs/collectives/all_gather_1d.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_gather_1d"
   csv_path: "../microbenchmarks/all_gather_1d"
   xlml_metrics_dir: "../microbenchmarks/all_gather_1d"
diff --git a/Ironwood/configs/collectives/all_gather_2d.yaml b/Ironwood/configs/collectives/all_gather_2d.yaml
index c45f3e70..2d7a0e7a 100644
--- a/Ironwood/configs/collectives/all_gather_2d.yaml
+++ b/Ironwood/configs/collectives/all_gather_2d.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5}
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups
   trace_dir: "../microbenchmarks/all_gather_2d"
   csv_path: "../microbenchmarks/all_gather_2d"
   xlml_metrics_dir: "../microbenchmarks/all_gather_2d"
diff --git a/Ironwood/configs/collectives/all_gather_3d.yaml b/Ironwood/configs/collectives/all_gather_3d.yaml
index e159adfd..cc876a08 100644
--- a/Ironwood/configs/collectives/all_gather_3d.yaml
+++ b/Ironwood/configs/collectives/all_gather_3d.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5}
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} # Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups
   trace_dir: "../microbenchmarks/all_gather_3d"
   csv_path: "../microbenchmarks/all_gather_3d"
   xlml_metrics_dir: "../microbenchmarks/all_gather_3d"
diff --git a/Ironwood/configs/collectives/all_gather_demo.yaml b/Ironwood/configs/collectives/all_gather_demo.yaml
index a9d776cd..6fb5a757 100644
--- a/Ironwood/configs/collectives/all_gather_demo.yaml
+++ b/Ironwood/configs/collectives/all_gather_demo.yaml
@@ -1,9 +1,13 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1}
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2}
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3}
+  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1} # Parallel Replica
+  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2} # Non Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2} # Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3} # Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3} # Non Parallel Replica Groups
+
 
   warmup_tries: 10
   trace_dir: "../microbenchmarks/all_gather_demo"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
index 0338aef1..9bc586a1 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
index 9253bac5..b5be0c8d 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
index 9f8af67f..09b02979 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
index 724fff00..4f6cf11a 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
index 65189cc9..77f3ed13 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
@@ -1,8 +1,9 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 10}
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4"
-  xla_dump_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4/hlo_graphs"
+  xla_dump_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4/hlo_graphs"
\ No newline at end of file
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
index 77c4da6f..12743d61 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8"
diff --git a/Ironwood/configs/collectives/all_reduce_1d.yaml b/Ironwood/configs/collectives/all_reduce_1d.yaml
index d12d4221..7b1d3068 100644
--- a/Ironwood/configs/collectives/all_reduce_1d.yaml
+++ b/Ironwood/configs/collectives/all_reduce_1d.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1" , op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1" , op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non Parallel Replica
   trace_dir: "../microbenchmarks/all_reduce_1d"
   csv_path: "../microbenchmarks/all_reduce_1d"
   xlml_metrics_dir: "../microbenchmarks/all_reduce_1d"
diff --git a/Ironwood/configs/collectives/all_reduce_2d.yaml b/Ironwood/configs/collectives/all_reduce_2d.yaml
index 5aa9654e..93e1a7c9 100644
--- a/Ironwood/configs/collectives/all_reduce_2d.yaml
+++ b/Ironwood/configs/collectives/all_reduce_2d.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5}
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5}  # Non Parallel Replica Groups
   trace_dir: "../microbenchmarks/all_reduce_2d"
   csv_path: "../microbenchmarks/all_reduce_2d"
   xlml_metrics_dir: "../microbenchmarks/all_reduce_2d"
diff --git a/Ironwood/configs/collectives/all_reduce_3d.yaml b/Ironwood/configs/collectives/all_reduce_3d.yaml
index 4e76b55f..f6a4ad9d 100644
--- a/Ironwood/configs/collectives/all_reduce_3d.yaml
+++ b/Ironwood/configs/collectives/all_reduce_3d.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5}
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} # Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups
   trace_dir: "../microbenchmarks/all_reduce_3d"
   csv_path: "../microbenchmarks/all_reduce_3d"
   xlml_metrics_dir: "../microbenchmarks/all_reduce_3d"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
index 6d2d506c..f7389925 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
index d11981b0..b2cb202c 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
index ab243b6f..946fd5ed 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
index c731c622..613717cf 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/psum_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/psum_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
index 53d8dd3d..3f4822c0 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/psum_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/psum_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
index f87878a4..a14bbfe8 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/psum_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/psum_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x8"
diff --git a/Ironwood/configs/collectives/all_to_all_1d.yaml b/Ironwood/configs/collectives/all_to_all_1d.yaml
index 8d222613..3c28194d 100644
--- a/Ironwood/configs/collectives/all_to_all_1d.yaml
+++ b/Ironwood/configs/collectives/all_to_all_1d.yaml
@@ -1,7 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non Parallel Replica
   trace_dir: "../microbenchmarks/all_to_all_1d"
   csv_path: "../microbenchmarks/all_to_all_1d"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_1d"
diff --git a/Ironwood/configs/collectives/all_to_all_2d.yaml b/Ironwood/configs/collectives/all_to_all_2d.yaml
index d23115fe..b4a1bc0e 100644
--- a/Ironwood/configs/collectives/all_to_all_2d.yaml
+++ b/Ironwood/configs/collectives/all_to_all_2d.yaml
@@ -1,7 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5}
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica
   trace_dir: "../microbenchmarks/all_to_all_2d"
   csv_path: "../microbenchmarks/all_to_all_2d"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_2d"
diff --git a/Ironwood/configs/collectives/all_to_all_3d.yaml b/Ironwood/configs/collectives/all_to_all_3d.yaml
index c705754c..3aa0e2a7 100644
--- a/Ironwood/configs/collectives/all_to_all_3d.yaml
+++ b/Ironwood/configs/collectives/all_to_all_3d.yaml
@@ -1,7 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5}
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups
   trace_dir: "../microbenchmarks/all_to_all_3d"
   csv_path: "../microbenchmarks/all_to_all_3d"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_3d"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
index f9786b29..96da2c38 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
index b530a698..388a4468 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
index 86e3dbbc..e0cc48c9 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
index 6d4b79fb..5ae19b6e 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
index 3460ddb6..4cc8f6bb 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
index 93ef7cb7..212cd92d 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8"
diff --git a/Ironwood/configs/collectives/reduce_scatter_1d.yaml b/Ironwood/configs/collectives/reduce_scatter_1d.yaml
index 063d73fc..9c2c0dea 100644
--- a/Ironwood/configs/collectives/reduce_scatter_1d.yaml
+++ b/Ironwood/configs/collectives/reduce_scatter_1d.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: psum_scatter
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/reduce_scatter_1d"
   csv_path: "../microbenchmarks/reduce_scatter_1d"
   xlml_metrics_dir: "../microbenchmarks/reduce_scatter_1d"
diff --git a/Ironwood/configs/collectives/reduce_scatter_2d.yaml b/Ironwood/configs/collectives/reduce_scatter_2d.yaml
index 027ac991..f329b571 100644
--- a/Ironwood/configs/collectives/reduce_scatter_2d.yaml
+++ b/Ironwood/configs/collectives/reduce_scatter_2d.yaml
@@ -1,7 +1,8 @@
 benchmarks:
 - benchmark_name: psum_scatter
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5}
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups
   trace_dir: "../microbenchmarks/reduce_scatter_2d"
   csv_path: "../microbenchmarks/reduce_scatter_2d"
   xlml_metrics_dir: "../microbenchmarks/reduce_scatter_2d"

From 0b24e562e9fc3442ff487069828aa7035f4dc3a9 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Fri, 30 Jan 2026 09:51:14 +0000
Subject: [PATCH 28/88] [Ironwood] Add pipelined H2D mode to H2D benchmark

---
 Ironwood/configs/host_device/host_device.yaml |  3 +-
 Ironwood/src/benchmark_host_device.py         | 82 +++++++++++++------
 2 files changed, 60 insertions(+), 25 deletions(-)

diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml
index 0b48800c..8d572ed7 100644
--- a/Ironwood/configs/host_device/host_device.yaml
+++ b/Ironwood/configs/host_device/host_device.yaml
@@ -3,7 +3,8 @@ benchmarks:
   num_runs: 20
   benchmark_sweep_params:
   - {
-      data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768]
+      data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768],
+      h2d_type: ["simple", "pipelined"]
     }
   csv_path: "../microbenchmarks/host_device"
   trace_dir: "../microbenchmarks/host_device/trace"
diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index 16352e2a..ec50619a 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -5,7 +5,6 @@
 from typing import Any, Dict, Tuple, List
 
 import jax
-from jax import sharding
 import numpy as np
 from benchmark_utils import MetricsStatistics
 
@@ -23,6 +22,7 @@ def benchmark_host_device(
     data_size_mib: int,
     num_runs: int = 100,
     trace_dir: str = None,
+    h2d_type: str = "simple",
 ) -> Dict[str, Any]:
     """Benchmarks H2D/D2H transfer using simple device_put/device_get."""
     
@@ -32,8 +32,18 @@ def benchmark_host_device(
     column = 128
     host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32)
     
+    # Used in pipelined flow
+    num_devices_to_perform_h2d = 1
+    tensor_size = 4 * 1024 * 1024
+    target_device = jax.devices()[:num_devices_to_perform_h2d]
+    mesh = jax.sharding.Mesh(np.array(target_device), axis_names=["x"])
+    sharding = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec("x"))
+    pipelined_array = None
+    if h2d_type == "pipelined":
+        pipelined_array = np.random.normal(size=(tensor_size,)).astype(np.float32)
+
     print(
-        f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations",
+        f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {h2d_type=}",
         flush=True
     )
 
@@ -65,29 +75,52 @@ def benchmark_host_device(
             
             with step_context:
                  # H2D
-                t0 = time.perf_counter()
-                
-                # Simple device_put
-                device_array = jax.device_put(host_data)
-                device_array.block_until_ready()
-                
-                t1 = time.perf_counter()
-                h2d_perf.append((t1 - t0) * 1000)
-                
-                # Verify H2D shape
-                assert device_array.shape == host_data.shape
-                
-                # D2H
-                t2 = time.perf_counter()
-                
-                # Simple device_get
-                # Note: device_get returns a numpy array (copy)
-                _ = jax.device_get(device_array)
-                
-                t3 = time.perf_counter()
-                d2h_perf.append((t3 - t2) * 1000)
+                if h2d_type == "simple":
+                    t0 = time.perf_counter()
+                    # Simple device_put
+                    device_array = jax.device_put(host_data)
+                    device_array.block_until_ready()
+                    t1 = time.perf_counter()
+                    
+                    # Verify H2D shape
+                    assert device_array.shape == host_data.shape
+
+                    h2d_perf.append((t1 - t0) * 1000)
                 
-                device_array.delete()
+                    # D2H
+                    t2 = time.perf_counter()
+                    
+                    # Simple device_get
+                    # Note: device_get returns a numpy array (copy)
+                    _ = jax.device_get(device_array)
+                    
+                    t3 = time.perf_counter()
+                    d2h_perf.append((t3 - t2) * 1000)
+                    
+                    device_array.delete()
+                elif h2d_type == "pipelined":
+                    tensors_on_device = []
+                    if data_size_mib * 1024 * 1024 < pipelined_array.nbytes:
+                        print(f"Warning: {data_size_mib=} is smaller than pipeline unit, no data will be transferred.")
+                    t0 = time.perf_counter()
+                    # Assume data_size_mib is total across devices for now
+                    bytes_left = 1024 * 1024 * data_size_mib
+                    while bytes_left >= pipelined_array.nbytes:
+                        with jax.profiler.StepTraceAnnotation("device_put", step_num=1):
+                            x_device = jax.device_put(pipelined_array, sharding)
+                            tensors_on_device.append(x_device)
+                            bytes_left -= pipelined_array.nbytes
+                    
+                    total_bytes_transferred = 0
+                    for tensor in tensors_on_device:
+                        tensor.block_until_ready()
+                        total_bytes_transferred += tensor.nbytes
+                        tensor.delete()
+                    t1 = time.perf_counter()
+
+                    h2d_perf.append((t1 - t0) * 1000)
+                    # Implement D2H at a later time after we establish H2D
+                    d2h_perf.append(0)
 
     return {
         "H2D_Bandwidth_ms": h2d_perf,
@@ -98,6 +131,7 @@ def benchmark_host_device_calculate_metrics(
     data_size_mib: int,
     H2D_Bandwidth_ms: List[float],
     D2H_Bandwidth_ms: List[float],
+    h2d_type: str = "simple",
 ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
     """Calculates metrics for Host-Device transfer."""
     params = locals().items()

From a70b70151923a8f346db42597ee59e1d931473e1 Mon Sep 17 00:00:00 2001
From: Junjie Qian <qjj1983@gmail.com>
Date: Sat, 31 Jan 2026 22:20:26 -0800
Subject: [PATCH 29/88] add extra datatypes in configs (#94)

---
 Ironwood/configs/collectives/all_gather_1d.yaml          | 4 ++++
 Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml | 4 ++++
 Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml | 4 ++++
 Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml | 4 ++++
 Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml | 4 ++++
 Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml | 4 ++++
 Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml | 4 ++++
 Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml | 4 ++++
 Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml | 4 ++++
 Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml | 4 ++++
 Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml | 4 ++++
 Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml | 4 ++++
 Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml | 4 ++++
 Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml | 4 ++++
 Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml | 4 ++++
 Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml | 4 ++++
 Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml | 4 ++++
 Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml | 4 ++++
 Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml | 4 ++++
 Ironwood/configs/hbm/hbm.yaml                            | 9 ++++++++-
 20 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/Ironwood/configs/collectives/all_gather_1d.yaml b/Ironwood/configs/collectives/all_gather_1d.yaml
index 0b1313dc..8ab9faeb 100644
--- a/Ironwood/configs/collectives/all_gather_1d.yaml
+++ b/Ironwood/configs/collectives/all_gather_1d.yaml
@@ -2,6 +2,10 @@ benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float16",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float8",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float4",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_1d"
   csv_path: "../microbenchmarks/all_gather_1d"
   xlml_metrics_dir: "../microbenchmarks/all_gather_1d"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
index 0338aef1..d4ef5f2c 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
@@ -2,6 +2,10 @@ benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float16",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float8",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float4",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
index 9253bac5..0048163e 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
@@ -2,6 +2,10 @@ benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float16",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float8",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float4",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
index 9f8af67f..3e18b0fa 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
@@ -2,6 +2,10 @@ benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float16",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float8",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float4",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
index 724fff00..97a6ec2b 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
@@ -2,6 +2,10 @@ benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float16",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float8",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float4",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
index 65189cc9..b51ebbe5 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
@@ -2,6 +2,10 @@ benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 10}
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float16",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 10}
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float8",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 10}
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float4",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 10}
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 10}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
index 77c4da6f..f6f151a8 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
@@ -2,6 +2,10 @@ benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float16",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float8",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float4",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
index 6d2d506c..08139afb 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
@@ -2,6 +2,10 @@ benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float16",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float8",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float4",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
index d11981b0..e510a975 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
@@ -2,6 +2,10 @@ benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float16",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float8",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float4",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
index ab243b6f..5f53e826 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
@@ -2,6 +2,10 @@ benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float16",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float8",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float4",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
index c731c622..5ea9c71b 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
@@ -2,6 +2,10 @@ benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float16",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float8",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float4",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/psum_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/psum_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
index 53d8dd3d..9ddbe192 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
@@ -2,6 +2,10 @@ benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float16",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float8",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float4",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/psum_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/psum_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
index f87878a4..997bb480 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
@@ -2,6 +2,10 @@ benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float16",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float8",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float4",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/psum_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/psum_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x8"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
index f9786b29..0e753645 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
@@ -2,6 +2,10 @@ benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float16",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float8",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float4",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
index b530a698..c6410449 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
@@ -2,6 +2,10 @@ benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float16",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float8",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float4",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
index 86e3dbbc..8dc83348 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
@@ -2,6 +2,10 @@ benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float16",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float8",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float4",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
index 6d4b79fb..ccd5ab14 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
@@ -2,6 +2,10 @@ benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float16",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float8",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float4",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
index 3460ddb6..54a82e5d 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
@@ -2,6 +2,10 @@ benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float16",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float8",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float4",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
index 93ef7cb7..b7120534 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
@@ -2,6 +2,10 @@ benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float16",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float8",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float4",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8"
diff --git a/Ironwood/configs/hbm/hbm.yaml b/Ironwood/configs/hbm/hbm.yaml
index 1912cb17..0e42b2f0 100644
--- a/Ironwood/configs/hbm/hbm.yaml
+++ b/Ironwood/configs/hbm/hbm.yaml
@@ -26,4 +26,11 @@ benchmarks:
   trace_dir: "../microbenchmarks/hbm_float16"
   csv_path: "../microbenchmarks/hbm_float16"
   xlml_metrics_dir: "../microbenchmarks/hbm_float16"
-  xla_dump_dir: "../microbenchmarks/hbm_float16/hlo_graphs"
\ No newline at end of file
+  xla_dump_dir: "../microbenchmarks/hbm_float16/hlo_graphs"
+- benchmark_name: "single_device_hbm_copy"
+  benchmark_sweep_params:
+  - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float4", num_runs: 1}
+  trace_dir: "../microbenchmarks/hbm_float4"
+  csv_path: "../microbenchmarks/hbm_float4"
+  xlml_metrics_dir: "../microbenchmarks/hbm_float4"
+  xla_dump_dir: "../microbenchmarks/hbm_float4/hlo_graphs"
\ No newline at end of file

From 94ddadabb9ea054f2b3f49003a60b4e19b476c5e Mon Sep 17 00:00:00 2001
From: Junjie Qian <qjj1983@gmail.com>
Date: Sun, 1 Feb 2026 18:37:09 -0800
Subject: [PATCH 30/88] add GCS service account name to job yamls (#95)

---
 Ironwood/guides/automation/automation_launch.sh         | 6 ++++--
 Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml | 1 +
 Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml        | 1 +
 Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml         | 1 +
 Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml | 1 +
 Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml | 1 +
 Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml | 1 +
 Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml | 1 +
 Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml | 1 +
 Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml | 1 +
 10 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
index 9b65bbf7..56cee946 100644
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -5,6 +5,7 @@
 ######################################################################
 TIMESTAMP=$(date +%Y-%m-%d_%H-%M-%S)
 export GCS_BUCKET_ROOT_DIR=""
+export GCS_SA_NAME="gcs-writer"  # Service account with write access to GCS_BUCKET_ROOT_DIR
 
 MAX_RETRIES=3
 TIMEOUT_SECOND=3600
@@ -93,11 +94,12 @@ apply_and_wait() {
         local filepath="${SCRIPT_DIR}/${yaml_file}"
         # Derive job name: remove .yaml, lowercase, replace _ with -
         local job_name=$(basename "${yaml_file}" .yaml | tr '[:upper:]' '[:lower:]' | tr '_' '-')
-        export JOB_NAME="${job_name}"
+        random_suffix=$(head /dev/urandom | tr -dc a-z0-9 | head -c 5)
+        export JOB_NAME="${job_name}-${random_suffix}"
         export GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}"
         
         echo "Launching job: ${filepath} (name: ${JOB_NAME})"
-        envsubst '${JOB_NAME} ${GCS_PATH}' < "${filepath}" | kubectl apply -f -
+        envsubst '${JOB_NAME} ${GCS_PATH} ${GCS_SA_NAME}' < "${filepath}" | kubectl apply -f -
         job_names_in_batch+=("${JOB_NAME}")
     done
 
diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml
index fc878ea0..a99704d0 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml
@@ -22,6 +22,7 @@ spec:
   template:
     spec:
       subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
       restartPolicy: Never
       nodeSelector:
         cloud.google.com/gke-tpu-accelerator: tpu7x
diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml
index 8181941b..1a119d2e 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml
@@ -22,6 +22,7 @@ spec:
   template:
     spec:
       subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
       restartPolicy: Never
       nodeSelector:
         cloud.google.com/gke-tpu-accelerator: tpu7x
diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml
index e84e13ea..85efe69a 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml
@@ -22,6 +22,7 @@ spec:
   template:
     spec:
       subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
       restartPolicy: Never
       nodeSelector:
         cloud.google.com/gke-tpu-accelerator: tpu7x
diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml
index 3f662be5..a6b8febd 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml
@@ -22,6 +22,7 @@ spec:
   template:
     spec:
       subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
       restartPolicy: Never
       nodeSelector:
         cloud.google.com/gke-tpu-accelerator: tpu7x
diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml
index c2efba03..937d572c 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml
@@ -22,6 +22,7 @@ spec:
   template:
     spec:
       subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
       restartPolicy: Never
       nodeSelector:
         cloud.google.com/gke-tpu-accelerator: tpu7x
diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml
index a42b04ae..6502170a 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml
@@ -22,6 +22,7 @@ spec:
   template:
     spec:
       subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
       restartPolicy: Never
       nodeSelector:
         cloud.google.com/gke-tpu-accelerator: tpu7x
diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml
index de6f7106..2de39037 100644
--- a/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml
@@ -22,6 +22,7 @@ spec:
   template:
     spec:
       subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
       restartPolicy: Never
       nodeSelector:
         cloud.google.com/gke-tpu-accelerator: tpu7x
diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml
index 828d1352..7afdac9d 100644
--- a/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml
+++ b/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml
@@ -22,6 +22,7 @@ spec:
   template:
     spec:
       subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
       restartPolicy: Never
       nodeSelector:
         cloud.google.com/gke-tpu-accelerator: tpu7x
diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml
index f3bf721d..14735274 100644
--- a/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml
+++ b/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml
@@ -22,6 +22,7 @@ spec:
   template:
     spec:
       subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
       restartPolicy: Never
       nodeSelector:
         cloud.google.com/gke-tpu-accelerator: tpu7x

From e7e10f9c5350d16966eaaecdab95dc19099ebb28 Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Mon, 2 Feb 2026 02:40:23 +0000
Subject: [PATCH 31/88] [Automation] GCS Permission check and fix

---
 Ironwood/guides/automation/README.md          |   2 +-
 .../guides/automation/automation_launch.sh    |  14 +++
 .../automation/check_gcs_permissions.sh       | 116 ++++++++++++++++++
 Ironwood/guides/automation/gcs-write.yaml     |  41 +++++++
 4 files changed, 172 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 Ironwood/guides/automation/automation_launch.sh
 create mode 100755 Ironwood/guides/automation/check_gcs_permissions.sh
 create mode 100644 Ironwood/guides/automation/gcs-write.yaml

diff --git a/Ironwood/guides/automation/README.md b/Ironwood/guides/automation/README.md
index 3c463c67..81d63c93 100644
--- a/Ironwood/guides/automation/README.md
+++ b/Ironwood/guides/automation/README.md
@@ -52,7 +52,7 @@ You can configure the behavior using the following environment variable:
 
 | Variable | Description | Required | Default |
 | :--- | :--- | :--- | :--- |
-| `GCS_BUCKET_ROOT_DIR` | The root GCS path where results will be stored. Must start with `gs://`. | **Yes** | `gs://amylin-microbenchmark` (Change this!) |
+| `GCS_BUCKET_ROOT_DIR` | The root GCS path where results will be stored. Must start with `gs://`. | **Yes** | `gs://example-microbenchmark` (Change this!) |
 
 ## Usage Guide
 
diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
old mode 100644
new mode 100755
index 56cee946..745a3f20
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -6,6 +6,7 @@
 TIMESTAMP=$(date +%Y-%m-%d_%H-%M-%S)
 export GCS_BUCKET_ROOT_DIR=""
 export GCS_SA_NAME="gcs-writer"  # Service account with write access to GCS_BUCKET_ROOT_DIR
+export PROJECT_ID=$(gcloud config get-value project 2>/dev/null)
 
 MAX_RETRIES=3
 TIMEOUT_SECOND=3600
@@ -45,6 +46,19 @@ for topology in "${required_topologies[@]}"; do
     envsubst '${TOPOLOGY} ${TPUS}' < ${SCRIPT_DIR}/job-queue.yaml | kubectl apply -f -
 done
 
+######################################################################
+#                  GCS PERMISSION CHECK
+######################################################################
+
+# Run the GCS permission check
+export SA_NAME="${GCS_SA_NAME}"
+export PROJECT_ID="${PROJECT_ID}"
+if ! bash "${SCRIPT_DIR}/check_gcs_permissions.sh"; then
+    echo "GCS Permission Check Failed. Exiting."
+    exit 1
+fi
+
+
 ######################################################################
 #                 LAUNCH JOBS & WAIT FOR COMPLETION
 ######################################################################
diff --git a/Ironwood/guides/automation/check_gcs_permissions.sh b/Ironwood/guides/automation/check_gcs_permissions.sh
new file mode 100755
index 00000000..0cca1968
--- /dev/null
+++ b/Ironwood/guides/automation/check_gcs_permissions.sh
@@ -0,0 +1,116 @@
+#!/usr/bin/env bash
+
+# This script checks if the configured Service Account has write permissions to the specified GCS bucket.
+# If permissions are missing, it attempts to fix them by creating the SA and granting roles/storage.admin.
+#
+# Expected Environment Variables:
+#   GCS_BUCKET_ROOT_DIR: The GCS path (must start with gs://)
+#   SA_NAME: The Service Account name (default: gcs-writer)
+#   PROJECT_ID: The GCP Project ID (optional, will try to detect if not set)
+
+SCRIPT_DIR="$(dirname "$(realpath "$0")")"
+SA_NAME="${SA_NAME:-gcs-writer}"
+PROJECT_ID="${PROJECT_ID:-$(gcloud config get-value project 2>/dev/null)}"
+
+if [[ -z "${GCS_BUCKET_ROOT_DIR}" || "${GCS_BUCKET_ROOT_DIR}" != "gs://"* ]]; then
+  echo "Error: GCS_BUCKET_ROOT_DIR must be set and start with gs://"
+  exit 1
+fi
+
+fix_gcs_permissions() {
+    # See more context in https://docs.cloud.google.com/kubernetes-engine/docs/how-to/workload-identity#authenticating_to
+    echo "Attempting to fix GCS permissions..."
+    
+    if [[ -z "${PROJECT_ID}" ]]; then
+        echo "Error: PROJECT_ID is not set and could not be detected."
+        echo "Please export PROJECT_ID=<your-project-id> and rerun."
+        exit 1
+    fi
+    
+    local bucket_name=$(echo "${GCS_BUCKET_ROOT_DIR}" | sed 's|^gs://||' | cut -d/ -f1)
+    local ns_name="default"
+    
+    echo "Ensuring ServiceAccount ${SA_NAME} exists in namespace ${ns_name}..."
+    kubectl create serviceaccount "${SA_NAME}" --namespace "${ns_name}" --dry-run=client -o yaml | kubectl apply -f -
+    
+    local project_number=$(gcloud projects describe "${PROJECT_ID}" --format="value(projectNumber)")
+    
+    echo "Granting roles/storage.admin to ${SA_NAME} on gs://${bucket_name}..."
+    gcloud storage buckets add-iam-policy-binding "gs://${bucket_name}" \
+        --role=roles/storage.admin \
+        --member="principal://iam.googleapis.com/projects/${project_number}/locations/global/workloadIdentityPools/${PROJECT_ID}.svc.id.goog/subject/ns/${ns_name}/sa/${SA_NAME}"
+        
+    echo "Permission fix command executed."
+}
+
+check_gcs_permission() {
+    echo "Checking GCS write permissions..."
+    export GCS_CHECK_PATH="${GCS_BUCKET_ROOT_DIR}/permission-check-$(date +%s).txt"
+    export SA_NAME="${SA_NAME}"
+
+    # Check if ServiceAccount exists first to fail fast
+    if ! kubectl get serviceaccount "${SA_NAME}" &> /dev/null; then
+        echo "ServiceAccount '${SA_NAME}' not found."
+        return 1
+    fi
+    
+    # Launch check pod
+    # We capture the pod name from the output of kubectl create
+    local apply_output=$(envsubst '${SA_NAME} ${GCS_CHECK_PATH}' < "${SCRIPT_DIR}/gcs-write.yaml" | kubectl create -f -)
+    # output example: pod/gcs-writer-test-abcde created
+    local pod_name=$(echo "${apply_output}" | awk -F'/' '{print $2}' | awk '{print $1}')
+    
+    echo "Launched GCS check pod: ${pod_name}"
+    
+    # Wait for completion
+    local check_status="FAILED"
+    for i in {1..20}; do
+        sleep 5
+        if kubectl get pod "${pod_name}" -o jsonpath='{.status.phase}' 2>/dev/null | grep -q "Succeeded"; then
+            check_status="SUCCESS"
+            break
+        fi
+        if kubectl get pod "${pod_name}" -o jsonpath='{.status.phase}' 2>/dev/null | grep -q "Failed"; then
+            check_status="FAILED"
+            break
+        fi
+    done
+
+    # Check logs
+    if kubectl logs "${pod_name}" 2>/dev/null | grep -q "GCS test complete!"; then
+         echo "GCS permission check PASSED."
+         check_status="SUCCESS"
+    else
+         echo "GCS permission check FAILED."
+         check_status="FAILED"
+         echo "Logs from ${pod_name}:"
+         kubectl logs "${pod_name}" 2>/dev/null | tail -n 10
+    fi
+    
+    # Cleanup
+    kubectl delete pod "${pod_name}" --grace-period=0 --force &> /dev/null
+    
+    if [[ "${check_status}" != "SUCCESS" ]]; then
+        return 1
+    fi
+    return 0
+}
+
+# Main Logic
+echo "======================================================================"
+echo "Starting GCS Permission Check (SA: ${SA_NAME}, Bucket: ${GCS_BUCKET_ROOT_DIR})"
+echo "======================================================================"
+
+if ! check_gcs_permission; then
+    echo "GCS check failed. Attempting to fix..."
+    fix_gcs_permissions
+    
+    echo "Retrying GCS check..."
+    if ! check_gcs_permission; then
+        echo "GCS permissions check failed even after attempted fix."
+        echo "Please verify your Service Account '${SA_NAME}' has proper permissions on ${GCS_BUCKET_ROOT_DIR}"
+        exit 1
+    fi
+fi
+
+echo "GCS Check Verified Successfully."
diff --git a/Ironwood/guides/automation/gcs-write.yaml b/Ironwood/guides/automation/gcs-write.yaml
new file mode 100644
index 00000000..8d27586c
--- /dev/null
+++ b/Ironwood/guides/automation/gcs-write.yaml
@@ -0,0 +1,41 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  generateName: gcs-writer-test-
+  namespace: default
+spec:
+  serviceAccountName: ${SA_NAME}
+  containers:
+  - name: gcs-test-container
+    image: google/cloud-sdk:slim
+    command:
+    - bash
+    - -c
+    - |
+      set -ex
+      TIMESTAMP=$(date +%s)
+      LOCAL_FILE="/tmp/test-file-${TIMESTAMP}.txt"
+      
+      # GCS_CHECK_PATH is substituted by envsubst
+      echo "Using GCS Path: ${GCS_CHECK_PATH}"
+
+      echo "Testing GCS write from pod at $(date)" > "${LOCAL_FILE}"
+
+      echo "--- Configuration ---"
+      gcloud auth list
+      gcloud config list
+      # Try to get service account email, but don't fail if metadata server is slow/unreachable (though it should be reachable)
+      curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/email" || echo "Could not fetch SA email"
+      echo
+
+      echo "--- Writing to GCS ---"
+      gsutil cp "${LOCAL_FILE}" "${GCS_CHECK_PATH}"
+
+      echo "--- Verifying from GCS ---"
+      gsutil cat "${GCS_CHECK_PATH}"
+
+      echo "--- Cleaning up GCS object ---"
+      gsutil rm "${GCS_CHECK_PATH}"
+
+      echo "GCS test complete!"
+  restartPolicy: Never

From 30db8d01cfe39506a0507eeba8aa6fef43233ae7 Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Mon, 2 Feb 2026 02:57:14 +0000
Subject: [PATCH 32/88] Inject service account spec to Aggregator

```
Caller does not have storage.objects.list access to the Google Cloud Storage bucket. Permission 'storage.objects.list' denied on resource (or it may not exist).
```
---
 Ironwood/guides/automation/aggregator.yaml      | 2 ++
 Ironwood/guides/automation/automation_launch.sh | 7 +++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/Ironwood/guides/automation/aggregator.yaml b/Ironwood/guides/automation/aggregator.yaml
index 5a2c89a3..120e94b7 100644
--- a/Ironwood/guides/automation/aggregator.yaml
+++ b/Ironwood/guides/automation/aggregator.yaml
@@ -3,8 +3,10 @@ kind: Job
 metadata:
   name: aggregator
 spec:
+  backoffLimit: 0
   template:
     spec:
+      serviceAccountName: ${GCS_SA_NAME}
       containers:
       - name: main-app
         image: python:3.12
diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
index 745a3f20..21af18ee 100755
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -209,9 +209,12 @@ echo ""
 echo "Jobs completed. Aggregating results..."
 echo ""
 
-envsubst '${GCS_BUCKET_ROOT_DIR}' < ${SCRIPT_DIR}/aggregator.yaml | kubectl apply -f -
+# Ensure cleanup of any previous aggregator job to avoid immutable field errors
+kubectl delete job aggregator --ignore-not-found=true
+
+envsubst '${GCS_BUCKET_ROOT_DIR} ${GCS_SA_NAME}' < ${SCRIPT_DIR}/aggregator.yaml | kubectl apply -f -
 wait_for_job_completion "aggregator" ${TIMEOUT_SECOND}
-envsubst '${GCS_BUCKET_ROOT_DIR}' < ${SCRIPT_DIR}/aggregator.yaml | kubectl delete -f -
+envsubst '${GCS_BUCKET_ROOT_DIR} ${GCS_SA_NAME}' < ${SCRIPT_DIR}/aggregator.yaml | kubectl delete -f -
 
 # Print the failed jobs at the end for better visibility.
 

From 5dd6f85d158a624ae4169868e50c778b633a57c1 Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Wed, 4 Feb 2026 16:19:53 +0800
Subject: [PATCH 33/88] Add bmm microbenchmark. (#97)

* [BMM] Add bmm microbenchmark

* Update hook in benchmark entry.

* Update BMM config

* Update timeit logic
---
 Ironwood/configs/bmm/single_device_bmm.yaml |  75 +++++++++++
 Ironwood/src/benchmark_bmm.py               | 134 ++++++++++++++++++++
 Ironwood/src/run_benchmark.py               |   5 +
 3 files changed, 214 insertions(+)
 create mode 100644 Ironwood/configs/bmm/single_device_bmm.yaml
 create mode 100644 Ironwood/src/benchmark_bmm.py

diff --git a/Ironwood/configs/bmm/single_device_bmm.yaml b/Ironwood/configs/bmm/single_device_bmm.yaml
new file mode 100644
index 00000000..f0a4156c
--- /dev/null
+++ b/Ironwood/configs/bmm/single_device_bmm.yaml
@@ -0,0 +1,75 @@
+benchmarks:
+- benchmark_name: "single_device_bmm"
+  trace_dir: "../microbenchmarks/single_device_bmm_bf16"
+  csv_path: "../microbenchmarks/single_device_bmm_bf16"
+  xlml_metrics_dir: "../microbenchmarks/single_device_bmm_bf16"
+  xla_dump_dir: "../microbenchmarks/single_device_bmm_bf16/hlo_graphs"
+  benchmark_sweep_params:
+  - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'bfloat16'}
+
+- benchmark_name: "single_device_bmm"
+  trace_dir: "../microbenchmarks/single_device_bmm_f32"
+  csv_path: "../microbenchmarks/single_device_bmm_f32"
+  xlml_metrics_dir: "../microbenchmarks/single_device_bmm_f32"
+  xla_dump_dir: "../microbenchmarks/single_device_bmm_f32/hlo_graphs"
+  benchmark_sweep_params:
+  - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float32'}
+  - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float32'}
+  - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float32'}
+  - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float32'}
+  - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float32'}
+  - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float32'}
+  - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float32'}
+  - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float32'}
+
+- benchmark_name: "single_device_bmm"
+  trace_dir: "../microbenchmarks/single_device_bmm_fp16"
+  csv_path: "../microbenchmarks/single_device_bmm_fp16"
+  xlml_metrics_dir: "../microbenchmarks/single_device_bmm_fp16"
+  xla_dump_dir: "../microbenchmarks/single_device_bmm_fp16/hlo_graphs"
+  benchmark_sweep_params:
+  - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float16'}
+  - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float16'}
+  - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float16'}
+  - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float16'}
+  - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float16'}
+  - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float16'}
+  - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float16'}
+  - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float16'}
+
+- benchmark_name: "single_device_bmm"
+  trace_dir: "../microbenchmarks/single_device_bmm_fp8"
+  csv_path: "../microbenchmarks/single_device_bmm_fp8"
+  xlml_metrics_dir: "../microbenchmarks/single_device_bmm_fp8"
+  xla_dump_dir: "../microbenchmarks/single_device_bmm_fp8/hlo_graphs"
+  benchmark_sweep_params:
+  - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float8'}
+  - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float8'}
+  - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float8'}
+  - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float8'}
+  - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float8'}
+  - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float8'}
+  - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float8'}
+  - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float8'}
+
+- benchmark_name: "single_device_bmm"
+  trace_dir: "../microbenchmarks/single_device_bmm_fp4"
+  csv_path: "../microbenchmarks/single_device_bmm_fp4"
+  xlml_metrics_dir: "../microbenchmarks/single_device_bmm_fp4"
+  xla_dump_dir: "../microbenchmarks/single_device_bmm_fp4/hlo_graphs"
+  benchmark_sweep_params:
+  - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float4'}
+  - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float4'}
+  - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float4'}
+  - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float4'}
+  - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float4'}
+  - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float4'}
+  - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float4'}
+  - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float4'}
\ No newline at end of file
diff --git a/Ironwood/src/benchmark_bmm.py b/Ironwood/src/benchmark_bmm.py
new file mode 100644
index 00000000..40046821
--- /dev/null
+++ b/Ironwood/src/benchmark_bmm.py
@@ -0,0 +1,134 @@
+"""
+Benchmarks bmm in various flavors.
+Considered ops:
+1. bmm
+"""
+
+import os
+from typing import Any, Dict
+
+# pylint: disable=g-importing-member
+from benchmark_utils import (
+    iteration_timeit,
+    multiple_iteration_timeit_from_trace,
+    ShardingStrategy,
+    get_lhs_named_shading,
+    get_rhs_named_shading,
+    get_output_named_shading,
+    get_out_sharding,
+    create_mesh,
+    handle_based_on_sharding,
+    unified_flops_metrics,
+    str_to_dtype,
+    get_peak_flops_multiplier
+)
+from common import MARKER
+import jax
+from jax.experimental.shard_map import shard_map
+import jax.numpy as jnp
+from jax.sharding import NamedSharding
+from jax.sharding import PartitionSpec as P
+
+
+# pylint: disable=g-importing-member
+
+os.environ["LIBTPU_INIT_ARGS"] = (
+    "--xla_tpu_enable_async_collective_fusion=true "
+    "--xla_tpu_enable_async_collective_fusion_fuse_all_gather=true "
+    "--xla_tpu_enable_async_collective_fusion_multiple_steps=true "
+    "--xla_tpu_overlap_compute_collective_tc=true "
+    "--xla_enable_async_all_gather=true "
+    "--xla_enable_async_collective_permute=true "
+    "--xla_tpu_enable_all_experimental_scheduler_features=true "
+    "--xla_tpu_accumulate_into_mrb=true "
+    "--xla_tpu_scoped_vmem_limit_kib=65536 "
+    "--xla_tpu_vmem_scavenging_mode=NONE "
+    "--xla_tpu_dvfs_p_state=7"
+)
+
+TRACE_BASE_DIR = None
+METRICS_JSONL_DIR = None
+SHARDING_STRATEGY = ShardingStrategy.NO_SHARDING
+SEED = 0
+PEAK_FLOPS_PER_DEVICE = 2307  # TFLOP/s for single core(device) of FP8
+
+def single_device_bmm(
+    b: int,
+    m: int,
+    k: int,
+    n: int,
+    dtype: jnp.dtype = jax.numpy.float8_e4m3fn,
+    num_runs: int = 1,
+    trace_dir: str = None,
+) -> Dict[str, Any]:
+    """Benchmarks the OUT<B, M, N>:BF16 = IN0<B, M, K>:FP8 x IN1<B, K, N>:FP8. Accumulation is FP32."""
+
+    def f(x, y):
+        with jax.named_scope(MARKER):
+            acc = jax.numpy.einsum(
+                "bij,bjk->bik", x, y, preferred_element_type=jnp.float32
+            )
+            return acc.astype(jnp.bfloat16)
+
+    jit_sharded_f = jax.jit(f)
+
+    lhs_shape = (b, m, k)
+    rhs_shape = (b, k, n)
+
+    lhs_dtype = dtype
+    rhs_dtype = dtype
+
+    key = jax.random.key(SEED)
+
+    def data_generator():
+        """Creates new random data on host and puts it on device."""
+        nonlocal key  # Use and update the outer 'key'
+        key, key_lhs, key_rhs = jax.random.split(key, 3)
+
+        # Create random data on host
+        lhs_host = jax.random.normal(key_lhs, lhs_shape).astype(lhs_dtype)
+        rhs_host = jax.random.normal(key_rhs, rhs_shape).astype(rhs_dtype)
+
+        # Put on device (HBM)
+
+        return (lhs_host, rhs_host)
+    
+    # Run the benchmark
+
+    # num_runs = 1
+
+    dtype_str = dtype.dtype.name
+    time_ms_list = multiple_iteration_timeit_from_trace(
+        jit_sharded_f,
+        data_generator,
+        matrix_dim=f"{dtype_str}_{b}x{m}x{n}x{k}",
+        tries=num_runs,
+        task="single_device_bmm",
+        trace_dir=trace_dir,
+    )
+
+    return {"time_ms_list": time_ms_list}
+
+
+def single_device_bmm_calculate_metrics(
+    b: int,
+    m: int,
+    k: int,
+    n: int,
+    dtype: jnp.dtype,
+    time_ms_list: list[float],
+) -> Dict[str, Any]:
+    # Calculate FLOPs
+    total_flops = 2 * b * m * k * n  # Total floating-point operations
+    total_flops, total_flops_all_devices = handle_based_on_sharding(
+        total_flops, SHARDING_STRATEGY
+    )
+    return unified_flops_metrics(
+        m,
+        n,
+        k,
+        time_ms_list,
+        total_flops,
+        total_flops_all_devices,
+        PEAK_FLOPS_PER_DEVICE,
+    )
diff --git a/Ironwood/src/run_benchmark.py b/Ironwood/src/run_benchmark.py
index 5b3c3b7f..efef1dcc 100644
--- a/Ironwood/src/run_benchmark.py
+++ b/Ironwood/src/run_benchmark.py
@@ -32,6 +32,10 @@
     "send_recv": "benchmark_send_recv.send_recv_benchmark",
 }
 
+BMM_BENCHMARK_MAP = {
+    "single_device_bmm": "benchmark_bmm.single_device_bmm",
+}
+
 MATMUL_BENCHMARK_MAP = {
     "naive_matmul": "benchmark_matmul.naive_matmul",
     "single_host_naive_matmul": "benchmark_matmul.single_host_naive_matmul",
@@ -99,6 +103,7 @@
     "host_device": "benchmark_host_device.benchmark_host_device",
 }
 BENCHMARK_MAP = {}
+BENCHMARK_MAP.update(BMM_BENCHMARK_MAP)
 BENCHMARK_MAP.update(COLLECTIVE_BENCHMARK_MAP)
 BENCHMARK_MAP.update(MATMUL_BENCHMARK_MAP)
 BENCHMARK_MAP.update(CONVOLUTION_BENCHMARK_MAP)

From ef5ad1c41585ac570fa7ab199bdb840e6aa9c80c Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Wed, 4 Feb 2026 09:46:14 +0000
Subject: [PATCH 34/88] Add --numactl_bind flag to H2D benchmark script

---
 Ironwood/scripts/run_host_device_benchmark.sh | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/Ironwood/scripts/run_host_device_benchmark.sh b/Ironwood/scripts/run_host_device_benchmark.sh
index 1275160b..607244c2 100755
--- a/Ironwood/scripts/run_host_device_benchmark.sh
+++ b/Ironwood/scripts/run_host_device_benchmark.sh
@@ -4,12 +4,14 @@
 CONFIG_DIR="Ironwood/configs/host_device"
 SPECIFIC_CONFIG=""
 INTERLEAVED=false
+NUMACTL_BIND=false
 
 # Helper function for usage
 usage() {
     echo "Usage: $0 [OPTIONS]"
     echo "Options:"
     echo "  --config <path>       Path to specific config file (optional)"
+    echo "  --numactl_bind        Run with numactl --cpunodebind=0 --membind=0"
     echo "  --interleaved         Run with numactl --interleave=all"
     echo "  --help                Show this help message"
     exit 1
@@ -20,6 +22,7 @@ while [[ "$#" -gt 0 ]]; do
     case $1 in
         --config) SPECIFIC_CONFIG="$2"; shift ;;
         --interleaved) INTERLEAVED=true ;;
+        --numactl_bind) NUMACTL_BIND=true ;;
         --help) usage ;;
         *) echo "Unknown parameter passed: $1"; usage ;;
     esac
@@ -33,6 +36,7 @@ echo "********************************************************"
 echo ""
 echo "Configuration:"
 echo "    Interleaved: $INTERLEAVED"
+echo "    numactl Bound: $NUMACTL_BIND"
 echo ""
 
 if [ -n "$SPECIFIC_CONFIG" ]; then
@@ -44,6 +48,11 @@ else
     shopt -u nullglob
 fi
 
+if [[ "$INTERLEAVED" = true && "$NUMACTL_BIND" = true ]]; then
+    echo "Only one of --interleaved and --numactl_bind is allowed to be set at once."
+    exit 1
+fi
+
 if [ ${#CONFIGS[@]} -eq 0 ]; then
     echo "No configuration files found!"
     exit 1
@@ -61,6 +70,14 @@ for CONFIG_FILE in "${CONFIGS[@]}"; do
             echo "Warning: numactl not found. Running without interleaving."
             $CMD
         fi
+    elif [ "$NUMACTL_BIND" = true ]; then
+        if command -v numactl &> /dev/null; then
+            echo "Running with numactl --cpunodebind=0 --membind=0"
+            numactl --cpunodebind=0 --membind=0 $CMD
+        else
+            echo "Warning: numactl not found. Running without binding."
+            $CMD
+        fi
     else
         $CMD
     fi

From be12de015f3ba0014f0c5d7e9d928d06a655fcc4 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Wed, 4 Feb 2026 09:46:14 +0000
Subject: [PATCH 35/88] Add --numactl_bind flag to H2D benchmark script

---
 Ironwood/scripts/run_host_device_benchmark.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Ironwood/scripts/run_host_device_benchmark.sh b/Ironwood/scripts/run_host_device_benchmark.sh
index 607244c2..77abb52d 100755
--- a/Ironwood/scripts/run_host_device_benchmark.sh
+++ b/Ironwood/scripts/run_host_device_benchmark.sh
@@ -13,6 +13,7 @@ usage() {
     echo "  --config <path>       Path to specific config file (optional)"
     echo "  --numactl_bind        Run with numactl --cpunodebind=0 --membind=0"
     echo "  --interleaved         Run with numactl --interleave=all"
+    echo "  --numactl_bind        Run with numactl --cpunodebind=0 --membind=0"
     echo "  --help                Show this help message"
     exit 1
 }

From 21c694061a8e48ca5d3e1baa75ab91d8e3ca4572 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Wed, 4 Feb 2026 09:46:14 +0000
Subject: [PATCH 36/88] Add --numactl_bind flag to H2D benchmark script

---
 Ironwood/scripts/run_host_device_benchmark.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Ironwood/scripts/run_host_device_benchmark.sh b/Ironwood/scripts/run_host_device_benchmark.sh
index 77abb52d..13345ec6 100755
--- a/Ironwood/scripts/run_host_device_benchmark.sh
+++ b/Ironwood/scripts/run_host_device_benchmark.sh
@@ -11,7 +11,6 @@ usage() {
     echo "Usage: $0 [OPTIONS]"
     echo "Options:"
     echo "  --config <path>       Path to specific config file (optional)"
-    echo "  --numactl_bind        Run with numactl --cpunodebind=0 --membind=0"
     echo "  --interleaved         Run with numactl --interleave=all"
     echo "  --numactl_bind        Run with numactl --cpunodebind=0 --membind=0"
     echo "  --help                Show this help message"

From 7b090f1d96761ddd0e6d32efa3a49de25bed2649 Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Wed, 4 Feb 2026 17:12:47 +0000
Subject: [PATCH 37/88] [Automation] Add BMM into automation script

---
 Ironwood/guides/automation/aggregator.py      | 21 ++++++-
 .../guides/automation/automation_launch.sh    |  1 +
 .../guides/automation/tpu7x-2x2x1-bmm.yaml    | 61 +++++++++++++++++++
 Ironwood/src/benchmark_bmm.py                 |  2 +
 Ironwood/src/benchmark_utils.py               |  1 +
 5 files changed, 85 insertions(+), 1 deletion(-)
 create mode 100644 Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml

diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py
index bc5e5e24..5382ce30 100644
--- a/Ironwood/guides/automation/aggregator.py
+++ b/Ironwood/guides/automation/aggregator.py
@@ -29,6 +29,13 @@
         "tflops_per_sec_per_device_avg", "tflops_per_sec_per_device_min",
         "tflops_per_sec_per_device_max",
     ],
+    "bmm": [
+        "b", "m", "n", "k", "dtype", "step_time_ms_num_runs",
+        "tflops_per_sec_per_device_p50", "tflops_per_sec_per_device_p90",
+        "tflops_per_sec_per_device_p95", "tflops_per_sec_per_device_p99",
+        "tflops_per_sec_per_device_avg", "tflops_per_sec_per_device_min",
+        "tflops_per_sec_per_device_max",
+    ],
 }
 
 def download_from_gcs(bucket_path: str, local_dir: str):
@@ -86,15 +93,27 @@ def aggregate_gemm(directories: list[str], picked_columns: list[str]) -> pd.Data
             aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True)
     return aggregated_df
 
+def aggregate_bmm(directories: list[str], picked_columns: list[str]) -> pd.DataFrame:
+    if len(directories) == 0:
+        return None
+    aggregated_df = pd.DataFrame()
+    for directory in directories:
+        files = glob.glob(f"{directory}/*.tsv")
+        for file in files:
+            df = pd.read_csv(file, sep='\t')
+            aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True)
+    return aggregated_df
+
 aggregate_function = {
     "collectives": aggregate_collectives,
     "hbm": aggregate_hbm,
     "host_device": aggregate_host_device,
     "gemm": aggregate_gemm,
+    "bmm": aggregate_bmm,
 }
 
 def aggregate_results(bucket_path: str, local_dir: str):
-    categories = ["collectives", "hbm", "host_device", "gemm"]
+    categories = ["collectives", "hbm", "host_device", "gemm", "bmm"]
     directories = {}
     results = {}
     for category in categories:
diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
index 21af18ee..0ca76210 100755
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -15,6 +15,7 @@ yaml_names=(
     "tpu7x-2x2x1-hbm.yaml"
     "tpu7x-2x2x1-host_device.yaml"
     "tpu7x-2x2x1-gemm.yaml"
+    "tpu7x-2x2x1-bmm.yaml"
     "tpu7x-2x2x1-collectives.yaml"
     "tpu7x-2x2x2-collectives.yaml"
     "tpu7x-2x2x4-collectives.yaml"
diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml
new file mode 100644
index 00000000..1b5b9774
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml
@@ -0,0 +1,61 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-2x2x1
+spec:
+  completionMode: Indexed
+  suspend: true
+  parallelism: 1
+  completions: 1
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x1
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/bmm/single_device_bmm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/src/benchmark_bmm.py b/Ironwood/src/benchmark_bmm.py
index 40046821..264daf2b 100644
--- a/Ironwood/src/benchmark_bmm.py
+++ b/Ironwood/src/benchmark_bmm.py
@@ -131,4 +131,6 @@ def single_device_bmm_calculate_metrics(
         total_flops,
         total_flops_all_devices,
         PEAK_FLOPS_PER_DEVICE,
+        dtype=dtype.dtype.name,
+        b=b,
     )
diff --git a/Ironwood/src/benchmark_utils.py b/Ironwood/src/benchmark_utils.py
index fa1fb81c..60fc2d14 100644
--- a/Ironwood/src/benchmark_utils.py
+++ b/Ironwood/src/benchmark_utils.py
@@ -1111,6 +1111,7 @@ def unified_flops_metrics(
     total_flops_all_devices: int,
     peak_TFLOPS_per_device: float,
     dtype: str = None,
+    b: int = None,
 ) -> Dict[str, Any]:
     """Calculates the metrics for the naive matmul benchmark."""
     # Build dictionary of all the parameters in the function

From a86475d6a50bb644617d6ee9e63427072526fc61 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Thu, 5 Feb 2026 00:15:58 +0000
Subject: [PATCH 38/88] Add baseline pipelined flow to H2D benchmark

---
 Ironwood/configs/host_device/host_device.yaml |   3 +-
 Ironwood/src/benchmark_host_device.py         | 126 ++++++++++++++----
 2 files changed, 103 insertions(+), 26 deletions(-)

diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml
index 0b48800c..ff97df1b 100644
--- a/Ironwood/configs/host_device/host_device.yaml
+++ b/Ironwood/configs/host_device/host_device.yaml
@@ -3,7 +3,8 @@ benchmarks:
   num_runs: 20
   benchmark_sweep_params:
   - {
-      data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768]
+      h2d_type: ["simple", "pipelined"],
+      data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768],
     }
   csv_path: "../microbenchmarks/host_device"
   trace_dir: "../microbenchmarks/host_device/trace"
diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index 16352e2a..1d72b5eb 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -5,7 +5,7 @@
 from typing import Any, Dict, Tuple, List
 
 import jax
-from jax import sharding
+from jax import numpy as jnp
 import numpy as np
 from benchmark_utils import MetricsStatistics
 
@@ -23,8 +23,9 @@ def benchmark_host_device(
     data_size_mib: int,
     num_runs: int = 100,
     trace_dir: str = None,
+    h2d_type: str = "simple",
 ) -> Dict[str, Any]:
-    """Benchmarks H2D/D2H transfer using simple device_put/device_get."""
+    """Benchmarks H2D/D2H transfer using device_put/device_get."""
     
     num_elements = 1024 * 1024 * data_size_mib // np.dtype(np.float32).itemsize
     
@@ -32,8 +33,13 @@ def benchmark_host_device(
     column = 128
     host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32)
     
+    # Used in pipelined flow
+    # TODO: turn into a param
+    num_devices_to_perform_h2d = 1
+    target_devices = jax.devices()[:num_devices_to_perform_h2d]
+
     print(
-        f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations",
+        f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {h2d_type=}",
         flush=True
     )
 
@@ -65,29 +71,98 @@ def benchmark_host_device(
             
             with step_context:
                  # H2D
-                t0 = time.perf_counter()
-                
-                # Simple device_put
-                device_array = jax.device_put(host_data)
-                device_array.block_until_ready()
-                
-                t1 = time.perf_counter()
-                h2d_perf.append((t1 - t0) * 1000)
-                
-                # Verify H2D shape
-                assert device_array.shape == host_data.shape
-                
-                # D2H
-                t2 = time.perf_counter()
-                
-                # Simple device_get
-                # Note: device_get returns a numpy array (copy)
-                _ = jax.device_get(device_array)
-                
-                t3 = time.perf_counter()
-                d2h_perf.append((t3 - t2) * 1000)
+                if h2d_type == "simple":
+                    t0 = time.perf_counter()
+                    # Simple device_put
+                    device_array = jax.device_put(host_data)
+                    device_array.block_until_ready()
+                    t1 = time.perf_counter()
+                    
+                    # Verify H2D shape
+                    assert device_array.shape == host_data.shape
+
+                    h2d_perf.append((t1 - t0) * 1000)
                 
-                device_array.delete()
+                    # D2H
+                    t2 = time.perf_counter()
+                    
+                    # Simple device_get
+                    # Note: device_get returns a numpy array (copy)
+                    _ = jax.device_get(device_array)
+                    
+                    t3 = time.perf_counter()
+                    d2h_perf.append((t3 - t2) * 1000)
+                    
+                    device_array.delete()
+                elif h2d_type == "pipelined":
+                    target_chunk_size_mib = 16  # Sweet spot from profiling
+                    num_devices = len(target_devices)
+
+                    tensors_on_device = []
+                    
+                    # Calculate chunks per device
+                    data_per_dev = data_size_mib / num_devices
+                    chunks_per_dev = int(data_per_dev / target_chunk_size_mib)
+                    chunks_per_dev = max(1, chunks_per_dev)
+
+                    chunks = np.array_split(host_data, chunks_per_dev * num_devices, axis=0)
+
+                    t0 = time.perf_counter()
+                    if chunks_per_dev > 1:    
+                        # We need to map chunks to the correct device
+                        # This simple example assumes chunks are perfectly divisible and ordered
+                        # In production, use `jax.sharding` mesh logic for complex layouts
+
+                        # approach 1: simple for loop
+                        for idx, chunk in enumerate(chunks):
+                            if num_devices > 1:
+                                dev = target_devices[idx % num_devices]
+                            else:
+                                dev = target_devices[0]
+                            tensors_on_device.append(jax.device_put(chunk, dev))
+                        # Re-assemble array
+                        result = jnp.vstack(tensors_on_device)
+                        # Wait for all chunks to be transferred
+                        result.block_until_ready()
+
+                        # approach 2: generator (slightly less overhead)
+                        # def chunk_generator(num_devices, chunks_per_dev):
+                        #     for n in range(chunks_per_dev):
+                        #         for d in range(num_devices):
+                        #             # 1. Get the specific small chunk
+                        #             chunk = chunks[d*chunks_per_dev+n]
+
+                        #             # 2. Trigger an individual DMA transfer for this specific chunk
+                        #             # This is where NUMA-local memory access matters
+                        #             yield jax.device_put(chunk, target_devices[d])
+
+                        # # Re-assemble array
+                        # result = jnp.vstack(list(chunk_generator(num_devices, chunks_per_dev)))
+                        # # Wait for all chunks to be transferred
+                        # result.block_until_ready()
+                    else:
+                        print(f"Warning: {data_size_mib=} is not larger than {target_chunk_size_mib=}, falling back to standard JAX put.")
+                        # Fallback to standard JAX put for small data
+                        result = jax.device_put(host_data, target_devices[0])
+                        result.block_until_ready()
+
+                    t1 = time.perf_counter()
+                    h2d_perf.append((t1 - t0) * 1000)
+
+                    # D2H
+                    t2 = time.perf_counter()
+                    # Simple device_get
+                    # Note: device_get returns a numpy array (copy)
+                    _ = jax.device_get(result)
+
+                    t3 = time.perf_counter()
+                    if not np.allclose(result, host_data):
+                        print("pipelined result not equal to host_data")
+                    d2h_perf.append((t3 - t2) * 1000)
+
+                    for r in tensors_on_device:
+                        r.delete()
+                    del tensors_on_device
 
     return {
         "H2D_Bandwidth_ms": h2d_perf,
@@ -98,6 +173,7 @@ def benchmark_host_device_calculate_metrics(
     data_size_mib: int,
     H2D_Bandwidth_ms: List[float],
     D2H_Bandwidth_ms: List[float],
+    h2d_type: str = "simple",
 ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
     """Calculates metrics for Host-Device transfer."""
     params = locals().items()

From 0fe95cce4fbd2cf9256dca509754823604aef1a3 Mon Sep 17 00:00:00 2001
From: "Amy (Yu-Hsuan) Lin" <amylin@google.com>
Date: Thu, 5 Feb 2026 11:05:03 +0800
Subject: [PATCH 39/88] Correct fp4 tensor size calculation (#99)

The new utility will use jnp.finfo and jnp.iinfo to determine the accurate bit
width of any dtype, ensuring correct bandwidth metrics for current and future
sub-byte types (like int4 or float4).
---
 Ironwood/src/benchmark_collectives.py |  5 +++--
 Ironwood/src/benchmark_hbm.py         |  3 ++-
 Ironwood/src/benchmark_send_recv.py   |  7 ++++---
 Ironwood/src/benchmark_utils.py       | 12 ++++++++++++
 4 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/Ironwood/src/benchmark_collectives.py b/Ironwood/src/benchmark_collectives.py
index e142f59b..8e8399d2 100644
--- a/Ironwood/src/benchmark_collectives.py
+++ b/Ironwood/src/benchmark_collectives.py
@@ -11,6 +11,7 @@
 from benchmark_utils import MetricsStatistics
 from benchmark_utils import multiple_iteration_timeit_from_trace
 from benchmark_utils import ShardingStrategy
+from benchmark_utils import get_real_dtype_bytes
 from common import MARKER
 import jax
 from jax import core
@@ -72,7 +73,7 @@ def get_metrics_helper(
       for key, value in params
       if value is not None and key not in exclude_keys
   }
-  metadata["dtype"] = metadata["dtype"].dtype.itemsize
+  metadata["dtype"] = get_real_dtype_bytes(metadata["dtype"].dtype)
   return metadata
 
 
@@ -99,7 +100,7 @@ def unified_ici_collectives_metrics(
 
   input_num_elements = matrix_shape[0] * matrix_shape[1] * matrix_shape[2]
   dtype_name = dtype.dtype.name
-  dtype_bytes = dtype.dtype.itemsize
+  dtype_bytes = get_real_dtype_bytes(dtype.dtype)
   if xla_output:
     xla_output_json = json.loads(xla_output)
     hlo_input_shape = xla_output_json.get("hlo_input_shape")
diff --git a/Ironwood/src/benchmark_hbm.py b/Ironwood/src/benchmark_hbm.py
index 53744b5d..5e6574ac 100644
--- a/Ironwood/src/benchmark_hbm.py
+++ b/Ironwood/src/benchmark_hbm.py
@@ -6,6 +6,7 @@
 from benchmark_utils import (
     MetricsStatistics,
     multiple_iteration_timeit_from_trace,
+    get_real_dtype_bytes,
 )
 from common import MARKER
 import jax
@@ -76,7 +77,7 @@ def single_device_hbm_copy_calculate_metrics(
     metrics = {}
 
     # Calculate throughput.
-    tensor_size_bytes = num_elements * dtype.dtype.itemsize
+    tensor_size_bytes = num_elements * get_real_dtype_bytes(dtype.dtype)
 
     tensor_size_gbytes = (tensor_size_bytes * 2) / 10**9
     time_statistics = MetricsStatistics(
diff --git a/Ironwood/src/benchmark_send_recv.py b/Ironwood/src/benchmark_send_recv.py
index 90950007..c7dd5db3 100644
--- a/Ironwood/src/benchmark_send_recv.py
+++ b/Ironwood/src/benchmark_send_recv.py
@@ -8,6 +8,7 @@
 import jax.sharding
 from benchmark_utils import (
     get_trace,
+    get_real_dtype_bytes,
 )
 from common import MARKER
 import tempfile
@@ -68,7 +69,7 @@ def get_metrics_helper(
         for key, value in params
         if value is not None and key not in exclude_keys
     }
-    metadata['dtype'] = metadata['dtype'].dtype.itemsize
+    metadata['dtype'] = get_real_dtype_bytes(metadata['dtype'].dtype)
     return metadata
 
 
@@ -84,7 +85,7 @@ def send_recv_benchmark(
     device_count = jax.local_device_count()
     devices = mesh_utils.create_device_mesh((device_count,))
     mesh = jax.sharding.Mesh(devices, 'x')
-    item_size = jnp.dtype(dtype).itemsize
+    item_size = get_real_dtype_bytes(jnp.dtype(dtype))
     tensor_size_bytes = num_elements * item_size
     last_dim = tensor_size_bytes // (1 * 8 * item_size)
 
@@ -161,7 +162,7 @@ def send_recv_benchmark_calculate_metrics(
     metadata = get_metrics_helper(params)
     metrics = {}
 
-    tensor_size_bytes = num_elements * jnp.dtype(dtype).itemsize
+    tensor_size_bytes = num_elements * get_real_dtype_bytes(jnp.dtype(dtype))
     tensor_size_gbytes = tensor_size_bytes / 10**9
 
     metrics['runtime_ms (ms)'] = runtime_ms
diff --git a/Ironwood/src/benchmark_utils.py b/Ironwood/src/benchmark_utils.py
index 60fc2d14..4091aefb 100644
--- a/Ironwood/src/benchmark_utils.py
+++ b/Ironwood/src/benchmark_utils.py
@@ -28,6 +28,18 @@
 import jax.extend
 from tensorflow.tsl.profiler.protobuf import xplane_pb2
 
+
+def get_real_dtype_bytes(dtype) -> float:
+  """Returns the real byte size of a dtype, handling sub-byte types."""
+  try:
+    return jnp.finfo(dtype).bits / 8
+  except Exception:
+    try:
+      return jnp.iinfo(dtype).bits / 8
+    except Exception:
+      return dtype.itemsize
+
+
 # The dictionary to map a JAX (collective) function to its main HLO.
 TARGET_TASK_NAME_COLLECTIVES_MAP = {
     "all_to_all_ici_op": r"all-to-all.[0-9]+",

From 9a7c4afb29571d682733046ca0b2d51caf546437 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Thu, 5 Feb 2026 00:15:58 +0000
Subject: [PATCH 40/88] Add baseline pipelined flow to H2D benchmark

---
 Ironwood/configs/host_device/host_device.yaml |  2 +-
 Ironwood/src/benchmark_host_device.py         | 92 ++++++++++++++-----
 2 files changed, 68 insertions(+), 26 deletions(-)

diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml
index 8d572ed7..ff97df1b 100644
--- a/Ironwood/configs/host_device/host_device.yaml
+++ b/Ironwood/configs/host_device/host_device.yaml
@@ -3,8 +3,8 @@ benchmarks:
   num_runs: 20
   benchmark_sweep_params:
   - {
+      h2d_type: ["simple", "pipelined"],
       data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768],
-      h2d_type: ["simple", "pipelined"]
     }
   csv_path: "../microbenchmarks/host_device"
   trace_dir: "../microbenchmarks/host_device/trace"
diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index 8a36a2c7..1d72b5eb 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -5,6 +5,7 @@
 from typing import Any, Dict, Tuple, List
 
 import jax
+from jax import numpy as jnp
 import numpy as np
 from benchmark_utils import MetricsStatistics
 
@@ -33,14 +34,9 @@ def benchmark_host_device(
     host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32)
     
     # Used in pipelined flow
+    # TODO: turn into a param
     num_devices_to_perform_h2d = 1
-    tensor_size = 4 * 1024 * 1024
-    target_device = jax.devices()[:num_devices_to_perform_h2d]
-    mesh = jax.sharding.Mesh(np.array(target_device), axis_names=["x"])
-    sharding = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec("x"))
-    pipelined_array = None
-    if h2d_type == "pipelined":
-        pipelined_array = np.random.normal(size=(tensor_size,)).astype(np.float32)
+    target_devices = jax.devices()[:num_devices_to_perform_h2d]
 
     print(
         f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {h2d_type=}",
@@ -99,28 +95,74 @@ def benchmark_host_device(
                     
                     device_array.delete()
                 elif h2d_type == "pipelined":
+                    target_chunk_size_mib = 16  # Sweet spot from profiling
+                    num_devices = len(target_devices)
+
                     tensors_on_device = []
-                    if data_size_mib * 1024 * 1024 < pipelined_array.nbytes:
-                        print(f"Warning: {data_size_mib=} is smaller than pipeline unit, no data will be transferred.")
-                    t0 = time.perf_counter()
-                    # Assume data_size_mib is total across devices for now
-                    bytes_left = 1024 * 1024 * data_size_mib
-                    while bytes_left >= pipelined_array.nbytes:
-                        with jax.profiler.StepTraceAnnotation("device_put", step_num=1):
-                            x_device = jax.device_put(pipelined_array, sharding)
-                            tensors_on_device.append(x_device)
-                            bytes_left -= pipelined_array.nbytes
                     
-                    total_bytes_transferred = 0
-                    for tensor in tensors_on_device:
-                        tensor.block_until_ready()
-                        total_bytes_transferred += tensor.nbytes
-                        tensor.delete()
-                    t1 = time.perf_counter()
+                    # Calculate chunks per device
+                    data_per_dev = data_size_mib / num_devices
+                    chunks_per_dev = int(data_per_dev / target_chunk_size_mib)
+                    chunks_per_dev = max(1, chunks_per_dev)
+
+                    chunks = np.array_split(host_data, chunks_per_dev * num_devices, axis=0)
 
+                    t0 = time.perf_counter()
+                    if chunks_per_dev > 1:    
+                        # We need to map chunks to the correct device
+                        # This simple example assumes chunks are perfectly divisible and ordered
+                        # In production, use `jax.sharding` mesh logic for complex layouts
+
+                        # approach 1: simple for loop
+                        for idx, chunk in enumerate(chunks):
+                            if num_devices > 1:
+                                dev = target_devices[idx % num_devices]
+                            else:
+                                dev = target_devices[0]
+                            tensors_on_device.append(jax.device_put(chunk, dev))
+                        # Re-assemble array
+                        result = jnp.vstack(tensors_on_device)
+                        # Wait for all chunks to be transferred
+                        result.block_until_ready()
+
+                        # approach 2: generator (slightly less overhead)
+                        # def chunk_generator(num_devices, chunks_per_dev):
+                        #     for n in range(chunks_per_dev):
+                        #         for d in range(num_devices):
+                        #             # 1. Get the specific small chunk
+                        #             chunk = chunks[d*chunks_per_dev+n]
+
+                        #             # 2. Trigger an individual DMA transfer for this specific chunk
+                        #             # This is where NUMA-local memory access matters
+                        #             yield jax.device_put(chunk, target_devices[d])
+
+                        # # Re-assemble array
+                        # result = jnp.vstack(list(chunk_generator(num_devices, chunks_per_dev)))
+                        # # Wait for all chunks to be transferred
+                        # result.block_until_ready()
+                    else:
+                        print(f"Warning: {data_size_mib=} is not larger than {target_chunk_size_mib=}, falling back to standard JAX put.")
+                        # Fallback to standard JAX put for small data
+                        result = jax.device_put(host_data, target_devices[0])
+                        result.block_until_ready()
+
+                    t1 = time.perf_counter()
                     h2d_perf.append((t1 - t0) * 1000)
-                    # Implement D2H at a later time after we establish H2D
-                    d2h_perf.append(0)
+
+                    # D2H
+                    t2 = time.perf_counter()
+                    # Simple device_get
+                    # Note: device_get returns a numpy array (copy)
+                    _ = jax.device_get(result)
+
+                    t3 = time.perf_counter()
+                    if not np.allclose(result, host_data):
+                        print("pipelined result not equal to host_data")
+                    d2h_perf.append((t3 - t2) * 1000)
+
+                    for r in tensors_on_device:
+                        r.delete()
+                    del tensors_on_device
 
     return {
         "H2D_Bandwidth_ms": h2d_perf,

From 90eb07a8b5f7b02eb71cdd76cfae799a6cce9428 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Thu, 5 Feb 2026 02:52:30 +0000
Subject: [PATCH 41/88] Add --numactl_binding flag to host_device YAMLs

---
 Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml      | 2 +-
 Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml
index a6b8febd..1084fdf9 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml
@@ -53,7 +53,7 @@ spec:
           pip install -r requirements.txt
 
           GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} --numactl_binding
         resources:
           requests:
             google.com/tpu: 4
diff --git a/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml b/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml
index 8c027c01..83d06065 100644
--- a/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml
+++ b/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml
@@ -24,7 +24,7 @@ spec:
       cd accelerator-microbenchmarks
       pip install -r requirements.txt
 
-      bash ./Ironwood/scripts/run_host_device_benchmark.sh --config Ironwood/configs/host_device/host_device.yaml
+      bash ./Ironwood/scripts/run_host_device_benchmark.sh --config Ironwood/configs/host_device/host_device.yaml --numactl_binding
 
     resources:
       requests:

From 040002fdcd635bc5a350d1567af716feb04149f5 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Thu, 5 Feb 2026 06:07:41 +0000
Subject: [PATCH 42/88] Add h2d_type column to H2D/D2H output

---
 Ironwood/src/benchmark_host_device.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index 1d72b5eb..d80c2819 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -184,6 +184,7 @@ def benchmark_host_device_calculate_metrics(
     }
     metadata = {k: v for k, v in params if k in metadata_keys}
     metadata["dtype"] = "float32"
+    metadata["h2d_type"] = h2d_type
     
     metrics = {}
     

From 38e8038d3ecb4652fe6d4f2f0baa15f9d58e7159 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Thu, 5 Feb 2026 06:21:56 +0000
Subject: [PATCH 43/88] Revert "Add baseline pipelined flow to H2D benchmark"

This reverts commit a86475d6a50bb644617d6ee9e63427072526fc61.
---
 Ironwood/configs/host_device/host_device.yaml |   3 +-
 Ironwood/src/benchmark_host_device.py         | 126 ++++--------------
 2 files changed, 26 insertions(+), 103 deletions(-)

diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml
index ff97df1b..0b48800c 100644
--- a/Ironwood/configs/host_device/host_device.yaml
+++ b/Ironwood/configs/host_device/host_device.yaml
@@ -3,8 +3,7 @@ benchmarks:
   num_runs: 20
   benchmark_sweep_params:
   - {
-      h2d_type: ["simple", "pipelined"],
-      data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768],
+      data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768]
     }
   csv_path: "../microbenchmarks/host_device"
   trace_dir: "../microbenchmarks/host_device/trace"
diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index d80c2819..0c7eacc5 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -5,7 +5,7 @@
 from typing import Any, Dict, Tuple, List
 
 import jax
-from jax import numpy as jnp
+from jax import sharding
 import numpy as np
 from benchmark_utils import MetricsStatistics
 
@@ -23,9 +23,8 @@ def benchmark_host_device(
     data_size_mib: int,
     num_runs: int = 100,
     trace_dir: str = None,
-    h2d_type: str = "simple",
 ) -> Dict[str, Any]:
-    """Benchmarks H2D/D2H transfer using device_put/device_get."""
+    """Benchmarks H2D/D2H transfer using simple device_put/device_get."""
     
     num_elements = 1024 * 1024 * data_size_mib // np.dtype(np.float32).itemsize
     
@@ -33,13 +32,8 @@ def benchmark_host_device(
     column = 128
     host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32)
     
-    # Used in pipelined flow
-    # TODO: turn into a param
-    num_devices_to_perform_h2d = 1
-    target_devices = jax.devices()[:num_devices_to_perform_h2d]
-
     print(
-        f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {h2d_type=}",
+        f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations",
         flush=True
     )
 
@@ -71,98 +65,29 @@ def benchmark_host_device(
             
             with step_context:
                  # H2D
-                if h2d_type == "simple":
-                    t0 = time.perf_counter()
-                    # Simple device_put
-                    device_array = jax.device_put(host_data)
-                    device_array.block_until_ready()
-                    t1 = time.perf_counter()
-                    
-                    # Verify H2D shape
-                    assert device_array.shape == host_data.shape
-
-                    h2d_perf.append((t1 - t0) * 1000)
+                t0 = time.perf_counter()
                 
-                    # D2H
-                    t2 = time.perf_counter()
-                    
-                    # Simple device_get
-                    # Note: device_get returns a numpy array (copy)
-                    _ = jax.device_get(device_array)
-                    
-                    t3 = time.perf_counter()
-                    d2h_perf.append((t3 - t2) * 1000)
-                    
-                    device_array.delete()
-                elif h2d_type == "pipelined":
-                    target_chunk_size_mib = 16  # Sweet spot from profiling
-                    num_devices = len(target_devices)
-
-                    tensors_on_device = []
-                    
-                    # Calculate chunks per device
-                    data_per_dev = data_size_mib / num_devices
-                    chunks_per_dev = int(data_per_dev / target_chunk_size_mib)
-                    chunks_per_dev = max(1, chunks_per_dev)
-
-                    chunks = np.array_split(host_data, chunks_per_dev * num_devices, axis=0)
-
-                    t0 = time.perf_counter()
-                    if chunks_per_dev > 1:    
-                        # We need to map chunks to the correct device
-                        # This simple example assumes chunks are perfectly divisible and ordered
-                        # In production, use `jax.sharding` mesh logic for complex layouts
-
-                        # approach 1: simple for loop
-                        for idx, chunk in enumerate(chunks):
-                            if num_devices > 1:
-                                dev = target_devices[idx % num_devices]
-                            else:
-                                dev = target_devices[0]
-                            tensors_on_device.append(jax.device_put(chunk, dev))
-                        # Re-assemble array
-                        result = jnp.vstack(tensors_on_device)
-                        # Wait for all chunks to be transferred
-                        result.block_until_ready()
-
-                        # approach 2: generator (slightly less overhead)
-                        # def chunk_generator(num_devices, chunks_per_dev):
-                        #     for n in range(chunks_per_dev):
-                        #         for d in range(num_devices):
-                        #             # 1. Get the specific small chunk
-                        #             chunk = chunks[d*chunks_per_dev+n]
-
-                        #             # 2. Trigger an individual DMA transfer for this specific chunk
-                        #             # This is where NUMA-local memory access matters
-                        #             yield jax.device_put(chunk, target_devices[d])
-
-                        # # Re-assemble array
-                        # result = jnp.vstack(list(chunk_generator(num_devices, chunks_per_dev)))
-                        # # Wait for all chunks to be transferred
-                        # result.block_until_ready()
-                    else:
-                        print(f"Warning: {data_size_mib=} is not larger than {target_chunk_size_mib=}, falling back to standard JAX put.")
-                        # Fallback to standard JAX put for small data
-                        result = jax.device_put(host_data, target_devices[0])
-                        result.block_until_ready()
-
-                    t1 = time.perf_counter()
-                    h2d_perf.append((t1 - t0) * 1000)
-
-                    # D2H
-                    t2 = time.perf_counter()
-                    # Simple device_get
-                    # Note: device_get returns a numpy array (copy)
-                    _ = jax.device_get(result)
-
-                    t3 = time.perf_counter()
-                    if not np.allclose(result, host_data):
-                        print("pipelined result not equal to host_data")
-                    d2h_perf.append((t3 - t2) * 1000)
-
-                    for r in tensors_on_device:
-                        r.delete()
-                    del tensors_on_device
+                # Simple device_put
+                device_array = jax.device_put(host_data)
+                device_array.block_until_ready()
+                
+                t1 = time.perf_counter()
+                h2d_perf.append((t1 - t0) * 1000)
+                
+                # Verify H2D shape
+                assert device_array.shape == host_data.shape
+                
+                # D2H
+                t2 = time.perf_counter()
+                
+                # Simple device_get
+                # Note: device_get returns a numpy array (copy)
+                _ = jax.device_get(device_array)
+                
+                t3 = time.perf_counter()
+                d2h_perf.append((t3 - t2) * 1000)
+                
+                device_array.delete()
 
     return {
         "H2D_Bandwidth_ms": h2d_perf,
@@ -173,7 +98,6 @@ def benchmark_host_device_calculate_metrics(
     data_size_mib: int,
     H2D_Bandwidth_ms: List[float],
     D2H_Bandwidth_ms: List[float],
-    h2d_type: str = "simple",
 ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
     """Calculates metrics for Host-Device transfer."""
     params = locals().items()

From 09b3331da8cf3cc6b166f300379e85baf3f68a44 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Thu, 5 Feb 2026 06:31:07 +0000
Subject: [PATCH 44/88] Revert "Add --numactl_binding flag to host_device
 YAMLs"

This reverts commit bbb316347a5277bcf2af5a829b0c4b25653f236d.
---
 Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml      | 2 +-
 Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml
index 1084fdf9..a6b8febd 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml
@@ -53,7 +53,7 @@ spec:
           pip install -r requirements.txt
 
           GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} --numactl_binding
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
         resources:
           requests:
             google.com/tpu: 4
diff --git a/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml b/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml
index 83d06065..8c027c01 100644
--- a/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml
+++ b/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml
@@ -24,7 +24,7 @@ spec:
       cd accelerator-microbenchmarks
       pip install -r requirements.txt
 
-      bash ./Ironwood/scripts/run_host_device_benchmark.sh --config Ironwood/configs/host_device/host_device.yaml --numactl_binding
+      bash ./Ironwood/scripts/run_host_device_benchmark.sh --config Ironwood/configs/host_device/host_device.yaml
 
     resources:
       requests:

From 462d771b657844ed689cd8133e5bc754403650f4 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Thu, 5 Feb 2026 06:32:58 +0000
Subject: [PATCH 45/88] Revert "Add h2d_type column to H2D/D2H output"

This reverts commit bf5c79cf574359c52227938bcef47bc4d2392386.
---
 Ironwood/src/benchmark_host_device.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index 0c7eacc5..16352e2a 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -108,7 +108,6 @@ def benchmark_host_device_calculate_metrics(
     }
     metadata = {k: v for k, v in params if k in metadata_keys}
     metadata["dtype"] = "float32"
-    metadata["h2d_type"] = h2d_type
     
     metrics = {}
     

From 75c47aad66d8a556a3eef0595b9aa92a5b210907 Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Thu, 5 Feb 2026 09:11:48 +0000
Subject: [PATCH 46/88] Add upload log for aggregated results

---
 Ironwood/guides/automation/aggregator.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py
index 5382ce30..069a13a1 100644
--- a/Ironwood/guides/automation/aggregator.py
+++ b/Ironwood/guides/automation/aggregator.py
@@ -120,6 +120,7 @@ def aggregate_results(bucket_path: str, local_dir: str):
         directories[category] = sorted(glob.glob(f"{local_dir}/*/{category}/*", recursive=True))
         results[category] = aggregate_function[category](directories[category], columns_mapping[category])
         if results[category] is not None:
+            print(f"Writing {category} results to {bucket_path}/aggregated_results/{category}.tsv")
             results[category].to_csv(f"{bucket_path}/aggregated_results/{category}.tsv", index=False, sep='\t')
 
 if __name__ == "__main__":

From 6eb8ac60c3626dfe620b20538166c78e2a4cde24 Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Thu, 5 Feb 2026 10:01:05 +0000
Subject: [PATCH 47/88] Update num_runs for collectives and matmul
 configuration

---
 .../configs/collectives/all_gather_tpu7x_2x2x1.yaml    |  4 ++--
 .../configs/collectives/all_gather_tpu7x_2x2x2.yaml    |  4 ++--
 .../configs/collectives/all_gather_tpu7x_2x2x4.yaml    |  4 ++--
 .../configs/collectives/all_gather_tpu7x_2x4x4.yaml    |  4 ++--
 .../configs/collectives/all_gather_tpu7x_4x4x4.yaml    |  4 ++--
 .../configs/collectives/all_gather_tpu7x_4x4x8.yaml    |  4 ++--
 .../configs/collectives/all_reduce_tpu7x_2x2x1.yaml    |  4 ++--
 .../configs/collectives/all_reduce_tpu7x_2x2x2.yaml    |  4 ++--
 .../configs/collectives/all_reduce_tpu7x_2x2x4.yaml    |  4 ++--
 .../configs/collectives/all_reduce_tpu7x_2x4x4.yaml    |  4 ++--
 .../configs/collectives/all_reduce_tpu7x_4x4x4.yaml    |  4 ++--
 .../configs/collectives/all_reduce_tpu7x_4x4x8.yaml    |  4 ++--
 .../configs/collectives/all_to_all_tpu7x_2x2x1.yaml    |  4 ++--
 .../configs/collectives/all_to_all_tpu7x_2x2x2.yaml    |  4 ++--
 .../configs/collectives/all_to_all_tpu7x_2x2x4.yaml    |  4 ++--
 .../configs/collectives/all_to_all_tpu7x_2x4x4.yaml    |  4 ++--
 .../configs/collectives/all_to_all_tpu7x_4x4x4.yaml    |  4 ++--
 .../configs/collectives/all_to_all_tpu7x_4x4x8.yaml    |  4 ++--
 Ironwood/configs/hbm/hbm.yaml                          | 10 +++++-----
 19 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
index 9bc586a1..b0858716 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
@@ -1,8 +1,8 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
index b5be0c8d..ab282dec 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
@@ -1,8 +1,8 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
index 09b02979..d2f65afe 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
@@ -1,8 +1,8 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
index 4f6cf11a..35414ff3 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
@@ -1,8 +1,8 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
index 77f3ed13..b561942d 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
@@ -1,8 +1,8 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
index 12743d61..5838cafd 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
@@ -1,8 +1,8 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
index f7389925..b713c549 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
@@ -1,8 +1,8 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
index b2cb202c..165e0e72 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
@@ -1,8 +1,8 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
index 946fd5ed..0002ae3a 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
@@ -1,8 +1,8 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
index 613717cf..e5652a92 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
@@ -1,8 +1,8 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/psum_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/psum_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
index 3f4822c0..137bae19 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
@@ -1,8 +1,8 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/psum_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/psum_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
index a14bbfe8..25758453 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
@@ -1,8 +1,8 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/psum_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/psum_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x8"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
index 96da2c38..a30a17cf 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
@@ -1,8 +1,8 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
index 388a4468..01fb9b80 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
@@ -1,8 +1,8 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
index e0cc48c9..43beeb27 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
@@ -1,8 +1,8 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
index 5ae19b6e..614caa6b 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
@@ -1,8 +1,8 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
index 4cc8f6bb..12dd149d 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
@@ -1,8 +1,8 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
index 212cd92d..32c63d74 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
@@ -1,8 +1,8 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8"
diff --git a/Ironwood/configs/hbm/hbm.yaml b/Ironwood/configs/hbm/hbm.yaml
index 0e42b2f0..02cbaf59 100644
--- a/Ironwood/configs/hbm/hbm.yaml
+++ b/Ironwood/configs/hbm/hbm.yaml
@@ -1,35 +1,35 @@
 benchmarks:
 - benchmark_name: "single_device_hbm_copy"
   benchmark_sweep_params:
-  - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "bfloat16", num_runs: 1}
+  - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "bfloat16", num_runs: 20}
   trace_dir: "../microbenchmarks/hbm_bfloat16"
   csv_path: "../microbenchmarks/hbm_bfloat16"
   xlml_metrics_dir: "../microbenchmarks/hbm_bfloat16"
   xla_dump_dir: "../microbenchmarks/hbm_bfloat16/hlo_graphs"
 - benchmark_name: "single_device_hbm_copy"
   benchmark_sweep_params:
-  - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float32", num_runs: 1}
+  - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float32", num_runs: 20}
   trace_dir: "../microbenchmarks/hbm_float32"
   csv_path: "../microbenchmarks/hbm_float32"
   xlml_metrics_dir: "../microbenchmarks/hbm_float32"
   xla_dump_dir: "../microbenchmarks/hbm_float32/hlo_graphs"
 - benchmark_name: "single_device_hbm_copy"
   benchmark_sweep_params:
-  - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float8", num_runs: 1}
+  - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float8", num_runs: 20}
   trace_dir: "../microbenchmarks/hbm_float8"
   csv_path: "../microbenchmarks/hbm_float8"
   xlml_metrics_dir: "../microbenchmarks/hbm_float8"
   xla_dump_dir: "../microbenchmarks/hbm_float8/hlo_graphs"
 - benchmark_name: "single_device_hbm_copy"
   benchmark_sweep_params:
-  - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float16", num_runs: 1}
+  - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float16", num_runs: 20}
   trace_dir: "../microbenchmarks/hbm_float16"
   csv_path: "../microbenchmarks/hbm_float16"
   xlml_metrics_dir: "../microbenchmarks/hbm_float16"
   xla_dump_dir: "../microbenchmarks/hbm_float16/hlo_graphs"
 - benchmark_name: "single_device_hbm_copy"
   benchmark_sweep_params:
-  - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float4", num_runs: 1}
+  - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float4", num_runs: 20}
   trace_dir: "../microbenchmarks/hbm_float4"
   csv_path: "../microbenchmarks/hbm_float4"
   xlml_metrics_dir: "../microbenchmarks/hbm_float4"

From b17d35b6553032221cb7aa15d46c5273e97dc0e3 Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Thu, 5 Feb 2026 10:58:02 +0000
Subject: [PATCH 48/88] Set batch size in bmm configuration to be 8

---
 Ironwood/configs/bmm/single_device_bmm.yaml | 80 ++++++++++-----------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/Ironwood/configs/bmm/single_device_bmm.yaml b/Ironwood/configs/bmm/single_device_bmm.yaml
index f0a4156c..f4f946b5 100644
--- a/Ironwood/configs/bmm/single_device_bmm.yaml
+++ b/Ironwood/configs/bmm/single_device_bmm.yaml
@@ -5,14 +5,14 @@ benchmarks:
   xlml_metrics_dir: "../microbenchmarks/single_device_bmm_bf16"
   xla_dump_dir: "../microbenchmarks/single_device_bmm_bf16/hlo_graphs"
   benchmark_sweep_params:
-  - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'bfloat16'}
-  - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'bfloat16'}
-  - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'bfloat16'}
-  - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'bfloat16'}
-  - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'bfloat16'}
-  - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'bfloat16'}
-  - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'bfloat16'}
-  - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 8, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 8, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 8, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 8, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 8, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 8, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 8, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 8, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'bfloat16'}
 
 - benchmark_name: "single_device_bmm"
   trace_dir: "../microbenchmarks/single_device_bmm_f32"
@@ -20,14 +20,14 @@ benchmarks:
   xlml_metrics_dir: "../microbenchmarks/single_device_bmm_f32"
   xla_dump_dir: "../microbenchmarks/single_device_bmm_f32/hlo_graphs"
   benchmark_sweep_params:
-  - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float32'}
-  - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float32'}
-  - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float32'}
-  - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float32'}
-  - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float32'}
-  - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float32'}
-  - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float32'}
-  - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float32'}
+  - {b: 8, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float32'}
+  - {b: 8, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float32'}
+  - {b: 8, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float32'}
+  - {b: 8, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float32'}
+  - {b: 8, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float32'}
+  - {b: 8, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float32'}
+  - {b: 8, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float32'}
+  - {b: 8, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float32'}
 
 - benchmark_name: "single_device_bmm"
   trace_dir: "../microbenchmarks/single_device_bmm_fp16"
@@ -35,14 +35,14 @@ benchmarks:
   xlml_metrics_dir: "../microbenchmarks/single_device_bmm_fp16"
   xla_dump_dir: "../microbenchmarks/single_device_bmm_fp16/hlo_graphs"
   benchmark_sweep_params:
-  - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float16'}
-  - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float16'}
-  - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float16'}
-  - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float16'}
-  - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float16'}
-  - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float16'}
-  - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float16'}
-  - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float16'}
+  - {b: 8, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float16'}
+  - {b: 8, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float16'}
+  - {b: 8, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float16'}
+  - {b: 8, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float16'}
+  - {b: 8, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float16'}
+  - {b: 8, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float16'}
+  - {b: 8, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float16'}
+  - {b: 8, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float16'}
 
 - benchmark_name: "single_device_bmm"
   trace_dir: "../microbenchmarks/single_device_bmm_fp8"
@@ -50,14 +50,14 @@ benchmarks:
   xlml_metrics_dir: "../microbenchmarks/single_device_bmm_fp8"
   xla_dump_dir: "../microbenchmarks/single_device_bmm_fp8/hlo_graphs"
   benchmark_sweep_params:
-  - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float8'}
-  - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float8'}
-  - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float8'}
-  - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float8'}
-  - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float8'}
-  - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float8'}
-  - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float8'}
-  - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float8'}
+  - {b: 8, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float8'}
+  - {b: 8, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float8'}
+  - {b: 8, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float8'}
+  - {b: 8, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float8'}
+  - {b: 8, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float8'}
+  - {b: 8, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float8'}
+  - {b: 8, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float8'}
+  - {b: 8, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float8'}
 
 - benchmark_name: "single_device_bmm"
   trace_dir: "../microbenchmarks/single_device_bmm_fp4"
@@ -65,11 +65,11 @@ benchmarks:
   xlml_metrics_dir: "../microbenchmarks/single_device_bmm_fp4"
   xla_dump_dir: "../microbenchmarks/single_device_bmm_fp4/hlo_graphs"
   benchmark_sweep_params:
-  - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float4'}
-  - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float4'}
-  - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float4'}
-  - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float4'}
-  - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float4'}
-  - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float4'}
-  - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float4'}
-  - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float4'}
\ No newline at end of file
+  - {b: 8, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float4'}
+  - {b: 8, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float4'}
+  - {b: 8, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float4'}
+  - {b: 8, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float4'}
+  - {b: 8, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float4'}
+  - {b: 8, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float4'}
+  - {b: 8, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float4'}
+  - {b: 8, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float4'}
\ No newline at end of file

From de75930b2960e42a1f7a19a7eb89d07e68d75b9a Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Fri, 6 Feb 2026 03:31:55 +0000
Subject: [PATCH 49/88] Implement gemm_all_reduce benchmark (Single Chip)

---
 .../gemm_all_reduce/gemm_all_reduce.yaml      |  60 +++++
 Ironwood/guides/automation/aggregator.py      |  10 +-
 .../guides/automation/automation_launch.sh    |   1 +
 .../tpu7x-2x2x1-gemm_all_reduce.yaml          |  61 +++++
 Ironwood/src/benchmark_gemm.py                |   2 +-
 Ironwood/src/benchmark_gemm_all_reduce.py     | 227 ++++++++++++++++++
 Ironwood/src/benchmark_utils.py               |  42 +++-
 Ironwood/src/run_benchmark.py                 |   2 +
 8 files changed, 393 insertions(+), 12 deletions(-)
 create mode 100644 Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml
 create mode 100644 Ironwood/guides/automation/tpu7x-2x2x1-gemm_all_reduce.yaml
 create mode 100644 Ironwood/src/benchmark_gemm_all_reduce.py

diff --git a/Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml b/Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml
new file mode 100644
index 00000000..93466840
--- /dev/null
+++ b/Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml
@@ -0,0 +1,60 @@
+benchmarks:
+- benchmark_name: "gemm_all_reduce"
+  trace_dir: "../microbenchmarks/gemm_all_reduce_bf16"
+  csv_path: "../microbenchmarks/gemm_all_reduce_bf16"
+  xlml_metrics_dir: "../microbenchmarks/gemm_all_reduce_bf16"
+  xla_dump_dir: "../microbenchmarks/gemm_all_reduce_bf16/hlo_graphs"
+  benchmark_sweep_params:
+  - {m: 2048, k: 2048, n: 2048, num_runs: 20, dtype: 'bfloat16'}
+  - {m: 4096, k: 4096, n: 4096, num_runs: 20, dtype: 'bfloat16'}
+  - {m: 8192, k: 8192, n: 8192, num_runs: 20, dtype: 'bfloat16'}
+  - {m: 16384, k: 16384, n: 16384, num_runs: 20, dtype: 'bfloat16'}
+  - {m: 32768, k: 32768, n: 32768, num_runs: 20, dtype: 'bfloat16'}
+
+- benchmark_name: "gemm_all_reduce"
+  trace_dir: "../microbenchmarks/gemm_all_reduce_f32"
+  csv_path: "../microbenchmarks/gemm_all_reduce_f32"
+  xlml_metrics_dir: "../microbenchmarks/gemm_all_reduce_f32"
+  xla_dump_dir: "../microbenchmarks/gemm_all_reduce_f32/hlo_graphs"
+  benchmark_sweep_params:
+  - {m: 2048, k: 2048, n: 2048, num_runs: 20, dtype: 'float32'}
+  - {m: 4096, k: 4096, n: 4096, num_runs: 20, dtype: 'float32'}
+  - {m: 8192, k: 8192, n: 8192, num_runs: 20, dtype: 'float32'}
+  - {m: 16384, k: 16384, n: 16384, num_runs: 20, dtype: 'float32'}
+  - {m: 32768, k: 32768, n: 32768, num_runs: 20, dtype: 'float32'}
+
+- benchmark_name: "gemm_all_reduce"
+  trace_dir: "../microbenchmarks/gemm_all_reduce_fp16"
+  csv_path: "../microbenchmarks/gemm_all_reduce_fp16"
+  xlml_metrics_dir: "../microbenchmarks/gemm_all_reduce_fp16"
+  xla_dump_dir: "../microbenchmarks/gemm_all_reduce_fp16/hlo_graphs"
+  benchmark_sweep_params:
+  - {m: 2048, k: 2048, n: 2048, num_runs: 20, dtype: 'float16'}
+  - {m: 4096, k: 4096, n: 4096, num_runs: 20, dtype: 'float16'}
+  - {m: 8192, k: 8192, n: 8192, num_runs: 20, dtype: 'float16'}
+  - {m: 16384, k: 16384, n: 16384, num_runs: 20, dtype: 'float16'}
+  - {m: 32768, k: 32768, n: 32768, num_runs: 20, dtype: 'float16'}
+
+- benchmark_name: "gemm_all_reduce"
+  trace_dir: "../microbenchmarks/gemm_all_reduce_fp8"
+  csv_path: "../microbenchmarks/gemm_all_reduce_fp8"
+  xlml_metrics_dir: "../microbenchmarks/gemm_all_reduce_fp8"
+  xla_dump_dir: "../microbenchmarks/gemm_all_reduce_fp8/hlo_graphs"
+  benchmark_sweep_params:
+  - {m: 2048, k: 2048, n: 2048, num_runs: 20, dtype: 'float8'}
+  - {m: 4096, k: 4096, n: 4096, num_runs: 20, dtype: 'float8'}
+  - {m: 8192, k: 8192, n: 8192, num_runs: 20, dtype: 'float8'}
+  - {m: 16384, k: 16384, n: 16384, num_runs: 20, dtype: 'float8'}
+  - {m: 32768, k: 32768, n: 32768, num_runs: 20, dtype: 'float8'}
+
+- benchmark_name: "gemm_all_reduce"
+  trace_dir: "../microbenchmarks/gemm_all_reduce_fp4"
+  csv_path: "../microbenchmarks/gemm_all_reduce_fp4"
+  xlml_metrics_dir: "../microbenchmarks/gemm_all_reduce_fp4"
+  xla_dump_dir: "../microbenchmarks/gemm_all_reduce_fp4/hlo_graphs"
+  benchmark_sweep_params:
+  - {m: 2048, k: 2048, n: 2048, num_runs: 20, dtype: 'float4'}
+  - {m: 4096, k: 4096, n: 4096, num_runs: 20, dtype: 'float4'}
+  - {m: 8192, k: 8192, n: 8192, num_runs: 20, dtype: 'float4'}
+  - {m: 16384, k: 16384, n: 16384, num_runs: 20, dtype: 'float4'}
+  - {m: 32768, k: 32768, n: 32768, num_runs: 20, dtype: 'float4'}
diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py
index 069a13a1..bd291cfc 100644
--- a/Ironwood/guides/automation/aggregator.py
+++ b/Ironwood/guides/automation/aggregator.py
@@ -36,6 +36,13 @@
         "tflops_per_sec_per_device_avg", "tflops_per_sec_per_device_min",
         "tflops_per_sec_per_device_max",
     ],
+    "gemm_all_reduce": [
+        "m", "n", "k", "dtype", "step_time_ms_num_runs",
+        "tflops_per_sec_per_device_p50", "tflops_per_sec_per_device_p90",
+        "tflops_per_sec_per_device_p95", "tflops_per_sec_per_device_p99",
+        "tflops_per_sec_per_device_avg", "tflops_per_sec_per_device_min",
+        "tflops_per_sec_per_device_max",
+    ],
 }
 
 def download_from_gcs(bucket_path: str, local_dir: str):
@@ -110,10 +117,11 @@ def aggregate_bmm(directories: list[str], picked_columns: list[str]) -> pd.DataF
     "host_device": aggregate_host_device,
     "gemm": aggregate_gemm,
     "bmm": aggregate_bmm,
+    "gemm_all_reduce": aggregate_gemm,
 }
 
 def aggregate_results(bucket_path: str, local_dir: str):
-    categories = ["collectives", "hbm", "host_device", "gemm", "bmm"]
+    categories = ["collectives", "hbm", "host_device", "gemm", "bmm", "gemm_all_reduce"]
     directories = {}
     results = {}
     for category in categories:
diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
index 0ca76210..0e49fb12 100755
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -14,6 +14,7 @@ TIMEOUT_SECOND=3600
 yaml_names=(
     "tpu7x-2x2x1-hbm.yaml"
     "tpu7x-2x2x1-host_device.yaml"
+    "tpu7x-2x2x1-gemm_all_reduce.yaml"
     "tpu7x-2x2x1-gemm.yaml"
     "tpu7x-2x2x1-bmm.yaml"
     "tpu7x-2x2x1-collectives.yaml"
diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-gemm_all_reduce.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-gemm_all_reduce.yaml
new file mode 100644
index 00000000..186c63e2
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-gemm_all_reduce.yaml
@@ -0,0 +1,61 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-2x2x1
+spec:
+  completionMode: Indexed
+  suspend: true
+  parallelism: 1
+  completions: 1
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x1
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
diff --git a/Ironwood/src/benchmark_gemm.py b/Ironwood/src/benchmark_gemm.py
index b802ddc0..0b19637d 100644
--- a/Ironwood/src/benchmark_gemm.py
+++ b/Ironwood/src/benchmark_gemm.py
@@ -146,7 +146,7 @@ def gemm_multiple_run_calculate_metrics(
     total_flops, total_flops_all_devices = handle_based_on_sharding(
         total_flops, SHARDING_STRATEGY
     )
-    peak_flops = PEAK_FLOPS_PER_DEVICE if dtype==jax.numpy.float8_e4m3fn else PEAK_FLOPS_PER_DEVICE/2
+    peak_flops_multiplier = get_peak_flops_multiplier(dtype.dtype.name)
     return unified_flops_metrics(
         m,
         n,
diff --git a/Ironwood/src/benchmark_gemm_all_reduce.py b/Ironwood/src/benchmark_gemm_all_reduce.py
new file mode 100644
index 00000000..851849ca
--- /dev/null
+++ b/Ironwood/src/benchmark_gemm_all_reduce.py
@@ -0,0 +1,227 @@
+"""Benchmarks gemm + all_reduce for DP gradient sync simulation."""
+
+import os
+from typing import Any, Dict, Optional, Callable
+
+# pylint: disable=g-importing-member
+from benchmark_utils import (
+    iteration_timeit,
+    multiple_iteration_timeit_from_trace,
+    ShardingStrategy,
+    get_lhs_named_shading,
+    get_rhs_named_shading,
+    get_out_sharding,
+    create_mesh,
+    handle_based_on_sharding,
+    unified_flops_metrics,
+    MetricsStatistics,
+    get_metrics_helper,
+    str_to_dtype,
+    get_peak_flops_multiplier,
+    unified_bytes_metrics,
+)
+from common import MARKER
+import jax
+from jax.experimental.shard_map import shard_map
+from jax.sharding import PartitionSpec as P
+import jax.numpy as jnp
+
+
+# pylint: disable=g-importing-member
+
+
+# Matmul shapes: A(M,K) x B(K,N) = C(M,N)
+# Then AllReduce(C)
+SHARDING_STRATEGY = ShardingStrategy.NO_SHARDING
+SEED = 0
+PEAK_FLOPS_PER_DEVICE = 2307  # TFLOP/s for single core(device) of FP8
+
+_INITIALIZED = False
+
+def setup_tpu_env():
+    global _INITIALIZED
+    if _INITIALIZED:
+        return
+    
+    print("Setting LIBTPU_INIT_ARGS...", flush=True)
+    os.environ["LIBTPU_INIT_ARGS"] = (
+        "--xla_tpu_enable_async_collective_fusion=true "
+        "--xla_tpu_enable_async_collective_fusion_fuse_all_reduce=true "
+        "--xla_tpu_enable_async_collective_fusion_multiple_steps=true "
+        "--xla_tpu_overlap_compute_collective_tc=true "
+        "--xla_enable_async_all_reduce=true "
+        "--xla_enable_async_collective_permute=true "
+        "--xla_tpu_enable_all_experimental_scheduler_features=true "
+        "--xla_tpu_should_accumulate_into_mrb=true "
+        "--xla_tpu_scoped_vmem_limit_kib=65536 "
+        "--xla_tpu_vmem_scavenging_mode=NONE "
+        "--xla_tpu_dvfs_p_state=7 "
+
+        "--xla_tpu_impure_enable_packed_bf16_math_ops=true "
+        "--xla_tpu_enable_pincer_short_fusion_emitter=true "
+        "--xla_tpu_enable_sparse_core_hierarchical_all_reduce=true "
+        "--xla_tpu_use_single_sparse_core_for_all_reduce_offload=true " # Test effect on SC
+
+        "--xla_jf_debug_level=1 "
+        "--xla_sc_disable_megacore_partitioning=true "
+        "--xla_tpu_disable_sparse_core_collective_offload_remover=true "
+        "--xla_tpu_enable_all_reduce_scatter_fusion=false "
+        "--xla_tpu_enable_sparse_core_collective_offload_all_reduce=true "
+        "--xla_tpu_pad_operations_input_tiles=true "
+        "--xla_tpu_sparse_core_all_reduce_offload_min_size_in_bytes=0 "
+        "--xla_tpu_use_tc_device_shape_on_sc=true "
+    )
+
+    print("Step 1: Calling jax.distributed.initialize(initialization_timeout=300)...", flush=True)
+    jax.distributed.initialize(initialization_timeout=300)
+    print("Step 1: jax.distributed.initialize() completed.", flush=True)
+    _INITIALIZED = True
+
+
+def _run_gemm_base(
+    m: int,
+    k: int,
+    n: int,
+    dtype: jnp.dtype,
+    num_runs: int,
+    trace_dir: str,
+    sharding_strategy: ShardingStrategy,
+    task_name_suffix: str,
+) -> Dict[str, Any]:
+    """Shared base function for running GEMM benchmarks."""
+    setup_tpu_env()
+    dtype_str = dtype.dtype.name
+    task_name = f"{task_name_suffix}_{dtype_str}"
+    print(f"Running {task_name} benchmark with m={m}, k={k}, n={n}, dtype={dtype_str}, runs={num_runs}", flush=True)
+
+    def f(x, y):
+        with jax.named_scope(MARKER):
+            # Matmul
+            acc = jax.numpy.einsum(
+                "ij,jk->ik", x, y, preferred_element_type=jnp.float32
+            )
+            c = acc.astype(dtype)
+            
+            # AllReduce (psum)
+            out = jax.lax.psum(c, axis_name="device")
+            return out
+
+    print("Step 2: Creating Mesh and Shardings...", flush=True)
+    mesh = create_mesh(sharding_strategy)
+    lhs_sharding = get_lhs_named_shading(mesh, sharding_strategy)
+    rhs_sharding = get_rhs_named_shading(mesh, sharding_strategy)
+    out_sharding = get_out_sharding(sharding_strategy)
+
+    jit_sharded_f = jax.jit(
+        shard_map(
+            f,
+            mesh,
+            in_specs=(
+                lhs_sharding.spec,
+                rhs_sharding.spec,
+            ),
+            out_specs=out_sharding,
+            check_rep=False,
+        )
+    )
+
+    lhs_shape = (m, k)
+    rhs_shape = (k, n)
+    lhs_dtype = dtype
+    rhs_dtype = dtype
+    key = jax.random.key(SEED)
+
+    def data_generator():
+        """Creates new random data on host and puts it on device."""
+        nonlocal key
+        key, key_lhs, key_rhs = jax.random.split(key, 3)
+
+        # Create random data on host
+        lhs_host = jax.random.normal(key_lhs, lhs_shape).astype(lhs_dtype)
+        rhs_host = jax.random.normal(key_rhs, rhs_shape).astype(rhs_dtype)
+
+        # Put on device (HBM)
+        lhs_device = jax.device_put(lhs_host, lhs_sharding)
+        rhs_device = jax.device_put(rhs_host, rhs_sharding)
+
+        return (lhs_device, rhs_device)
+
+    print("Step 3: Starting Execution Loop (includes JIT)...", flush=True)
+    time_ms_list = multiple_iteration_timeit_from_trace(
+        jit_sharded_f,
+        data_generator,
+        matrix_dim=f"{dtype_str}_{m}x{n}x{k}",
+        tries=num_runs,
+        task=task_name,
+        trace_dir=trace_dir,
+        multi_op=True,
+    )
+    print("Step 4: Execution Loop Completed.", flush=True)
+    
+    return {
+        "time_ms_list": time_ms_list,
+    }
+
+
+def gemm_all_reduce(
+    m: int,
+    k: int,
+    n: int,
+    dtype: jnp.dtype = jnp.bfloat16,
+    num_runs: int = 1,
+    trace_dir: str = None,
+) -> Dict[str, Any]:
+    """Benchmarks the Matmul(A, B) + AllReduce(C)."""
+    return _run_gemm_base(
+        m, k, n, dtype, num_runs, trace_dir,
+        sharding_strategy=ShardingStrategy.NO_SHARDING,
+        task_name_suffix="gemm_all_reduce"
+    )
+
+
+
+
+
+def _calculate_metrics_base(
+    m: int,
+    k: int,
+    n: int,
+    dtype: jnp.dtype,
+    time_ms_list: list[float],
+    sharding_strategy: ShardingStrategy,
+) -> tuple[Dict[str, Any], Dict[str, Any]]:
+    """Shared metrics calculation for GEMM benchmarks."""
+    total_flops = 2 * m * k * n
+    total_flops_per_device, total_flops_all_devices = handle_based_on_sharding(
+        total_flops, sharding_strategy
+    )
+
+    dtype_str = dtype.dtype.name
+    peak_flops_multiplier = get_peak_flops_multiplier(dtype_str)
+    peak_flops = PEAK_FLOPS_PER_DEVICE * peak_flops_multiplier
+
+    return unified_flops_metrics(
+        m, n, k, time_ms_list, total_flops_per_device, total_flops_all_devices, peak_flops, dtype=dtype_str,
+    )
+
+
+def gemm_all_reduce_calculate_metrics(
+    m: int,
+    k: int,
+    n: int,
+    dtype: jnp.dtype,
+    time_ms_list: list[float],
+) -> Dict[str, Any]:
+    # Calculate Bandwidth for Collective (AllReduce)
+    # Effective bandwidth for AllReduce is 2 * (N-1)/N * Size.
+    # We use Size * 2 as a proxy for total bytes moved (assuming large N).
+
+    metadata, metrics = _calculate_metrics_base(
+        m, k, n, dtype, time_ms_list, ShardingStrategy.NO_SHARDING
+    )
+    
+    metadata["type"] = "gemm_all_reduce"
+    return metadata, metrics
+
+
+
diff --git a/Ironwood/src/benchmark_utils.py b/Ironwood/src/benchmark_utils.py
index 4091aefb..0a45678d 100644
--- a/Ironwood/src/benchmark_utils.py
+++ b/Ironwood/src/benchmark_utils.py
@@ -62,6 +62,7 @@ class ShardingStrategy(Enum):
   SHARDING_ON_SINGLE_CHIP_WITH_N = auto()
 
 
+
 def multiple_iteration_timeit_from_trace_throttling(
     compute_func: Callable,
     data_generator: Callable,
@@ -150,6 +151,7 @@ def multiple_iteration_timeit_from_trace(
     tries: int = 17,
     task: str = None,
     trace_dir: str = None,
+    multi_op: bool = False,
 ) -> list[float]:
     """
     Time a function with jax.profiler and get the run time from the trace.
@@ -189,10 +191,11 @@ def multiple_iteration_timeit_from_trace(
     if trace_full_dir != tmp_trace_dir:
         # Upload the traces to desired location
         upload_to_storage(trace_dir=trace_full_dir, local_file=tmp_trace_dir)
-    return multiple_iteration_get_metrics_from_trace(trace, task)
+    return multiple_iteration_get_metrics_from_trace(trace, task, tries, multi_op)
 
 
-def multiple_iteration_get_metrics_from_trace(trace: dict[str, Any], task: str = None) -> list[float]:
+def multiple_iteration_get_metrics_from_trace(
+    trace: dict[str, Any], task: str = None, tries = 17, multi_op: bool = False) -> list[float]:
     marker_done_events = []
     for event in trace["traceEvents"]:
         args = event.get("args", {})
@@ -203,7 +206,7 @@ def multiple_iteration_get_metrics_from_trace(trace: dict[str, Any], task: str =
     marker_call_done_events = [
         e for e in marker_done_events if e.get("name", "").endswith("call-done")
     ]
-    if marker_call_done_events:
+    if not multi_op and marker_call_done_events:
         marker_done_events = marker_call_done_events
     unique_pids = set([e["pid"] for e in marker_done_events])
     print(f"Unique PIDs: {unique_pids}")
@@ -232,9 +235,19 @@ def multiple_iteration_get_metrics_from_trace(trace: dict[str, Any], task: str =
 
     min_pid = min([e["pid"] for e in marker_done_events])
     events_from_min_pid = [e for e in marker_done_events if e["pid"] == min_pid]
-    durations_ms = [
-        float(e["args"]["device_duration_ps"]) / 1e9 for e in events_from_min_pid
-    ]
+
+    if multi_op and len(events_from_min_pid) > tries:
+        if len(events_from_min_pid) % tries != 0:
+            raise ValueError(f"Number of events {len(events_from_min_pid)} is not a multiple of tries {tries}.")
+        events_from_min_pid.sort(key=lambda t: t["ts"])
+        durations_ms = []
+        num_ops = len(events_from_min_pid) // tries
+        for i in range(0, len(events_from_min_pid), num_ops):
+            durations_ms.append(sum([float(e["args"]["device_duration_ps"]) / 1e9 for e in events_from_min_pid[i:i+num_ops]]))
+    else:
+        durations_ms = [
+            float(e["args"]["device_duration_ps"]) / 1e9 for e in events_from_min_pid
+        ]
     print(f"Collected {len(durations_ms)} events from trace for pid {min_pid}.")
     print(durations_ms)
 
@@ -984,6 +997,8 @@ def get_lhs_named_shading(mesh, strategy: ShardingStrategy):
             return NamedSharding(mesh, P(None, None))
         case ShardingStrategy.SHARDING_ON_SINGLE_CHIP_WITH_N:
             return NamedSharding(mesh, P(None, None))
+        case ShardingStrategy.SHARDING_ON_ALL_DEVICES_WITH_K:
+            return NamedSharding(mesh, P(None, "device"))
 
 
 def get_rhs_named_shading(mesh, strategy: ShardingStrategy):
@@ -1000,6 +1015,7 @@ def get_rhs_named_shading(mesh, strategy: ShardingStrategy):
             return NamedSharding(mesh, P(None, "device"))
 
 
+
 def get_out_sharding(strategy: ShardingStrategy):
     match strategy:
         case ShardingStrategy.NO_SHARDING:
@@ -1014,6 +1030,7 @@ def get_out_sharding(strategy: ShardingStrategy):
             return P(None, "device")
 
 
+
 def get_rowwise_named_shading(mesh, strategy: ShardingStrategy):
     match strategy:
         case ShardingStrategy.NO_SHARDING:
@@ -1056,6 +1073,7 @@ def handle_per_device_based_on_sharding(value, strategy: ShardingStrategy):
             return value // 2
 
 
+
 def handle_all_devices_based_on_sharding(value: int, strategy: ShardingStrategy):
     match strategy:
         case ShardingStrategy.NO_SHARDING:
@@ -1070,6 +1088,7 @@ def handle_all_devices_based_on_sharding(value: int, strategy: ShardingStrategy)
             return value * jax.device_count() // 2
 
 
+
 def handle_based_on_sharding(value: int, strategy: ShardingStrategy):
     total_value = value
     value = handle_per_device_based_on_sharding(value, strategy)
@@ -1273,16 +1292,19 @@ def get_peak_flops_multiplier(in_dtype_str: str) -> float:
     (PEAK_FLOPS_PER_DEVICE) based on the input data type.
     """
     in_dtype_lower = in_dtype_str.lower()
-    if in_dtype_lower == "fp8":
+    if in_dtype_lower in ("fp8", "float8_e4m3fn"):
         # FP8 is 2x faster than BF16
         # The baseline PEAK_FLOPS_PER_DEVICE is 1153.5 * 2 = 2307, which is FP8 peak.
         # So the multiplier should be 1.0
         return 1.0
-    elif in_dtype_lower == "bf16" or in_dtype_lower == "fp16":
+    elif in_dtype_lower in ("bf16", "bfloat16", "fp16", "float16"):
         # BF16/FP16 is 2x slower than FP8 peak
         return 0.5
-    elif in_dtype_lower == "fp32":
+    elif in_dtype_lower in ("fp32", "float32"):
         # FP32 is 4x slower than FP8 peak
         return 0.25
+    elif in_dtype_lower in ("fp4", "float4_e2m1fn"):
+        # FP4/INT4 is treated the same as FP8
+        return 1.0
     else:
-        raise RuntimeError(f"{in_dtype_lower} is not supported for setting peak_flops_multiplier.")
+        raise RuntimeError(f"{in_dtype_lower} is not supported for setting peak_flops_multiplier.")
\ No newline at end of file
diff --git a/Ironwood/src/run_benchmark.py b/Ironwood/src/run_benchmark.py
index efef1dcc..9c3f41bb 100644
--- a/Ironwood/src/run_benchmark.py
+++ b/Ironwood/src/run_benchmark.py
@@ -63,6 +63,8 @@
     "gemm_simple": "benchmark_gemm.gemm_simple",
     "gemm_simple_with_dtype": "benchmark_gemm.gemm_simple_with_dtype",
     "gemm_multiple_run": "benchmark_gemm.gemm_multiple_run",
+    "gemm_all_reduce": "benchmark_gemm_all_reduce.gemm_all_reduce",
+
     "gemm_throttling": "benchmark_gemm_throttling.gemm_throttling",
     "gemm": "benchmark_gemm.gemm",
     "gemm_accum": "benchmark_gemm.gemm_accum",

From 9a0b8aecd2fb0cea6d4757b4b621e27d22ccc5b7 Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Fri, 6 Feb 2026 16:06:49 +0800
Subject: [PATCH 50/88] Add multi-host BMM into automation (#105)

* Add multi-host BMM into automation
---
 Ironwood/configs/bmm/multi_host_bmm.yaml      | 75 +++++++++++++++
 .../guides/automation/tpu7x-2x2x1-bmm.yaml    |  1 +
 Ironwood/src/benchmark_bmm.py                 | 93 +++++++++++++++++++
 Ironwood/src/run_benchmark.py                 |  1 +
 4 files changed, 170 insertions(+)
 create mode 100644 Ironwood/configs/bmm/multi_host_bmm.yaml

diff --git a/Ironwood/configs/bmm/multi_host_bmm.yaml b/Ironwood/configs/bmm/multi_host_bmm.yaml
new file mode 100644
index 00000000..eb332383
--- /dev/null
+++ b/Ironwood/configs/bmm/multi_host_bmm.yaml
@@ -0,0 +1,75 @@
+benchmarks:
+- benchmark_name: "multi_host_bmm"
+  trace_dir: "../microbenchmarks/multi_host_bmm_bf16"
+  csv_path: "../microbenchmarks/multi_host_bmm_bf16"
+  xlml_metrics_dir: "../microbenchmarks/multi_host_bmm_bf16"
+  xla_dump_dir: "../microbenchmarks/multi_host_bmm_bf16/hlo_graphs"
+  benchmark_sweep_params:
+  - {b: 8, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 8, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 8, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 8, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 8, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 8, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 8, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'bfloat16'}
+  - {b: 8, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'bfloat16'}
+
+- benchmark_name: "multi_host_bmm"
+  trace_dir: "../microbenchmarks/multi_host_bmm_f32"
+  csv_path: "../microbenchmarks/multi_host_bmm_f32"
+  xlml_metrics_dir: "../microbenchmarks/multi_host_bmm_f32"
+  xla_dump_dir: "../microbenchmarks/multi_host_bmm_f32/hlo_graphs"
+  benchmark_sweep_params:
+  - {b: 8, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float32'}
+  - {b: 8, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float32'}
+  - {b: 8, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float32'}
+  - {b: 8, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float32'}
+  - {b: 8, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float32'}
+  - {b: 8, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float32'}
+  - {b: 8, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float32'}
+  - {b: 8, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float32'}
+
+- benchmark_name: "multi_host_bmm"
+  trace_dir: "../microbenchmarks/multi_host_bmm_fp16"
+  csv_path: "../microbenchmarks/multi_host_bmm_fp16"
+  xlml_metrics_dir: "../microbenchmarks/multi_host_bmm_fp16"
+  xla_dump_dir: "../microbenchmarks/multi_host_bmm_fp16/hlo_graphs"
+  benchmark_sweep_params:
+  - {b: 8, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float16'}
+  - {b: 8, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float16'}
+  - {b: 8, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float16'}
+  - {b: 8, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float16'}
+  - {b: 8, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float16'}
+  - {b: 8, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float16'}
+  - {b: 8, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float16'}
+  - {b: 8, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float16'}
+
+- benchmark_name: "multi_host_bmm"
+  trace_dir: "../microbenchmarks/multi_host_bmm_fp8"
+  csv_path: "../microbenchmarks/multi_host_bmm_fp8"
+  xlml_metrics_dir: "../microbenchmarks/multi_host_bmm_fp8"
+  xla_dump_dir: "../microbenchmarks/multi_host_bmm_fp8/hlo_graphs"
+  benchmark_sweep_params:
+  - {b: 8, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float8'}
+  - {b: 8, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float8'}
+  - {b: 8, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float8'}
+  - {b: 8, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float8'}
+  - {b: 8, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float8'}
+  - {b: 8, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float8'}
+  - {b: 8, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float8'}
+  - {b: 8, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float8'}
+
+- benchmark_name: "multi_host_bmm"
+  trace_dir: "../microbenchmarks/multi_host_bmm_fp4"
+  csv_path: "../microbenchmarks/multi_host_bmm_fp4"
+  xlml_metrics_dir: "../microbenchmarks/multi_host_bmm_fp4"
+  xla_dump_dir: "../microbenchmarks/multi_host_bmm_fp4/hlo_graphs"
+  benchmark_sweep_params:
+  - {b: 8, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float4'}
+  - {b: 8, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float4'}
+  - {b: 8, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float4'}
+  - {b: 8, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float4'}
+  - {b: 8, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float4'}
+  - {b: 8, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float4'}
+  - {b: 8, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float4'}
+  - {b: 8, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float4'}
\ No newline at end of file
diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml
index 1b5b9774..6257acfc 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml
@@ -54,6 +54,7 @@ spec:
 
           GCS_BUCKET_DIR=${GCS_PATH}
           python Ironwood/src/run_benchmark.py --config="Ironwood/configs/bmm/single_device_bmm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/bmm/multi_host_bmm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
         resources:
           requests:
             google.com/tpu: 4
diff --git a/Ironwood/src/benchmark_bmm.py b/Ironwood/src/benchmark_bmm.py
index 264daf2b..f988008e 100644
--- a/Ironwood/src/benchmark_bmm.py
+++ b/Ironwood/src/benchmark_bmm.py
@@ -134,3 +134,96 @@ def single_device_bmm_calculate_metrics(
         dtype=dtype.dtype.name,
         b=b,
     )
+
+
+def multi_host_bmm(
+    b: int,
+    m: int,
+    k: int,
+    n: int,
+    dtype: jnp.dtype = jax.numpy.float8_e4m3fn,
+    num_runs: int = 1,
+    trace_dir: str = None,
+    sharding_strategy: ShardingStrategy = ShardingStrategy.NO_SHARDING,
+) -> Dict[str, Any]:
+    """Benchmarks multi-host bmm."""
+    mesh = create_mesh(sharding_strategy)
+
+    lhs_sharding = get_lhs_named_shading(mesh, sharding_strategy)
+    rhs_sharding = get_rhs_named_shading(mesh, sharding_strategy)
+    output_sharding = get_output_named_shading(mesh, sharding_strategy)
+
+    def f(x, y):
+        with jax.named_scope(MARKER):
+            acc = jax.numpy.einsum(
+                "bij,bjk->bik", x, y, preferred_element_type=jnp.float32
+            )
+            return acc.astype(jnp.bfloat16)
+
+    jit_sharded_f = jax.jit(
+        f,
+        in_shardings=(lhs_sharding, rhs_sharding),
+        out_shardings=output_sharding,
+    )
+
+    lhs_shape = (b, m, k)
+    rhs_shape = (b, k, n)
+
+    lhs_dtype = dtype
+    rhs_dtype = dtype
+
+    key = jax.random.key(SEED)
+
+    def data_generator():
+        """Creates new random data on host and puts it on device."""
+        nonlocal key  # Use and update the outer 'key'
+        key, key_lhs, key_rhs = jax.random.split(key, 3)
+
+        # Create random data on host
+        lhs_host = jax.random.normal(key_lhs, lhs_shape).astype(lhs_dtype)
+        rhs_host = jax.random.normal(key_rhs, rhs_shape).astype(rhs_dtype)
+
+        # Put on device (HBM) with sharding
+        lhs = jax.device_put(lhs_host, lhs_sharding)
+        rhs = jax.device_put(rhs_host, rhs_sharding)
+
+        return (lhs, rhs)
+
+    dtype_str = dtype.dtype.name
+    time_ms_list = multiple_iteration_timeit_from_trace(
+        jit_sharded_f,
+        data_generator,
+        matrix_dim=f"{dtype_str}_{b}x{m}x{n}x{k}",
+        tries=num_runs,
+        task="multi_host_bmm",
+        trace_dir=trace_dir,
+    )
+
+    return {"time_ms_list": time_ms_list}
+
+
+def multi_host_bmm_calculate_metrics(
+    b: int,
+    m: int,
+    k: int,
+    n: int,
+    dtype: jnp.dtype,
+    time_ms_list: list[float],
+    sharding_strategy: ShardingStrategy = ShardingStrategy.NO_SHARDING
+) -> Dict[str, Any]:
+    # Calculate FLOPs
+    total_flops = 2 * b * m * k * n  # Total floating-point operations
+    total_flops, total_flops_all_devices = handle_based_on_sharding(
+        total_flops, sharding_strategy
+    )
+    return unified_flops_metrics(
+        m,
+        n,
+        k,
+        time_ms_list,
+        total_flops,
+        total_flops_all_devices,
+        PEAK_FLOPS_PER_DEVICE,
+        dtype=dtype.dtype.name,
+        b=b,
+    )
diff --git a/Ironwood/src/run_benchmark.py b/Ironwood/src/run_benchmark.py
index 9c3f41bb..31338d21 100644
--- a/Ironwood/src/run_benchmark.py
+++ b/Ironwood/src/run_benchmark.py
@@ -34,6 +34,7 @@
 
 BMM_BENCHMARK_MAP = {
     "single_device_bmm": "benchmark_bmm.single_device_bmm",
+    "multi_host_bmm": "benchmark_bmm.multi_host_bmm"
 }
 
 MATMUL_BENCHMARK_MAP = {

From 2d45945df4bef612334c0c69f99739fc995bbe57 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Fri, 6 Feb 2026 02:25:11 +0000
Subject: [PATCH 51/88] Update pipelined flow with optimized approach

---
 Ironwood/src/benchmark_host_device.py | 106 ++++++++++++++++++++------
 1 file changed, 84 insertions(+), 22 deletions(-)

diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index 16352e2a..ba1be121 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -21,6 +21,7 @@
 
 def benchmark_host_device(
     data_size_mib: int,
+    h2d_type: str = "simple",
     num_runs: int = 100,
     trace_dir: str = None,
 ) -> Dict[str, Any]:
@@ -65,29 +66,89 @@ def benchmark_host_device(
             
             with step_context:
                  # H2D
-                t0 = time.perf_counter()
+                if h2d_type == "simple":
+                    t0 = time.perf_counter()
+                    # Simple device_put
+                    device_array = jax.device_put(host_data)
+                    device_array.block_until_ready()
+                    t1 = time.perf_counter()
+                    
+                    # Verify H2D shape
+                    assert device_array.shape == host_data.shape
+                    h2d_perf.append((t1 - t0) * 1000)
                 
-                # Simple device_put
-                device_array = jax.device_put(host_data)
-                device_array.block_until_ready()
-                
-                t1 = time.perf_counter()
-                h2d_perf.append((t1 - t0) * 1000)
-                
-                # Verify H2D shape
-                assert device_array.shape == host_data.shape
-                
-                # D2H
-                t2 = time.perf_counter()
-                
-                # Simple device_get
-                # Note: device_get returns a numpy array (copy)
-                _ = jax.device_get(device_array)
-                
-                t3 = time.perf_counter()
-                d2h_perf.append((t3 - t2) * 1000)
-                
-                device_array.delete()
+                    # D2H
+                    t2 = time.perf_counter()
+                    # Simple device_get
+                    # Note: device_get returns a numpy array (copy)
+                    _ = jax.device_get(device_array)
+                    t3 = time.perf_counter()
+                    d2h_perf.append((t3 - t2) * 1000)
+                    
+                    device_array.delete()
+                elif h2d_type == "pipelined":
+                    target_chunk_size_mib = 16  # Sweet spot from profiling
+                    num_devices = len(target_devices)
+
+                    tensors_on_device = []
+                    
+                    # Calculate chunks per device
+                    data_per_dev = data_size_mib / num_devices
+                    chunks_per_dev = int(data_per_dev / target_chunk_size_mib)
+                    chunks_per_dev = max(1, chunks_per_dev)
+
+                    chunks = np.array_split(host_data, chunks_per_dev * num_devices, axis=0)
+                    if chunks_per_dev > 1:    
+                        t0 = time.perf_counter()
+                        # We need to map chunks to the correct device
+                        # This simple example assumes chunks are perfectly divisible and ordered
+                        # In production, use `jax.sharding` mesh logic for complex layouts
+                        for idx, chunk in enumerate(chunks):
+                            if num_devices > 1:
+                                dev = target_devices[idx % num_devices]
+                            else:
+                                dev = target_devices[0]
+                            tensors_on_device.append(jax.device_put(chunk, dev))
+                        for device_tensor in tensors_on_device:
+                            device_tensor.block_until_ready()
+                        t1 = time.perf_counter()
+                        h2d_perf.append((t1 - t0) * 1000)
+                        del chunks
+
+                        # D2H
+                        tensor_stack = jnp.vstack(tensors_on_device)
+                        
+                        t2 = time.perf_counter()
+                        _ = jax.device_get(tensor_stack)
+                        t3 = time.perf_counter()
+
+                        d2h_perf.append((t3 - t2) * 1000)
+                        tensor_stack.delete()
+                        for device_tensor in tensors_on_device:
+                            device_tensor.delete()
+                        del tensors_on_device
+                    else:
+                        t0 = time.perf_counter()
+
+                        print(f"Warning: {data_size_mib=} is not larger than {target_chunk_size_mib=}, falling back to standard JAX put.")
+                        # Fallback to standard JAX put for small data
+                        result = jax.device_put(host_data, target_devices[0])
+                        result.block_until_ready()
+
+                        t1 = time.perf_counter()
+                        h2d_perf.append((t1 - t0) * 1000)
+
+                        # D2H
+                        t2 = time.perf_counter()
+                        # Simple device_get
+                        # Note: device_get returns a numpy array (copy)
+                        _ = jax.device_get(result)
+
+                        t3 = time.perf_counter()
+                        d2h_perf.append((t3 - t2) * 1000)
+                        result.delete()
+                    
+                jax.clear_caches()
 
     return {
         "H2D_Bandwidth_ms": h2d_perf,
@@ -108,6 +169,7 @@ def benchmark_host_device_calculate_metrics(
     }
     metadata = {k: v for k, v in params if k in metadata_keys}
     metadata["dtype"] = "float32"
+    metadata["h2d_type"] = h2d_type
     
     metrics = {}
     

From 3d82d5b11940840720ffbf3208cac708604f34e5 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Fri, 6 Feb 2026 08:09:39 +0000
Subject: [PATCH 52/88] Add missing h2d_type to H2D metrics

---
 Ironwood/guides/automation/aggregator.py | 2 +-
 Ironwood/src/benchmark_host_device.py    | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py
index bd291cfc..97235665 100644
--- a/Ironwood/guides/automation/aggregator.py
+++ b/Ironwood/guides/automation/aggregator.py
@@ -16,7 +16,7 @@
         "time_ms_p50", "time_ms_p90", "time_ms_p95", "time_ms_p99", "time_ms_avg", "time_ms_min", "time_ms_max",
     ],
     "host_device": [
-        "data_size_mib", "H2D_bw (GiB/s)_num_runs",
+        "data_size_mib", "h2d_type", "H2D_bw (GiB/s)_num_runs",
         "H2D_bw (GiB/s)_p50", "H2D_bw (GiB/s)_p90", "H2D_bw (GiB/s)_p95", "H2D_bw (GiB/s)_p99", 
         "H2D_bw (GiB/s)_avg", "H2D_bw (GiB/s)_min", "H2D_bw (GiB/s)_max",
         "D2H_bw (GiB/s)_p50", "D2H_bw (GiB/s)_p90", "D2H_bw (GiB/s)_p95", "D2H_bw (GiB/s)_p99", 
diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index ba1be121..4e4e0a63 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -157,6 +157,7 @@ def benchmark_host_device(
 
 def benchmark_host_device_calculate_metrics(
     data_size_mib: int,
+    h2d_type: str,
     H2D_Bandwidth_ms: List[float],
     D2H_Bandwidth_ms: List[float],
 ) -> Tuple[Dict[str, Any], Dict[str, Any]]:

From adc084d6462540a16c96531ca23113a36ed5e88c Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Fri, 6 Feb 2026 09:02:42 +0000
Subject: [PATCH 53/88] Revert unintended commit

This reverts commit 3e4b59a0d8a5e87bea51f8c934895a4c9fda3ac5.
---
 Ironwood/configs/collectives/all_gather_1d.yaml        |  3 +--
 Ironwood/configs/collectives/all_gather_2d.yaml        |  3 +--
 Ironwood/configs/collectives/all_gather_3d.yaml        |  3 +--
 Ironwood/configs/collectives/all_gather_demo.yaml      | 10 +++-------
 .../configs/collectives/all_gather_tpu7x_2x2x1.yaml    |  3 +--
 .../configs/collectives/all_gather_tpu7x_2x2x2.yaml    |  3 +--
 .../configs/collectives/all_gather_tpu7x_2x2x4.yaml    |  3 +--
 .../configs/collectives/all_gather_tpu7x_2x4x4.yaml    |  3 +--
 .../configs/collectives/all_gather_tpu7x_4x4x4.yaml    |  5 ++---
 .../configs/collectives/all_gather_tpu7x_4x4x8.yaml    |  3 +--
 Ironwood/configs/collectives/all_reduce_1d.yaml        |  3 +--
 Ironwood/configs/collectives/all_reduce_2d.yaml        |  3 +--
 Ironwood/configs/collectives/all_reduce_3d.yaml        |  3 +--
 .../configs/collectives/all_reduce_tpu7x_2x2x1.yaml    |  3 +--
 .../configs/collectives/all_reduce_tpu7x_2x2x2.yaml    |  3 +--
 .../configs/collectives/all_reduce_tpu7x_2x2x4.yaml    |  3 +--
 .../configs/collectives/all_reduce_tpu7x_2x4x4.yaml    |  3 +--
 .../configs/collectives/all_reduce_tpu7x_4x4x4.yaml    |  3 +--
 .../configs/collectives/all_reduce_tpu7x_4x4x8.yaml    |  3 +--
 Ironwood/configs/collectives/all_to_all_1d.yaml        |  2 +-
 Ironwood/configs/collectives/all_to_all_2d.yaml        |  2 +-
 Ironwood/configs/collectives/all_to_all_3d.yaml        |  2 +-
 .../configs/collectives/all_to_all_tpu7x_2x2x1.yaml    |  3 +--
 .../configs/collectives/all_to_all_tpu7x_2x2x2.yaml    |  3 +--
 .../configs/collectives/all_to_all_tpu7x_2x2x4.yaml    |  3 +--
 .../configs/collectives/all_to_all_tpu7x_2x4x4.yaml    |  3 +--
 .../configs/collectives/all_to_all_tpu7x_4x4x4.yaml    |  3 +--
 .../configs/collectives/all_to_all_tpu7x_4x4x8.yaml    |  3 +--
 Ironwood/configs/collectives/reduce_scatter_1d.yaml    |  3 +--
 Ironwood/configs/collectives/reduce_scatter_2d.yaml    |  3 +--
 30 files changed, 33 insertions(+), 63 deletions(-)

diff --git a/Ironwood/configs/collectives/all_gather_1d.yaml b/Ironwood/configs/collectives/all_gather_1d.yaml
index 85d8fc3e..0b1313dc 100644
--- a/Ironwood/configs/collectives/all_gather_1d.yaml
+++ b/Ironwood/configs/collectives/all_gather_1d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_1d"
   csv_path: "../microbenchmarks/all_gather_1d"
   xlml_metrics_dir: "../microbenchmarks/all_gather_1d"
diff --git a/Ironwood/configs/collectives/all_gather_2d.yaml b/Ironwood/configs/collectives/all_gather_2d.yaml
index 2d7a0e7a..c45f3e70 100644
--- a/Ironwood/configs/collectives/all_gather_2d.yaml
+++ b/Ironwood/configs/collectives/all_gather_2d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_2d"
   csv_path: "../microbenchmarks/all_gather_2d"
   xlml_metrics_dir: "../microbenchmarks/all_gather_2d"
diff --git a/Ironwood/configs/collectives/all_gather_3d.yaml b/Ironwood/configs/collectives/all_gather_3d.yaml
index cc876a08..e159adfd 100644
--- a/Ironwood/configs/collectives/all_gather_3d.yaml
+++ b/Ironwood/configs/collectives/all_gather_3d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_3d"
   csv_path: "../microbenchmarks/all_gather_3d"
   xlml_metrics_dir: "../microbenchmarks/all_gather_3d"
diff --git a/Ironwood/configs/collectives/all_gather_demo.yaml b/Ironwood/configs/collectives/all_gather_demo.yaml
index 6fb5a757..a9d776cd 100644
--- a/Ironwood/configs/collectives/all_gather_demo.yaml
+++ b/Ironwood/configs/collectives/all_gather_demo.yaml
@@ -1,13 +1,9 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1} # Parallel Replica
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1} # Non-Parallel Replica
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2} # Non Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3} # Non Parallel Replica Groups
-
+  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1}
+  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2}
+  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3}
 
   warmup_tries: 10
   trace_dir: "../microbenchmarks/all_gather_demo"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
index b0858716..5b11ac8d 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
index ab282dec..3747b754 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
index d2f65afe..9c25eb6b 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
index 35414ff3..0ad03f56 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
index b561942d..ceb7bb52 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
@@ -1,9 +1,8 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4"
-  xla_dump_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4/hlo_graphs"
\ No newline at end of file
+  xla_dump_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4/hlo_graphs"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
index 5838cafd..0218d6b0 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8"
diff --git a/Ironwood/configs/collectives/all_reduce_1d.yaml b/Ironwood/configs/collectives/all_reduce_1d.yaml
index 7b1d3068..d12d4221 100644
--- a/Ironwood/configs/collectives/all_reduce_1d.yaml
+++ b/Ironwood/configs/collectives/all_reduce_1d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1" , op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non Parallel Replica
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1" , op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_reduce_1d"
   csv_path: "../microbenchmarks/all_reduce_1d"
   xlml_metrics_dir: "../microbenchmarks/all_reduce_1d"
diff --git a/Ironwood/configs/collectives/all_reduce_2d.yaml b/Ironwood/configs/collectives/all_reduce_2d.yaml
index 93e1a7c9..5aa9654e 100644
--- a/Ironwood/configs/collectives/all_reduce_2d.yaml
+++ b/Ironwood/configs/collectives/all_reduce_2d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5}  # Non Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5}
   trace_dir: "../microbenchmarks/all_reduce_2d"
   csv_path: "../microbenchmarks/all_reduce_2d"
   xlml_metrics_dir: "../microbenchmarks/all_reduce_2d"
diff --git a/Ironwood/configs/collectives/all_reduce_3d.yaml b/Ironwood/configs/collectives/all_reduce_3d.yaml
index f6a4ad9d..4e76b55f 100644
--- a/Ironwood/configs/collectives/all_reduce_3d.yaml
+++ b/Ironwood/configs/collectives/all_reduce_3d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5}
   trace_dir: "../microbenchmarks/all_reduce_3d"
   csv_path: "../microbenchmarks/all_reduce_3d"
   xlml_metrics_dir: "../microbenchmarks/all_reduce_3d"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
index b713c549..dbeb0407 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
index 165e0e72..cca20bc2 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
index 0002ae3a..1cb29b11 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
index e5652a92..8366350a 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/psum_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/psum_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
index 137bae19..93deef9f 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/psum_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/psum_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
index 25758453..7b629828 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/psum_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/psum_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x8"
diff --git a/Ironwood/configs/collectives/all_to_all_1d.yaml b/Ironwood/configs/collectives/all_to_all_1d.yaml
index 3c28194d..8d222613 100644
--- a/Ironwood/configs/collectives/all_to_all_1d.yaml
+++ b/Ironwood/configs/collectives/all_to_all_1d.yaml
@@ -1,7 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non Parallel Replica
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_1d"
   csv_path: "../microbenchmarks/all_to_all_1d"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_1d"
diff --git a/Ironwood/configs/collectives/all_to_all_2d.yaml b/Ironwood/configs/collectives/all_to_all_2d.yaml
index b4a1bc0e..d23115fe 100644
--- a/Ironwood/configs/collectives/all_to_all_2d.yaml
+++ b/Ironwood/configs/collectives/all_to_all_2d.yaml
@@ -1,7 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_2d"
   csv_path: "../microbenchmarks/all_to_all_2d"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_2d"
diff --git a/Ironwood/configs/collectives/all_to_all_3d.yaml b/Ironwood/configs/collectives/all_to_all_3d.yaml
index 3aa0e2a7..c705754c 100644
--- a/Ironwood/configs/collectives/all_to_all_3d.yaml
+++ b/Ironwood/configs/collectives/all_to_all_3d.yaml
@@ -1,7 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_3d"
   csv_path: "../microbenchmarks/all_to_all_3d"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_3d"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
index a30a17cf..42dcf9e1 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
index 01fb9b80..5b1bbb82 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
index 43beeb27..f6004ce2 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
index 614caa6b..a0e16a92 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
index 12dd149d..5b627a9a 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
index 32c63d74..65742f12 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1,  num_runs: 20} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8"
diff --git a/Ironwood/configs/collectives/reduce_scatter_1d.yaml b/Ironwood/configs/collectives/reduce_scatter_1d.yaml
index 9c2c0dea..063d73fc 100644
--- a/Ironwood/configs/collectives/reduce_scatter_1d.yaml
+++ b/Ironwood/configs/collectives/reduce_scatter_1d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum_scatter
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/reduce_scatter_1d"
   csv_path: "../microbenchmarks/reduce_scatter_1d"
   xlml_metrics_dir: "../microbenchmarks/reduce_scatter_1d"
diff --git a/Ironwood/configs/collectives/reduce_scatter_2d.yaml b/Ironwood/configs/collectives/reduce_scatter_2d.yaml
index f329b571..027ac991 100644
--- a/Ironwood/configs/collectives/reduce_scatter_2d.yaml
+++ b/Ironwood/configs/collectives/reduce_scatter_2d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum_scatter
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5}
   trace_dir: "../microbenchmarks/reduce_scatter_2d"
   csv_path: "../microbenchmarks/reduce_scatter_2d"
   xlml_metrics_dir: "../microbenchmarks/reduce_scatter_2d"

From 16c614714c12076d1fef29045bd81cccd622fa23 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Fri, 6 Feb 2026 09:03:51 +0000
Subject: [PATCH 54/88] Remove 32768 data_size_mib from H2D YAML

---
 Ironwood/configs/host_device/host_device.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml
index 0b48800c..b703ea96 100644
--- a/Ironwood/configs/host_device/host_device.yaml
+++ b/Ironwood/configs/host_device/host_device.yaml
@@ -3,7 +3,7 @@ benchmarks:
   num_runs: 20
   benchmark_sweep_params:
   - {
-      data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768]
+      data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
     }
   csv_path: "../microbenchmarks/host_device"
   trace_dir: "../microbenchmarks/host_device/trace"

From 8c3be375dc6f090dd9492dcb206b333c2d454d3b Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Mon, 9 Feb 2026 17:32:43 +0800
Subject: [PATCH 55/88] Fix inadvertent removal of target_devices (#108)

---
 Ironwood/src/benchmark_host_device.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index 4e4e0a63..67eb980b 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -5,7 +5,7 @@
 from typing import Any, Dict, Tuple, List
 
 import jax
-from jax import sharding
+from jax import numpy as jnp
 import numpy as np
 from benchmark_utils import MetricsStatistics
 
@@ -25,7 +25,7 @@ def benchmark_host_device(
     num_runs: int = 100,
     trace_dir: str = None,
 ) -> Dict[str, Any]:
-    """Benchmarks H2D/D2H transfer using simple device_put/device_get."""
+    """Benchmarks H2D/D2H transfer using device_put/device_get."""
     
     num_elements = 1024 * 1024 * data_size_mib // np.dtype(np.float32).itemsize
     
@@ -33,8 +33,13 @@ def benchmark_host_device(
     column = 128
     host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32)
     
+    # Used in pipelined flow
+    # TODO: turn into a param
+    num_devices_to_perform_h2d = 1
+    target_devices = jax.devices()[:num_devices_to_perform_h2d]
+
     print(
-        f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations",
+        f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {h2d_type=}",
         flush=True
     )
 

From b6bd6ae9dce456f1575daab674ff4bab10e46029 Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Tue, 10 Feb 2026 07:05:21 +0000
Subject: [PATCH 56/88] Add log to show best hyperparameters after tuning

---
 Ironwood/configs/attention/attention.yaml | 12 ++++++++++++
 Ironwood/src/benchmark_attention.py       | 10 ++++++++++
 2 files changed, 22 insertions(+)
 create mode 100644 Ironwood/configs/attention/attention.yaml

diff --git a/Ironwood/configs/attention/attention.yaml b/Ironwood/configs/attention/attention.yaml
new file mode 100644
index 00000000..92c19dd8
--- /dev/null
+++ b/Ironwood/configs/attention/attention.yaml
@@ -0,0 +1,12 @@
+benchmarks:
+- benchmark_name: "tokamax_splash_attention"
+  benchmark_sweep_params:
+  - {batch_size: 1, q_seq_len: 4096, kv_seq_len: 4096, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true}
+  - {batch_size: 1, q_seq_len: 8192, kv_seq_len: 8192, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true}
+  - {batch_size: 1, q_seq_len: 16384, kv_seq_len: 16384, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true}
+  - {batch_size: 1, q_seq_len: 32768, kv_seq_len: 32768, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true}
+  - {batch_size: 1, q_seq_len: 65536, kv_seq_len: 65536, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true}
+  - {batch_size: 1, q_seq_len: 131072, kv_seq_len: 131072, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true}
+  trace_dir: "../microbenchmarks/attention"
+  csv_path: "../microbenchmarks/attention"
+  xlml_metrics_dir: "../microbenchmarks/attention"
diff --git a/Ironwood/src/benchmark_attention.py b/Ironwood/src/benchmark_attention.py
index 95ae5248..525c7757 100644
--- a/Ironwood/src/benchmark_attention.py
+++ b/Ironwood/src/benchmark_attention.py
@@ -252,6 +252,16 @@ def attention_fn(
     output = tuned_splash(q, k, v)
     jax.block_until_ready(output)
 
+    
+    print("-" * 50)
+    print(
+        f"batch_size={batch_size}, q_seq_len={q_seq_len}, kv_seq_len={kv_seq_len}, "
+        f"q_heads={q_heads}, kv_heads={kv_heads}, qk_head_dim={qk_head_dim}, "
+        f"v_head_dim={v_head_dim}, mode={mode}, causal={causal}"
+    )
+    print(f"tuned_splash.optimal_hyperparams={tuned_splash.optimal_hyperparams}")
+    print("-" * 50)
+
     # Run benchmark
     time_ms_list = timeit_from_trace(
         tuned_splash,

From 5d73eaeb9b2708113f2671c06851e3c9447c5226 Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Tue, 10 Feb 2026 09:33:30 +0000
Subject: [PATCH 57/88] Add attention into aggregator

---
 Ironwood/guides/automation/aggregator.py      | 19 ++++++
 .../automation/tpu7x-2x2x1-attention.yaml     | 61 +++++++++++++++++++
 2 files changed, 80 insertions(+)
 create mode 100644 Ironwood/guides/automation/tpu7x-2x2x1-attention.yaml

diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py
index 97235665..ed889d79 100644
--- a/Ironwood/guides/automation/aggregator.py
+++ b/Ironwood/guides/automation/aggregator.py
@@ -43,6 +43,13 @@
         "tflops_per_sec_per_device_avg", "tflops_per_sec_per_device_min",
         "tflops_per_sec_per_device_max",
     ],
+    "attention": [
+        "batch_size", "q_seq_len", "kv_seq_len", "q_heads", "kv_heads", "qk_head_dim", "v_head_dim", "mode", "causal", "step_time_ms_num_runs", "num_samples", 
+        "time_ms_p50", "time_ms_p90",
+        "time_ms_p95", "time_ms_p99",
+        "time_ms_avg", "time_ms_min",
+        "time_ms_max",
+    ],
 }
 
 def download_from_gcs(bucket_path: str, local_dir: str):
@@ -111,6 +118,17 @@ def aggregate_bmm(directories: list[str], picked_columns: list[str]) -> pd.DataF
             aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True)
     return aggregated_df
 
+def aggregate_attention(directories: list[str], picked_columns: list[str]) -> pd.DataFrame:
+    if len(directories) == 0:
+        return None
+    aggregated_df = pd.DataFrame()
+    for directory in directories:
+        files = glob.glob(f"{directory}/*.tsv")
+        for file in files:
+            df = pd.read_csv(file, sep='\t')
+            aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True)
+    return aggregated_df
+
 aggregate_function = {
     "collectives": aggregate_collectives,
     "hbm": aggregate_hbm,
@@ -118,6 +136,7 @@ def aggregate_bmm(directories: list[str], picked_columns: list[str]) -> pd.DataF
     "gemm": aggregate_gemm,
     "bmm": aggregate_bmm,
     "gemm_all_reduce": aggregate_gemm,
+    "attention": aggregate_attention,
 }
 
 def aggregate_results(bucket_path: str, local_dir: str):
diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-attention.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-attention.yaml
new file mode 100644
index 00000000..d3aff3ed
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-attention.yaml
@@ -0,0 +1,61 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-2x2x1
+spec:
+  completionMode: Indexed
+  suspend: true
+  parallelism: 1
+  completions: 1
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x1
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/attention/attention.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file

From d9d22465bc60504870c12da7c2d0b761102c4457 Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Fri, 6 Feb 2026 03:31:55 +0000
Subject: [PATCH 58/88] Gemm+All Reduce for 4x4x4 and fix minor bugs

---
 .../gemm_all_reduce/gemm_all_reduce.yaml      | 10 +--
 Ironwood/configs/host_device/host_device.yaml |  7 ++-
 Ironwood/guides/automation/aggregator.py      | 16 ++++-
 .../tpu7x-4x4x4-gemm_all_reduce.yaml          | 61 +++++++++++++++++++
 Ironwood/src/benchmark_gemm.py                |  1 +
 Ironwood/src/benchmark_gemm_all_reduce.py     | 54 ++++++++--------
 Ironwood/src/benchmark_host_device.py         |  5 +-
 Ironwood/src/run_benchmark.py                 | 30 ++++++++-
 8 files changed, 145 insertions(+), 39 deletions(-)
 create mode 100644 Ironwood/guides/automation/tpu7x-4x4x4-gemm_all_reduce.yaml

diff --git a/Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml b/Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml
index 93466840..cde1101f 100644
--- a/Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml
+++ b/Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml
@@ -5,11 +5,11 @@ benchmarks:
   xlml_metrics_dir: "../microbenchmarks/gemm_all_reduce_bf16"
   xla_dump_dir: "../microbenchmarks/gemm_all_reduce_bf16/hlo_graphs"
   benchmark_sweep_params:
+  - {m: 1024, k: 1024, n: 1024, num_runs: 20, dtype: 'bfloat16'}
   - {m: 2048, k: 2048, n: 2048, num_runs: 20, dtype: 'bfloat16'}
   - {m: 4096, k: 4096, n: 4096, num_runs: 20, dtype: 'bfloat16'}
   - {m: 8192, k: 8192, n: 8192, num_runs: 20, dtype: 'bfloat16'}
   - {m: 16384, k: 16384, n: 16384, num_runs: 20, dtype: 'bfloat16'}
-  - {m: 32768, k: 32768, n: 32768, num_runs: 20, dtype: 'bfloat16'}
 
 - benchmark_name: "gemm_all_reduce"
   trace_dir: "../microbenchmarks/gemm_all_reduce_f32"
@@ -17,11 +17,11 @@ benchmarks:
   xlml_metrics_dir: "../microbenchmarks/gemm_all_reduce_f32"
   xla_dump_dir: "../microbenchmarks/gemm_all_reduce_f32/hlo_graphs"
   benchmark_sweep_params:
+  - {m: 1024, k: 1024, n: 1024, num_runs: 20, dtype: 'float32'}
   - {m: 2048, k: 2048, n: 2048, num_runs: 20, dtype: 'float32'}
   - {m: 4096, k: 4096, n: 4096, num_runs: 20, dtype: 'float32'}
   - {m: 8192, k: 8192, n: 8192, num_runs: 20, dtype: 'float32'}
   - {m: 16384, k: 16384, n: 16384, num_runs: 20, dtype: 'float32'}
-  - {m: 32768, k: 32768, n: 32768, num_runs: 20, dtype: 'float32'}
 
 - benchmark_name: "gemm_all_reduce"
   trace_dir: "../microbenchmarks/gemm_all_reduce_fp16"
@@ -29,11 +29,11 @@ benchmarks:
   xlml_metrics_dir: "../microbenchmarks/gemm_all_reduce_fp16"
   xla_dump_dir: "../microbenchmarks/gemm_all_reduce_fp16/hlo_graphs"
   benchmark_sweep_params:
+  - {m: 1024, k: 1024, n: 1024, num_runs: 20, dtype: 'float16'}
   - {m: 2048, k: 2048, n: 2048, num_runs: 20, dtype: 'float16'}
   - {m: 4096, k: 4096, n: 4096, num_runs: 20, dtype: 'float16'}
   - {m: 8192, k: 8192, n: 8192, num_runs: 20, dtype: 'float16'}
   - {m: 16384, k: 16384, n: 16384, num_runs: 20, dtype: 'float16'}
-  - {m: 32768, k: 32768, n: 32768, num_runs: 20, dtype: 'float16'}
 
 - benchmark_name: "gemm_all_reduce"
   trace_dir: "../microbenchmarks/gemm_all_reduce_fp8"
@@ -41,11 +41,11 @@ benchmarks:
   xlml_metrics_dir: "../microbenchmarks/gemm_all_reduce_fp8"
   xla_dump_dir: "../microbenchmarks/gemm_all_reduce_fp8/hlo_graphs"
   benchmark_sweep_params:
+  - {m: 1024, k: 1024, n: 1024, num_runs: 20, dtype: 'float8'}
   - {m: 2048, k: 2048, n: 2048, num_runs: 20, dtype: 'float8'}
   - {m: 4096, k: 4096, n: 4096, num_runs: 20, dtype: 'float8'}
   - {m: 8192, k: 8192, n: 8192, num_runs: 20, dtype: 'float8'}
   - {m: 16384, k: 16384, n: 16384, num_runs: 20, dtype: 'float8'}
-  - {m: 32768, k: 32768, n: 32768, num_runs: 20, dtype: 'float8'}
 
 - benchmark_name: "gemm_all_reduce"
   trace_dir: "../microbenchmarks/gemm_all_reduce_fp4"
@@ -53,8 +53,8 @@ benchmarks:
   xlml_metrics_dir: "../microbenchmarks/gemm_all_reduce_fp4"
   xla_dump_dir: "../microbenchmarks/gemm_all_reduce_fp4/hlo_graphs"
   benchmark_sweep_params:
+  - {m: 1024, k: 1024, n: 1024, num_runs: 20, dtype: 'float4'}
   - {m: 2048, k: 2048, n: 2048, num_runs: 20, dtype: 'float4'}
   - {m: 4096, k: 4096, n: 4096, num_runs: 20, dtype: 'float4'}
   - {m: 8192, k: 8192, n: 8192, num_runs: 20, dtype: 'float4'}
   - {m: 16384, k: 16384, n: 16384, num_runs: 20, dtype: 'float4'}
-  - {m: 32768, k: 32768, n: 32768, num_runs: 20, dtype: 'float4'}
diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml
index b703ea96..c63075a0 100644
--- a/Ironwood/configs/host_device/host_device.yaml
+++ b/Ironwood/configs/host_device/host_device.yaml
@@ -2,8 +2,9 @@ benchmarks:
 - benchmark_name: host_device
   num_runs: 20
   benchmark_sweep_params:
-  - {
-      data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
-    }
+  - h2d_type: "simple"
+    data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
+  - h2d_type: "pipelined"
+    data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
   csv_path: "../microbenchmarks/host_device"
   trace_dir: "../microbenchmarks/host_device/trace"
diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py
index ed889d79..9e4e9648 100644
--- a/Ironwood/guides/automation/aggregator.py
+++ b/Ironwood/guides/automation/aggregator.py
@@ -37,7 +37,7 @@
         "tflops_per_sec_per_device_max",
     ],
     "gemm_all_reduce": [
-        "m", "n", "k", "dtype", "step_time_ms_num_runs",
+        "topology", "m", "n", "k", "dtype", "step_time_ms_num_runs",
         "tflops_per_sec_per_device_p50", "tflops_per_sec_per_device_p90",
         "tflops_per_sec_per_device_p95", "tflops_per_sec_per_device_p99",
         "tflops_per_sec_per_device_avg", "tflops_per_sec_per_device_min",
@@ -107,6 +107,18 @@ def aggregate_gemm(directories: list[str], picked_columns: list[str]) -> pd.Data
             aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True)
     return aggregated_df
 
+def aggregate_gemm_all_reduce(directories: list[str], picked_columns: list[str]) -> pd.DataFrame:
+    if len(directories) == 0:
+        return None
+    aggregated_df = pd.DataFrame()
+    for directory in directories:
+        files = glob.glob(f"{directory}/*.tsv")
+        for file in files:
+            df = pd.read_csv(file, sep='\t')
+            df["topology"] = [file.split('/')[-4].split('-')[1] for _ in range(df.shape[0])]
+            aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True)
+    return aggregated_df
+
 def aggregate_bmm(directories: list[str], picked_columns: list[str]) -> pd.DataFrame:
     if len(directories) == 0:
         return None
@@ -135,8 +147,8 @@ def aggregate_attention(directories: list[str], picked_columns: list[str]) -> pd
     "host_device": aggregate_host_device,
     "gemm": aggregate_gemm,
     "bmm": aggregate_bmm,
-    "gemm_all_reduce": aggregate_gemm,
     "attention": aggregate_attention,
+    "gemm_all_reduce": aggregate_gemm_all_reduce,
 }
 
 def aggregate_results(bucket_path: str, local_dir: str):
diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-gemm_all_reduce.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-gemm_all_reduce.yaml
new file mode 100644
index 00000000..05c98427
--- /dev/null
+++ b/Ironwood/guides/automation/tpu7x-4x4x4-gemm_all_reduce.yaml
@@ -0,0 +1,61 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-4x4x4
+spec:
+  completionMode: Indexed
+  suspend: true
+  parallelism: 16
+  completions: 16
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 4x4x4
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/src/benchmark_gemm.py b/Ironwood/src/benchmark_gemm.py
index 0b19637d..422b1bc4 100644
--- a/Ironwood/src/benchmark_gemm.py
+++ b/Ironwood/src/benchmark_gemm.py
@@ -147,6 +147,7 @@ def gemm_multiple_run_calculate_metrics(
         total_flops, SHARDING_STRATEGY
     )
     peak_flops_multiplier = get_peak_flops_multiplier(dtype.dtype.name)
+    peak_flops = PEAK_FLOPS_PER_DEVICE * peak_flops_multiplier
     return unified_flops_metrics(
         m,
         n,
diff --git a/Ironwood/src/benchmark_gemm_all_reduce.py b/Ironwood/src/benchmark_gemm_all_reduce.py
index 851849ca..55744593 100644
--- a/Ironwood/src/benchmark_gemm_all_reduce.py
+++ b/Ironwood/src/benchmark_gemm_all_reduce.py
@@ -1,6 +1,7 @@
 """Benchmarks gemm + all_reduce for DP gradient sync simulation."""
 
 import os
+import time
 from typing import Any, Dict, Optional, Callable
 
 # pylint: disable=g-importing-member
@@ -57,11 +58,6 @@ def setup_tpu_env():
         "--xla_tpu_vmem_scavenging_mode=NONE "
         "--xla_tpu_dvfs_p_state=7 "
 
-        "--xla_tpu_impure_enable_packed_bf16_math_ops=true "
-        "--xla_tpu_enable_pincer_short_fusion_emitter=true "
-        "--xla_tpu_enable_sparse_core_hierarchical_all_reduce=true "
-        "--xla_tpu_use_single_sparse_core_for_all_reduce_offload=true " # Test effect on SC
-
         "--xla_jf_debug_level=1 "
         "--xla_sc_disable_megacore_partitioning=true "
         "--xla_tpu_disable_sparse_core_collective_offload_remover=true "
@@ -72,9 +68,7 @@ def setup_tpu_env():
         "--xla_tpu_use_tc_device_shape_on_sc=true "
     )
 
-    print("Step 1: Calling jax.distributed.initialize(initialization_timeout=300)...", flush=True)
     jax.distributed.initialize(initialization_timeout=300)
-    print("Step 1: jax.distributed.initialize() completed.", flush=True)
     _INITIALIZED = True
 
 
@@ -106,7 +100,6 @@ def f(x, y):
             out = jax.lax.psum(c, axis_name="device")
             return out
 
-    print("Step 2: Creating Mesh and Shardings...", flush=True)
     mesh = create_mesh(sharding_strategy)
     lhs_sharding = get_lhs_named_shading(mesh, sharding_strategy)
     rhs_sharding = get_rhs_named_shading(mesh, sharding_strategy)
@@ -131,22 +124,35 @@ def f(x, y):
     rhs_dtype = dtype
     key = jax.random.key(SEED)
 
-    def data_generator():
-        """Creates new random data on host and puts it on device."""
-        nonlocal key
-        key, key_lhs, key_rhs = jax.random.split(key, 3)
-
-        # Create random data on host
-        lhs_host = jax.random.normal(key_lhs, lhs_shape).astype(lhs_dtype)
-        rhs_host = jax.random.normal(key_rhs, rhs_shape).astype(rhs_dtype)
-
-        # Put on device (HBM)
-        lhs_device = jax.device_put(lhs_host, lhs_sharding)
-        rhs_device = jax.device_put(rhs_host, rhs_sharding)
+    # Create random data on host and put on device ONCE (Double Buffered)
+    key, key_lhs_1, key_lhs_2, key_rhs_1, key_rhs_2 = jax.random.split(key, 5)
+    
+    lhs_host_1 = jax.random.normal(key_lhs_1, lhs_shape).astype(lhs_dtype)
+    lhs_host_2 = jax.random.normal(key_lhs_2, lhs_shape).astype(lhs_dtype)
+    rhs_host_1 = jax.random.normal(key_rhs_1, rhs_shape).astype(rhs_dtype)
+    rhs_host_2 = jax.random.normal(key_rhs_2, rhs_shape).astype(rhs_dtype)
+    
+    lhs_device_1 = jax.device_put(lhs_host_1, lhs_sharding)
+    lhs_device_2 = jax.device_put(lhs_host_2, lhs_sharding)
+    rhs_device_1 = jax.device_put(rhs_host_1, rhs_sharding)
+    rhs_device_2 = jax.device_put(rhs_host_2, rhs_sharding)
+    
+    jax.block_until_ready(lhs_device_1)
+    jax.block_until_ready(lhs_device_2)
+    jax.block_until_ready(rhs_device_1)
+    jax.block_until_ready(rhs_device_2)
 
-        return (lhs_device, rhs_device)
+    step = 0
+    def data_generator():
+        """Returns pre-allocated device data, toggling between two sets of buffers to avoid caching."""
+        nonlocal step
+        use_set_1 = (step % 2) == 0
+        step += 1
+        return (
+            lhs_device_1 if use_set_1 else lhs_device_2,
+            rhs_device_1 if use_set_1 else rhs_device_2
+        )
 
-    print("Step 3: Starting Execution Loop (includes JIT)...", flush=True)
     time_ms_list = multiple_iteration_timeit_from_trace(
         jit_sharded_f,
         data_generator,
@@ -156,7 +162,6 @@ def data_generator():
         trace_dir=trace_dir,
         multi_op=True,
     )
-    print("Step 4: Execution Loop Completed.", flush=True)
     
     return {
         "time_ms_list": time_ms_list,
@@ -179,9 +184,6 @@ def gemm_all_reduce(
     )
 
 
-
-
-
 def _calculate_metrics_base(
     m: int,
     k: int,
diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index 67eb980b..a1f2467c 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -21,7 +21,7 @@
 
 def benchmark_host_device(
     data_size_mib: int,
-    h2d_type: str = "simple",
+    h2d_type: str,
     num_runs: int = 100,
     trace_dir: str = None,
 ) -> Dict[str, Any]:
@@ -129,6 +129,7 @@ def benchmark_host_device(
 
                         d2h_perf.append((t3 - t2) * 1000)
                         tensor_stack.delete()
+                        
                         for device_tensor in tensors_on_device:
                             device_tensor.delete()
                         del tensors_on_device
@@ -165,6 +166,7 @@ def benchmark_host_device_calculate_metrics(
     h2d_type: str,
     H2D_Bandwidth_ms: List[float],
     D2H_Bandwidth_ms: List[float],
+    d2h_pipelined_method: str = "device_put",
 ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
     """Calculates metrics for Host-Device transfer."""
     params = locals().items()
@@ -176,6 +178,7 @@ def benchmark_host_device_calculate_metrics(
     metadata = {k: v for k, v in params if k in metadata_keys}
     metadata["dtype"] = "float32"
     metadata["h2d_type"] = h2d_type
+    metadata["d2h_pipelined_method"] = d2h_pipelined_method
     
     metrics = {}
     
diff --git a/Ironwood/src/run_benchmark.py b/Ironwood/src/run_benchmark.py
index 31338d21..2d10db5d 100644
--- a/Ironwood/src/run_benchmark.py
+++ b/Ironwood/src/run_benchmark.py
@@ -331,6 +331,17 @@ def write_metrics_to_gcs(
     calculate_metrics_results: List[Dict[str, Any]],
 ):
     """Writes metrics to GCS bucket defined by gcs_bucket_csv_dir."""
+    # Only write metrics from one host.
+    try:
+        if jax.process_index() != 0:
+            return
+    except Exception:
+        # If jax is not initialized or we are in a single process setup where process_index might fail
+        # (though likely it returns 0), we can default to writing.
+        # But safest is to assume if this function is called, we want to write, 
+        # unless explicitly on a non-zero rank.
+        pass
+
     if not gcs_bucket_csv_dir:
         return
 
@@ -342,9 +353,24 @@ def write_metrics_to_gcs(
         config_category = os.path.basename(config_dir)
 
     config_stem = os.path.splitext(os.path.basename(config_path))[0]
-
     gcs_path = os.path.join(gcs_bucket_csv_dir, config_category, config_stem)
-    write_to_csv(f"{gcs_path}/{test_name}.tsv", calculate_metrics_results)
+
+    # Group results by dtype
+    results_by_dtype = {}
+    for result in calculate_metrics_results:
+        dtype = result.get("metadata", {}).get("dtype", "unknown_dtype")
+        if dtype not in results_by_dtype:
+            results_by_dtype[dtype] = []
+        results_by_dtype[dtype].append(result)
+
+    # Write separate files for each dtype
+    for dtype, results in results_by_dtype.items():
+        # Append dtype to test_name if it's not already part of it (it usually isn't in a clean way)
+        # But to be safe and clear, we can just append it.
+        # test_name is like "t_gemm_all_reduce_XYZ..."
+        # We want "t_gemm_all_reduce_XYZ..._float16.tsv"
+        dtype_test_name = f"{test_name}_{dtype}"
+        write_to_csv(f"{gcs_path}/{dtype_test_name}.tsv", results)
 
 
 def run_single_benchmark(benchmark_config: Dict[str, Any], output_path: str, gcs_bucket_csv_dir: str = None, config_path: str = None):

From 88229ec46146a38b6c8a4df503c230746a02373f Mon Sep 17 00:00:00 2001
From: "Amy (Yu-Hsuan) Lin" <amylin@google.com>
Date: Tue, 10 Feb 2026 18:24:46 +0800
Subject: [PATCH 59/88] Add 4x4 gemm_all_reduce.yaml to automation launch
 script

---
 Ironwood/guides/automation/automation_launch.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
index 0e49fb12..1d1fb498 100755
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -22,6 +22,7 @@ yaml_names=(
     "tpu7x-2x2x4-collectives.yaml"
     "tpu7x-2x4x4-collectives.yaml"
     "tpu7x-4x4x4-collectives.yaml"
+    "tpu7x-4x4x4-gemm_all_reduce.yaml"
 )
 
 ######################################################################

From a585ecba660599d4e7d612721282845e33589ed1 Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Wed, 11 Feb 2026 09:22:08 +0000
Subject: [PATCH 60/88] Add step time to matmul series

---
 Ironwood/guides/automation/aggregator.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py
index 9e4e9648..b221c7e8 100644
--- a/Ironwood/guides/automation/aggregator.py
+++ b/Ironwood/guides/automation/aggregator.py
@@ -28,6 +28,7 @@
         "tflops_per_sec_per_device_p95", "tflops_per_sec_per_device_p99",
         "tflops_per_sec_per_device_avg", "tflops_per_sec_per_device_min",
         "tflops_per_sec_per_device_max",
+        "step_time_ms_p50", "step_time_ms_p90", "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg", "step_time_ms_min", "step_time_ms_max",
     ],
     "bmm": [
         "b", "m", "n", "k", "dtype", "step_time_ms_num_runs",
@@ -35,6 +36,7 @@
         "tflops_per_sec_per_device_p95", "tflops_per_sec_per_device_p99",
         "tflops_per_sec_per_device_avg", "tflops_per_sec_per_device_min",
         "tflops_per_sec_per_device_max",
+        "step_time_ms_p50", "step_time_ms_p90", "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg", "step_time_ms_min", "step_time_ms_max",
     ],
     "gemm_all_reduce": [
         "topology", "m", "n", "k", "dtype", "step_time_ms_num_runs",
@@ -42,6 +44,7 @@
         "tflops_per_sec_per_device_p95", "tflops_per_sec_per_device_p99",
         "tflops_per_sec_per_device_avg", "tflops_per_sec_per_device_min",
         "tflops_per_sec_per_device_max",
+        "step_time_ms_p50", "step_time_ms_p90", "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg", "step_time_ms_min", "step_time_ms_max",
     ],
     "attention": [
         "batch_size", "q_seq_len", "kv_seq_len", "q_heads", "kv_heads", "qk_head_dim", "v_head_dim", "mode", "causal", "step_time_ms_num_runs", "num_samples", 

From adb47d1a9498164709ee838e81838a9f3941d032 Mon Sep 17 00:00:00 2001
From: yuyanpeng-google <yuyanpeng@google.com>
Date: Thu, 12 Feb 2026 16:43:47 +0800
Subject: [PATCH 61/88] update benchmark_attention not sweep at the runtime
 (#111)

We add a tuned table instead of. run the sweeping at each microbenchmark.
For the config not tuned yet, we just use
default block sizes and output with flag has_optimized=false.
---
 Ironwood/configs/attention/attention.yaml |   7 +-
 Ironwood/src/benchmark_attention.py       | 236 +++++++++++++++-------
 2 files changed, 161 insertions(+), 82 deletions(-)

diff --git a/Ironwood/configs/attention/attention.yaml b/Ironwood/configs/attention/attention.yaml
index 92c19dd8..bc365cd9 100644
--- a/Ironwood/configs/attention/attention.yaml
+++ b/Ironwood/configs/attention/attention.yaml
@@ -1,12 +1,7 @@
 benchmarks:
 - benchmark_name: "tokamax_splash_attention"
   benchmark_sweep_params:
-  - {batch_size: 1, q_seq_len: 4096, kv_seq_len: 4096, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true}
-  - {batch_size: 1, q_seq_len: 8192, kv_seq_len: 8192, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true}
-  - {batch_size: 1, q_seq_len: 16384, kv_seq_len: 16384, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true}
-  - {batch_size: 1, q_seq_len: 32768, kv_seq_len: 32768, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true}
-  - {batch_size: 1, q_seq_len: 65536, kv_seq_len: 65536, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true}
-  - {batch_size: 1, q_seq_len: 131072, kv_seq_len: 131072, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true}
+  - {batch_size: 1, q_seq_len: 4096, kv_seq_len: 4096, q_heads: 128, kv_heads: 128, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false]}
   trace_dir: "../microbenchmarks/attention"
   csv_path: "../microbenchmarks/attention"
   xlml_metrics_dir: "../microbenchmarks/attention"
diff --git a/Ironwood/src/benchmark_attention.py b/Ironwood/src/benchmark_attention.py
index 525c7757..64192266 100644
--- a/Ironwood/src/benchmark_attention.py
+++ b/Ironwood/src/benchmark_attention.py
@@ -6,27 +6,96 @@
 
 # pylint: disable=g-importing-member,g-bad-import-order
 from functools import partial
-from typing import Any, Callable, Dict, Tuple
+from typing import Any, Callable, Dict, Literal, Optional, Tuple
 import dataclasses
 
 from benchmark_utils import timeit_from_trace, MetricsStatistics
 import jax
-import logging
+import jax.numpy as jnp
+
 from tokamax._src.ops.experimental.tpu.splash_attention import (
     splash_attention_kernel as splash,
 )
 from tokamax._src.ops.experimental.tpu.splash_attention import (
     splash_attention_mask as mask_lib,
 )
-import tune_jax
-tune_jax.tune_logger.setLevel(logging.ERROR)
 
 # pylint: disable=g-importing-member,g-bad-import-order
 
 os.environ["LIBTPU_INIT_ARGS"] = (
-    "--xla_tpu_dvfs_p_state=7"
+    "--xla_tpu_dvfs_p_state=7 --xla_tpu_scoped_vmem_limit_kib=65536"
+)
+
+SplashAttentionLookupKey = tuple[
+    int, # batch_size
+    int, # q_seq_len
+    int, # kv_seq_len
+    int, # q_heads
+    int, # kv_heads
+    int, # qk_head_dim
+    int, # v_head_dim
+    bool, # causal
+]
+
+SplashAttentionLookupValue = tuple[
+    int, # block_q
+    int, # block_kv
+    int, # block_kv_compute
+    int, # block_q_dkv
+    int, # block_kv_dkv
+    int, # block_kv_dkv_compute
+    splash.QKVLayout, # q_layout
+    splash.QKVLayout, # k_layout
+    splash.QKVLayout, # v_layout
+    bool, # use_experimental_scheduler
+]
+
+# Merge the tuned block size of optimal fwd and bwd
+# The optimal layout and use_experimental_scheduler may be different between fwd and bwd
+# Use the layout and use_experimental_scheduler optimized for fwd
+SPLASH_ATTENTION_HYPERPARAMS_LOOKUP_TABLE: Dict[
+    SplashAttentionLookupKey, SplashAttentionLookupValue
+] = {
+    (1, 4096, 4096, 128, 128, 256, 256, True): (
+        2048,
+        2048,
+        256,
+        2048,
+        2048,
+        512,
+        splash.QKVLayout.HEAD_DIM_MINOR,
+        splash.QKVLayout.SEQ_MINOR,
+        splash.QKVLayout.HEAD_DIM_MINOR,
+        True,
+    ),
+    (1, 4096, 4096, 128, 128, 256, 256, False): (
+        4096,
+        4096,
+        512,
+        4096,
+        2048,
+        512,
+        splash.QKVLayout.HEAD_DIM_MINOR,
+        splash.QKVLayout.SEQ_MINOR,
+        splash.QKVLayout.HEAD_DIM_MINOR,
+        True,
+    ),
+}
+
+DEFAULT_SPLASH_ATTENTION_HYPERPARAMS: SplashAttentionLookupValue = (
+    2048,
+    2048,
+    256,
+    2048,
+    2048,
+    256,
+    splash.QKVLayout.HEAD_DIM_MINOR,
+    splash.QKVLayout.SEQ_MINOR,
+    splash.QKVLayout.HEAD_DIM_MINOR,
+    True,
 )
 
+
 def generate_qkv_separate_dims(
     batch_size: int,
     q_seq_len: int,
@@ -40,9 +109,9 @@ def generate_qkv_separate_dims(
     """Generates QKV with potentially different shapes for Q, K, and V."""
     key = jax.random.PRNGKey(seed)
     key_q, key_k, key_v = jax.random.split(key, 3)
-    q = jax.random.normal(key_q, (batch_size, q_heads, q_seq_len, qk_head_dim))
-    k = jax.random.normal(key_k, (batch_size, kv_heads, kv_seq_len, qk_head_dim))
-    v = jax.random.normal(key_v, (batch_size, kv_heads, kv_seq_len, v_head_dim))
+    q = jax.random.normal(key_q, (batch_size, q_heads, q_seq_len, qk_head_dim), dtype=jnp.bfloat16)
+    k = jax.random.normal(key_k, (batch_size, kv_heads, kv_seq_len, qk_head_dim), dtype=jnp.bfloat16)
+    v = jax.random.normal(key_v, (batch_size, kv_heads, kv_seq_len, v_head_dim), dtype=jnp.bfloat16)
     return q, k, v
 
 
@@ -106,28 +175,14 @@ def tokamax_splash_attention_benchmark(
     kv_heads: int,
     qk_head_dim: int,
     v_head_dim: int,
-    mode: str = "fwd",  # One of ('fwd', 'bwd', 'combined')
+    mode: Literal["fwd", "bwd"] = "fwd",
     causal: bool = True,
-    num_samples: int = 256,
-    tune_pallas_only: bool = True,
     num_runs: int = 10,
-    trace_dir: str = None,
+    trace_dir: Optional[str] = None,
 ) -> Dict[str, Any]:
     """Benchmarks the Tokamax Splash attention kernel."""
     event_filter_regex = _pallas_call_hlo_pattern(mode, q_heads != kv_heads)
-
-    hyperparams_override = {}
-    if mode == "bwd":
-        # Don't tune fwd only hyperparams
-        hyperparams_override = dict(
-            block_q=min(512, q_seq_len),
-            block_kv=min(1024, kv_seq_len),
-            block_kv_compute=min(512, kv_seq_len),
-        )
-    elif mode == "combined":
-        mode = "bwd"
-
-    # Generate QKV.
+    # Generate QKV in shape [batch, head_num, seq_len, head_dim].
     q, k, v = generate_qkv_separate_dims(
         batch_size,
         q_seq_len,
@@ -138,12 +193,68 @@ def tokamax_splash_attention_benchmark(
         v_head_dim,
     )
 
+    key = (
+        batch_size,
+        q_seq_len,
+        kv_seq_len,
+        q_heads,
+        kv_heads,
+        qk_head_dim,
+        v_head_dim,
+        causal,
+    )
+    hyperparams: Optional[SplashAttentionLookupValue] = (
+        SPLASH_ATTENTION_HYPERPARAMS_LOOKUP_TABLE.get(key, None)
+    )
+    has_optimized = True
+    if hyperparams is None:
+        print(f"{key=} is not tuned")
+        has_optimized = False
+        hyperparams = DEFAULT_SPLASH_ATTENTION_HYPERPARAMS
+    
+    (
+        block_q,
+        block_kv,
+        block_kv_compute,
+        block_q_dkv,
+        block_kv_dkv,
+        block_kv_dkv_compute,
+        q_layout,
+        k_layout,
+        v_layout,
+        use_experimental_scheduler,
+    ) = hyperparams
+
+    # Pad q, kv to prevent the block size are not valid
+    if not has_optimized:
+        def _ceiling_div(a: int, b: int) -> int:
+            return (a + b - 1) // b
+
+
+        def _align_to(x: int, a: int) -> int:
+            return _ceiling_div(x, a) * a
+
+        def _pad_token(t: jax.Array, size) -> jax.Array:
+            # tensor is [batch_size, num_head, token, head_dim]
+            result = jnp.pad(t, ((0, 0), (0, 0), (0, size), (0, 0)), constant_values=0)
+            return result
+
+        q_len = q.shape[-2]
+        k_len = k.shape[-2]
+
+        # Pad q, k, v sequence, align to block sizes
+        q = _pad_token(q, _align_to(q_len, block_q) - q_len)
+        k = _pad_token(k, _align_to(k_len, block_kv) - k_len)
+        v = _pad_token(v, _align_to(k_len, block_kv) - k_len)
+
+    padded_q_len = q.shape[-2]
+    padded_kv_len = k.shape[-2]
     # Attention mask
-    mask = mask_lib.FullMask(_shape=(q_seq_len, kv_seq_len))
+    mask = mask_lib.FullMask(_shape=(padded_q_len, padded_kv_len))
     if causal:
         # Pick offset for causal masks for a "representative" slice of the causal
-        offset = v.shape[-2] - q.shape[-2]
-        mask = mask_lib.CausalMask(shape=(q_seq_len, kv_seq_len), offset=offset)
+        offset = padded_kv_len - padded_q_len
+        mask = mask_lib.CausalMask(shape=(padded_q_len, padded_kv_len), offset=offset)
 
     def attention_fn(
         q: jax.Array,
@@ -155,8 +266,6 @@ def attention_fn(
         block_q_dkv: int | None,
         block_kv_dkv: int | None,
         block_kv_dkv_compute: int | None,
-        block_q_dq: int | None,
-        block_kv_dq: int | None,
         q_layout: splash.QKVLayout,
         k_layout: splash.QKVLayout,
         v_layout: splash.QKVLayout,
@@ -165,6 +274,7 @@ def attention_fn(
         mqa: bool,
         use_experimental_scheduler: bool,
     ):
+        # dq kernel is not used
         config = splash.SplashConfig(
             block_q=block_q,
             block_kv=block_kv,
@@ -172,8 +282,8 @@ def attention_fn(
             block_q_dkv=block_q_dkv,
             block_kv_dkv=block_kv_dkv,
             block_kv_dkv_compute=block_kv_dkv_compute,
-            block_q_dq=block_q_dq,
-            block_kv_dq=block_kv_dq,
+            block_q_dq=None,
+            block_kv_dq=None,
             q_layout=q_layout,
             k_layout=k_layout,
             v_layout=v_layout,
@@ -190,38 +300,6 @@ def attention_fn(
         mqa=q_heads != kv_heads,  # Determine if it's Multi-Query Attention
     )
 
-    # Define the search space for tokamax splash attention hyperparameters.
-    tiles = [256, 512, 1024, 2048, 4096, 8192]
-    layouts = [splash.QKVLayout.HEAD_DIM_MINOR, splash.QKVLayout.SEQ_MINOR]
-    hyperparams = {
-        "block_q": tiles,
-        "block_kv": tiles,
-        "block_kv_compute": tiles,
-        "block_q_dkv": [None],
-        "block_kv_dkv": [None],
-        "block_kv_dkv_compute": [None],
-        "block_q_dq": [None],
-        "block_kv_dq": [None],
-        "q_layout": layouts,
-        "k_layout": layouts,
-        "v_layout": layouts,
-        "use_experimental_scheduler": [True, False],
-    }
-
-    if mode == "bwd":
-        # If mode is backward, enable tuning for dKV-related block sizes.
-        # These parameters are only used during the backward pass.
-        hyperparams["block_q_dkv"] = tiles
-        hyperparams["block_kv_dkv"] = tiles
-        hyperparams["block_kv_dkv_compute"] = tiles
-        hyperparams["block_q_dq"] = tiles
-        hyperparams["block_kv_dq"] = tiles
-
-    # Incorporate any potentially previously tuned hyperparameters
-    hyperparams = dict(hyperparams, **hyperparams_override)
-
-    # Prepare the attention function for tuning.
-    tune_jax.CONFIG.allow_fallback_timing = False
     splash_fn = jax.jit(
         attention_fn,
         static_argnames=(
@@ -231,8 +309,6 @@ def attention_fn(
             "block_q_dkv",
             "block_kv_dkv",
             "block_kv_dkv_compute",
-            "block_q_dq",
-            "block_kv_dq",
             "q_layout",
             "k_layout",
             "v_layout",
@@ -240,26 +316,31 @@ def attention_fn(
         ),
     )
 
-    # Tune the hyperparameters with tune_jax
-    tuned_splash = tune_jax.tune(
+    tuned_splash = partial(
         splash_fn,
-        hyperparams=hyperparams,
-        event_filter_regex=event_filter_regex if tune_pallas_only else None,
-        sample_num=num_samples,
+        block_q=block_q,
+        block_kv=block_kv,
+        block_kv_compute=block_kv_compute,
+        block_q_dkv=block_q_dkv,
+        block_kv_dkv=block_kv_dkv,
+        block_kv_dkv_compute=block_kv_dkv_compute,
+        q_layout=q_layout,
+        k_layout=k_layout,
+        v_layout=v_layout,
+        use_experimental_scheduler=use_experimental_scheduler,
     )
 
     # Run once
     output = tuned_splash(q, k, v)
     jax.block_until_ready(output)
 
-    
     print("-" * 50)
     print(
         f"batch_size={batch_size}, q_seq_len={q_seq_len}, kv_seq_len={kv_seq_len}, "
         f"q_heads={q_heads}, kv_heads={kv_heads}, qk_head_dim={qk_head_dim}, "
         f"v_head_dim={v_head_dim}, mode={mode}, causal={causal}"
     )
-    print(f"tuned_splash.optimal_hyperparams={tuned_splash.optimal_hyperparams}")
+    print(f"{hyperparams=}")
     print("-" * 50)
 
     # Run benchmark
@@ -275,7 +356,11 @@ def attention_fn(
             f"{event_filter_regex}_no_residuals.1",
         ]
     )
-    return {"time_ms_list": time_ms_list, "output": output}
+    return {
+        "time_ms_list": time_ms_list,
+        "output": output,
+        "has_optimized": has_optimized,
+    }
 
 
 def tokamax_splash_attention_benchmark_calculate_metrics(
@@ -289,9 +374,8 @@ def tokamax_splash_attention_benchmark_calculate_metrics(
     v_head_dim: int,
     mode: str,
     causal: bool,
-    num_samples: int,
-    tune_pallas_only: bool,
     time_ms_list: list[float],
+    has_optimized: bool,
     # pylint: disable=unused-argument
 ) -> Dict[str, Any]:
     """Gathers metrics for the tokamax splash attention benchmark."""

From 9bf9e38d13515b638b3489484271c36379ea62ab Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Thu, 12 Feb 2026 09:59:58 +0000
Subject: [PATCH 62/88] Add attention into automation

---
 Ironwood/guides/automation/aggregator.py        | 4 ++--
 Ironwood/guides/automation/automation_launch.sh | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py
index b221c7e8..54df2146 100644
--- a/Ironwood/guides/automation/aggregator.py
+++ b/Ironwood/guides/automation/aggregator.py
@@ -47,7 +47,7 @@
         "step_time_ms_p50", "step_time_ms_p90", "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg", "step_time_ms_min", "step_time_ms_max",
     ],
     "attention": [
-        "batch_size", "q_seq_len", "kv_seq_len", "q_heads", "kv_heads", "qk_head_dim", "v_head_dim", "mode", "causal", "step_time_ms_num_runs", "num_samples", 
+        "batch_size", "q_seq_len", "kv_seq_len", "q_heads", "kv_heads", "qk_head_dim", "v_head_dim", "mode", "causal", "has_optimized", "step_time_ms_num_runs", 
         "time_ms_p50", "time_ms_p90",
         "time_ms_p95", "time_ms_p99",
         "time_ms_avg", "time_ms_min",
@@ -155,7 +155,7 @@ def aggregate_attention(directories: list[str], picked_columns: list[str]) -> pd
 }
 
 def aggregate_results(bucket_path: str, local_dir: str):
-    categories = ["collectives", "hbm", "host_device", "gemm", "bmm", "gemm_all_reduce"]
+    categories = ["collectives", "hbm", "host_device", "gemm", "bmm", "gemm_all_reduce", "attention"]
     directories = {}
     results = {}
     for category in categories:
diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
index 1d1fb498..133db969 100755
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -17,6 +17,7 @@ yaml_names=(
     "tpu7x-2x2x1-gemm_all_reduce.yaml"
     "tpu7x-2x2x1-gemm.yaml"
     "tpu7x-2x2x1-bmm.yaml"
+    "tpu7x-2x2x1-attention.yaml"
     "tpu7x-2x2x1-collectives.yaml"
     "tpu7x-2x2x2-collectives.yaml"
     "tpu7x-2x2x4-collectives.yaml"

From 1d36fa8933b7bff032dcde0b2505f8610085ec0c Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Thu, 12 Feb 2026 11:24:38 +0000
Subject: [PATCH 63/88] Update attention aggregate logic

---
 Ironwood/guides/automation/aggregator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py
index 54df2146..c519635b 100644
--- a/Ironwood/guides/automation/aggregator.py
+++ b/Ironwood/guides/automation/aggregator.py
@@ -47,7 +47,7 @@
         "step_time_ms_p50", "step_time_ms_p90", "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg", "step_time_ms_min", "step_time_ms_max",
     ],
     "attention": [
-        "batch_size", "q_seq_len", "kv_seq_len", "q_heads", "kv_heads", "qk_head_dim", "v_head_dim", "mode", "causal", "has_optimized", "step_time_ms_num_runs", 
+        "batch_size", "q_seq_len", "kv_seq_len", "q_heads", "kv_heads", "qk_head_dim", "v_head_dim", "mode", "causal", "has_optimized", "time_ms_num_runs", 
         "time_ms_p50", "time_ms_p90",
         "time_ms_p95", "time_ms_p99",
         "time_ms_avg", "time_ms_min",

From 5d958cc5e4878c483d47d8458af264b44b9bcd7f Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Thu, 12 Feb 2026 11:27:50 +0000
Subject: [PATCH 64/88] Set automation timeout to 2 hours

---
 Ironwood/guides/automation/automation_launch.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
index 133db969..cb89e725 100755
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -9,7 +9,7 @@ export GCS_SA_NAME="gcs-writer"  # Service account with write access to GCS_BUCK
 export PROJECT_ID=$(gcloud config get-value project 2>/dev/null)
 
 MAX_RETRIES=3
-TIMEOUT_SECOND=3600
+TIMEOUT_SECOND=7200
 
 yaml_names=(
     "tpu7x-2x2x1-hbm.yaml"

From 1ab8008f3de1e52bc4213e0825f161a8815e8ef5 Mon Sep 17 00:00:00 2001
From: Hong-Yi Lin <horacehylin@google.com>
Date: Thu, 12 Feb 2026 11:39:10 +0000
Subject: [PATCH 65/88] Set attention num_runs to 20

---
 Ironwood/configs/attention/attention.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Ironwood/configs/attention/attention.yaml b/Ironwood/configs/attention/attention.yaml
index bc365cd9..4ee7e100 100644
--- a/Ironwood/configs/attention/attention.yaml
+++ b/Ironwood/configs/attention/attention.yaml
@@ -1,7 +1,7 @@
 benchmarks:
 - benchmark_name: "tokamax_splash_attention"
   benchmark_sweep_params:
-  - {batch_size: 1, q_seq_len: 4096, kv_seq_len: 4096, q_heads: 128, kv_heads: 128, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false]}
+  - {batch_size: 1, q_seq_len: 4096, kv_seq_len: 4096, q_heads: 128, kv_heads: 128, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_runs: 20}
   trace_dir: "../microbenchmarks/attention"
   csv_path: "../microbenchmarks/attention"
   xlml_metrics_dir: "../microbenchmarks/attention"

From 9b4e8ded7479fbbd7e94a66ce134aaa5df3219a4 Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Wed, 11 Feb 2026 10:48:09 +0000
Subject: [PATCH 66/88] Try pinned memory

---
 Ironwood/configs/host_device/host_device.yaml |   6 +-
 Ironwood/guides/automation/aggregator.py      |   2 +-
 Ironwood/src/benchmark_host_device.py         | 276 +++++++++++-------
 3 files changed, 181 insertions(+), 103 deletions(-)

diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml
index c63075a0..e652f87f 100644
--- a/Ironwood/configs/host_device/host_device.yaml
+++ b/Ironwood/configs/host_device/host_device.yaml
@@ -2,9 +2,11 @@ benchmarks:
 - benchmark_name: host_device
   num_runs: 20
   benchmark_sweep_params:
-  - h2d_type: "simple"
+  - transfer_type: "simple"
     data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
-  - h2d_type: "pipelined"
+  - transfer_type: "pipelined"
+    data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
+  - transfer_type: "pinned_memory"
     data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
   csv_path: "../microbenchmarks/host_device"
   trace_dir: "../microbenchmarks/host_device/trace"
diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py
index c519635b..5ff4ce99 100644
--- a/Ironwood/guides/automation/aggregator.py
+++ b/Ironwood/guides/automation/aggregator.py
@@ -16,7 +16,7 @@
         "time_ms_p50", "time_ms_p90", "time_ms_p95", "time_ms_p99", "time_ms_avg", "time_ms_min", "time_ms_max",
     ],
     "host_device": [
-        "data_size_mib", "h2d_type", "H2D_bw (GiB/s)_num_runs",
+        "data_size_mib", "transfer_type", "H2D_bw (GiB/s)_num_runs",
         "H2D_bw (GiB/s)_p50", "H2D_bw (GiB/s)_p90", "H2D_bw (GiB/s)_p95", "H2D_bw (GiB/s)_p99", 
         "H2D_bw (GiB/s)_avg", "H2D_bw (GiB/s)_min", "H2D_bw (GiB/s)_max",
         "D2H_bw (GiB/s)_p50", "D2H_bw (GiB/s)_p90", "D2H_bw (GiB/s)_p95", "D2H_bw (GiB/s)_p99", 
diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index a1f2467c..3c1a5621 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -8,6 +8,8 @@
 from jax import numpy as jnp
 import numpy as np
 from benchmark_utils import MetricsStatistics
+from jax.sharding import Mesh, NamedSharding, PartitionSpec
+from jax.experimental import mesh_utils
 
 
 libtpu_init_args = [
@@ -19,9 +21,162 @@
 os.environ["TPU_PREMAPPED_BUFFER_TRANSFER_THRESHOLD_BYTES"] = "68719476736"
 
 
+import abc
+
+class TransferStrategy(abc.ABC):
+    """Abstract base class for transfer strategies."""
+
+    def __init__(self, trace_dir: str = None):
+        self.trace_dir = trace_dir
+        self.h2d_perf = []
+        self.d2h_perf = []
+
+    @abc.abstractmethod
+    def setup(self, data_size_mib: int, host_data: np.ndarray, devices: List[jax.Device]):
+        """Perform one-time setup before the benchmark loop."""
+        pass
+
+    @abc.abstractmethod
+    def run_h2d(self, host_data: np.ndarray, i: int) -> Any:
+        """Run H2D transfer for iteration i."""
+        pass
+
+    @abc.abstractmethod
+    def run_d2h(self, device_data: Any, i: int):
+        """Run D2H transfer for iteration i."""
+        pass
+
+    @abc.abstractmethod
+    def teardown(self):
+        """Clean up resources after the benchmark loop."""
+        pass
+
+
+class SimpleTransfer(TransferStrategy):
+    """Simple device_put/device_get strategy."""
+
+    def setup(self, data_size_mib: int, host_data: np.ndarray):
+        pass
+
+    def run_h2d(self, host_data: np.ndarray, i: int) -> Any:
+        t0 = time.perf_counter()
+        device_array = jax.device_put(host_data)
+        device_array.block_until_ready()
+        t1 = time.perf_counter()
+        
+        # Verify H2D shape
+        assert device_array.shape == host_data.shape
+        self.h2d_perf.append((t1 - t0) * 1000)
+        return device_array
+
+    def run_d2h(self, device_data: Any, i: int):
+        t2 = time.perf_counter()
+        _ = jax.device_get(device_data)
+        t3 = time.perf_counter()
+        self.d2h_perf.append((t3 - t2) * 1000)
+        device_data.delete()
+
+    def teardown(self):
+        pass
+
+
+class PipelinedTransfer(TransferStrategy):
+    """Pipelined transfer using chunking."""
+
+    def setup(self, data_size_mib: int, host_data: np.ndarray):
+        self.target_chunk_size_mib = 16
+        num_devices_to_perform_h2d = 2
+        self.target_devices = jax.devices()[:num_devices_to_perform_h2d]
+        self.num_devices = len(self.target_devices)
+        
+        data_per_dev = data_size_mib / self.num_devices
+        chunks_per_dev = int(data_per_dev / self.target_chunk_size_mib)
+        self.chunks_per_dev = max(1, chunks_per_dev)
+        if self.chunks_per_dev == 1:
+            print(f"Warning: Data size is not larger than target chunk size, falling back to standard JAX put.")
+
+    def run_h2d(self, host_data: np.ndarray, i: int) -> Any:
+        if self.chunks_per_dev > 1:
+            chunks = np.array_split(host_data, self.chunks_per_dev * self.num_devices, axis=0)
+            tensors_on_device = []
+            
+            t0 = time.perf_counter()
+            for idx, chunk in enumerate(chunks):
+                dev = self.target_devices[idx % self.num_devices]
+                tensors_on_device.append(jax.device_put(chunk, dev))
+            
+            for device_tensor in tensors_on_device:
+                device_tensor.block_until_ready()
+            t1 = time.perf_counter()
+            
+            self.h2d_perf.append((t1 - t0) * 1000)
+            del chunks
+            return tensors_on_device
+        else:
+            t0 = time.perf_counter()
+            result = jax.device_put(host_data, self.target_devices[0])
+            result.block_until_ready()
+            t1 = time.perf_counter()
+            self.h2d_perf.append((t1 - t0) * 1000)
+            return result
+
+    def run_d2h(self, device_data: Any, i: int):
+        t2 = time.perf_counter()
+        if isinstance(device_data, list):
+            _ = jax.device_get(device_data)
+            for device_tensor in device_data:
+                device_tensor.delete()
+        else:
+            _ = jax.device_get(device_data)
+            device_data.delete()
+        t3 = time.perf_counter()
+        self.d2h_perf.append((t3 - t2) * 1000)
+
+    def teardown(self):
+        pass
+
+
+class PinnedMemoryTransfer(TransferStrategy):
+    """Pinned memory host-to-device with parallelized device-to-host transfer."""
+
+    def setup(self, data_size_mib: int, host_data: np.ndarray):
+        num_devices_to_perform_h2d = 2
+        target_devices = jax.devices()[:num_devices_to_perform_h2d]
+        
+        mesh = Mesh(target_devices, ('x',))
+        partition_spec = PartitionSpec('x')
+        host_sharding = NamedSharding(mesh, partition_spec, memory_kind='pinned_host')
+        self.pinned_device_sharding = NamedSharding(mesh, partition_spec)
+
+        print("  Allocating Pinned Host Data...", flush=True)
+        self.pinned_host_input = jax.device_put(host_data, host_sharding)
+        self.pinned_host_input.block_until_ready()
+
+    def run_h2d(self, host_data: np.ndarray, i: int) -> Any:
+        t_transfer_start = time.perf_counter()
+        device_array = jax.device_put(self.pinned_host_input, self.pinned_device_sharding)
+        device_array.block_until_ready()
+        t_transfer_end = time.perf_counter()
+        
+        self.h2d_perf.append((t_transfer_end - t_transfer_start) * 1000)
+        return device_array
+
+    def run_d2h(self, device_data: Any, i: int):
+        t2 = time.perf_counter()
+        # Fetch addressable shards to enable pipelined D2H
+        _ = jax.device_get([s.data for s in device_data.addressable_shards])
+        t3 = time.perf_counter()
+        self.d2h_perf.append((t3 - t2) * 1000)
+        device_data.delete()
+
+    def teardown(self):
+        if hasattr(self, 'pinned_host_input'):
+             self.pinned_host_input.delete()
+
+
 def benchmark_host_device(
     data_size_mib: int,
-    h2d_type: str,
+    transfer_type: str,
     num_runs: int = 100,
     trace_dir: str = None,
 ) -> Dict[str, Any]:
@@ -32,19 +187,23 @@ def benchmark_host_device(
     # Allocate Host Source Buffer
     column = 128
     host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32)
-    
-    # Used in pipelined flow
-    # TODO: turn into a param
-    num_devices_to_perform_h2d = 1
-    target_devices = jax.devices()[:num_devices_to_perform_h2d]
 
     print(
-        f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {h2d_type=}",
+        f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {transfer_type=}",
         flush=True
     )
 
-    # Performance Lists
-    h2d_perf, d2h_perf = [], []
+    strategies = {
+        "simple": SimpleTransfer,
+        "pipelined": PipelinedTransfer,
+        "pinned_memory": PinnedMemoryTransfer,
+    }
+
+    if transfer_type not in strategies:
+        raise ValueError(f"Unknown transfer_type: {transfer_type}. Available: {list(strategies.keys())}")
+
+    strategy = strategies[transfer_type](trace_dir)
+    strategy.setup(data_size_mib, host_data)
 
     # Profiling Context
     import contextlib
@@ -70,103 +229,21 @@ def benchmark_host_device(
                 step_context = contextlib.nullcontext()
             
             with step_context:
-                 # H2D
-                if h2d_type == "simple":
-                    t0 = time.perf_counter()
-                    # Simple device_put
-                    device_array = jax.device_put(host_data)
-                    device_array.block_until_ready()
-                    t1 = time.perf_counter()
-                    
-                    # Verify H2D shape
-                    assert device_array.shape == host_data.shape
-                    h2d_perf.append((t1 - t0) * 1000)
-                
-                    # D2H
-                    t2 = time.perf_counter()
-                    # Simple device_get
-                    # Note: device_get returns a numpy array (copy)
-                    _ = jax.device_get(device_array)
-                    t3 = time.perf_counter()
-                    d2h_perf.append((t3 - t2) * 1000)
-                    
-                    device_array.delete()
-                elif h2d_type == "pipelined":
-                    target_chunk_size_mib = 16  # Sweet spot from profiling
-                    num_devices = len(target_devices)
-
-                    tensors_on_device = []
-                    
-                    # Calculate chunks per device
-                    data_per_dev = data_size_mib / num_devices
-                    chunks_per_dev = int(data_per_dev / target_chunk_size_mib)
-                    chunks_per_dev = max(1, chunks_per_dev)
-
-                    chunks = np.array_split(host_data, chunks_per_dev * num_devices, axis=0)
-                    if chunks_per_dev > 1:    
-                        t0 = time.perf_counter()
-                        # We need to map chunks to the correct device
-                        # This simple example assumes chunks are perfectly divisible and ordered
-                        # In production, use `jax.sharding` mesh logic for complex layouts
-                        for idx, chunk in enumerate(chunks):
-                            if num_devices > 1:
-                                dev = target_devices[idx % num_devices]
-                            else:
-                                dev = target_devices[0]
-                            tensors_on_device.append(jax.device_put(chunk, dev))
-                        for device_tensor in tensors_on_device:
-                            device_tensor.block_until_ready()
-                        t1 = time.perf_counter()
-                        h2d_perf.append((t1 - t0) * 1000)
-                        del chunks
-
-                        # D2H
-                        tensor_stack = jnp.vstack(tensors_on_device)
-                        
-                        t2 = time.perf_counter()
-                        _ = jax.device_get(tensor_stack)
-                        t3 = time.perf_counter()
-
-                        d2h_perf.append((t3 - t2) * 1000)
-                        tensor_stack.delete()
-                        
-                        for device_tensor in tensors_on_device:
-                            device_tensor.delete()
-                        del tensors_on_device
-                    else:
-                        t0 = time.perf_counter()
-
-                        print(f"Warning: {data_size_mib=} is not larger than {target_chunk_size_mib=}, falling back to standard JAX put.")
-                        # Fallback to standard JAX put for small data
-                        result = jax.device_put(host_data, target_devices[0])
-                        result.block_until_ready()
-
-                        t1 = time.perf_counter()
-                        h2d_perf.append((t1 - t0) * 1000)
-
-                        # D2H
-                        t2 = time.perf_counter()
-                        # Simple device_get
-                        # Note: device_get returns a numpy array (copy)
-                        _ = jax.device_get(result)
-
-                        t3 = time.perf_counter()
-                        d2h_perf.append((t3 - t2) * 1000)
-                        result.delete()
-                    
-                jax.clear_caches()
+                device_data = strategy.run_h2d(host_data, i)
+                strategy.run_d2h(device_data, i)
+
+    strategy.teardown()
 
     return {
-        "H2D_Bandwidth_ms": h2d_perf,
-        "D2H_Bandwidth_ms": d2h_perf,
+        "H2D_Bandwidth_ms": strategy.h2d_perf,
+        "D2H_Bandwidth_ms": strategy.d2h_perf,
     }
 
 def benchmark_host_device_calculate_metrics(
     data_size_mib: int,
-    h2d_type: str,
+    transfer_type: str,
     H2D_Bandwidth_ms: List[float],
     D2H_Bandwidth_ms: List[float],
-    d2h_pipelined_method: str = "device_put",
 ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
     """Calculates metrics for Host-Device transfer."""
     params = locals().items()
@@ -177,8 +254,7 @@ def benchmark_host_device_calculate_metrics(
     }
     metadata = {k: v for k, v in params if k in metadata_keys}
     metadata["dtype"] = "float32"
-    metadata["h2d_type"] = h2d_type
-    metadata["d2h_pipelined_method"] = d2h_pipelined_method
+    metadata["transfer_type"] = transfer_type
     
     metrics = {}
     

From e0a9abcbe6670888dd44c0d6a054099e2dcab557 Mon Sep 17 00:00:00 2001
From: yuyanpeng-google <yuyanpeng@google.com>
Date: Fri, 13 Feb 2026 17:05:25 +0800
Subject: [PATCH 67/88] fix numeric error cause by padding and improve default
 block size (#112)

Use segment id to filter out the padding KV if needed.
Since the segment id would affect the latency,
we should add them to reflect the padding situation.
---
 Ironwood/src/benchmark_attention.py | 63 ++++++++++++++++++++++-------
 1 file changed, 49 insertions(+), 14 deletions(-)

diff --git a/Ironwood/src/benchmark_attention.py b/Ironwood/src/benchmark_attention.py
index 64192266..87139200 100644
--- a/Ironwood/src/benchmark_attention.py
+++ b/Ironwood/src/benchmark_attention.py
@@ -151,19 +151,19 @@ def _get_tokamax_benchmark_fn(
         kernel = splash.make_splash_mqa_single_device(mask, config=config)
 
         @jax.jit
-        def f(q, k, v):
+        def f(q, k, v, segment_ids):
             q = q.reshape(q.shape[:-3] + (k.shape[-3], -1) + q.shape[-2:])
-            kernel_ = jax.vmap(kernel, in_axes=(0, 0, 0))  # batch vmap
-            kernel_ = jax.vmap(kernel_, in_axes=(0, 0, 0))  # mqa vmap
-            return kernel_(q, k, v)
+            kernel_ = jax.vmap(kernel, in_axes=(0, 0, 0, None))  # batch vmap
+            kernel_ = jax.vmap(kernel_, in_axes=(0, 0, 0, None))  # mqa vmap
+            return kernel_(q, k, v, segment_ids)
     else:
         kernel = splash.make_splash_mha_single_device(mask, config=config)
-        f = jax.jit(jax.vmap(kernel, in_axes=(0, 0, 0)))
+        f = jax.jit(jax.vmap(kernel, in_axes=(0, 0, 0, None)))
 
     if mode == "fwd":
         return f
     if mode == "bwd":
-        return jax.grad(lambda q, k, v: f(q, k, v).mean(), argnums=(0, 1, 2))
+        return jax.grad(lambda q, k, v, segment_ids: f(q, k, v, segment_ids).mean(), argnums=(0, 1, 2))
     raise ValueError(f"Invalid mode: {mode}")
 
 
@@ -211,7 +211,7 @@ def tokamax_splash_attention_benchmark(
         print(f"{key=} is not tuned")
         has_optimized = False
         hyperparams = DEFAULT_SPLASH_ATTENTION_HYPERPARAMS
-    
+
     (
         block_q,
         block_kv,
@@ -225,30 +225,62 @@ def tokamax_splash_attention_benchmark(
         use_experimental_scheduler,
     ) = hyperparams
 
+    segment_ids = None
+
     # Pad q, kv to prevent the block size are not valid
     if not has_optimized:
         def _ceiling_div(a: int, b: int) -> int:
             return (a + b - 1) // b
 
-
         def _align_to(x: int, a: int) -> int:
             return _ceiling_div(x, a) * a
 
+        q_len = q.shape[-2]
+        k_len = k.shape[-2]
+
+        # handle the block size, seq_len need to be multiple of block size
+        # bkv need to be multiple of bkv_compute
+        block_q = min(q_len, block_q)
+        # Align to 128 per kernel request
+        block_q = _align_to(block_q, 128)
+        block_kv = min(k_len, block_kv)
+        block_kv = _align_to(block_kv, 128)
+        block_kv_compute = min(block_kv, 256)
+        # Align block_kv to block_kv_compute
+        block_kv = _align_to(block_kv, block_kv_compute)
+        block_q_dkv = min(q_len, block_q_dkv)
+        # Align to 128 per kernel request
+        block_q_dkv = _align_to(block_q_dkv, 128)
+        block_kv_dkv = min(k_len, block_kv_dkv)
+        block_kv_dkv = _align_to(block_kv_dkv, 128)
+        block_kv_dkv_compute = min(block_kv_dkv, 256)
+        # Align block_kv to block_kv_compute
+        block_kv_dkv = _align_to(block_kv_dkv, block_kv_dkv_compute)
+
         def _pad_token(t: jax.Array, size) -> jax.Array:
             # tensor is [batch_size, num_head, token, head_dim]
             result = jnp.pad(t, ((0, 0), (0, 0), (0, size), (0, 0)), constant_values=0)
             return result
 
-        q_len = q.shape[-2]
-        k_len = k.shape[-2]
-
         # Pad q, k, v sequence, align to block sizes
         q = _pad_token(q, _align_to(q_len, block_q) - q_len)
         k = _pad_token(k, _align_to(k_len, block_kv) - k_len)
         v = _pad_token(v, _align_to(k_len, block_kv) - k_len)
+        # Handle the k padding to avoid numeric error
+        if k.shape[-2] > k_len:
+            # padded q doesn't matter since it can directly strip out from result
+            segment_ids = splash.SegmentIds(
+                q=jnp.ones((q.shape[-2],), dtype=jnp.int32),
+                kv=jnp.pad(
+                    jnp.ones((k_len,), dtype=jnp.int32),
+                    ((0, k.shape[-2] - k_len),),
+                    constant_values=0,
+                ),
+            )
 
     padded_q_len = q.shape[-2]
     padded_kv_len = k.shape[-2]
+    print(f"{padded_q_len=}, {padded_kv_len=}")
     # Attention mask
     mask = mask_lib.FullMask(_shape=(padded_q_len, padded_kv_len))
     if causal:
@@ -260,6 +292,7 @@ def attention_fn(
         q: jax.Array,
         k: jax.Array,
         v: jax.Array,
+        segment_ids: Optional[splash.SegmentIds],
         block_q: int,
         block_kv: int,
         block_kv_compute: int,
@@ -291,7 +324,7 @@ def attention_fn(
         )
 
         f = _get_tokamax_benchmark_fn(mask, config, mode, mqa=mqa)
-        return f(q, k, v)
+        return f(q, k, v, segment_ids)
 
     attention_fn = partial(
         attention_fn,
@@ -331,7 +364,7 @@ def attention_fn(
     )
 
     # Run once
-    output = tuned_splash(q, k, v)
+    output = tuned_splash(q, k, v, segment_ids)
     jax.block_until_ready(output)
 
     print("-" * 50)
@@ -343,17 +376,19 @@ def attention_fn(
     print(f"{hyperparams=}")
     print("-" * 50)
 
+    is_event_filter_segmented = "" if segment_ids is None else "segmented_"
     # Run benchmark
     time_ms_list = timeit_from_trace(
         tuned_splash,
         q,
         k,
         v,
+        segment_ids,
         tries=num_runs,
         task="tokamax_splash_attentionatt",
         trace_dir=trace_dir,
         event_name_str_list=[
-            f"{event_filter_regex}_no_residuals.1",
+            f"{event_filter_regex}_{is_event_filter_segmented}no_residuals.1",
         ]
     )
     return {

From e7c1649aa14e6abd7949d897c115e0f2c962b64c Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Fri, 13 Feb 2026 08:42:11 +0000
Subject: [PATCH 68/88] Fix retry command

---
 Ironwood/guides/automation/automation_launch.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh
index cb89e725..03f781ca 100755
--- a/Ironwood/guides/automation/automation_launch.sh
+++ b/Ironwood/guides/automation/automation_launch.sh
@@ -230,7 +230,7 @@ if [[ ${#FAILED_JOBS[@]} -gt 0 ]]; then
     for yaml_file in "${FAILED_JOBS[@]}"; do
         job_name=$(basename "${yaml_file}" .yaml | tr '[:upper:]' '[:lower:]' | tr '_' '-')
         GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}"
-        echo "JOB_NAME=\"${job_name}\" GCS_PATH=\"${GCS_PATH}\" envsubst '\${JOB_NAME} \${GCS_PATH}' < \"${SCRIPT_DIR}/${yaml_file}\" | kubectl apply -f -"
+        echo "JOB_NAME=\"${job_name}\" GCS_PATH=\"${GCS_PATH}\" GCS_SA_NAME=\"${GCS_SA_NAME}\" envsubst '\${JOB_NAME} \${GCS_PATH} \${GCS_SA_NAME}' < \"${SCRIPT_DIR}/${yaml_file}\" | kubectl apply -f -"
     done
 else
     echo "Success! All jobs finished."

From c2bec50175a5f5d51bcea84a053a45e5af502793 Mon Sep 17 00:00:00 2001
From: Chi Shuen Lee <chishuen@google.com>
Date: Fri, 13 Feb 2026 17:57:43 +0000
Subject: [PATCH 69/88] Remove BMM multi-host runs from the 2x2x1 yaml file to
 avoid confusion.

---
 Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml
index 6257acfc..1b5b9774 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml
@@ -54,7 +54,6 @@ spec:
 
           GCS_BUCKET_DIR=${GCS_PATH}
           python Ironwood/src/run_benchmark.py --config="Ironwood/configs/bmm/single_device_bmm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/bmm/multi_host_bmm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
         resources:
           requests:
             google.com/tpu: 4

From f4f89ee12224aabc762df8334b3829ab094fe857 Mon Sep 17 00:00:00 2001
From: Pulasthi Supun <pulasthi@google.com>
Date: Thu, 19 Feb 2026 13:02:07 -0800
Subject: [PATCH 70/88] Adding CCC based autoscaler files (#109)

* Adding CCC based autoscaler files

Signed-off-by: pulasthi <pulasthi@google.com>

* adding Readme file

---------

Signed-off-by: pulasthi <pulasthi@google.com>
---
 .../guides/automation/autoscaling/README.md   | 111 ++++++++
 .../autoscaling/automation_launch.sh          | 267 ++++++++++++++++++
 .../autoscaling/check_ccc_resources.sh        |  82 ++++++
 .../autoscaling/create_ccc_templates.sh       |  98 +++++++
 .../automation/autoscaling/job-queue-CCC.yaml |  40 +++
 .../autoscaling/tpu-ccc-template.yaml         |  19 ++
 .../autoscaling/tpu7x-2x2x1-bmm.yaml          |  62 ++++
 .../autoscaling/tpu7x-2x2x1-collectives.yaml  |  64 +++++
 .../autoscaling/tpu7x-2x2x1-gemm.yaml         |  62 ++++
 .../tpu7x-2x2x1-gemm_all_reduce.yaml          |  62 ++++
 .../autoscaling/tpu7x-2x2x1-hbm.yaml          |  62 ++++
 .../autoscaling/tpu7x-2x2x1-host_device.yaml  |  62 ++++
 .../autoscaling/tpu7x-2x2x2-collectives.yaml  |  64 +++++
 .../autoscaling/tpu7x-2x2x4-collectives.yaml  |  64 +++++
 .../autoscaling/tpu7x-2x4x4-collectives.yaml  |  64 +++++
 .../autoscaling/tpu7x-4x4x4-collectives.yaml  |  64 +++++
 .../autoscaling/tpu7x-4x4x8-collectives.yaml  |  64 +++++
 17 files changed, 1311 insertions(+)
 create mode 100644 Ironwood/guides/automation/autoscaling/README.md
 create mode 100755 Ironwood/guides/automation/autoscaling/automation_launch.sh
 create mode 100644 Ironwood/guides/automation/autoscaling/check_ccc_resources.sh
 create mode 100755 Ironwood/guides/automation/autoscaling/create_ccc_templates.sh
 create mode 100644 Ironwood/guides/automation/autoscaling/job-queue-CCC.yaml
 create mode 100644 Ironwood/guides/automation/autoscaling/tpu-ccc-template.yaml
 create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-bmm.yaml
 create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-collectives.yaml
 create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-gemm.yaml
 create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-gemm_all_reduce.yaml
 create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-hbm.yaml
 create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-host_device.yaml
 create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-2x2x2-collectives.yaml
 create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-2x2x4-collectives.yaml
 create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-2x4x4-collectives.yaml
 create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-4x4x4-collectives.yaml
 create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-4x4x8-collectives.yaml

diff --git a/Ironwood/guides/automation/autoscaling/README.md b/Ironwood/guides/automation/autoscaling/README.md
new file mode 100644
index 00000000..e8bbde50
--- /dev/null
+++ b/Ironwood/guides/automation/autoscaling/README.md
@@ -0,0 +1,111 @@
+# Ironwood Benchmark Automation With CCC for nodepool creation
+
+This directory contains the automation framework for running TPU microbenchmarks (HBM, Host-Device, Collectives, etc.) on GKE clusters with autoscaling enabled through CCC. The tool simplifies the workflow of launching multiple benchmark jobs via [Kueue](https://kueue.sigs.k8s.io/), monitoring their status, handling retries, and aggregating the final results into a unified format.
+
+The autoscaling version of the script uses CustomComputeClass (CCC) to manage the creation and deletion of the required nodepools automatically based on the workloads.
+
+## Overview
+
+The automation workflow consists of three main stages:
+1.  **Launch**: Submits Kubernetes Jobs for various benchmark configurations (e.g., different topologies like 2x2x1, 2x2x2) using Kueue for queue management.
+2.  **Monitor & Retry**: Watches the jobs until completion. If any job fails, it automatically retries them (up to 3 times by default).
+3.  **Aggregate**: Once all jobs succeed, an aggregator job is launched to collect all intermediate results from GCS and consolidate them into summary TSV files.
+
+## Prerequisites
+
+Before running the automation script, ensure the following requirements are met:
+
+### 1. Environment Setup
+*   **GKE Cluster**: You must have a GKE cluster.
+*   **Kubectl**: Ensure `kubectl` is installed and authenticated to your cluster.
+*   **GCS Bucket**: A Google Cloud Storage bucket is required to store intermediate and final aggregated results.
+    ```bash
+    gcloud storage buckets create gs://my-unique-bucket-name --location=us-central1
+    ```
+
+### 2. Install Kueue
+The automation relies on Kueue for job queuing. Check if it's already installed:
+
+```bash
+kubectl get namespace kueue-system
+```
+
+If you see `Error from server (NotFound)`, install it with:
+
+```bash
+kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.16.0/manifests.yaml
+```
+
+### 3. Verify requirments for CCC
+In order for CCC to work the correct set of CCC templates need to be created. If you have not already done so, allowing pre-flight checks to run
+when the script prompts for it will install all the required CCC templates (templates for different TPU topologies 2x2x1, 2x2x2, etc)
+
+## Directory Structure
+
+*   `automation_launch.sh`: The main entry point script. Manages the full lifecycle of the benchmark run.
+*   `check_ccc_resources.sh`: Validation script that makes sure all CCC related resources are created.
+*   `create_ccc_templates.sh`: Create the required CCC related resources.
+*   `../aggregator.py`: Python script that downloads results from GCS and produces summary tables.
+*   `../aggregator.yaml`: Kubernetes Job definition for running the aggregator.
+*   `job-queue-CCC.yaml`: Kueue resource definitions (ClusterQueue, LocalQueue).
+*   `*.yaml`: Benchmark job configurations (e.g., `tpu7x-2x2x1-hbm.yaml`).
+
+## Configuration
+
+You can configure the behavior using the following environment variable:
+
+| Variable | Description | Required | Default |
+| :--- | :--- | :--- | :--- |
+| `GCS_BUCKET_ROOT_DIR` | The root GCS path where results will be stored. Must start with `gs://`. | **Yes** | `gs://example-microbenchmark` (Change this!) |
+
+## Usage Guide
+
+1.  **Clone the Repository**:
+    ```bash
+    git clone https://github.com/google/accelerator-microbenchmarks.git
+    cd accelerator-microbenchmarks
+    # Switch to the correct branch if necessary
+    git checkout tpu7x-auto
+    ```
+
+2.  **Set the GCS Bucket**:
+    Export the path to your GCS bucket. This is where all results will be saved.
+    ```bash
+    export GCS_BUCKET_ROOT_DIR="gs://your-unique-bucket-name/benchmark_runs/$(date +%Y%m%d_%H%M%S)"
+    ```
+
+3.  **Run the Automation Script**:
+    Execute the launch script from the root of the repository.
+    ```bash
+    bash Ironwood/guides/automation/automation_launch.sh
+    ```
+
+    **What happens next?**
+    *   If pre-flight checks are enabled, will check and CCC resources (and create if needed) and check GCS permissions
+    *   It applies the Kueue job queue.
+    *   It submits the benchmark jobs defined in the script (e.g., HBM tests).
+    *   It waits for jobs to finish, retrying any failures up to 3 times.
+    *   Finally, it launches the `aggregator` job.
+
+## Output
+
+After the automation completes, check your GCS bucket (`GCS_BUCKET_ROOT_DIR`). You will find:
+
+*   **`aggregated_results/`**: Contains the final summary CSV/TSV files (e.g., `hbm.tsv`, `collectives.tsv`).
+*   **`<job-name>/`**: Directories for each individual job containing intermediate results.
+
+## Troubleshooting
+
+### Job Failures
+If jobs fail even after retries:
+1.  Check the script output to see which specific jobs failed.
+2.  Inspect the logs of a failed job using `kubectl logs job/<job-name>`.
+3.  Manually retry a specific job if needed using the command printed by the script at the end of the run.
+
+### Missing Results
+If the `aggregated_results` folder is empty:
+1.  Check the logs of the aggregator job:
+    ```bash
+    kubectl logs job/aggregator
+    ```
+2.  Ensure the `GCS_BUCKET_ROOT_DIR` was accessible by the pods (check Workload Identity or service account permissions if running in a restricted project).
diff --git a/Ironwood/guides/automation/autoscaling/automation_launch.sh b/Ironwood/guides/automation/autoscaling/automation_launch.sh
new file mode 100755
index 00000000..2823a4ef
--- /dev/null
+++ b/Ironwood/guides/automation/autoscaling/automation_launch.sh
@@ -0,0 +1,267 @@
+#!/usr/bin/env bash
+
+######################################################################
+# automation_launch.sh: Run a series of TPU microbenchmark jobs
+######################################################################
+# This script automates the process of launching multiple TPU microbenchmark
+# jobs defined in various YAML files. It handles:
+#   - Pre-flight checks for necessary CCC resources and GCS permissions.
+#   - Applying job YAMLs to a Kubernetes cluster.
+#   - Waiting for jobs to complete, with a timeout.
+#   - Retrying failed jobs up to a configurable number of times.
+#   - Aggregating results using a separate aggregator job.
+#   - Reporting on any jobs that ultimately failed.
+#
+# User-configurable variables are at the top of the script.
+######################################################################
+
+######################################################################
+#                            USER INPUT
+######################################################################
+TIMESTAMP=$(date +%Y-%m-%d_%H-%M-%S)
+export GCS_BUCKET_ROOT_DIR="gs://pulasthi-ccc-testb1/test5"
+export GCS_SA_NAME="gcs-writer"  # Service account with write access to GCS_BUCKET_ROOT_DIR
+export PROJECT_ID=$(gcloud config get-value project 2>/dev/null)
+MAX_RETRIES=3
+TIMEOUT_SECOND=3600
+
+yaml_names=(
+    "tpu7x-2x2x1-hbm.yaml"
+    "tpu7x-2x4x4-collectives.yaml"
+    "tpu7x-2x2x1-gemm_all_reduce.yaml"
+)
+
+################################################################################
+# COLOR OUTPUT
+################################################################################
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+function print_success() {
+    echo -e "${GREEN}✅ $1${NC}"
+}
+
+function print_error() {
+    echo -e "${RED}❌ $1${NC}"
+}
+
+function print_info() {
+    echo -e "${BLUE}ℹ️  $1${NC}"
+}
+
+function print_warning() {
+    echo -e "${YELLOW}⚠️  $1${NC}"
+}
+
+######################################################################
+#                        VALIDATION & SETUP
+######################################################################
+
+if [[ -z "${GCS_BUCKET_ROOT_DIR}" || "${GCS_BUCKET_ROOT_DIR}" != "gs://"* ]]; then
+  print_error "GCS_BUCKET_ROOT_DIR must be set and start with gs://"
+  exit 1
+fi
+
+print_info "The intermediate result will be written to ${GCS_BUCKET_ROOT_DIR}"
+
+read -p "Run pre-flight checks (CCC resource validation & GCS permissions)? (y/n): " run_checks
+
+if [[ "$run_checks" == "y" ]]; then
+  print_info "Running CCC resource validation..."
+  required_topologies=($(printf "%s\n" "${yaml_names[@]}" | grep -oE '[0-9]+x[0-9]+x[0-9]+' | sort -u))
+  SCRIPT_DIR="$(dirname "$(realpath "$0")")"
+  if ! bash "${SCRIPT_DIR}/check_ccc_resources.sh"; then
+    print_error "Some required CCC resources are missing. Please run create_ccc_templates.sh first. Make sure to fill the requierd variables."
+    exit 1
+  fi
+
+  print_info "Running GCS permission check..."
+  export SA_NAME="${GCS_SA_NAME}"
+  export PROJECT_ID="${PROJECT_ID}"
+  if ! bash "${SCRIPT_DIR}/../check_gcs_permissions.sh"; then
+      print_error "GCS Permission Check Failed. Exiting."
+      exit 1
+  fi
+else
+  print_warning "Skipping pre-flight checks."
+fi
+
+SCRIPT_DIR="$(dirname "$(realpath "$0")")"
+kubectl apply -f ${SCRIPT_DIR}/job-queue-CCC.yaml
+
+######################################################################
+#                 LAUNCH JOBS & WAIT FOR COMPLETION
+######################################################################
+
+
+# Function to wait for a job to complete or fail
+wait_for_job_completion() {
+    local job_name="$1"
+    local timeout="$2"
+    local start_time=$(date +%s)
+    local end_time=$((start_time + timeout))
+
+    while true; do
+        current_time=$(date +%s)
+        if [[ $current_time -gt $end_time ]]; then
+            print_error "Timeout waiting for job ${job_name}"
+            return 2
+        fi
+
+        # Check for Complete condition
+        if kubectl get job "${job_name}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null | grep -q "True"; then
+            print_success "Job ${job_name} completed successfully!"
+            return 0
+        fi
+
+        # Check for Failed condition
+        if kubectl get job "${job_name}" -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null | grep -q "True"; then
+            print_error "Job ${job_name} FAILED!"
+            return 1
+        fi
+
+        sleep 5
+    done
+}
+
+# Function to apply jobs and wait for them to complete
+# Returns a list of failed yaml files in the variable FAILED_JOBS
+apply_and_wait() {
+    local yaml_files=("$@")
+    local job_names_in_batch=()
+    FAILED_JOBS=()
+
+    print_info "Processing batch of ${#yaml_files[@]} jobs..."
+
+    # Launch all jobs
+    for yaml_file in "${yaml_files[@]}"; do
+        local filepath="${SCRIPT_DIR}/${yaml_file}"
+        # Derive job name: remove .yaml, lowercase, replace _ with -
+        local job_name=$(basename "${yaml_file}" .yaml | tr '[:upper:]' '[:lower:]' | tr '_' '-')
+        random_suffix=$(head /dev/urandom | tr -dc a-z0-9 | head -c 5)
+        export JOB_NAME="${job_name}-${random_suffix}"
+        export GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}"
+        
+        print_info "Launching job: ${filepath} (name: ${JOB_NAME})"
+        envsubst '${JOB_NAME} ${GCS_PATH} ${GCS_SA_NAME}' < "${filepath}" | kubectl apply -f -
+        job_names_in_batch+=("${JOB_NAME}")
+    done
+
+    # Monitor jobs
+    local start_time=$(date +%s)
+    local end_time=$((start_time + TIMEOUT_SECOND))
+    local last_print_time=0
+    
+    while true; do
+        local current_time=$(date +%s)
+        if [[ $current_time -gt $end_time ]]; then
+            print_error "Timeout waiting for batch completion"
+            break
+        fi
+
+        # Identify active jobs
+        local active_jobs=()
+        for job_name in "${job_names_in_batch[@]}"; do
+            # Check for Complete
+            if kubectl get job "${job_name}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null | grep -q "True"; then
+                continue
+            fi
+            
+            # Check for Failed
+            if kubectl get job "${job_name}" -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null | grep -q "True"; then
+                continue
+            fi
+            
+            # If neither, it's pending/running
+            active_jobs+=("${job_name}")
+        done
+
+        if [[ ${#active_jobs[@]} -eq 0 ]]; then
+            break
+        fi
+
+        # Dashboard View - Print every 60 seconds
+        if [[ $((current_time - last_print_time)) -ge 60 ]]; then
+            print_info "======================================================================"
+            date "+%Y-%m-%d %H:%M:%S"
+            print_info "----------------------------------------------------------------------"
+            kubectl get jobs "${active_jobs[@]}"
+            print_info "======================================================================"
+            last_print_time=$current_time
+        fi
+        
+        sleep 10
+    done
+
+    # Collect results and cleanup
+    FAILED_JOBS=()
+    for i in "${!yaml_files[@]}"; do
+        local yaml_file="${yaml_files[$i]}"
+        local job_name="${job_names_in_batch[$i]}"
+        local filepath="${SCRIPT_DIR}/${yaml_file}"
+        
+        # Check if failed or still running (timeout)
+        if ! kubectl get job "${job_name}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null | grep -q "True"; then
+             FAILED_JOBS+=("${yaml_files[$i]}")
+        fi
+        
+        export JOB_NAME="${job_name}"
+        export GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}"
+        envsubst '${JOB_NAME} ${GCS_PATH}' < "${filepath}" | kubectl delete -f - &> /dev/null
+    done
+}
+
+# Retry loop
+current_batch=("${yaml_names[@]}")
+
+for (( retry=1; retry<=MAX_RETRIES; retry++ )); do
+    apply_and_wait "${current_batch[@]}"
+
+    if [[ ${#FAILED_JOBS[@]} -eq 0 ]]; then
+        print_success "All jobs completed successfully in Round ${retry}!"
+        break
+    fi
+
+    print_error "Round ${retry} finished. ${#FAILED_JOBS[@]} jobs failed."
+    current_batch=("${FAILED_JOBS[@]}")
+
+    if [[ ${retry} -lt ${MAX_RETRIES} ]]; then
+        print_info "Retrying failed jobs..."
+        print_info "========================================"
+        print_info "$((retry + 1)) / ${MAX_RETRIES} max retries"
+        print_info "========================================"
+    else
+        print_error "Max retries reached."
+    fi
+done
+
+echo ""
+print_info "Jobs completed. Aggregating results..."
+echo ""
+
+# Ensure cleanup of any previous aggregator job to avoid immutable field errors
+kubectl delete job aggregator --ignore-not-found=true
+
+envsubst '${GCS_BUCKET_ROOT_DIR} ${GCS_SA_NAME}' < ${SCRIPT_DIR}/../aggregator.yaml | kubectl apply -f -
+wait_for_job_completion "aggregator" ${TIMEOUT_SECOND}
+envsubst '${GCS_BUCKET_ROOT_DIR} ${GCS_SA_NAME}' < ${SCRIPT_DIR}/../aggregator.yaml | kubectl delete -f -
+
+# Print the failed jobs at the end for better visibility.
+
+if [[ ${#FAILED_JOBS[@]} -gt 0 ]]; then
+    print_error "The following jobs finally failed after ${MAX_RETRIES} rounds:"
+    printf '%s\n' "${FAILED_JOBS[@]}"
+
+    echo -e "\nTo retry manually, run:"
+    for yaml_file in "${FAILED_JOBS[@]}"; do
+        job_name=$(basename "${yaml_file}" .yaml | tr '[:upper:]' '[:lower:]' | tr '_' '-')
+        GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}"
+        echo "JOB_NAME=\"${job_name}\" GCS_PATH=\"${GCS_PATH}\" envsubst '\${JOB_NAME} \${GCS_PATH}' < \"${SCRIPT_DIR}/${yaml_file}\" | kubectl apply -f -"
+    done
+else
+    print_success "Success! All jobs finished."
+fi
diff --git a/Ironwood/guides/automation/autoscaling/check_ccc_resources.sh b/Ironwood/guides/automation/autoscaling/check_ccc_resources.sh
new file mode 100644
index 00000000..cb085d66
--- /dev/null
+++ b/Ironwood/guides/automation/autoscaling/check_ccc_resources.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+######################################################################
+# check_ccc_resources.sh: Validate existence of CCC resources
+######################################################################
+# This script checks if the required Google Cloud Compute resource policies
+# and Kubernetes Custom Compute Class (CCC) manifests exist for a given
+# list of TPU topologies.
+#
+# It iterates through the provided TOPOLOGIES array:
+#   - For multi-host topologies, it verifies the presence of the
+#     expected workload policy using gcloud.
+#   - It checks for the existence of the Custom Compute Class resource
+#     in the Kubernetes cluster using kubectl.
+#
+# The script exits with status 1 if any required resource is missing,
+# and status 0 if all resources are found.
+######################################################################
+
+export TOPOLOGIES=(2x2x1 2x2x2 2x2x4 2x4x4 4x4x4 4x4x8)
+PROJECT_ID="${PROJECT_ID:-$(gcloud config get-value project 2>/dev/null)}"
+export REGION=$(kubectl get nodes -o jsonpath='{.items[0].metadata.labels.topology\.kubernetes\.io/region}')
+CLUSTER_NAME=$(kubectl config current-context | cut -d '_' -f 4)
+export RESOURCE_NAME=${CLUSTER_NAME%-gke}
+
+################################################################################
+# COLOR OUTPUT
+################################################################################
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+function print_success() {
+    echo -e "${GREEN}✅ $1${NC}"
+}
+
+function print_error() {
+    echo -e "${RED}❌ $1${NC}"
+}
+
+function print_info() {
+    echo -e "${BLUE}ℹ️  $1${NC}"
+}
+
+print_info "Checking CCC resources for all topologies"
+missing_resources=false
+
+for TOPOLOGY in "${TOPOLOGIES[@]}"
+do
+    print_info "Checking resources for topology: ${TOPOLOGY}"
+    # Check workload policy for multi-host topologies
+    if [[ "${TOPOLOGY}" != "2x2x1" ]]; then
+        WORKLOAD_POLICY_NAME="${RESOURCE_NAME}-workload-policy${TOPOLOGY}"
+        if gcloud compute resource-policies describe ${WORKLOAD_POLICY_NAME} --project=${PROJECT_ID} --region=${REGION} &> /dev/null; then
+            print_success "Workload policy ${WORKLOAD_POLICY_NAME} exists."
+        else
+            print_error "Workload policy ${WORKLOAD_POLICY_NAME} is MISSING."
+            missing_resources=true
+        fi
+    else
+        print_info "Skipping workload policy check for single-host topology ${TOPOLOGY}."
+    fi
+
+    # Check Custom Compute Class
+    CCC_NAME="tpuv7-${TOPOLOGY}-class"
+    if kubectl get computeclass ${CCC_NAME} &> /dev/null; then
+        print_success "Custom Compute Class ${CCC_NAME} exists."
+    else
+        print_error "Custom Compute Class ${CCC_NAME} is MISSING."
+        missing_resources=true
+    fi
+done
+
+if [[ "${missing_resources}" == "true" ]]; then
+    print_error "One or more required resources are missing. Please create them."
+    exit 1
+else
+    print_success "All required CCC resources exist."
+    exit 0
+fi
diff --git a/Ironwood/guides/automation/autoscaling/create_ccc_templates.sh b/Ironwood/guides/automation/autoscaling/create_ccc_templates.sh
new file mode 100755
index 00000000..5630885d
--- /dev/null
+++ b/Ironwood/guides/automation/autoscaling/create_ccc_templates.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+
+######################################################################
+# create_ccc_templates.sh: Create Custom Compute Class templates
+######################################################################
+# This script creates the necessary Google Cloud Compute resource policies
+# and Kubernetes Custom Compute Class (CCC) manifests for various TPU
+# topologies.
+#
+# It iterates through a predefined list of TOPOLOGIES:
+#   - For multi-host topologies, it creates a HIGH_THROUGHPUT
+#     workload policy if it doesn't already exist.
+#   - It then uses envsubst to populate a template YAML
+#     (tpu-ccc-template.yaml) with the correct TPU_TOPOLOGY,
+#     RESERVATION_NAME, PROJECT_ID, and POLICY_NAME.
+#   - The resulting manifest is applied to the Kubernetes cluster using
+#     kubectl apply.
+#
+# Required environment variables:
+#   - RESERVATION_NAME: The name of the GCE reservation to use.
+#   - PROJECT_ID: The Google Cloud Project ID.
+#   - REGION: The Google Cloud Region.
+#   - RESOURCE_NAME: A base name used for naming resources.
+######################################################################
+
+export RESERVATION_NAME="<RESERVATION_NAME>"
+
+
+export TOPOLOGIES=(2x2x1 2x2x2 2x2x4 2x4x4 4x4x4 4x4x8)
+SCRIPT_DIR="$(dirname "$(realpath "$0")")"
+PROJECT_ID="${PROJECT_ID:-$(gcloud config get-value project 2>/dev/null)}"
+export REGION=$(kubectl get nodes -o jsonpath='{.items[0].metadata.labels.topology\.kubernetes\.io/region}')
+CLUSTER_NAME=$(kubectl config current-context | cut -d '_' -f 4)
+export RESOURCE_NAME=${CLUSTER_NAME%-gke} # assumes cluster was created with setup script which creates cluster with ${RESOURCE_NAME}-gke as name
+################################################################################
+# COLOR OUTPUT
+################################################################################
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+function print_header() {
+    echo -e "\n${BLUE}========================================${NC}"
+    echo -e "${BLUE}$1${NC}"
+    echo -e "${BLUE}========================================${NC}\n"
+}
+
+function print_success() {
+    echo -e "${GREEN}✅ $1${NC}"
+}
+
+function print_error() {
+    echo -e "${RED}❌ $1${NC}"
+}
+
+function print_warning() {
+    echo -e "${YELLOW}⚠️  $1${NC}"
+}
+
+function print_info() {
+    echo -e "${BLUE}ℹ️  $1${NC}"
+}
+
+print_info "Creating CCC templates for all topoligies"
+# Create workload policy
+for TOPOLOGY in "${TOPOLOGIES[@]}"
+do
+    export TPU_TOPOLOGY=${TOPOLOGY}
+    if [[ "${TOPOLOGY}" == "2x2x1" ]]; then
+        print_warning "Skipping workload policy creation for ${TOPOLOGY} as it is not needed for single host topologies."
+        export POLICY_NAME="" # No policy for single host
+    else
+        WORKLOAD_POLICY_NAME="${RESOURCE_NAME}-workload-policy${TOPOLOGY}"
+        if gcloud compute resource-policies describe ${WORKLOAD_POLICY_NAME} --project=${PROJECT_ID} --region=${REGION} &> /dev/null; then
+            print_info "Workload policy ${WORKLOAD_POLICY_NAME} already exists."
+        else
+            print_info "Creating workload policy ${WORKLOAD_POLICY_NAME}..."
+            gcloud compute resource-policies create workload-policy ${WORKLOAD_POLICY_NAME} \
+    --type HIGH_THROUGHPUT \
+    --accelerator-topology ${TOPOLOGY} \
+    --project ${PROJECT_ID} \
+    --region ${REGION}
+            print_success "Workload policy ${WORKLOAD_POLICY_NAME} created."
+        fi
+        export POLICY_NAME=${WORKLOAD_POLICY_NAME}
+    fi
+
+    echo "${TPU_TOPOLOGY} ${RESERVATION_NAME} ${PROJECT_ID} ${POLICY_NAME}"
+    if [[ "${TOPOLOGY}" == "2x2x1" ]]; then
+        envsubst '${TPU_TOPOLOGY} ${RESERVATION_NAME} ${PROJECT_ID}' < ${SCRIPT_DIR}/tpu-ccc-template.yaml | sed '/placement:/,/policyName:/d' | kubectl apply -f -
+    else
+        envsubst '${TPU_TOPOLOGY} ${RESERVATION_NAME} ${PROJECT_ID} ${POLICY_NAME}' < ${SCRIPT_DIR}/tpu-ccc-template.yaml | kubectl apply -f -
+    fi
+    print_success "Applied TPU Compute Class for ${TOPOLOGY}"
+done
diff --git a/Ironwood/guides/automation/autoscaling/job-queue-CCC.yaml b/Ironwood/guides/automation/autoscaling/job-queue-CCC.yaml
new file mode 100644
index 00000000..c1c155ce
--- /dev/null
+++ b/Ironwood/guides/automation/autoscaling/job-queue-CCC.yaml
@@ -0,0 +1,40 @@
+apiVersion: kueue.x-k8s.io/v1beta2
+kind: ResourceFlavor
+metadata:
+  name: "flavor-tpu7x"
+spec:
+  nodeLabels:
+    cloud.google.com/gke-tpu-accelerator: tpu7x
+---
+apiVersion: kueue.x-k8s.io/v1beta2
+kind: ClusterQueue
+metadata:
+  name: cluster-queue-tpu7x
+spec:
+  flavorFungibility:
+    whenCanBorrow: MayStopSearch
+    whenCanPreempt: TryNextFlavor
+  namespaceSelector: {}
+  preemption:
+    borrowWithinCohort:
+      policy: Never
+    reclaimWithinCohort: Never
+    withinClusterQueue: LowerPriority
+  queueingStrategy: BestEffortFIFO
+  resourceGroups:
+  - coveredResources:
+    - google.com/tpu
+    flavors:
+    - name: flavor-tpu7x
+      resources:
+      - name: google.com/tpu
+        nominalQuota: 128
+  stopPolicy: None
+---
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: LocalQueue
+metadata:
+  namespace: default
+  name: "user-queue-tpu7x"
+spec:
+  clusterQueue: "cluster-queue-tpu7x"
\ No newline at end of file
diff --git a/Ironwood/guides/automation/autoscaling/tpu-ccc-template.yaml b/Ironwood/guides/automation/autoscaling/tpu-ccc-template.yaml
new file mode 100644
index 00000000..100175f4
--- /dev/null
+++ b/Ironwood/guides/automation/autoscaling/tpu-ccc-template.yaml
@@ -0,0 +1,19 @@
+apiVersion: cloud.google.com/v1
+kind: ComputeClass
+metadata:
+  name: tpuv7-${TPU_TOPOLOGY}-class
+spec:
+  priorities:
+    - tpu:
+        type: tpu7x
+        topology: ${TPU_TOPOLOGY}
+        count: 4
+      reservations:
+        specific:
+        - name: ${RESERVATION_NAME}
+          project: ${PROJECT_ID}
+        affinity: Specific
+      placement:
+        policyName: ${POLICY_NAME}
+  nodePoolAutoCreation:
+    enabled: true
\ No newline at end of file
diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-bmm.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-bmm.yaml
new file mode 100644
index 00000000..1a4c777e
--- /dev/null
+++ b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-bmm.yaml
@@ -0,0 +1,62 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-tpu7x
+spec:
+  completionMode: Indexed
+  suspend: true
+  parallelism: 1
+  completions: 1
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/compute-class: tpuv7-2x2x1-class
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x1
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/bmm/single_device_bmm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-collectives.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-collectives.yaml
new file mode 100644
index 00000000..eb152986
--- /dev/null
+++ b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-collectives.yaml
@@ -0,0 +1,64 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-tpu7x
+spec:
+  completionMode: Indexed
+  suspend: true
+  parallelism: 1
+  completions: 1
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/compute-class: tpuv7-2x2x1-class
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x1
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-gemm.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-gemm.yaml
new file mode 100644
index 00000000..822a2246
--- /dev/null
+++ b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-gemm.yaml
@@ -0,0 +1,62 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-tpu7x
+spec:
+  completionMode: Indexed
+  suspend: true
+  parallelism: 1
+  completions: 1
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/compute-class: tpuv7-2x2x1-class
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x1
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/gemm/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-gemm_all_reduce.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-gemm_all_reduce.yaml
new file mode 100644
index 00000000..11b1fce7
--- /dev/null
+++ b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-gemm_all_reduce.yaml
@@ -0,0 +1,62 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-tpu7x
+spec:
+  completionMode: Indexed
+  suspend: true
+  parallelism: 1
+  completions: 1
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/compute-class: tpuv7-2x2x1-class
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x1
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-hbm.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-hbm.yaml
new file mode 100644
index 00000000..f589cc98
--- /dev/null
+++ b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-hbm.yaml
@@ -0,0 +1,62 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-tpu7x
+spec:
+  completionMode: Indexed
+  suspend: true
+  parallelism: 1
+  completions: 1
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/compute-class: tpuv7-2x2x1-class
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x1
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-host_device.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-host_device.yaml
new file mode 100644
index 00000000..bc9c0819
--- /dev/null
+++ b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-host_device.yaml
@@ -0,0 +1,62 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-tpu7x
+spec:
+  completionMode: Indexed
+  suspend: true
+  parallelism: 1
+  completions: 1
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/compute-class: tpuv7-2x2x1-class
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x1
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-2x2x2-collectives.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x2-collectives.yaml
new file mode 100644
index 00000000..7915ff28
--- /dev/null
+++ b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x2-collectives.yaml
@@ -0,0 +1,64 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-tpu7x
+spec:
+  completionMode: Indexed
+  suspend: true
+  parallelism: 2
+  completions: 2
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/compute-class: tpuv7-2x2x2-class
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x2
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-2x2x4-collectives.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x4-collectives.yaml
new file mode 100644
index 00000000..60282962
--- /dev/null
+++ b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x4-collectives.yaml
@@ -0,0 +1,64 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-tpu7x
+spec:
+  completionMode: Indexed
+  suspend: true
+  parallelism: 4
+  completions: 4
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/compute-class: tpuv7-2x2x4-class
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x2x4
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-2x4x4-collectives.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-2x4x4-collectives.yaml
new file mode 100644
index 00000000..343bbf01
--- /dev/null
+++ b/Ironwood/guides/automation/autoscaling/tpu7x-2x4x4-collectives.yaml
@@ -0,0 +1,64 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-tpu7x
+spec:
+  completionMode: Indexed
+  suspend: true
+  parallelism: 8
+  completions: 8
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/compute-class: tpuv7-2x4x4-class
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 2x4x4
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-4x4x4-collectives.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-4x4x4-collectives.yaml
new file mode 100644
index 00000000..23f0fb3a
--- /dev/null
+++ b/Ironwood/guides/automation/autoscaling/tpu7x-4x4x4-collectives.yaml
@@ -0,0 +1,64 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-tpu7x
+spec:
+  completionMode: Indexed
+  suspend: true
+  parallelism: 16
+  completions: 16
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/compute-class: tpuv7-4x4x4-class
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 4x4x4
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file
diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-4x4x8-collectives.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-4x4x8-collectives.yaml
new file mode 100644
index 00000000..25655ca5
--- /dev/null
+++ b/Ironwood/guides/automation/autoscaling/tpu7x-4x4x8-collectives.yaml
@@ -0,0 +1,64 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: headless-svc-${JOB_NAME}
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue-tpu7x
+spec:
+  completionMode: Indexed
+  suspend: true
+  parallelism: 32
+  completions: 32
+  backoffLimit: 0
+  template:
+    spec:
+      subdomain: headless-svc-${JOB_NAME}
+      serviceAccountName: ${GCS_SA_NAME}
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/compute-class: tpuv7-4x4x8-class
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: 4x4x8
+      containers:
+      - name: jax-tpu
+        image: python:3.12
+        securityContext:
+          privileged: false
+        env:
+        - name: JAX_PLATFORMS
+          value: "tpu,cpu"
+        - name: TPU_VMODULE
+          value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10"
+        - name: XLA_IR_DEBUG
+          value: "1"
+        - name: XLA_HLO_DEBUG
+          value: "1"
+        command:
+        - bash
+        - -c
+        - |
+          set -ex
+
+          git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+          cd accelerator-microbenchmarks
+          git checkout tpu7x-auto
+          pip install -r requirements.txt
+
+          GCS_BUCKET_DIR=${GCS_PATH}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
\ No newline at end of file

From 1629d32d7fef47c18521f8b6cfe5c2c3b4946fe4 Mon Sep 17 00:00:00 2001
From: Pulasthi Supun <pulasthi@google.com>
Date: Fri, 20 Feb 2026 15:58:28 -0800
Subject: [PATCH 71/88] adding all benchmarks to automation script (#114)

---
 .../automation/autoscaling/automation_launch.sh      | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/Ironwood/guides/automation/autoscaling/automation_launch.sh b/Ironwood/guides/automation/autoscaling/automation_launch.sh
index 2823a4ef..bcd041a1 100755
--- a/Ironwood/guides/automation/autoscaling/automation_launch.sh
+++ b/Ironwood/guides/automation/autoscaling/automation_launch.sh
@@ -19,16 +19,24 @@
 #                            USER INPUT
 ######################################################################
 TIMESTAMP=$(date +%Y-%m-%d_%H-%M-%S)
-export GCS_BUCKET_ROOT_DIR="gs://pulasthi-ccc-testb1/test5"
+export GCS_BUCKET_ROOT_DIR="gs://<BUCKET_NAME>/<FOLDER_NAME>"
 export GCS_SA_NAME="gcs-writer"  # Service account with write access to GCS_BUCKET_ROOT_DIR
 export PROJECT_ID=$(gcloud config get-value project 2>/dev/null)
 MAX_RETRIES=3
 TIMEOUT_SECOND=3600
 
 yaml_names=(
+    "tpu7x-2x2x1-bmm.yaml"
+    "tpu7x-2x2x1-collectives.yaml"
+    "tpu7x-2x2x1-gemm.yaml"
+    "tpu7x-2x2x1-gemm_all_reduce.yaml"
     "tpu7x-2x2x1-hbm.yaml"
+    "tpu7x-2x2x1-host_device.yaml"
+    "tpu7x-2x2x2-collectives.yaml"
+    "tpu7x-2x2x4-collectives.yaml"
     "tpu7x-2x4x4-collectives.yaml"
-    "tpu7x-2x2x1-gemm_all_reduce.yaml"
+    "tpu7x-4x4x4-collectives.yaml"
+    "tpu7x-4x4x8-collectives.yaml"
 )
 
 ################################################################################

From 5885a2889b0635827b3e007151d7761f6515feab Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Wed, 25 Feb 2026 08:41:32 +0000
Subject: [PATCH 72/88] Add missing 8192 gemm

---
 Ironwood/configs/gemm/gemm_multiple_run_more.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Ironwood/configs/gemm/gemm_multiple_run_more.yaml b/Ironwood/configs/gemm/gemm_multiple_run_more.yaml
index ea89f98b..022a8285 100644
--- a/Ironwood/configs/gemm/gemm_multiple_run_more.yaml
+++ b/Ironwood/configs/gemm/gemm_multiple_run_more.yaml
@@ -11,6 +11,7 @@ benchmarks:
   - {m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'bfloat16'}
   - {m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'bfloat16'}
   - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'bfloat16'}
+  - {m: 8192, k: 8192, n: 8192, num_runs: 100, dtype: 'bfloat16'}
   - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'bfloat16'}
   - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'bfloat16'}
 
@@ -26,6 +27,7 @@ benchmarks:
   - {m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float32'}
   - {m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float32'}
   - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float32'}
+  - {m: 8192, k: 8192, n: 8192, num_runs: 100, dtype: 'float32'}
   - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float32'}
   - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float32'}
 
@@ -41,6 +43,7 @@ benchmarks:
   - {m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float16'}
   - {m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float16'}
   - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float16'}
+  - {m: 8192, k: 8192, n: 8192, num_runs: 100, dtype: 'float16'}
   - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float16'}
   - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float16'}
 
@@ -56,6 +59,7 @@ benchmarks:
   - {m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float8'}
   - {m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float8'}
   - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float8'}
+  - {m: 8192, k: 8192, n: 8192, num_runs: 100, dtype: 'float8'}
   - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float8'}
   - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float8'}
 
@@ -71,5 +75,6 @@ benchmarks:
   - {m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float4'}
   - {m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float4'}
   - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float4'}
+  - {m: 8192, k: 8192, n: 8192, num_runs: 100, dtype: 'float4'}
   - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float4'}
   - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float4'}
\ No newline at end of file

From 4a284030faa42e89b7b7d458c850f2eb845af50c Mon Sep 17 00:00:00 2001
From: "Amy (Yu-Hsuan) Lin" <amylin@google.com>
Date: Tue, 3 Mar 2026 16:11:20 +0800
Subject: [PATCH 73/88] Remove peak flops for fp32, which is unspecified in
 spec (#117)

---
 Ironwood/src/benchmark_bmm.py             |  4 ++-
 Ironwood/src/benchmark_gemm.py            |  4 +--
 Ironwood/src/benchmark_gemm_all_reduce.py |  2 +-
 Ironwood/src/benchmark_utils.py           | 32 ++++++++++++++---------
 4 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/Ironwood/src/benchmark_bmm.py b/Ironwood/src/benchmark_bmm.py
index f988008e..486c1996 100644
--- a/Ironwood/src/benchmark_bmm.py
+++ b/Ironwood/src/benchmark_bmm.py
@@ -216,6 +216,8 @@ def multi_host_bmm_calculate_metrics(
     total_flops, total_flops_all_devices = handle_based_on_sharding(
         total_flops, sharding_strategy
     )
+    peak_flops_multiplier = get_peak_flops_multiplier(dtype.dtype.name)
+    peak_flops = PEAK_FLOPS_PER_DEVICE * peak_flops_multiplier if peak_flops_multiplier is not None else None
     return unified_flops_metrics(
         m,
         n,
@@ -223,7 +225,7 @@ def multi_host_bmm_calculate_metrics(
         time_ms_list,
         total_flops,
         total_flops_all_devices,
-        PEAK_FLOPS_PER_DEVICE,
+        peak_flops,
         dtype=dtype.dtype.name,
         b=b,
     )
diff --git a/Ironwood/src/benchmark_gemm.py b/Ironwood/src/benchmark_gemm.py
index 422b1bc4..4d4b8ccc 100644
--- a/Ironwood/src/benchmark_gemm.py
+++ b/Ironwood/src/benchmark_gemm.py
@@ -147,7 +147,7 @@ def gemm_multiple_run_calculate_metrics(
         total_flops, SHARDING_STRATEGY
     )
     peak_flops_multiplier = get_peak_flops_multiplier(dtype.dtype.name)
-    peak_flops = PEAK_FLOPS_PER_DEVICE * peak_flops_multiplier
+    peak_flops = PEAK_FLOPS_PER_DEVICE * peak_flops_multiplier if peak_flops_multiplier is not None else None
     return unified_flops_metrics(
         m,
         n,
@@ -332,7 +332,7 @@ def gemm_simple_with_dtype_calculate_metrics(
     metadata, metrics = unified_flops_metrics(
             m, n, k, time_ms_list,
             total_flops, total_flops_all_devices,
-            PEAK_FLOPS_PER_DEVICE * peak_flops_multiplier)
+            PEAK_FLOPS_PER_DEVICE * peak_flops_multiplier if peak_flops_multiplier is not None else None)
 
     # Add dtype info to metadata for logging
     metadata["in_dtype"] = in_dtype_str
diff --git a/Ironwood/src/benchmark_gemm_all_reduce.py b/Ironwood/src/benchmark_gemm_all_reduce.py
index 55744593..11d9dc7b 100644
--- a/Ironwood/src/benchmark_gemm_all_reduce.py
+++ b/Ironwood/src/benchmark_gemm_all_reduce.py
@@ -200,7 +200,7 @@ def _calculate_metrics_base(
 
     dtype_str = dtype.dtype.name
     peak_flops_multiplier = get_peak_flops_multiplier(dtype_str)
-    peak_flops = PEAK_FLOPS_PER_DEVICE * peak_flops_multiplier
+    peak_flops = PEAK_FLOPS_PER_DEVICE * peak_flops_multiplier if peak_flops_multiplier is not None else None
 
     return unified_flops_metrics(
         m, n, k, time_ms_list, total_flops_per_device, total_flops_all_devices, peak_flops, dtype=dtype_str,
diff --git a/Ironwood/src/benchmark_utils.py b/Ironwood/src/benchmark_utils.py
index 0a45678d..aa111cba 100644
--- a/Ironwood/src/benchmark_utils.py
+++ b/Ironwood/src/benchmark_utils.py
@@ -1140,7 +1140,7 @@ def unified_flops_metrics(
     time_ms_list: list[float],
     total_flops: int,
     total_flops_all_devices: int,
-    peak_TFLOPS_per_device: float,
+    peak_TFLOPS_per_device: float | None = None,
     dtype: str = None,
     b: int = None,
 ) -> Dict[str, Any]:
@@ -1158,10 +1158,19 @@ def unified_flops_metrics(
         total_flops_all_devices / average_time_s / 10**12
         for average_time_s in average_time_s_list
     ]
-    mfu = [
-        tflops_per_sec / peak_TFLOPS_per_device
-        for tflops_per_sec in tflops_per_sec_list
-    ]
+    if peak_TFLOPS_per_device is not None:
+        mfu = [
+            tflops_per_sec / peak_TFLOPS_per_device
+            for tflops_per_sec in tflops_per_sec_list
+        ]
+        mfu_statistics = MetricsStatistics(metrics_list=mfu, metrics_name="MFU")
+        mfu_val = f"{mfu_statistics.statistics['p50']:.2%}"
+        mfu_raw = mfu_statistics.statistics["p50"]
+    else:
+        mfu_statistics = None
+        mfu_val = "N/A"
+        mfu_raw = "N/A"
+        
     average_time_ms_statistics = MetricsStatistics(
         metrics_list=time_ms_list, metrics_name="step_time_ms"
     )
@@ -1171,14 +1180,13 @@ def unified_flops_metrics(
     tflops_per_sec_all_devices_statistics = MetricsStatistics(
         metrics_list=tflops_per_sec_all_devices, metrics_name="tflops_per_sec"
     )
-    mfu_statistics = MetricsStatistics(metrics_list=mfu, metrics_name="MFU")
     dtype_prefix = f"[{dtype}] " if dtype is not None else ""
     print(
         f"{dtype_prefix}"
         f"Total floating-point ops: {total_flops}, Step Time (median): {average_time_ms_statistics.statistics['p50']:.2f}, "
         f"Throughput (median): {tflops_per_sec_statistics.statistics['p50']:.2f} TFLOP / second / device, "
         f"TotalThroughput (median): {tflops_per_sec_all_devices_statistics.statistics['p50']:.2f} TFLOP / second, "
-        f"MFU: {mfu_statistics.statistics['p50']:.2%}"
+        f"MFU: {mfu_val}"
     )
     # print()
     # time_ms_list =
@@ -1193,7 +1201,7 @@ def unified_flops_metrics(
             "TotalThroughput(median,TFLOP/s)": tflops_per_sec_all_devices_statistics.statistics[
                 "p50"
             ],
-            "MFU": mfu_statistics.statistics["p50"],
+            "MFU": mfu_raw,
             "total_flops": total_flops,
             # "all_time_ms_list":  f"{json.dumps(time_ms_list)}",
         }
@@ -1201,7 +1209,8 @@ def unified_flops_metrics(
     metrics.update(average_time_ms_statistics.serialize_statistics())
     metrics.update(tflops_per_sec_statistics.serialize_statistics())
     metrics.update(tflops_per_sec_all_devices_statistics.serialize_statistics())
-    metrics.update(mfu_statistics.serialize_statistics())
+    if mfu_statistics is not None:
+        metrics.update(mfu_statistics.serialize_statistics())
     metrics = {key: value for key, value in metrics.items() if value is not None}
     return metadata, metrics
 
@@ -1286,7 +1295,7 @@ def str_to_dtype(dtype_str: str) -> jnp.dtype:
     else:
         raise ValueError(f"Unsupported dtype string: {dtype_str}")
 
-def get_peak_flops_multiplier(in_dtype_str: str) -> float:
+def get_peak_flops_multiplier(in_dtype_str: str) -> float | None:
     """
     Returns the peak FLOPS multiplier relative to the baseline
     (PEAK_FLOPS_PER_DEVICE) based on the input data type.
@@ -1301,8 +1310,7 @@ def get_peak_flops_multiplier(in_dtype_str: str) -> float:
         # BF16/FP16 is 2x slower than FP8 peak
         return 0.5
     elif in_dtype_lower in ("fp32", "float32"):
-        # FP32 is 4x slower than FP8 peak
-        return 0.25
+        return None
     elif in_dtype_lower in ("fp4", "float4_e2m1fn"):
         # FP4/INT4 is treated the same as FP8
         return 1.0

From a495fd631578cc3653b1b02c62dc2a352884b7a1 Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Fri, 6 Mar 2026 07:31:16 +0000
Subject: [PATCH 74/88] Increase sweeping range for all reduce

---
 Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
index 93deef9f..8d5ccd90 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
@@ -1,7 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/psum_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/psum_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x4"

From c378bdbafb2e1e62c98189f9ceb0f64e2bcfe2c3 Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Fri, 6 Mar 2026 10:04:02 +0000
Subject: [PATCH 75/88] Extend configs for gemm and collectives

---
 .../configs/collectives/all_to_all_tpu7x_4x4x4.yaml  |  2 +-
 Ironwood/configs/gemm/gemm_multiple_run_more.yaml    | 12 +++++++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
index 5b627a9a..bf30ef9d 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
@@ -1,7 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
diff --git a/Ironwood/configs/gemm/gemm_multiple_run_more.yaml b/Ironwood/configs/gemm/gemm_multiple_run_more.yaml
index 022a8285..7b68aa5b 100644
--- a/Ironwood/configs/gemm/gemm_multiple_run_more.yaml
+++ b/Ironwood/configs/gemm/gemm_multiple_run_more.yaml
@@ -14,6 +14,8 @@ benchmarks:
   - {m: 8192, k: 8192, n: 8192, num_runs: 100, dtype: 'bfloat16'}
   - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'bfloat16'}
   - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'bfloat16'}
+  - {m: 32768, k: 4096, n: 4096, num_runs: 100, dtype: 'bfloat16'}
+  - {m: 32768, k: 8192, n: 8192, num_runs: 100, dtype: 'bfloat16'}
 
 - benchmark_name: "gemm_multiple_run"
   trace_dir: "../microbenchmarks/gemm_multiple_run_f32"
@@ -30,6 +32,8 @@ benchmarks:
   - {m: 8192, k: 8192, n: 8192, num_runs: 100, dtype: 'float32'}
   - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float32'}
   - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float32'}
+  - {m: 32768, k: 4096, n: 4096, num_runs: 100, dtype: 'float32'}
+  - {m: 32768, k: 8192, n: 8192, num_runs: 100, dtype: 'float32'}
 
 - benchmark_name: "gemm_multiple_run"
   trace_dir: "../microbenchmarks/gemm_multiple_run_fp16"
@@ -46,6 +50,8 @@ benchmarks:
   - {m: 8192, k: 8192, n: 8192, num_runs: 100, dtype: 'float16'}
   - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float16'}
   - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float16'}
+  - {m: 32768, k: 4096, n: 4096, num_runs: 100, dtype: 'float16'}
+  - {m: 32768, k: 8192, n: 8192, num_runs: 100, dtype: 'float16'}
 
 - benchmark_name: "gemm_multiple_run"
   trace_dir: "../microbenchmarks/gemm_multiple_run_fp8"
@@ -62,6 +68,8 @@ benchmarks:
   - {m: 8192, k: 8192, n: 8192, num_runs: 100, dtype: 'float8'}
   - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float8'}
   - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float8'}
+  - {m: 32768, k: 4096, n: 4096, num_runs: 100, dtype: 'float8'}
+  - {m: 32768, k: 8192, n: 8192, num_runs: 100, dtype: 'float8'}
 
 - benchmark_name: "gemm_multiple_run"
   trace_dir: "../microbenchmarks/gemm_multiple_run_fp4"
@@ -77,4 +85,6 @@ benchmarks:
   - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float4'}
   - {m: 8192, k: 8192, n: 8192, num_runs: 100, dtype: 'float4'}
   - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float4'}
-  - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float4'}
\ No newline at end of file
+  - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float4'}
+  - {m: 32768, k: 4096, n: 4096, num_runs: 100, dtype: 'float4'}
+  - {m: 32768, k: 8192, n: 8192, num_runs: 100, dtype: 'float4'}
\ No newline at end of file

From dc795d9482258661de0b7aa869d8f7dfc89cc05f Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Fri, 6 Mar 2026 11:38:32 +0000
Subject: [PATCH 76/88] Extend configs for gemm and collectives

---
 Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml | 2 ++
 Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml | 2 ++
 Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
index ceb7bb52..a4dba8ff 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
@@ -2,6 +2,8 @@ benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float8",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
index 8d5ccd90..d88ffa33 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
@@ -2,6 +2,8 @@ benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/psum_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/psum_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
index bf30ef9d..23e5e4fa 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
@@ -2,6 +2,8 @@ benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
   - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4"

From 55fa0eaacf1fc945d48bc52f6bb021e465c6bcc8 Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Fri, 6 Mar 2026 12:15:40 +0000
Subject: [PATCH 77/88] Fix collectives aggregator for multi dtypes

---
 Ironwood/guides/automation/aggregator.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py
index 5ff4ce99..8e5f6751 100644
--- a/Ironwood/guides/automation/aggregator.py
+++ b/Ironwood/guides/automation/aggregator.py
@@ -71,10 +71,11 @@ def aggregate_collectives(directories: list[str], picked_columns: list[str]) ->
         return None
     aggregated_df = pd.DataFrame()
     for directory in directories:
-        file = glob.glob(f"{directory}/*.tsv")[0]
-        df = pd.read_csv(file, sep='\t')
-        df["topology"] = [file.split('/')[-4].split('-')[1] for _ in range(df.shape[0])]
-        aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True)
+        files = glob.glob(f"{directory}/*.tsv")
+        for file in files:
+            df = pd.read_csv(file, sep='\t')
+            df["topology"] = [file.split('/')[-4].split('-')[1] for _ in range(df.shape[0])]
+            aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True)
     return aggregated_df
 
 def aggregate_hbm(directories: list[str], picked_columns: list[str]) -> pd.DataFrame:

From cb56a4309ec6bfd9794bc4e869d5d75f2a83f25a Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Fri, 6 Mar 2026 13:18:38 +0000
Subject: [PATCH 78/88] Address too much event issue

---
 Ironwood/src/benchmark_utils.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Ironwood/src/benchmark_utils.py b/Ironwood/src/benchmark_utils.py
index aa111cba..78718aad 100644
--- a/Ironwood/src/benchmark_utils.py
+++ b/Ironwood/src/benchmark_utils.py
@@ -171,7 +171,13 @@ def multiple_iteration_timeit_from_trace(
     if trace_dir and not is_local_directory_path(trace_dir):
         tmp_trace_dir = f"{LOCAL_TRACE_DIR}/{trace_name}"
     # data_args = data_generator()
-    with jax.profiler.trace(tmp_trace_dir):
+    options = jax.profiler.ProfileOptions()
+    options.advanced_configuration = {
+        "tpu_trace_mode" : "TRACE_ONLY_XLA",
+        "tpu_num_sparse_cores_to_trace": 0,
+        "tpu_num_sparse_core_tiles_to_trace": 0,
+    }
+    with jax.profiler.trace(tmp_trace_dir, profiler_options=options):
         for i in range(tries):
             if i % 10 == 0:
                 print(f"[{task}] Running iteration {i} of {tries} with {matrix_dim}...")

From dd61804c62b876b76173436f4347c9a38f2f75cc Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Mon, 9 Mar 2026 08:05:47 +0000
Subject: [PATCH 79/88] Use larger transfering size

Test out larger matrix

Test out larger matrix
---
 Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml | 4 +++-
 Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml | 4 +++-
 Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml | 4 +++-
 Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml | 4 +++-
 Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml | 4 +++-
 Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml | 4 +++-
 Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml | 4 +++-
 Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml | 4 +++-
 Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml | 4 +++-
 Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml | 4 +++-
 Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml | 4 +++-
 Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml | 4 +++-
 Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml | 4 +++-
 Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml | 4 +++-
 Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml | 4 +++-
 15 files changed, 45 insertions(+), 15 deletions(-)

diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
index 5b11ac8d..7f7e4b52 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
@@ -1,7 +1,9 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
index 3747b754..481d312e 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
@@ -1,7 +1,9 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
index 9c25eb6b..59046a73 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
@@ -1,7 +1,9 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
index 0ad03f56..0b917e88 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
@@ -1,7 +1,9 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
index 0218d6b0..7688fc33 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
@@ -1,7 +1,9 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
index dbeb0407..030bfeea 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
@@ -1,7 +1,9 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
index cca20bc2..6cc52c7c 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
@@ -1,7 +1,9 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
index 1cb29b11..c7dfdb51 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
@@ -1,7 +1,9 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
index 8366350a..73d437d2 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
@@ -1,7 +1,9 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/psum_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/psum_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
index 7b629828..78f4d0a2 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
@@ -1,7 +1,9 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/psum_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/psum_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x8"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
index 42dcf9e1..2a5f453b 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
@@ -1,7 +1,9 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
index 5b1bbb82..f3de48ff 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
@@ -1,7 +1,9 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
index f6004ce2..586397d7 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
@@ -1,7 +1,9 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
index a0e16a92..f8b990ab 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
@@ -1,7 +1,9 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
index 65742f12..5cdd637d 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
@@ -1,7 +1,9 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20}
+  - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8"

From f924a7ee2902dff49300c95e0fc446c4b6bade08 Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Mon, 16 Mar 2026 13:27:01 +0000
Subject: [PATCH 80/88] Optimize H2D/D2H transfer pipelines and add
 comprehensive benchmark configs

---
 .../comprehensive_8dev_experiments.yaml       |  9 ++++
 .../comprehensive_experiments.yaml            |  9 ++++
 Ironwood/src/benchmark_host_device.py         | 50 ++++++++++++-------
 Ironwood/src/run_benchmark.py                 |  2 +-
 4 files changed, 52 insertions(+), 18 deletions(-)
 create mode 100644 Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml
 create mode 100644 Ironwood/configs/host_device/comprehensive_experiments.yaml

diff --git a/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml b/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml
new file mode 100644
index 00000000..ea3a2320
--- /dev/null
+++ b/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml
@@ -0,0 +1,9 @@
+benchmarks:
+- benchmark_name: host_device
+  num_runs: 20
+  benchmark_sweep_params:
+  - transfer_type_list: ["pinned_memory", "simple", "pipelined"]
+    num_devices_list: [8]
+    data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
+    input_type_list: ["numpy", "jax"]
+  csv_path: "/tmp/microbenchmarks/host_device_numactl"
diff --git a/Ironwood/configs/host_device/comprehensive_experiments.yaml b/Ironwood/configs/host_device/comprehensive_experiments.yaml
new file mode 100644
index 00000000..842ea1f4
--- /dev/null
+++ b/Ironwood/configs/host_device/comprehensive_experiments.yaml
@@ -0,0 +1,9 @@
+benchmarks:
+- benchmark_name: host_device
+  num_runs: 20
+  benchmark_sweep_params:
+  - transfer_type_list: ["pinned_memory", "simple", "pipelined"]
+    num_devices_list: [1, 2, 8]
+    data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
+    input_type_list: ["numpy", "jax"]
+  csv_path: "/tmp/microbenchmarks/final_experiments"
diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index 3c1a5621..d1c3481f 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -32,7 +32,7 @@ def __init__(self, trace_dir: str = None):
         self.d2h_perf = []
 
     @abc.abstractmethod
-    def setup(self, data_size_mib: int, host_data: np.ndarray, devices: List[jax.Device]):
+    def setup(self, data_size_mib: int, host_data: np.ndarray, num_devices: int):
         """Perform one-time setup before the benchmark loop."""
         pass
 
@@ -55,12 +55,15 @@ def teardown(self):
 class SimpleTransfer(TransferStrategy):
     """Simple device_put/device_get strategy."""
 
-    def setup(self, data_size_mib: int, host_data: np.ndarray):
-        pass
+    def setup(self, data_size_mib: int, host_data: np.ndarray, num_devices: int):
+        target_devices = jax.devices()[:num_devices]
+        self.mesh = Mesh(target_devices, ('x',))
+        self.partition_spec = PartitionSpec('x')
+        self.sharding = NamedSharding(self.mesh, self.partition_spec)
 
     def run_h2d(self, host_data: np.ndarray, i: int) -> Any:
         t0 = time.perf_counter()
-        device_array = jax.device_put(host_data)
+        device_array = jax.device_put(host_data, self.sharding)
         device_array.block_until_ready()
         t1 = time.perf_counter()
         
@@ -71,7 +74,8 @@ def run_h2d(self, host_data: np.ndarray, i: int) -> Any:
 
     def run_d2h(self, device_data: Any, i: int):
         t2 = time.perf_counter()
-        _ = jax.device_get(device_data)
+        # Retrieving addressable shards natively supports parallelism
+        _ = jax.device_get([s.data for s in device_data.addressable_shards])
         t3 = time.perf_counter()
         self.d2h_perf.append((t3 - t2) * 1000)
         device_data.delete()
@@ -83,9 +87,9 @@ def teardown(self):
 class PipelinedTransfer(TransferStrategy):
     """Pipelined transfer using chunking."""
 
-    def setup(self, data_size_mib: int, host_data: np.ndarray):
+    def setup(self, data_size_mib: int, host_data: np.ndarray, num_devices: int):
         self.target_chunk_size_mib = 16
-        num_devices_to_perform_h2d = 2
+        num_devices_to_perform_h2d = num_devices
         self.target_devices = jax.devices()[:num_devices_to_perform_h2d]
         self.num_devices = len(self.target_devices)
         
@@ -139,8 +143,8 @@ def teardown(self):
 class PinnedMemoryTransfer(TransferStrategy):
     """Pinned memory host-to-device with parallelized device-to-host transfer."""
 
-    def setup(self, data_size_mib: int, host_data: np.ndarray):
-        num_devices_to_perform_h2d = 2
+    def setup(self, data_size_mib: int, host_data: np.ndarray, num_devices: int):
+        num_devices_to_perform_h2d = num_devices
         target_devices = jax.devices()[:num_devices_to_perform_h2d]
         
         mesh = Mesh(target_devices, ('x',))
@@ -177,6 +181,8 @@ def teardown(self):
 def benchmark_host_device(
     data_size_mib: int,
     transfer_type: str,
+    num_devices: int,
+    input_type: str,
     num_runs: int = 100,
     trace_dir: str = None,
 ) -> Dict[str, Any]:
@@ -186,7 +192,14 @@ def benchmark_host_device(
     
     # Allocate Host Source Buffer
     column = 128
-    host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32)
+    np_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32)
+    
+    if input_type == "numpy":
+        host_data = np_data
+    elif input_type == "jax":
+        host_data = jax.device_put(np_data, jax.devices("cpu")[0])
+    else:
+        raise ValueError(f"Unknown input_type: {input_type}")
 
     print(
         f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {transfer_type=}",
@@ -203,7 +216,7 @@ def benchmark_host_device(
         raise ValueError(f"Unknown transfer_type: {transfer_type}. Available: {list(strategies.keys())}")
 
     strategy = strategies[transfer_type](trace_dir)
-    strategy.setup(data_size_mib, host_data)
+    strategy.setup(data_size_mib, host_data, num_devices)
 
     # Profiling Context
     import contextlib
@@ -215,11 +228,10 @@ def benchmark_host_device(
     with profiler_context:
         # Warmup
         for _ in range(2):
-            device_array = jax.device_put(host_data)
-            device_array.block_until_ready()
-            host_out = np.array(device_array)
-            device_array.delete()
-            del host_out
+            device_array = strategy.run_h2d(host_data, -1)
+            strategy.run_d2h(device_array, -1)
+        strategy.h2d_perf.clear()
+        strategy.d2h_perf.clear()
 
         for i in range(num_runs):
             # Step Context
@@ -242,6 +254,8 @@ def benchmark_host_device(
 def benchmark_host_device_calculate_metrics(
     data_size_mib: int,
     transfer_type: str,
+    num_devices: int,
+    input_type: str,
     H2D_Bandwidth_ms: List[float],
     D2H_Bandwidth_ms: List[float],
 ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
@@ -251,10 +265,12 @@ def benchmark_host_device_calculate_metrics(
     # Filter out list params from metadata to avoid explosion
     metadata_keys = {
         "data_size_mib", 
+        "transfer_type",
+        "num_devices",
+        "input_type",
     }
     metadata = {k: v for k, v in params if k in metadata_keys}
     metadata["dtype"] = "float32"
-    metadata["transfer_type"] = transfer_type
     
     metrics = {}
     
diff --git a/Ironwood/src/run_benchmark.py b/Ironwood/src/run_benchmark.py
index 2d10db5d..aef590e8 100644
--- a/Ironwood/src/run_benchmark.py
+++ b/Ironwood/src/run_benchmark.py
@@ -410,7 +410,7 @@ def run_single_benchmark(benchmark_config: Dict[str, Any], output_path: str, gcs
     for id, benchmark_param in enumerate(benchmark_params):
         original_benchmark_param = copy.deepcopy(benchmark_param)
         benchmark_param = preprocess_benchmark_param(
-            benchmark_param, trace_dir=os.path.join(trace_dir, f"benchmark_{id}")
+            benchmark_param, trace_dir=os.path.join(trace_dir, f"benchmark_{id}") if trace_dir else None
         )
         print(f"Running benchmark: {benchmark_name} with params: {benchmark_param}")
         test_start_time = (

From 7439b2a3cf0df6d2a7d8c472bfc18127efd48b03 Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Mon, 16 Mar 2026 13:33:47 +0000
Subject: [PATCH 81/88] Add benchmark guide and run script

---
 .../guides/host_device_benchmark/README.md    | 39 +++++++++++++++++++
 .../run_comprehensive.sh                      | 15 +++++++
 2 files changed, 54 insertions(+)
 create mode 100644 Ironwood/guides/host_device_benchmark/README.md
 create mode 100755 Ironwood/guides/host_device_benchmark/run_comprehensive.sh

diff --git a/Ironwood/guides/host_device_benchmark/README.md b/Ironwood/guides/host_device_benchmark/README.md
new file mode 100644
index 00000000..cbac7b11
--- /dev/null
+++ b/Ironwood/guides/host_device_benchmark/README.md
@@ -0,0 +1,39 @@
+# Host-Device Benchmark Guide
+
+This directory contains instructions and a script to run comprehensive Host-to-Device (H2D) and Device-to-Host (D2H) microbenchmarks on Cloud TPUs.
+
+## Overview
+
+The benchmarks measure the transfer bandwidth for various configurations:
+- **Transfer Strategies**: `simple`, `pipelined`, `pinned_memory`
+- **Data Sizes**: Ranging from 1 MiB to 16,384 MiB.
+- **Input Types**: Replicating inputs using `numpy` arrays or pre-allocated `jax` arrays.
+- **Device Counts**: Scaling across 1, 2, and 8 TPU devices.
+- **NUMA Settings**: Testing the impact of `--interleave=all` with `numactl`.
+
+## How to Run
+
+A convenience script `run_comprehensive.sh` is provided. It executes two suites of tests sequentially from the `Ironwood/` directory:
+
+1. **Comprehensive Suite**: Sweeps through all transfer strategies, data sizes, input types, and device configurations (1, 2, and 8 devices) without any specific NUMA configuration.
+2. **8-Device NUMA Suite**: Executes an 8-device specific sweep while enforcing `numactl --interleave=all` at the process level to balance memory allocations across NUMA nodes, heavily improving the pipelined D2H transfer bottleneck on multi-chip architectures.
+
+### Execution
+
+Simply execute the script on your TPU VM:
+
+```bash
+bash run_comprehensive.sh
+```
+
+## Configuration Files
+
+The executed configurations are located at:
+- `Ironwood/configs/host_device/comprehensive_experiments.yaml`
+- `Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml`
+
+Refer to these files to adjust the tested parameters or trace directories.
+
+## Analyzing Results
+
+Resulting logs and TSV files will be exported to the directory specified within the configurations (default typically output to the console and/or a timestamped TSV). You can use simple Pandas scripts to analyze and extract the max P50 bandwidths.
diff --git a/Ironwood/guides/host_device_benchmark/run_comprehensive.sh b/Ironwood/guides/host_device_benchmark/run_comprehensive.sh
new file mode 100755
index 00000000..a63af1b7
--- /dev/null
+++ b/Ironwood/guides/host_device_benchmark/run_comprehensive.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -e
+
+# Change to the root of the Ironwood directory assuming this script is run from anywhere
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+IRONWOOD_DIR="$(dirname $(dirname "$DIR"))"
+cd "$IRONWOOD_DIR"
+
+echo "Running comprehensive benchmarks across 1, 2, and 8 devices..."
+python3 src/run_benchmark.py --config configs/host_device/comprehensive_experiments.yaml
+
+echo "Running 8-device benchmark with numactl interleaving..."
+numactl --interleave=all python3 src/run_benchmark.py --config configs/host_device/comprehensive_8dev_experiments.yaml
+
+echo "Benchmarks completed successfully."

From 38ec530fa47714f2af1da7c11c0ef31588b5c6dc Mon Sep 17 00:00:00 2001
From: "Yu-Hsuan (Amy) Lin" <amylin@google.com>
Date: Wed, 13 May 2026 12:28:09 +0000
Subject: [PATCH 82/88] Allow sweeping dtype in host_device benchmarks

---
 .../host_device/comprehensive_8dev_experiments.yaml      | 1 +
 .../configs/host_device/comprehensive_experiments.yaml   | 1 +
 Ironwood/src/benchmark_host_device.py                    | 9 ++++++---
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml b/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml
index ea3a2320..e15bcc4f 100644
--- a/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml
+++ b/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml
@@ -6,4 +6,5 @@ benchmarks:
     num_devices_list: [8]
     data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
     input_type_list: ["numpy", "jax"]
+    dtype_list: ["float32"]
   csv_path: "/tmp/microbenchmarks/host_device_numactl"
diff --git a/Ironwood/configs/host_device/comprehensive_experiments.yaml b/Ironwood/configs/host_device/comprehensive_experiments.yaml
index 842ea1f4..7974bc61 100644
--- a/Ironwood/configs/host_device/comprehensive_experiments.yaml
+++ b/Ironwood/configs/host_device/comprehensive_experiments.yaml
@@ -6,4 +6,5 @@ benchmarks:
     num_devices_list: [1, 2, 8]
     data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
     input_type_list: ["numpy", "jax"]
+    dtype_list: ["float32"]
   csv_path: "/tmp/microbenchmarks/final_experiments"
diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index d1c3481f..8b6966b6 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -183,16 +183,18 @@ def benchmark_host_device(
     transfer_type: str,
     num_devices: int,
     input_type: str,
+    dtype: jnp.dtype = jnp.float32,
     num_runs: int = 100,
     trace_dir: str = None,
 ) -> Dict[str, Any]:
     """Benchmarks H2D/D2H transfer using device_put/device_get."""
     
-    num_elements = 1024 * 1024 * data_size_mib // np.dtype(np.float32).itemsize
+    normalized_dtype = jnp.dtype(dtype)
+    num_elements = 1024 * 1024 * data_size_mib // normalized_dtype.itemsize
     
     # Allocate Host Source Buffer
     column = 128
-    np_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32)
+    np_data = np.random.normal(size=(num_elements // column, column)).astype(normalized_dtype)
     
     if input_type == "numpy":
         host_data = np_data
@@ -258,6 +260,7 @@ def benchmark_host_device_calculate_metrics(
     input_type: str,
     H2D_Bandwidth_ms: List[float],
     D2H_Bandwidth_ms: List[float],
+    dtype: jnp.dtype = jnp.float32,
 ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
     """Calculates metrics for Host-Device transfer."""
     params = locals().items()
@@ -270,7 +273,7 @@ def benchmark_host_device_calculate_metrics(
         "input_type",
     }
     metadata = {k: v for k, v in params if k in metadata_keys}
-    metadata["dtype"] = "float32"
+    metadata["dtype"] = jnp.dtype(dtype).name
     
     metrics = {}
     

From aa1e67c37f21a2f1796a242db311a660c0bbbc72 Mon Sep 17 00:00:00 2001
From: Daniel Wu <wudani@google.com>
Date: Wed, 20 May 2026 06:23:25 +0000
Subject: [PATCH 83/88] Added sample variance as a metric for h2dd2h and
 increased the num_runs from 20 to 100.

---
 .../host_device/comprehensive_experiments.yaml     |  2 +-
 Ironwood/src/benchmark_host_device.py              | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/Ironwood/configs/host_device/comprehensive_experiments.yaml b/Ironwood/configs/host_device/comprehensive_experiments.yaml
index 7974bc61..70329dac 100644
--- a/Ironwood/configs/host_device/comprehensive_experiments.yaml
+++ b/Ironwood/configs/host_device/comprehensive_experiments.yaml
@@ -1,6 +1,6 @@
 benchmarks:
 - benchmark_name: host_device
-  num_runs: 20
+  num_runs: 100
   benchmark_sweep_params:
   - transfer_type_list: ["pinned_memory", "simple", "pipelined"]
     num_devices_list: [1, 2, 8]
diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index 8b6966b6..6aa88451 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -291,6 +291,20 @@ def add_metric(name, ms_list):
         )
         metrics.update(stats_bw.serialize_statistics())
 
+        if len(bw_list) > 1:
+            bw_array = np.array(bw_list)
+            sample_variance = np.var(bw_array, ddof=1)
+
+            metrics[f"{name}_bw (GiB/s)_sample_variance"] = sample_variance
+
+            print(
+                f"  {name}_bw (GiB/s) Sample Variance: {sample_variance:.4e}", 
+                flush=True
+            )
+        elif len(bw_list) == 1:
+            print(f"  {name}_bw (GiB/s): Only one sample, variance cannot be calculated.", flush=True)
+            metrics[f"{name}_bw_variance_GiBs"] = 0.0
+
     add_metric("H2D", H2D_Bandwidth_ms)
     add_metric("D2H", D2H_Bandwidth_ms)
 

From ac83feed5e9464f6cdc8add29ff6d1e5e23c0309 Mon Sep 17 00:00:00 2001
From: Daniel Wu <wudani@google.com>
Date: Wed, 20 May 2026 08:38:30 +0000
Subject: [PATCH 84/88] Triggering CLA recheck


From f302e987bbe3d3f8b173b9e14b51ea5753bd0293 Mon Sep 17 00:00:00 2001
From: Daniel Wu <wudani@google.com>
Date: Wed, 20 May 2026 08:46:00 +0000
Subject: [PATCH 85/88] Triggering CLA recheck 2


From 9e88b7be5eb1837c87164c0fbfdf62a8103917ef Mon Sep 17 00:00:00 2001
From: Daniel Wu <wudani@google.com>
Date: Thu, 21 May 2026 06:54:33 +0000
Subject: [PATCH 86/88] shorten sample_variance as variance

---
 Ironwood/src/benchmark_host_device.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index 6aa88451..5e1e4fbb 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -295,7 +295,7 @@ def add_metric(name, ms_list):
             bw_array = np.array(bw_list)
             sample_variance = np.var(bw_array, ddof=1)
 
-            metrics[f"{name}_bw (GiB/s)_sample_variance"] = sample_variance
+            metrics[f"{name}_bw (GiB/s)_variance"] = sample_variance
 
             print(
                 f"  {name}_bw (GiB/s) Sample Variance: {sample_variance:.4e}", 

From c8eca6f6857ad550ceb153b72deff46b0750b863 Mon Sep 17 00:00:00 2001
From: Daniel Wu <wudani@google.com>
Date: Fri, 22 May 2026 00:29:19 +0000
Subject: [PATCH 87/88] check if the variance is nan and set the value to zero

---
 Ironwood/src/benchmark_host_device.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index 5e1e4fbb..83a62aa8 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -291,19 +291,17 @@ def add_metric(name, ms_list):
         )
         metrics.update(stats_bw.serialize_statistics())
 
-        if len(bw_list) > 1:
-            bw_array = np.array(bw_list)
-            sample_variance = np.var(bw_array, ddof=1)
-
-            metrics[f"{name}_bw (GiB/s)_variance"] = sample_variance
-
-            print(
-                f"  {name}_bw (GiB/s) Sample Variance: {sample_variance:.4e}", 
-                flush=True
-            )
-        elif len(bw_list) == 1:
-            print(f"  {name}_bw (GiB/s): Only one sample, variance cannot be calculated.", flush=True)
-            metrics[f"{name}_bw_variance_GiBs"] = 0.0
+        bw_array = np.array(bw_list)
+        sample_variance = np.var(bw_array, ddof=1)
+        if np.isnan(sample_variance):
+            sample_variance = 0.0
+
+        metrics[f"{name}_bw (GiB/s)_variance"] = sample_variance
+
+        print(
+            f"  {name}_bw (GiB/s) Sample Variance: {sample_variance:.4e}", 
+            flush=True
+        )
 
     add_metric("H2D", H2D_Bandwidth_ms)
     add_metric("D2H", D2H_Bandwidth_ms)

From 07fc9b3aef4ae6ad868a31f2aa1424f5a7c5a53f Mon Sep 17 00:00:00 2001
From: Daniel Wu <wudani@google.com>
Date: Mon, 25 May 2026 00:57:49 +0000
Subject: [PATCH 88/88] Updated comprehensive_8dev_experiments.yaml from 20 to
 100 num runs

---
 .../configs/host_device/comprehensive_8dev_experiments.yaml     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml b/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml
index e15bcc4f..d3403b10 100644
--- a/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml
+++ b/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml
@@ -1,6 +1,6 @@
 benchmarks:
 - benchmark_name: host_device
-  num_runs: 20
+  num_runs: 100
   benchmark_sweep_params:
   - transfer_type_list: ["pinned_memory", "simple", "pipelined"]
     num_devices_list: [8]