From e2f2a81c7259e839be857f40f6cf1f4d71e2c3d1 Mon Sep 17 00:00:00 2001 From: Rahul Anand Sharma Date: Wed, 28 Jan 2026 23:18:26 -0800 Subject: [PATCH 01/88] fix gemm timing logic (#92) --- Ironwood/src/benchmark_gemm.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Ironwood/src/benchmark_gemm.py b/Ironwood/src/benchmark_gemm.py index 99564490..b802ddc0 100644 --- a/Ironwood/src/benchmark_gemm.py +++ b/Ironwood/src/benchmark_gemm.py @@ -213,6 +213,8 @@ def data_generator(): return (lhs_device, rhs_device) # Run the benchmark + num_runs = 1 + ## Need to fix gemm timing logic to handle num_runs > 1 time_ms_list = iteration_timeit( jit_sharded_f, @@ -300,6 +302,9 @@ def data_generator(): return (lhs_device, rhs_device) + num_runs = 1 + ## Need to fix gemm timing logic to handle num_runs > 1 + # Run the benchmark time_ms_list = iteration_timeit( jit_sharded_f, @@ -402,6 +407,9 @@ def data_generator(): return (lhs_device, rhs_device, sf0_device, sf1_device) + num_runs = 1 + ## Need to fix gemm timing logic to handle num_runs > 1 + time_ms_list = iteration_timeit( jit_sharded_f, data_generator, @@ -513,6 +521,10 @@ def data_generator(): return (out_buffer_device, lhs_device, rhs_device, sf0_device, sf1_device) + + num_runs = 1 + ## Need to fix gemm timing logic to handle num_runs > 1 + time_ms_list = iteration_timeit( jit_sharded_f, data_generator, From 20286e50af3b7f3b0edbe0750011a6a6dbf1e057 Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Tue, 27 Jan 2026 05:33:27 +0000 Subject: [PATCH 02/88] Add `gcs-bucket-csv-dir` to support GCS upload Support run_benchmark.py to use argument `gcs-bucket-csv-dir` to configure the directory for writing csv/tsv result. --- Ironwood/src/run_benchmark.py | 56 ++++++++++++++++++++++++++++++----- requirements.txt | 1 + 2 files changed, 49 insertions(+), 8 deletions(-) diff --git a/Ironwood/src/run_benchmark.py b/Ironwood/src/run_benchmark.py index b44aab75..5b3c3b7f 100644 --- a/Ironwood/src/run_benchmark.py +++ b/Ironwood/src/run_benchmark.py @@ -316,7 +316,30 @@ def convert_dict_to_df(target_dict: Dict) -> pd.DataFrame: print(f"Metrics written to CSV at {csv_path}.") -def run_single_benchmark(benchmark_config: Dict[str, Any], output_path: str): +def write_metrics_to_gcs( + gcs_bucket_csv_dir: str, + config_path: str, + test_name: str, + calculate_metrics_results: List[Dict[str, Any]], +): + """Writes metrics to GCS bucket defined by gcs_bucket_csv_dir.""" + if not gcs_bucket_csv_dir: + return + + config_dir = os.path.dirname(config_path) + if not config_dir: + print("No config directory found, the config path is: ", config_path) + config_category = "root" + else: + config_category = os.path.basename(config_dir) + + config_stem = os.path.splitext(os.path.basename(config_path))[0] + + gcs_path = os.path.join(gcs_bucket_csv_dir, config_category, config_stem) + write_to_csv(f"{gcs_path}/{test_name}.tsv", calculate_metrics_results) + + +def run_single_benchmark(benchmark_config: Dict[str, Any], output_path: str, gcs_bucket_csv_dir: str = None, config_path: str = None): """Run a single benchmark with one or more configurations.""" # Extract benchmark details benchmark_name = benchmark_config.get("benchmark_name") @@ -413,14 +436,19 @@ def run_single_benchmark(benchmark_config: Dict[str, Any], output_path: str): "metrics": metrics }) - # Dump metrics to file. + test_name = f"t_{benchmark_name}_" + "".join( + random.choices(string.ascii_uppercase + string.digits, k=10) + ) + if csv_path: os.makedirs(csv_path, exist_ok=True) - test_name = f"t_{benchmark_name}_" + "".join( - random.choices(string.ascii_uppercase + string.digits, k=10) - ) write_to_csv(f"{csv_path}/{test_name}.tsv", calculate_metrics_results) + if gcs_bucket_csv_dir: + write_metrics_to_gcs( + gcs_bucket_csv_dir, config_path, test_name, calculate_metrics_results + ) + def main(args): """Main function.""" @@ -428,6 +456,7 @@ def main(args): config_path = args.config multithreaded = args.multithreaded output_path = args.output_path + gcs_bucket_csv_dir = args.gcs_bucket_csv_dir config = get_benchmark_config(config_path) benchmarks = config.get("benchmarks") if not benchmarks or not isinstance(benchmarks, list): @@ -459,14 +488,14 @@ def main(args): # print("Num hosts detected: %d", num_hosts) for benchmark_config in benchmarks: - run_benchmark_multithreaded(benchmark_config, output_path) + run_benchmark_multithreaded(benchmark_config, output_path, gcs_bucket_csv_dir, config_path) else: for benchmark_config in benchmarks: - run_single_benchmark(benchmark_config, output_path) + run_single_benchmark(benchmark_config, output_path, gcs_bucket_csv_dir, config_path) -def run_benchmark_multithreaded(benchmark_config, output_path): +def run_benchmark_multithreaded(benchmark_config, output_path, gcs_bucket_csv_dir=None, config_path=None): # Extract benchmark details benchmark_name = benchmark_config.get("benchmark_name") benchmark_params = benchmark_config.get("benchmark_params", []) @@ -543,6 +572,11 @@ def run_benchmark_multithreaded(benchmark_config, output_path): os.makedirs(csv_path, exist_ok=True) write_to_csv(f"{csv_path}/{test_name}.tsv", calculate_metrics_results) + if gcs_bucket_csv_dir: + write_metrics_to_gcs( + gcs_bucket_csv_dir, config_path, test_name, calculate_metrics_results + ) + if __name__ == "__main__": parser = argparse.ArgumentParser( @@ -560,6 +594,12 @@ def run_benchmark_multithreaded(benchmark_config, output_path): default="", help="Path to output.", ) + parser.add_argument( + "--gcs-bucket-csv-dir", + type=str, + default=None, + help="GCS bucket directory to write CSVs to.", + ) parser.add_argument( "--multithreaded", type=bool, diff --git a/requirements.txt b/requirements.txt index 3ae246a8..ff76e478 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,4 @@ qwix@git+https://github.com/google/qwix.git tokamax tune-jax immutabledict +gcsfs From 8dedb4c4704bef5767e89ee361188a9a39c80259 Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Tue, 27 Jan 2026 06:54:07 +0000 Subject: [PATCH 03/88] Add automation script and an HBM yaml example. --- .../guides/automation/automation_launch.sh | 21 +++++++ Ironwood/guides/automation/tpu7x-16-hbm.yaml | 57 +++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 Ironwood/guides/automation/automation_launch.sh create mode 100644 Ironwood/guides/automation/tpu7x-16-hbm.yaml diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh new file mode 100644 index 00000000..5b72bcb3 --- /dev/null +++ b/Ironwood/guides/automation/automation_launch.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S") + +yaml_names=("tpu7x-16-hbm.yaml") +job_names=("tpu7x-16-hbm") + +# Fill the target GCS bucket path. +export GCS_BUCKET_ROOT_DIR="" +export GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${TIMESTAMP}" + +for yaml_file in "${yaml_names[@]}"; do + echo "Launch job: ${yaml_file}" + envsubst '${GCS_PATH}' < ${yaml_file} | kubectl apply -f - +done + +for job_name in "${job_names[@]}"; do + kubectl wait --for=condition=complete job/${job_name} --timeout=1800s + kubectl delete job ${job_name} +done +kubectl apply -f aggregator.yaml \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-16-hbm.yaml b/Ironwood/guides/automation/tpu7x-16-hbm.yaml new file mode 100644 index 00000000..206d4b79 --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-16-hbm.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc +spec: + clusterIP: None + selector: + job-name: tpu7x-16-hbm +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: tpu7x-16-hbm +spec: + completionMode: Indexed + parallelism: 2 + completions: 2 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x2 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" + echo "Result will be written to ${GCS_BUCKET_DIR}" + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 From 2bf6a941d5391188a4671b557d43f41a92a4afc1 Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Tue, 27 Jan 2026 06:56:25 +0000 Subject: [PATCH 04/88] Add aggregator yaml file. --- Ironwood/guides/automation/aggregator.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 Ironwood/guides/automation/aggregator.yaml diff --git a/Ironwood/guides/automation/aggregator.yaml b/Ironwood/guides/automation/aggregator.yaml new file mode 100644 index 00000000..2c0ecbd0 --- /dev/null +++ b/Ironwood/guides/automation/aggregator.yaml @@ -0,0 +1,16 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: job-waiter +spec: + template: + spec: + containers: + - name: main-app + image: ubuntu + command: ["/bin/sh", "-c"] + args: + - | + echo "Collectives job is finally done." + echo "Everything is all fine!" + restartPolicy: Never \ No newline at end of file From 00fa7b2432eee4a1effabb919f374a260c6e311e Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Tue, 27 Jan 2026 07:41:02 +0000 Subject: [PATCH 05/88] [Automation] Add readme and node-pools topology check --- Ironwood/guides/automation/README.md | 40 +++++++++++++++++++ .../guides/automation/automation_launch.sh | 26 ++++++++++-- .../automation/check_node_pool_setup.sh | 33 +++++++++++++++ 3 files changed, 96 insertions(+), 3 deletions(-) create mode 100644 Ironwood/guides/automation/README.md create mode 100644 Ironwood/guides/automation/check_node_pool_setup.sh diff --git a/Ironwood/guides/automation/README.md b/Ironwood/guides/automation/README.md new file mode 100644 index 00000000..05e2e2d1 --- /dev/null +++ b/Ironwood/guides/automation/README.md @@ -0,0 +1,40 @@ +# Ironwood Automation Tool + +This directory contains the automation scripts for running TPU microbenchmarks. The tool simplifies the process of launching multiple benchmark jobs, waiting for their completion, and aggregating the results into a unified format. + +## Prerequisites + +Before running the automation script, ensure the following requirements are met: + +1. **Node Pool Topology**: The script expects specific TPU node pools to be available in your cluster. + * The `check_node_pool_setup.sh` script validates this. +2. **GCS Bucket**: You must have a Google Cloud Storage (GCS) bucket for the intermediate and final results. + * This can be setup by `gcloud storage buckets create gs://my-unique-bucket-name --location=us-central1` +3. **Kubectl**: Ensure `kubectl` is configured and connected to your GKE cluster. + +## User Journey + +1. **Clone & Checkout Branch**. + ```bash + git clone https://github.com/google/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + ``` + +2. **Setup Environment**: Ensure your node pools are set up and you have prepared a GCS bucket. + +3. **Run Automation Script**: + The main script is `automation_launch.sh`. You need to set the `GCS_BUCKET_ROOT_DIR` environment variable before running it. + + ```bash + # Replace with your actual bucket path (must start with gs://) + export GCS_BUCKET_ROOT_DIR="gs://your-bucket-name/automation_results" + + # Run the launch script + bash Ironwood/guides/automation/automation_launch.sh + ``` + +4. **Retrieve Results**: + After the script completes, the final aggregated TSV files will be available in your GCS bucket. The script generates a timestamped directory for each run. + * Check the script output for the exact path: `The intermediate result will be written to gs://...` + * Look for the `final` directory under that path (e.g., `gs://your-bucket/automation_results//final`). diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh index 5b72bcb3..b069340f 100644 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -1,13 +1,33 @@ #!/usr/bin/env bash -TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S") +###################################################################### +# USER INPUT +###################################################################### +export GCS_BUCKET_ROOT_DIR="" yaml_names=("tpu7x-16-hbm.yaml") job_names=("tpu7x-16-hbm") -# Fill the target GCS bucket path. -export GCS_BUCKET_ROOT_DIR="" +###################################################################### +# VALIDATION & SETUP +###################################################################### + +if [[ -z "${GCS_BUCKET_ROOT_DIR}" || "${GCS_BUCKET_ROOT_DIR}" != "gs://"* ]]; then + echo "Error: GCS_BUCKET_ROOT_DIR must be set and start with gs://" + exit 1 +fi +TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S") export GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${TIMESTAMP}" +echo "The intermediate result will be written to ${GCS_PATH}" + +SCRIPT_DIR="$(dirname "$(realpath "$0")")" +if ! bash "${SCRIPT_DIR}/check_node_pool_setup.sh"; then + exit 1 +fi + +###################################################################### +# LAUNCH JOBS & WAIT FOR COMPLETION +###################################################################### for yaml_file in "${yaml_names[@]}"; do echo "Launch job: ${yaml_file}" diff --git a/Ironwood/guides/automation/check_node_pool_setup.sh b/Ironwood/guides/automation/check_node_pool_setup.sh new file mode 100644 index 00000000..5dd8737a --- /dev/null +++ b/Ironwood/guides/automation/check_node_pool_setup.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +required_chip="tpu7x" +required_topologies=("2x2x1") + +echo "Checking for required GKE TPU configurations..." +echo "Required TPU Type: ${required_chip}" +echo "-----------------------------------------------------------------" + +all_found=true + +for topology in "${required_topologies[@]}"; do + echo -n "Checking for TPU topology '${topology}' with type '${required_chip}': " + + matching_nodes=$(kubectl get nodes -l cloud.google.com/gke-tpu-topology=${topology},cloud.google.com/gke-tpu-accelerator=${required_chip} -o custom-columns=NAME:.metadata.name --no-headers 2>/dev/null) + + if [[ -n "${matching_nodes}" ]]; then + echo "FOUND" + else + echo "MISSING" + all_found=false + fi +done + +echo "-----------------------------------------------------------------" + +if [[ "${all_found}" = true ]]; then + echo "SUCCESS: All required TPU configurations (topology + type) are present in the cluster." + exit 0 +else + echo "FAILURE: One or more required TPU configurations are missing." + exit 1 +fi From 99fa6b1b1f872c30c79b664dcd5b2d9f276e972a Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Tue, 27 Jan 2026 09:49:40 +0000 Subject: [PATCH 06/88] Update automation script and yaml files for different topologies. --- .../guides/automation/automation_launch.sh | 38 ++++++++----- .../guides/automation/tpu7x-2x2x1-hbm.yaml | 57 +++++++++++++++++++ .../guides/automation/tpu7x-2x2x2-hbm.yaml | 57 +++++++++++++++++++ .../guides/automation/tpu7x-2x2x4-hbm.yaml | 57 +++++++++++++++++++ .../guides/automation/tpu7x-2x4x4-hbm.yaml | 57 +++++++++++++++++++ .../guides/automation/tpu7x-4x4x4-hbm.yaml | 57 +++++++++++++++++++ .../guides/automation/tpu7x-4x4x8-hbm.yaml | 57 +++++++++++++++++++ 7 files changed, 367 insertions(+), 13 deletions(-) create mode 100644 Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml create mode 100644 Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml create mode 100644 Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml create mode 100644 Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml create mode 100644 Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml create mode 100644 Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh index b069340f..a904e14d 100644 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -3,21 +3,20 @@ ###################################################################### # USER INPUT ###################################################################### -export GCS_BUCKET_ROOT_DIR="" +export GCS_PATH="" -yaml_names=("tpu7x-16-hbm.yaml") -job_names=("tpu7x-16-hbm") +yaml_names=("tpu7x-2x2x1-hbm.yaml" "tpu7x-2x2x2-hbm.yaml") +job_names=("tpu7x-8-hbm" "tpu7x-16-hbm") ###################################################################### # VALIDATION & SETUP ###################################################################### -if [[ -z "${GCS_BUCKET_ROOT_DIR}" || "${GCS_BUCKET_ROOT_DIR}" != "gs://"* ]]; then - echo "Error: GCS_BUCKET_ROOT_DIR must be set and start with gs://" +if [[ -z "${GCS_PATH}" || "${GCS_PATH}" != "gs://"* ]]; then + echo "Error: GCS_PATH must be set and start with gs://" exit 1 fi -TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S") -export GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${TIMESTAMP}" + echo "The intermediate result will be written to ${GCS_PATH}" SCRIPT_DIR="$(dirname "$(realpath "$0")")" @@ -29,13 +28,26 @@ fi # LAUNCH JOBS & WAIT FOR COMPLETION ###################################################################### -for yaml_file in "${yaml_names[@]}"; do +length=${#yaml_names[@]} +for (( i=0; i /dev/null; then + echo "Job from ${yaml_file} is completed!" + else + echo "Job from ${yaml_file} failed!" + + fi + envsubst '${JOB_NAME} ${GCS_PATH}' < ${yaml_file} | kubectl delete -f - + ) & done -for job_name in "${job_names[@]}"; do - kubectl wait --for=condition=complete job/${job_name} --timeout=1800s - kubectl delete job ${job_name} -done +echo "All jobs dispatched. Waiting for results..." +wait +echo "All processing done." + kubectl apply -f aggregator.yaml \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml new file mode 100644 index 00000000..7455961d --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 1 + completions: 1 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x1 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml new file mode 100644 index 00000000..3f3f7929 --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 2 + completions: 2 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x2 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml new file mode 100644 index 00000000..ed87aaf8 --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 4 + completions: 4 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x4 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml new file mode 100644 index 00000000..57182026 --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 8 + completions: 8 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x4x4 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml new file mode 100644 index 00000000..ccce9dc0 --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 16 + completions: 16 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 4x4x4 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml new file mode 100644 index 00000000..8d8e960e --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 32 + completions: 32 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 4x4x8 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file From 91d4638aaed01ceff8f14febabcce2a2150f2380 Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Tue, 27 Jan 2026 15:33:55 +0000 Subject: [PATCH 07/88] [Automation] Error catch and failure retry * Fix the issue where `kubectl wait` could only wait for one condition. Use poll loop to check for status. * Store the failed jobs and retry with maximum 3 times TEST=Use dummy `must-fail` and `must-succeed` job which exit 1/0 directly. Make sure the script will retry on the failed one for 3 times, and eventually print out the command to retry. --- .../guides/automation/automation_launch.sh | 140 +++++++++++++++--- 1 file changed, 120 insertions(+), 20 deletions(-) diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh index a904e14d..9815780f 100644 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -6,7 +6,6 @@ export GCS_PATH="" yaml_names=("tpu7x-2x2x1-hbm.yaml" "tpu7x-2x2x2-hbm.yaml") -job_names=("tpu7x-8-hbm" "tpu7x-16-hbm") ###################################################################### # VALIDATION & SETUP @@ -28,26 +27,127 @@ fi # LAUNCH JOBS & WAIT FOR COMPLETION ###################################################################### -length=${#yaml_names[@]} -for (( i=0; i /dev/null; then - echo "Job from ${yaml_file} is completed!" - else - echo "Job from ${yaml_file} failed!" - + +# Function to wait for a job to complete or fail +wait_for_job_completion() { + local job_name="$1" + local timeout="$2" + local start_time=$(date +%s) + local end_time=$((start_time + timeout)) + + while true; do + current_time=$(date +%s) + if [[ $current_time -gt $end_time ]]; then + echo "Timeout waiting for job ${job_name}" + return 2 + fi + + # Check for Complete condition + if kubectl get job "${job_name}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null | grep -q "True"; then + echo "Job ${job_name} completed successfully!" + return 0 + fi + + # Check for Failed condition + if kubectl get job "${job_name}" -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null | grep -q "True"; then + echo "Job ${job_name} FAILED!" + return 1 + fi + + sleep 5 + done +} + +# Function to apply jobs and wait for them to complete +# Returns a list of failed yaml files in the variable FAILED_JOBS +apply_and_wait() { + local yaml_files=("$@") + local pids=() + local job_names_in_batch=() + FAILED_JOBS=() + + echo "Processing batch of ${#yaml_files[@]} jobs..." + + # Launch all jobs + for yaml_file in "${yaml_files[@]}"; do + local filepath="${SCRIPT_DIR}/${yaml_file}" + # Derive job name: remove .yaml, lowercase, replace _ with - + local job_name=$(basename "${yaml_file}" .yaml | tr '[:upper:]' '[:lower:]' | tr '_' '-') + export JOB_NAME="${job_name}" + + echo "Launching job: ${filepath} (name: ${JOB_NAME})" + envsubst '${JOB_NAME} ${GCS_PATH}' < "${filepath}" | kubectl apply -f - + job_names_in_batch+=("${JOB_NAME}") + done + + # Wait for completion in background + for i in "${!yaml_files[@]}"; do + local yaml_file="${yaml_files[$i]}" + local filepath="${SCRIPT_DIR}/${yaml_file}" + local job_name="${job_names_in_batch[$i]}" + + ( + wait_for_job_completion "${job_name}" 1800 + wait_status=$? + + export JOB_NAME="${job_name}" + envsubst '${JOB_NAME} ${GCS_PATH}' < "${filepath}" | kubectl delete -f - &> /dev/null + exit $wait_status + ) & + pids+=($!) + done + + # Collect results + for i in "${!pids[@]}"; do + wait "${pids[$i]}" + if [[ $? -ne 0 ]]; then + FAILED_JOBS+=("${yaml_files[$i]}") fi - envsubst '${JOB_NAME} ${GCS_PATH}' < ${yaml_file} | kubectl delete -f - - ) & + done +} + +# Retry loop +current_batch=("${yaml_names[@]}") +MAX_RETRIES=3 + +for (( retry=1; retry<=MAX_RETRIES; retry++ )); do + apply_and_wait "${current_batch[@]}" + + if [[ ${#FAILED_JOBS[@]} -eq 0 ]]; then + echo "All jobs completed successfully in Round ${retry}!" + break + fi + + echo "Round ${retry} finished. ${#FAILED_JOBS[@]} jobs failed." + current_batch=("${FAILED_JOBS[@]}") + + if [[ ${retry} -lt ${MAX_RETRIES} ]]; then + echo "Retrying failed jobs..." + echo "========================================" + echo "$((retry + 1)) / ${MAX_RETRIES}" max retries + echo "========================================" + else + echo "Max retries reached. ¯\_(ツ)_/¯" + fi done -echo "All jobs dispatched. Waiting for results..." -wait -echo "All processing done." +echo "" +echo "Jobs completed. Aggregating results..." +echo "" + +kubectl apply -f aggregator.yaml + +# Print the failed jobs at the end for better visibility. + +if [[ ${#FAILED_JOBS[@]} -gt 0 ]]; then + echo "The following jobs finally failed after ${MAX_RETRIES} rounds:" + printf '%s\n' "${FAILED_JOBS[@]}" -kubectl apply -f aggregator.yaml \ No newline at end of file + echo -e "\nTo retry manually, run:" + for yaml_file in "${FAILED_JOBS[@]}"; do + job_name=$(basename "${yaml_file}" .yaml | tr '[:upper:]' '[:lower:]' | tr '_' '-') + echo "JOB_NAME=\"${job_name}\" GCS_PATH=\"${GCS_PATH}\" envsubst '\${JOB_NAME} \${GCS_PATH}' < \"${SCRIPT_DIR}/${yaml_file}\" | kubectl apply -f -" + done +else + echo "Success! All jobs finished." +fi \ No newline at end of file From 62a04612eb2a20c4047820ce5e3989bd453370a4 Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Wed, 28 Jan 2026 03:47:34 +0000 Subject: [PATCH 08/88] [Automation] Add missing topology tracking in check_node_pool_setup.sh --- .../automation/check_node_pool_setup.sh | 6 +- Ironwood/guides/automation/tpu7x-16-hbm.yaml | 57 ------------------- 2 files changed, 5 insertions(+), 58 deletions(-) delete mode 100644 Ironwood/guides/automation/tpu7x-16-hbm.yaml diff --git a/Ironwood/guides/automation/check_node_pool_setup.sh b/Ironwood/guides/automation/check_node_pool_setup.sh index 5dd8737a..b318823e 100644 --- a/Ironwood/guides/automation/check_node_pool_setup.sh +++ b/Ironwood/guides/automation/check_node_pool_setup.sh @@ -1,13 +1,15 @@ #!/bin/bash required_chip="tpu7x" -required_topologies=("2x2x1") +required_topologies=("2x2x1" "2x2x2" "2x2x4" "2x4x4" "4x4x4") echo "Checking for required GKE TPU configurations..." echo "Required TPU Type: ${required_chip}" echo "-----------------------------------------------------------------" all_found=true +missing_topologies=() + for topology in "${required_topologies[@]}"; do echo -n "Checking for TPU topology '${topology}' with type '${required_chip}': " @@ -18,6 +20,7 @@ for topology in "${required_topologies[@]}"; do echo "FOUND" else echo "MISSING" + missing_topologies+=("${topology}") all_found=false fi done @@ -29,5 +32,6 @@ if [[ "${all_found}" = true ]]; then exit 0 else echo "FAILURE: One or more required TPU configurations are missing." + echo "Missing topologies: ${missing_topologies[@]}" exit 1 fi diff --git a/Ironwood/guides/automation/tpu7x-16-hbm.yaml b/Ironwood/guides/automation/tpu7x-16-hbm.yaml deleted file mode 100644 index 206d4b79..00000000 --- a/Ironwood/guides/automation/tpu7x-16-hbm.yaml +++ /dev/null @@ -1,57 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: headless-svc -spec: - clusterIP: None - selector: - job-name: tpu7x-16-hbm ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: tpu7x-16-hbm -spec: - completionMode: Indexed - parallelism: 2 - completions: 2 - backoffLimit: 0 - template: - spec: - subdomain: headless-svc - restartPolicy: Never - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu7x - cloud.google.com/gke-tpu-topology: 2x2x2 - containers: - - name: jax-tpu - image: python:3.12 - securityContext: - privileged: false - env: - - name: JAX_PLATFORMS - value: "tpu,cpu" - - name: TPU_VMODULE - value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" - - name: XLA_IR_DEBUG - value: "1" - - name: XLA_HLO_DEBUG - value: "1" - command: - - bash - - -c - - | - set -ex - - git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git - cd accelerator-microbenchmarks - pip install -r requirements.txt - - GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" - echo "Result will be written to ${GCS_BUCKET_DIR}" - resources: - requests: - google.com/tpu: 4 - limits: - google.com/tpu: 4 From 991cb2f9c49ed3504a8113f36d2f4bbee5543503 Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Wed, 28 Jan 2026 04:08:10 +0000 Subject: [PATCH 09/88] [Automation] Add topology-aware node pool validation. --- Ironwood/guides/automation/automation_launch.sh | 4 +++- Ironwood/guides/automation/check_node_pool_setup.sh | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh index 9815780f..7ff69a19 100644 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -18,8 +18,10 @@ fi echo "The intermediate result will be written to ${GCS_PATH}" +required_topologies=($(printf "%s\n" "${yaml_names[@]}" | grep -oE '[0-9]+x[0-9]+x[0-9]+' | sort -u)) + SCRIPT_DIR="$(dirname "$(realpath "$0")")" -if ! bash "${SCRIPT_DIR}/check_node_pool_setup.sh"; then +if ! bash "${SCRIPT_DIR}/check_node_pool_setup.sh" "${required_topologies[@]}"; then exit 1 fi diff --git a/Ironwood/guides/automation/check_node_pool_setup.sh b/Ironwood/guides/automation/check_node_pool_setup.sh index b318823e..a77138b4 100644 --- a/Ironwood/guides/automation/check_node_pool_setup.sh +++ b/Ironwood/guides/automation/check_node_pool_setup.sh @@ -1,7 +1,11 @@ #!/bin/bash required_chip="tpu7x" -required_topologies=("2x2x1" "2x2x2" "2x2x4" "2x4x4" "4x4x4") +if [[ $# -gt 0 ]]; then + required_topologies=("$@") +else + required_topologies=("2x2x1" "2x2x2" "2x2x4" "2x4x4" "4x4x4") +fi echo "Checking for required GKE TPU configurations..." echo "Required TPU Type: ${required_chip}" From 6b5c66bb96cde8749a97b8bdd56e5e40a7db4920 Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Thu, 29 Jan 2026 05:52:16 +0000 Subject: [PATCH 10/88] [Automation] Update configurations for GEMM, H2D and Collectives --- .../automation/tpu7x-2x2x1-collectives.yaml | 59 +++++++++++++++++++ .../guides/automation/tpu7x-2x2x1-gemm.yaml | 57 ++++++++++++++++++ .../guides/automation/tpu7x-2x2x1-h2d.yaml | 57 ++++++++++++++++++ .../automation/tpu7x-2x2x2-collectives.yaml | 59 +++++++++++++++++++ .../guides/automation/tpu7x-2x2x2-gemm.yaml | 57 ++++++++++++++++++ .../guides/automation/tpu7x-2x2x2-h2d.yaml | 57 ++++++++++++++++++ .../automation/tpu7x-2x2x4-collectives.yaml | 59 +++++++++++++++++++ .../guides/automation/tpu7x-2x2x4-gemm.yaml | 57 ++++++++++++++++++ .../guides/automation/tpu7x-2x2x4-h2d.yaml | 57 ++++++++++++++++++ .../automation/tpu7x-2x4x4-collectives.yaml | 59 +++++++++++++++++++ .../guides/automation/tpu7x-2x4x4-gemm.yaml | 57 ++++++++++++++++++ .../guides/automation/tpu7x-2x4x4-h2d.yaml | 57 ++++++++++++++++++ .../automation/tpu7x-4x4x4-collectives.yaml | 59 +++++++++++++++++++ .../guides/automation/tpu7x-4x4x4-gemm.yaml | 57 ++++++++++++++++++ .../guides/automation/tpu7x-4x4x4-h2d.yaml | 57 ++++++++++++++++++ .../automation/tpu7x-4x4x8-collectives.yaml | 59 +++++++++++++++++++ .../guides/automation/tpu7x-4x4x8-gemm.yaml | 57 ++++++++++++++++++ .../guides/automation/tpu7x-4x4x8-h2d.yaml | 57 ++++++++++++++++++ 18 files changed, 1038 insertions(+) create mode 100644 Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml create mode 100644 Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml create mode 100644 Ironwood/guides/automation/tpu7x-2x2x1-h2d.yaml create mode 100644 Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml create mode 100644 Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml create mode 100644 Ironwood/guides/automation/tpu7x-2x2x2-h2d.yaml create mode 100644 Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml create mode 100644 Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml create mode 100644 Ironwood/guides/automation/tpu7x-2x2x4-h2d.yaml create mode 100644 Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml create mode 100644 Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml create mode 100644 Ironwood/guides/automation/tpu7x-2x4x4-h2d.yaml create mode 100644 Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml create mode 100644 Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml create mode 100644 Ironwood/guides/automation/tpu7x-4x4x4-h2d.yaml create mode 100644 Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml create mode 100644 Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml create mode 100644 Ironwood/guides/automation/tpu7x-4x4x8-h2d.yaml diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml new file mode 100644 index 00000000..1671f2ec --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml @@ -0,0 +1,59 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 1 + completions: 1 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x1 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml new file mode 100644 index 00000000..1929d886 --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 1 + completions: 1 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x1 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-h2d.yaml new file mode 100644 index 00000000..3f36f54e --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-2x2x1-h2d.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 1 + completions: 1 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x1 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml new file mode 100644 index 00000000..7758e449 --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml @@ -0,0 +1,59 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 2 + completions: 2 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x2 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml new file mode 100644 index 00000000..c869fbdb --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 2 + completions: 2 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x2 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-h2d.yaml new file mode 100644 index 00000000..34d19110 --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-2x2x2-h2d.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 2 + completions: 2 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x2 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml new file mode 100644 index 00000000..dce47004 --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml @@ -0,0 +1,59 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 4 + completions: 4 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x4 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml new file mode 100644 index 00000000..e514f1f6 --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 4 + completions: 4 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x4 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-h2d.yaml new file mode 100644 index 00000000..f48d047f --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-2x2x4-h2d.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 4 + completions: 4 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x4 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml new file mode 100644 index 00000000..4b0617c5 --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml @@ -0,0 +1,59 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 8 + completions: 8 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x4x4 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml new file mode 100644 index 00000000..e07a1072 --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 8 + completions: 8 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x4x4 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-h2d.yaml new file mode 100644 index 00000000..258aa8ee --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-2x4x4-h2d.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 8 + completions: 8 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x4x4 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml new file mode 100644 index 00000000..036bbfeb --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml @@ -0,0 +1,59 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 16 + completions: 16 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 4x4x4 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml new file mode 100644 index 00000000..00f09803 --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 16 + completions: 16 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 4x4x4 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-h2d.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-h2d.yaml new file mode 100644 index 00000000..21ddef16 --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-4x4x4-h2d.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 16 + completions: 16 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 4x4x4 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml new file mode 100644 index 00000000..97adb283 --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml @@ -0,0 +1,59 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 32 + completions: 32 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 4x4x8 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml new file mode 100644 index 00000000..e186c51c --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 32 + completions: 32 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 4x4x8 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-h2d.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-h2d.yaml new file mode 100644 index 00000000..712d94b7 --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-4x4x8-h2d.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} +spec: + completionMode: Indexed + parallelism: 32 + completions: 32 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 4x4x8 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file From 926a7c02949497169ce16e9b30abba74ff79cf3b Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Thu, 29 Jan 2026 08:34:47 +0000 Subject: [PATCH 11/88] [Automation] Update `automation_launch.sh` --- .../guides/automation/automation_launch.sh | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh index 7ff69a19..678285cf 100644 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -3,20 +3,29 @@ ###################################################################### # USER INPUT ###################################################################### -export GCS_PATH="" +TIMESTAMP=$(date +%Y-%m-%d_%H-%M-%S) +export GCS_BUCKET_ROOT_DIR="" -yaml_names=("tpu7x-2x2x1-hbm.yaml" "tpu7x-2x2x2-hbm.yaml") +MAX_RETRIES=3 +TIMEOUT_SECOND=3600 + +yaml_names=( + "tpu7x-2x2x1-hbm.yaml" "tpu7x-2x2x1-h2d.yaml" "tpu7x-2x2x1-gemm.yaml" "tpu7x-2x2x1-collectives.yaml" + "tpu7x-2x2x2-hbm.yaml" "tpu7x-2x2x2-h2d.yaml" "tpu7x-2x2x2-gemm.yaml" "tpu7x-2x2x2-collectives.yaml" + "tpu7x-2x2x4-hbm.yaml" "tpu7x-2x2x4-h2d.yaml" "tpu7x-2x2x4-gemm.yaml" "tpu7x-2x2x4-collectives.yaml" + "tpu7x-2x4x4-hbm.yaml" "tpu7x-2x4x4-h2d.yaml" "tpu7x-2x4x4-gemm.yaml" "tpu7x-2x4x4-collectives.yaml" +) ###################################################################### # VALIDATION & SETUP ###################################################################### -if [[ -z "${GCS_PATH}" || "${GCS_PATH}" != "gs://"* ]]; then - echo "Error: GCS_PATH must be set and start with gs://" +if [[ -z "${GCS_BUCKET_ROOT_DIR}" || "${GCS_BUCKET_ROOT_DIR}" != "gs://"* ]]; then + echo "Error: GCS_BUCKET_ROOT_DIR must be set and start with gs://" exit 1 fi -echo "The intermediate result will be written to ${GCS_PATH}" +echo "The intermediate result will be written to ${GCS_BUCKET_ROOT_DIR}" required_topologies=($(printf "%s\n" "${yaml_names[@]}" | grep -oE '[0-9]+x[0-9]+x[0-9]+' | sort -u)) @@ -76,6 +85,7 @@ apply_and_wait() { # Derive job name: remove .yaml, lowercase, replace _ with - local job_name=$(basename "${yaml_file}" .yaml | tr '[:upper:]' '[:lower:]' | tr '_' '-') export JOB_NAME="${job_name}" + local GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}" echo "Launching job: ${filepath} (name: ${JOB_NAME})" envsubst '${JOB_NAME} ${GCS_PATH}' < "${filepath}" | kubectl apply -f - @@ -87,9 +97,9 @@ apply_and_wait() { local yaml_file="${yaml_files[$i]}" local filepath="${SCRIPT_DIR}/${yaml_file}" local job_name="${job_names_in_batch[$i]}" - + local GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}" ( - wait_for_job_completion "${job_name}" 1800 + wait_for_job_completion "${job_name}" ${TIMEOUT_SECOND} wait_status=$? export JOB_NAME="${job_name}" @@ -110,7 +120,6 @@ apply_and_wait() { # Retry loop current_batch=("${yaml_names[@]}") -MAX_RETRIES=3 for (( retry=1; retry<=MAX_RETRIES; retry++ )); do apply_and_wait "${current_batch[@]}" @@ -148,6 +157,7 @@ if [[ ${#FAILED_JOBS[@]} -gt 0 ]]; then echo -e "\nTo retry manually, run:" for yaml_file in "${FAILED_JOBS[@]}"; do job_name=$(basename "${yaml_file}" .yaml | tr '[:upper:]' '[:lower:]' | tr '_' '-') + GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}" echo "JOB_NAME=\"${job_name}\" GCS_PATH=\"${GCS_PATH}\" envsubst '\${JOB_NAME} \${GCS_PATH}' < \"${SCRIPT_DIR}/${yaml_file}\" | kubectl apply -f -" done else From a86ab39f289faf1a7959a9bffd27da617ff8369d Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Thu, 29 Jan 2026 12:08:15 +0000 Subject: [PATCH 12/88] [Automation] Enable kueue to prevent deadlock from race condition --- .../guides/automation/automation_launch.sh | 10 ++++- Ironwood/guides/automation/job-queue.yaml | 41 +++++++++++++++++++ .../automation/tpu7x-2x2x1-collectives.yaml | 3 ++ .../guides/automation/tpu7x-2x2x1-gemm.yaml | 3 ++ .../guides/automation/tpu7x-2x2x1-h2d.yaml | 3 ++ .../guides/automation/tpu7x-2x2x1-hbm.yaml | 3 ++ .../automation/tpu7x-2x2x2-collectives.yaml | 3 ++ .../guides/automation/tpu7x-2x2x2-gemm.yaml | 3 ++ .../guides/automation/tpu7x-2x2x2-h2d.yaml | 3 ++ .../guides/automation/tpu7x-2x2x2-hbm.yaml | 3 ++ .../automation/tpu7x-2x2x4-collectives.yaml | 3 ++ .../guides/automation/tpu7x-2x2x4-gemm.yaml | 3 ++ .../guides/automation/tpu7x-2x2x4-h2d.yaml | 3 ++ .../guides/automation/tpu7x-2x2x4-hbm.yaml | 3 ++ .../automation/tpu7x-2x4x4-collectives.yaml | 3 ++ .../guides/automation/tpu7x-2x4x4-gemm.yaml | 3 ++ .../guides/automation/tpu7x-2x4x4-h2d.yaml | 3 ++ .../guides/automation/tpu7x-2x4x4-hbm.yaml | 3 ++ .../automation/tpu7x-4x4x4-collectives.yaml | 3 ++ .../guides/automation/tpu7x-4x4x4-gemm.yaml | 3 ++ .../guides/automation/tpu7x-4x4x4-h2d.yaml | 3 ++ .../guides/automation/tpu7x-4x4x4-hbm.yaml | 3 ++ .../automation/tpu7x-4x4x8-collectives.yaml | 3 ++ .../guides/automation/tpu7x-4x4x8-gemm.yaml | 3 ++ .../guides/automation/tpu7x-4x4x8-h2d.yaml | 3 ++ .../guides/automation/tpu7x-4x4x8-hbm.yaml | 3 ++ 26 files changed, 121 insertions(+), 2 deletions(-) create mode 100644 Ironwood/guides/automation/job-queue.yaml diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh index 678285cf..3079da66 100644 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -34,6 +34,12 @@ if ! bash "${SCRIPT_DIR}/check_node_pool_setup.sh" "${required_topologies[@]}"; exit 1 fi +for topology in "${required_topologies[@]}"; do + export TOPOLOGY="${topology}" + export TPUS=$(echo "${TOPOLOGY}" | sed 's/x/*/g' | bc) + envsubst '${TOPOLOGY} ${TPUS}' < ${SCRIPT_DIR}/job-queue.yaml | kubectl apply -f - +done + ###################################################################### # LAUNCH JOBS & WAIT FOR COMPLETION ###################################################################### @@ -85,7 +91,7 @@ apply_and_wait() { # Derive job name: remove .yaml, lowercase, replace _ with - local job_name=$(basename "${yaml_file}" .yaml | tr '[:upper:]' '[:lower:]' | tr '_' '-') export JOB_NAME="${job_name}" - local GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}" + export GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}" echo "Launching job: ${filepath} (name: ${JOB_NAME})" envsubst '${JOB_NAME} ${GCS_PATH}' < "${filepath}" | kubectl apply -f - @@ -97,7 +103,7 @@ apply_and_wait() { local yaml_file="${yaml_files[$i]}" local filepath="${SCRIPT_DIR}/${yaml_file}" local job_name="${job_names_in_batch[$i]}" - local GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}" + export GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}" ( wait_for_job_completion "${job_name}" ${TIMEOUT_SECOND} wait_status=$? diff --git a/Ironwood/guides/automation/job-queue.yaml b/Ironwood/guides/automation/job-queue.yaml new file mode 100644 index 00000000..b13f5573 --- /dev/null +++ b/Ironwood/guides/automation/job-queue.yaml @@ -0,0 +1,41 @@ +apiVersion: kueue.x-k8s.io/v1beta2 +kind: ResourceFlavor +metadata: + name: "flavor-${TOPOLOGY}" +spec: + nodeLabels: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: ${TOPOLOGY} +--- +apiVersion: kueue.x-k8s.io/v1beta2 +kind: ClusterQueue +metadata: + name: cluster-queue-${TOPOLOGY} +spec: + flavorFungibility: + whenCanBorrow: MayStopSearch + whenCanPreempt: TryNextFlavor + namespaceSelector: {} + preemption: + borrowWithinCohort: + policy: Never + reclaimWithinCohort: Never + withinClusterQueue: LowerPriority + queueingStrategy: BestEffortFIFO + resourceGroups: + - coveredResources: + - google.com/tpu + flavors: + - name: flavor-${TOPOLOGY} + resources: + - name: google.com/tpu + nominalQuota: ${TPUS} + stopPolicy: None +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + namespace: default + name: "user-queue-${TOPOLOGY}" +spec: + clusterQueue: "cluster-queue-${TOPOLOGY}" \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml index 1671f2ec..fc878ea0 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-2x2x1 spec: completionMode: Indexed + suspend: true parallelism: 1 completions: 1 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml index 1929d886..1c9fa143 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-2x2x1 spec: completionMode: Indexed + suspend: true parallelism: 1 completions: 1 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-h2d.yaml index 3f36f54e..3f662be5 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x1-h2d.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x1-h2d.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-2x2x1 spec: completionMode: Indexed + suspend: true parallelism: 1 completions: 1 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml index 7455961d..e84e13ea 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-2x2x1 spec: completionMode: Indexed + suspend: true parallelism: 1 completions: 1 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml index 7758e449..c2efba03 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-2x2x2 spec: completionMode: Indexed + suspend: true parallelism: 2 completions: 2 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml index c869fbdb..3f6004bd 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-2x2x2 spec: completionMode: Indexed + suspend: true parallelism: 2 completions: 2 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-h2d.yaml index 34d19110..fbcd556d 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x2-h2d.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x2-h2d.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-2x2x2 spec: completionMode: Indexed + suspend: true parallelism: 2 completions: 2 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml index 3f3f7929..71793e3f 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-2x2x2 spec: completionMode: Indexed + suspend: true parallelism: 2 completions: 2 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml index dce47004..a42b04ae 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-2x2x4 spec: completionMode: Indexed + suspend: true parallelism: 4 completions: 4 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml index e514f1f6..fee014e9 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-2x2x4 spec: completionMode: Indexed + suspend: true parallelism: 4 completions: 4 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-h2d.yaml index f48d047f..799c5b35 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x4-h2d.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x4-h2d.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-2x2x4 spec: completionMode: Indexed + suspend: true parallelism: 4 completions: 4 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml index ed87aaf8..100e1f7f 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-2x2x4 spec: completionMode: Indexed + suspend: true parallelism: 4 completions: 4 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml index 4b0617c5..de6f7106 100644 --- a/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml +++ b/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-2x4x4 spec: completionMode: Indexed + suspend: true parallelism: 8 completions: 8 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml index e07a1072..d51c9c31 100644 --- a/Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml +++ b/Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-2x4x4 spec: completionMode: Indexed + suspend: true parallelism: 8 completions: 8 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-h2d.yaml index 258aa8ee..30a25c4c 100644 --- a/Ironwood/guides/automation/tpu7x-2x4x4-h2d.yaml +++ b/Ironwood/guides/automation/tpu7x-2x4x4-h2d.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-2x4x4 spec: completionMode: Indexed + suspend: true parallelism: 8 completions: 8 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml index 57182026..a438b93d 100644 --- a/Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml +++ b/Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-2x4x4 spec: completionMode: Indexed + suspend: true parallelism: 8 completions: 8 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml index 036bbfeb..828d1352 100644 --- a/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml +++ b/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-4x4x4 spec: completionMode: Indexed + suspend: true parallelism: 16 completions: 16 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml index 00f09803..ef9220f6 100644 --- a/Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml +++ b/Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-4x4x4 spec: completionMode: Indexed + suspend: true parallelism: 16 completions: 16 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-h2d.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-h2d.yaml index 21ddef16..da7a4e7c 100644 --- a/Ironwood/guides/automation/tpu7x-4x4x4-h2d.yaml +++ b/Ironwood/guides/automation/tpu7x-4x4x4-h2d.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-4x4x4 spec: completionMode: Indexed + suspend: true parallelism: 16 completions: 16 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml index ccce9dc0..0819edad 100644 --- a/Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml +++ b/Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-4x4x4 spec: completionMode: Indexed + suspend: true parallelism: 16 completions: 16 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml index 97adb283..f3bf721d 100644 --- a/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml +++ b/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-4x4x8 spec: completionMode: Indexed + suspend: true parallelism: 32 completions: 32 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml index e186c51c..65f93467 100644 --- a/Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml +++ b/Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-4x4x8 spec: completionMode: Indexed + suspend: true parallelism: 32 completions: 32 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-h2d.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-h2d.yaml index 712d94b7..09d2a5ab 100644 --- a/Ironwood/guides/automation/tpu7x-4x4x8-h2d.yaml +++ b/Ironwood/guides/automation/tpu7x-4x4x8-h2d.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-4x4x8 spec: completionMode: Indexed + suspend: true parallelism: 32 completions: 32 backoffLimit: 0 diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml index 8d8e960e..e6445b54 100644 --- a/Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml +++ b/Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml @@ -11,8 +11,11 @@ apiVersion: batch/v1 kind: Job metadata: name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-4x4x8 spec: completionMode: Indexed + suspend: true parallelism: 32 completions: 32 backoffLimit: 0 From f53bf5e336d99ef8c1fb59da06adaeb3daba42cd Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Thu, 29 Jan 2026 16:07:16 +0000 Subject: [PATCH 13/88] [Automation] Update aggregator --- Ironwood/guides/automation/aggregator.py | 24 +++++++++++++++++++ Ironwood/guides/automation/aggregator.yaml | 19 ++++++++++----- .../guides/automation/automation_launch.sh | 2 +- 3 files changed, 38 insertions(+), 7 deletions(-) create mode 100644 Ironwood/guides/automation/aggregator.py diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py new file mode 100644 index 00000000..44889890 --- /dev/null +++ b/Ironwood/guides/automation/aggregator.py @@ -0,0 +1,24 @@ +import argparse +import os +import glob +import pandas as pd +import gcsfs + +def download_from_gcs(bucket_path: str, local_dir: str): + """ + Downloads the content of the GCS bucket path to a local directory. + """ + fs = gcsfs.GCSFileSystem() + gcs_path = bucket_path.replace("gs://", "") + + print(f"Downloading from gs://{gcs_path} to {local_dir}...") + os.makedirs(local_dir, exist_ok=True) + fs.get(gcs_path, local_dir, recursive=True) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Download from GCS and aggregate results locally.") + parser.add_argument("--bucket_path", type=str, required=True, help="The GCS bucket path (gs://...)") + parser.add_argument("--local_dir", type=str, default="./results", help="Local directory to download and aggregate results.") + args = parser.parse_args() + + download_from_gcs(args.bucket_path, args.local_dir) diff --git a/Ironwood/guides/automation/aggregator.yaml b/Ironwood/guides/automation/aggregator.yaml index 2c0ecbd0..04cef00e 100644 --- a/Ironwood/guides/automation/aggregator.yaml +++ b/Ironwood/guides/automation/aggregator.yaml @@ -7,10 +7,17 @@ spec: spec: containers: - name: main-app - image: ubuntu - command: ["/bin/sh", "-c"] - args: + image: python:3.12 + command: + - bash + - -c - | - echo "Collectives job is finally done." - echo "Everything is all fine!" - restartPolicy: Never \ No newline at end of file + set -ex + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/guides/automation/aggregator.py --bucket_path=${GCS_BUCKET_DIR} + sleep infinity + restartPolicy: Never diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh index 3079da66..c832f8ac 100644 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -152,7 +152,7 @@ echo "" echo "Jobs completed. Aggregating results..." echo "" -kubectl apply -f aggregator.yaml +envsubst '${GCS_BUCKET_ROOT_DIR}' < ${SCRIPT_DIR}/aggregator.yaml | kubectl apply -f - # Print the failed jobs at the end for better visibility. From a06944d11580a2196b8848631bee145c59065e5f Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Thu, 29 Jan 2026 17:13:01 +0000 Subject: [PATCH 14/88] [Automation] Update aggregator and rename host to device yaml files --- Ironwood/guides/automation/aggregator.py | 9 +++++++++ Ironwood/guides/automation/aggregator.yaml | 5 +++-- Ironwood/guides/automation/automation_launch.sh | 8 ++++---- ...tpu7x-2x2x1-h2d.yaml => tpu7x-2x2x1-host_device.yaml} | 0 ...tpu7x-2x2x2-h2d.yaml => tpu7x-2x2x2-host_device.yaml} | 0 ...tpu7x-2x2x4-h2d.yaml => tpu7x-2x2x4-host_device.yaml} | 0 ...tpu7x-2x4x4-h2d.yaml => tpu7x-2x4x4-host_device.yaml} | 0 ...tpu7x-4x4x4-h2d.yaml => tpu7x-4x4x4-host_device.yaml} | 0 ...tpu7x-4x4x8-h2d.yaml => tpu7x-4x4x8-host_device.yaml} | 0 9 files changed, 16 insertions(+), 6 deletions(-) rename Ironwood/guides/automation/{tpu7x-2x2x1-h2d.yaml => tpu7x-2x2x1-host_device.yaml} (100%) rename Ironwood/guides/automation/{tpu7x-2x2x2-h2d.yaml => tpu7x-2x2x2-host_device.yaml} (100%) rename Ironwood/guides/automation/{tpu7x-2x2x4-h2d.yaml => tpu7x-2x2x4-host_device.yaml} (100%) rename Ironwood/guides/automation/{tpu7x-2x4x4-h2d.yaml => tpu7x-2x4x4-host_device.yaml} (100%) rename Ironwood/guides/automation/{tpu7x-4x4x4-h2d.yaml => tpu7x-4x4x4-host_device.yaml} (100%) rename Ironwood/guides/automation/{tpu7x-4x4x8-h2d.yaml => tpu7x-4x4x8-host_device.yaml} (100%) diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py index 44889890..1bd18973 100644 --- a/Ironwood/guides/automation/aggregator.py +++ b/Ironwood/guides/automation/aggregator.py @@ -15,6 +15,14 @@ def download_from_gcs(bucket_path: str, local_dir: str): os.makedirs(local_dir, exist_ok=True) fs.get(gcs_path, local_dir, recursive=True) +def aggregate_results(local_dir: str): + categories = ["collectives", "hbm", "host_device"] + directories = {} + for category in categories: + directories[category] = glob.glob(f"{local_dir}/*/{category}/*", recursive=True) + + + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Download from GCS and aggregate results locally.") parser.add_argument("--bucket_path", type=str, required=True, help="The GCS bucket path (gs://...)") @@ -22,3 +30,4 @@ def download_from_gcs(bucket_path: str, local_dir: str): args = parser.parse_args() download_from_gcs(args.bucket_path, args.local_dir) + aggregate_results(args.local_dir) diff --git a/Ironwood/guides/automation/aggregator.yaml b/Ironwood/guides/automation/aggregator.yaml index 04cef00e..bc81bed6 100644 --- a/Ironwood/guides/automation/aggregator.yaml +++ b/Ironwood/guides/automation/aggregator.yaml @@ -1,7 +1,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: job-waiter + name: aggregator spec: template: spec: @@ -16,8 +16,9 @@ spec: git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git cd accelerator-microbenchmarks git checkout tpu7x-auto + pip install -r requirements.txt - GCS_BUCKET_DIR=${GCS_PATH} + GCS_BUCKET_DIR=${GCS_BUCKET_ROOT_DIR} python Ironwood/guides/automation/aggregator.py --bucket_path=${GCS_BUCKET_DIR} sleep infinity restartPolicy: Never diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh index c832f8ac..198ad91c 100644 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -10,10 +10,10 @@ MAX_RETRIES=3 TIMEOUT_SECOND=3600 yaml_names=( - "tpu7x-2x2x1-hbm.yaml" "tpu7x-2x2x1-h2d.yaml" "tpu7x-2x2x1-gemm.yaml" "tpu7x-2x2x1-collectives.yaml" - "tpu7x-2x2x2-hbm.yaml" "tpu7x-2x2x2-h2d.yaml" "tpu7x-2x2x2-gemm.yaml" "tpu7x-2x2x2-collectives.yaml" - "tpu7x-2x2x4-hbm.yaml" "tpu7x-2x2x4-h2d.yaml" "tpu7x-2x2x4-gemm.yaml" "tpu7x-2x2x4-collectives.yaml" - "tpu7x-2x4x4-hbm.yaml" "tpu7x-2x4x4-h2d.yaml" "tpu7x-2x4x4-gemm.yaml" "tpu7x-2x4x4-collectives.yaml" + "tpu7x-2x2x1-hbm.yaml" "tpu7x-2x2x1-host_device.yaml" "tpu7x-2x2x1-gemm.yaml" "tpu7x-2x2x1-collectives.yaml" + "tpu7x-2x2x2-hbm.yaml" "tpu7x-2x2x2-host_device.yaml" "tpu7x-2x2x2-gemm.yaml" "tpu7x-2x2x2-collectives.yaml" + "tpu7x-2x2x4-hbm.yaml" "tpu7x-2x2x4-host_device.yaml" "tpu7x-2x2x4-gemm.yaml" "tpu7x-2x2x4-collectives.yaml" + "tpu7x-2x4x4-hbm.yaml" "tpu7x-2x4x4-host_device.yaml" "tpu7x-2x4x4-gemm.yaml" "tpu7x-2x4x4-collectives.yaml" ) ###################################################################### diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml similarity index 100% rename from Ironwood/guides/automation/tpu7x-2x2x1-h2d.yaml rename to Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-host_device.yaml similarity index 100% rename from Ironwood/guides/automation/tpu7x-2x2x2-h2d.yaml rename to Ironwood/guides/automation/tpu7x-2x2x2-host_device.yaml diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-host_device.yaml similarity index 100% rename from Ironwood/guides/automation/tpu7x-2x2x4-h2d.yaml rename to Ironwood/guides/automation/tpu7x-2x2x4-host_device.yaml diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-h2d.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-host_device.yaml similarity index 100% rename from Ironwood/guides/automation/tpu7x-2x4x4-h2d.yaml rename to Ironwood/guides/automation/tpu7x-2x4x4-host_device.yaml diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-h2d.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-host_device.yaml similarity index 100% rename from Ironwood/guides/automation/tpu7x-4x4x4-h2d.yaml rename to Ironwood/guides/automation/tpu7x-4x4x4-host_device.yaml diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-h2d.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-host_device.yaml similarity index 100% rename from Ironwood/guides/automation/tpu7x-4x4x8-h2d.yaml rename to Ironwood/guides/automation/tpu7x-4x4x8-host_device.yaml From ddd34730c432c9c9a366eb2566f61a1fd16bb14e Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Thu, 29 Jan 2026 18:44:45 +0000 Subject: [PATCH 15/88] [Automation] Delete unused yaml file and update aggregator file --- Ironwood/guides/automation/aggregator.py | 102 ++++++++++++++++-- Ironwood/guides/automation/aggregator.yaml | 3 +- .../guides/automation/automation_launch.sh | 4 +- .../guides/automation/tpu7x-2x2x2-gemm.yaml | 60 ----------- .../guides/automation/tpu7x-2x2x2-hbm.yaml | 60 ----------- .../automation/tpu7x-2x2x2-host_device.yaml | 60 ----------- .../guides/automation/tpu7x-2x2x4-gemm.yaml | 60 ----------- .../guides/automation/tpu7x-2x2x4-hbm.yaml | 60 ----------- .../automation/tpu7x-2x2x4-host_device.yaml | 60 ----------- .../guides/automation/tpu7x-2x4x4-gemm.yaml | 60 ----------- .../guides/automation/tpu7x-2x4x4-hbm.yaml | 60 ----------- .../automation/tpu7x-2x4x4-host_device.yaml | 60 ----------- .../guides/automation/tpu7x-4x4x4-gemm.yaml | 60 ----------- .../guides/automation/tpu7x-4x4x4-hbm.yaml | 60 ----------- .../automation/tpu7x-4x4x4-host_device.yaml | 60 ----------- .../guides/automation/tpu7x-4x4x8-gemm.yaml | 60 ----------- .../guides/automation/tpu7x-4x4x8-hbm.yaml | 60 ----------- .../automation/tpu7x-4x4x8-host_device.yaml | 60 ----------- 18 files changed, 96 insertions(+), 913 deletions(-) delete mode 100644 Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml delete mode 100644 Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml delete mode 100644 Ironwood/guides/automation/tpu7x-2x2x2-host_device.yaml delete mode 100644 Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml delete mode 100644 Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml delete mode 100644 Ironwood/guides/automation/tpu7x-2x2x4-host_device.yaml delete mode 100644 Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml delete mode 100644 Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml delete mode 100644 Ironwood/guides/automation/tpu7x-2x4x4-host_device.yaml delete mode 100644 Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml delete mode 100644 Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml delete mode 100644 Ironwood/guides/automation/tpu7x-4x4x4-host_device.yaml delete mode 100644 Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml delete mode 100644 Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml delete mode 100644 Ironwood/guides/automation/tpu7x-4x4x8-host_device.yaml diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py index 1bd18973..471a4ca8 100644 --- a/Ironwood/guides/automation/aggregator.py +++ b/Ironwood/guides/automation/aggregator.py @@ -4,24 +4,110 @@ import pandas as pd import gcsfs +columns_mapping = { + "collectives": [ + "topology", "input_num_elements", "transferred_data (GB)", "dtype_bytes", "hlo_input_shape", "hlo_output_shape", "test_name", + 'step_time_ms_p50', 'step_time_ms_p90', 'step_time_ms_p95', 'step_time_ms_p99', 'step_time_ms_avg', 'step_time_ms_max', 'step_time_ms_num_runs', 'step_time_ms_min', + 'achieved_bw (GB/s)_p50', 'achieved_bw (GB/s)_p90', 'achieved_bw (GB/s)_p95', 'achieved_bw (GB/s)_p99', 'achieved_bw (GB/s)_avg', 'achieved_bw (GB/s)_max', 'achieved_bw (GB/s)_num_runs', 'achieved_bw (GB/s)_min', + ], + "hbm": [ + 'num_elements', 'dtype', 'tensor_size_gbytes', 'test_name', + 'time_ms_p50', 'time_ms_p90', 'time_ms_p95', 'time_ms_p99', 'time_ms_avg', 'time_ms_max', 'time_ms_num_runs', 'time_ms_min', + 'bw_gbyte_sec_p50', 'bw_gbyte_sec_p90', 'bw_gbyte_sec_p95', 'bw_gbyte_sec_p99', 'bw_gbyte_sec_avg', 'bw_gbyte_sec_max', 'bw_gbyte_sec_num_runs', 'bw_gbyte_sec_min', + ], + "host_device": [ + 'data_size_mib', 'H2D_bw (GiB/s)_p50', 'H2D_bw (GiB/s)_p90', + 'H2D_bw (GiB/s)_p95', 'H2D_bw (GiB/s)_p99', 'H2D_bw (GiB/s)_avg', + 'H2D_bw (GiB/s)_max', 'H2D_bw (GiB/s)_num_runs', 'H2D_bw (GiB/s)_min', + 'D2H_bw (GiB/s)_p50', 'D2H_bw (GiB/s)_p90', 'D2H_bw (GiB/s)_p95', + 'D2H_bw (GiB/s)_p99', 'D2H_bw (GiB/s)_avg', 'D2H_bw (GiB/s)_max', + 'D2H_bw (GiB/s)_num_runs', 'D2H_bw (GiB/s)_min'], + "training": ['m', 'n', 'k', 'dtype', 'StepTime(median,ms)', + 'Throughput(median,TFLOP/s/device)', 'TotalThroughput(median,TFLOP/s)', + 'MFU', 'total_flops', 'test_name', 'step_time_ms_p50', 'step_time_ms_p90', + 'step_time_ms_p95', 'step_time_ms_p99', 'step_time_ms_avg', + 'step_time_ms_max', 'step_time_ms_num_runs', 'step_time_ms_min', + 'tflops_per_sec_pre_device_p50', 'tflops_per_sec_pre_device_p90', + 'tflops_per_sec_pre_device_p95', 'tflops_per_sec_pre_device_p99', + 'tflops_per_sec_pre_device_avg', 'tflops_per_sec_pre_device_max', + 'tflops_per_sec_pre_device_num_runs', 'tflops_per_sec_pre_device_min', + 'tflops_per_sec_p50', 'tflops_per_sec_p90', 'tflops_per_sec_p95', + 'tflops_per_sec_p99', 'tflops_per_sec_avg', 'tflops_per_sec_max', + 'tflops_per_sec_num_runs', 'tflops_per_sec_min', 'MFU_p50', 'MFU_p90', + 'MFU_p95', 'MFU_p99', 'MFU_avg', 'MFU_max', 'MFU_num_runs', 'MFU_min'] +} + def download_from_gcs(bucket_path: str, local_dir: str): """ Downloads the content of the GCS bucket path to a local directory. """ fs = gcsfs.GCSFileSystem() - gcs_path = bucket_path.replace("gs://", "") - + gcs_path = bucket_path.replace("gs://", "").rstrip("/") + "/" + print(f"Downloading from gs://{gcs_path} to {local_dir}...") os.makedirs(local_dir, exist_ok=True) fs.get(gcs_path, local_dir, recursive=True) -def aggregate_results(local_dir: str): - categories = ["collectives", "hbm", "host_device"] +def aggregate_collectives(directories: list[str], picked_columns: list[str]) -> pd.DataFrame: + if len(directories) == 0: + return None + aggregated_df = pd.DataFrame() + for directory in directories: + file = glob.glob(f"{directory}/*.tsv")[0] + df = pd.read_csv(file, sep='\t') + df["topology"] = [file.split('/')[-4].split('-')[1] for _ in range(df.shape[0])] + aggregated_df = pd.concat([aggregated_df, df[picked_columns]], ignore_index=True) + return aggregated_df + +def aggregate_hbm(directories: list[str], picked_columns: list[str]) -> pd.DataFrame: + if len(directories) == 0: + return None + aggregated_df = pd.DataFrame() + for directory in directories: + file = glob.glob(f"{directory}/*.tsv")[0] + df = pd.read_csv(file, sep='\t') + aggregated_df = pd.concat([aggregated_df, df[picked_columns]], ignore_index=True) + return aggregated_df + +def aggregate_host_device(directories: list[str], picked_columns: list[str]) -> pd.DataFrame: + if len(directories) == 0: + return None + aggregated_df = pd.DataFrame() + for directory in directories: + file = glob.glob(f"{directory}/*.tsv")[0] + df = pd.read_csv(file, sep='\t') + aggregated_df = pd.concat([aggregated_df, df[picked_columns]], ignore_index=True) + return aggregated_df + +def aggregate_training(directories: list[str], picked_columns: list[str]) -> pd.DataFrame: + if len(directories) == 0: + return None + aggregated_df = pd.DataFrame() + for directory in directories: + files = glob.glob(f"{directory}/*.tsv") + for file in files: + df = pd.read_csv(file, sep='\t') + if "topology" in picked_columns: + df["topology"] = [file.split('/')[-4].split('-')[1] for _ in range(df.shape[0])] + aggregated_df = pd.concat([aggregated_df, df[picked_columns]], ignore_index=True) + return aggregated_df + +aggregate_function = { + "collectives": aggregate_collectives, + "hbm": aggregate_hbm, + "host_device": aggregate_host_device, + "training": aggregate_training +} + +def aggregate_results(bucket_path: str, local_dir: str): + categories = ["collectives", "hbm", "host_device", "training"] directories = {} + results = {} for category in categories: - directories[category] = glob.glob(f"{local_dir}/*/{category}/*", recursive=True) - - + directories[category] = sorted(glob.glob(f"{local_dir}/*/{category}/*", recursive=True)) + results[category] = aggregate_function[category](directories[category], columns_mapping[category]) + if results[category] is not None: + results[category].to_csv(f"{bucket_path}/aggregated_results/{category}.csv", index=False) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Download from GCS and aggregate results locally.") @@ -30,4 +116,4 @@ def aggregate_results(local_dir: str): args = parser.parse_args() download_from_gcs(args.bucket_path, args.local_dir) - aggregate_results(args.local_dir) + aggregate_results(args.bucket_path, args.local_dir) diff --git a/Ironwood/guides/automation/aggregator.yaml b/Ironwood/guides/automation/aggregator.yaml index bc81bed6..12a0832f 100644 --- a/Ironwood/guides/automation/aggregator.yaml +++ b/Ironwood/guides/automation/aggregator.yaml @@ -1,7 +1,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: aggregator + name: wait spec: template: spec: @@ -20,5 +20,4 @@ spec: GCS_BUCKET_DIR=${GCS_BUCKET_ROOT_DIR} python Ironwood/guides/automation/aggregator.py --bucket_path=${GCS_BUCKET_DIR} - sleep infinity restartPolicy: Never diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh index 198ad91c..262e06f2 100644 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -11,9 +11,7 @@ TIMEOUT_SECOND=3600 yaml_names=( "tpu7x-2x2x1-hbm.yaml" "tpu7x-2x2x1-host_device.yaml" "tpu7x-2x2x1-gemm.yaml" "tpu7x-2x2x1-collectives.yaml" - "tpu7x-2x2x2-hbm.yaml" "tpu7x-2x2x2-host_device.yaml" "tpu7x-2x2x2-gemm.yaml" "tpu7x-2x2x2-collectives.yaml" - "tpu7x-2x2x4-hbm.yaml" "tpu7x-2x2x4-host_device.yaml" "tpu7x-2x2x4-gemm.yaml" "tpu7x-2x2x4-collectives.yaml" - "tpu7x-2x4x4-hbm.yaml" "tpu7x-2x4x4-host_device.yaml" "tpu7x-2x4x4-gemm.yaml" "tpu7x-2x4x4-collectives.yaml" + "tpu7x-2x2x2-collectives.yaml" "tpu7x-2x2x4-collectives.yaml" "tpu7x-2x4x4-collectives.yaml" "tpu7x-4x4x4-collectives.yaml" ) ###################################################################### diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml deleted file mode 100644 index 3f6004bd..00000000 --- a/Ironwood/guides/automation/tpu7x-2x2x2-gemm.yaml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: headless-svc-${JOB_NAME} -spec: - clusterIP: None - selector: - job-name: ${JOB_NAME} ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: ${JOB_NAME} - labels: - kueue.x-k8s.io/queue-name: user-queue-2x2x2 -spec: - completionMode: Indexed - suspend: true - parallelism: 2 - completions: 2 - backoffLimit: 0 - template: - spec: - subdomain: headless-svc-${JOB_NAME} - restartPolicy: Never - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu7x - cloud.google.com/gke-tpu-topology: 2x2x2 - containers: - - name: jax-tpu - image: python:3.12 - securityContext: - privileged: false - env: - - name: JAX_PLATFORMS - value: "tpu,cpu" - - name: TPU_VMODULE - value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" - - name: XLA_IR_DEBUG - value: "1" - - name: XLA_HLO_DEBUG - value: "1" - command: - - bash - - -c - - | - set -ex - - git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git - cd accelerator-microbenchmarks - git checkout tpu7x-auto - pip install -r requirements.txt - - GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} - resources: - requests: - google.com/tpu: 4 - limits: - google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml deleted file mode 100644 index 71793e3f..00000000 --- a/Ironwood/guides/automation/tpu7x-2x2x2-hbm.yaml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: headless-svc-${JOB_NAME} -spec: - clusterIP: None - selector: - job-name: ${JOB_NAME} ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: ${JOB_NAME} - labels: - kueue.x-k8s.io/queue-name: user-queue-2x2x2 -spec: - completionMode: Indexed - suspend: true - parallelism: 2 - completions: 2 - backoffLimit: 0 - template: - spec: - subdomain: headless-svc-${JOB_NAME} - restartPolicy: Never - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu7x - cloud.google.com/gke-tpu-topology: 2x2x2 - containers: - - name: jax-tpu - image: python:3.12 - securityContext: - privileged: false - env: - - name: JAX_PLATFORMS - value: "tpu,cpu" - - name: TPU_VMODULE - value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" - - name: XLA_IR_DEBUG - value: "1" - - name: XLA_HLO_DEBUG - value: "1" - command: - - bash - - -c - - | - set -ex - - git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git - cd accelerator-microbenchmarks - git checkout tpu7x-auto - pip install -r requirements.txt - - GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} - resources: - requests: - google.com/tpu: 4 - limits: - google.com/tpu: 4 diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-host_device.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-host_device.yaml deleted file mode 100644 index fbcd556d..00000000 --- a/Ironwood/guides/automation/tpu7x-2x2x2-host_device.yaml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: headless-svc-${JOB_NAME} -spec: - clusterIP: None - selector: - job-name: ${JOB_NAME} ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: ${JOB_NAME} - labels: - kueue.x-k8s.io/queue-name: user-queue-2x2x2 -spec: - completionMode: Indexed - suspend: true - parallelism: 2 - completions: 2 - backoffLimit: 0 - template: - spec: - subdomain: headless-svc-${JOB_NAME} - restartPolicy: Never - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu7x - cloud.google.com/gke-tpu-topology: 2x2x2 - containers: - - name: jax-tpu - image: python:3.12 - securityContext: - privileged: false - env: - - name: JAX_PLATFORMS - value: "tpu,cpu" - - name: TPU_VMODULE - value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" - - name: XLA_IR_DEBUG - value: "1" - - name: XLA_HLO_DEBUG - value: "1" - command: - - bash - - -c - - | - set -ex - - git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git - cd accelerator-microbenchmarks - git checkout tpu7x-auto - pip install -r requirements.txt - - GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} - resources: - requests: - google.com/tpu: 4 - limits: - google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml deleted file mode 100644 index fee014e9..00000000 --- a/Ironwood/guides/automation/tpu7x-2x2x4-gemm.yaml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: headless-svc-${JOB_NAME} -spec: - clusterIP: None - selector: - job-name: ${JOB_NAME} ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: ${JOB_NAME} - labels: - kueue.x-k8s.io/queue-name: user-queue-2x2x4 -spec: - completionMode: Indexed - suspend: true - parallelism: 4 - completions: 4 - backoffLimit: 0 - template: - spec: - subdomain: headless-svc-${JOB_NAME} - restartPolicy: Never - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu7x - cloud.google.com/gke-tpu-topology: 2x2x4 - containers: - - name: jax-tpu - image: python:3.12 - securityContext: - privileged: false - env: - - name: JAX_PLATFORMS - value: "tpu,cpu" - - name: TPU_VMODULE - value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" - - name: XLA_IR_DEBUG - value: "1" - - name: XLA_HLO_DEBUG - value: "1" - command: - - bash - - -c - - | - set -ex - - git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git - cd accelerator-microbenchmarks - git checkout tpu7x-auto - pip install -r requirements.txt - - GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} - resources: - requests: - google.com/tpu: 4 - limits: - google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml deleted file mode 100644 index 100e1f7f..00000000 --- a/Ironwood/guides/automation/tpu7x-2x2x4-hbm.yaml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: headless-svc-${JOB_NAME} -spec: - clusterIP: None - selector: - job-name: ${JOB_NAME} ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: ${JOB_NAME} - labels: - kueue.x-k8s.io/queue-name: user-queue-2x2x4 -spec: - completionMode: Indexed - suspend: true - parallelism: 4 - completions: 4 - backoffLimit: 0 - template: - spec: - subdomain: headless-svc-${JOB_NAME} - restartPolicy: Never - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu7x - cloud.google.com/gke-tpu-topology: 2x2x4 - containers: - - name: jax-tpu - image: python:3.12 - securityContext: - privileged: false - env: - - name: JAX_PLATFORMS - value: "tpu,cpu" - - name: TPU_VMODULE - value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" - - name: XLA_IR_DEBUG - value: "1" - - name: XLA_HLO_DEBUG - value: "1" - command: - - bash - - -c - - | - set -ex - - git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git - cd accelerator-microbenchmarks - git checkout tpu7x-auto - pip install -r requirements.txt - - GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} - resources: - requests: - google.com/tpu: 4 - limits: - google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-host_device.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-host_device.yaml deleted file mode 100644 index 799c5b35..00000000 --- a/Ironwood/guides/automation/tpu7x-2x2x4-host_device.yaml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: headless-svc-${JOB_NAME} -spec: - clusterIP: None - selector: - job-name: ${JOB_NAME} ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: ${JOB_NAME} - labels: - kueue.x-k8s.io/queue-name: user-queue-2x2x4 -spec: - completionMode: Indexed - suspend: true - parallelism: 4 - completions: 4 - backoffLimit: 0 - template: - spec: - subdomain: headless-svc-${JOB_NAME} - restartPolicy: Never - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu7x - cloud.google.com/gke-tpu-topology: 2x2x4 - containers: - - name: jax-tpu - image: python:3.12 - securityContext: - privileged: false - env: - - name: JAX_PLATFORMS - value: "tpu,cpu" - - name: TPU_VMODULE - value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" - - name: XLA_IR_DEBUG - value: "1" - - name: XLA_HLO_DEBUG - value: "1" - command: - - bash - - -c - - | - set -ex - - git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git - cd accelerator-microbenchmarks - git checkout tpu7x-auto - pip install -r requirements.txt - - GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} - resources: - requests: - google.com/tpu: 4 - limits: - google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml deleted file mode 100644 index d51c9c31..00000000 --- a/Ironwood/guides/automation/tpu7x-2x4x4-gemm.yaml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: headless-svc-${JOB_NAME} -spec: - clusterIP: None - selector: - job-name: ${JOB_NAME} ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: ${JOB_NAME} - labels: - kueue.x-k8s.io/queue-name: user-queue-2x4x4 -spec: - completionMode: Indexed - suspend: true - parallelism: 8 - completions: 8 - backoffLimit: 0 - template: - spec: - subdomain: headless-svc-${JOB_NAME} - restartPolicy: Never - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu7x - cloud.google.com/gke-tpu-topology: 2x4x4 - containers: - - name: jax-tpu - image: python:3.12 - securityContext: - privileged: false - env: - - name: JAX_PLATFORMS - value: "tpu,cpu" - - name: TPU_VMODULE - value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" - - name: XLA_IR_DEBUG - value: "1" - - name: XLA_HLO_DEBUG - value: "1" - command: - - bash - - -c - - | - set -ex - - git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git - cd accelerator-microbenchmarks - git checkout tpu7x-auto - pip install -r requirements.txt - - GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} - resources: - requests: - google.com/tpu: 4 - limits: - google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml deleted file mode 100644 index a438b93d..00000000 --- a/Ironwood/guides/automation/tpu7x-2x4x4-hbm.yaml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: headless-svc-${JOB_NAME} -spec: - clusterIP: None - selector: - job-name: ${JOB_NAME} ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: ${JOB_NAME} - labels: - kueue.x-k8s.io/queue-name: user-queue-2x4x4 -spec: - completionMode: Indexed - suspend: true - parallelism: 8 - completions: 8 - backoffLimit: 0 - template: - spec: - subdomain: headless-svc-${JOB_NAME} - restartPolicy: Never - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu7x - cloud.google.com/gke-tpu-topology: 2x4x4 - containers: - - name: jax-tpu - image: python:3.12 - securityContext: - privileged: false - env: - - name: JAX_PLATFORMS - value: "tpu,cpu" - - name: TPU_VMODULE - value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" - - name: XLA_IR_DEBUG - value: "1" - - name: XLA_HLO_DEBUG - value: "1" - command: - - bash - - -c - - | - set -ex - - git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git - cd accelerator-microbenchmarks - git checkout tpu7x-auto - pip install -r requirements.txt - - GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} - resources: - requests: - google.com/tpu: 4 - limits: - google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-host_device.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-host_device.yaml deleted file mode 100644 index 30a25c4c..00000000 --- a/Ironwood/guides/automation/tpu7x-2x4x4-host_device.yaml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: headless-svc-${JOB_NAME} -spec: - clusterIP: None - selector: - job-name: ${JOB_NAME} ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: ${JOB_NAME} - labels: - kueue.x-k8s.io/queue-name: user-queue-2x4x4 -spec: - completionMode: Indexed - suspend: true - parallelism: 8 - completions: 8 - backoffLimit: 0 - template: - spec: - subdomain: headless-svc-${JOB_NAME} - restartPolicy: Never - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu7x - cloud.google.com/gke-tpu-topology: 2x4x4 - containers: - - name: jax-tpu - image: python:3.12 - securityContext: - privileged: false - env: - - name: JAX_PLATFORMS - value: "tpu,cpu" - - name: TPU_VMODULE - value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" - - name: XLA_IR_DEBUG - value: "1" - - name: XLA_HLO_DEBUG - value: "1" - command: - - bash - - -c - - | - set -ex - - git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git - cd accelerator-microbenchmarks - git checkout tpu7x-auto - pip install -r requirements.txt - - GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} - resources: - requests: - google.com/tpu: 4 - limits: - google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml deleted file mode 100644 index ef9220f6..00000000 --- a/Ironwood/guides/automation/tpu7x-4x4x4-gemm.yaml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: headless-svc-${JOB_NAME} -spec: - clusterIP: None - selector: - job-name: ${JOB_NAME} ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: ${JOB_NAME} - labels: - kueue.x-k8s.io/queue-name: user-queue-4x4x4 -spec: - completionMode: Indexed - suspend: true - parallelism: 16 - completions: 16 - backoffLimit: 0 - template: - spec: - subdomain: headless-svc-${JOB_NAME} - restartPolicy: Never - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu7x - cloud.google.com/gke-tpu-topology: 4x4x4 - containers: - - name: jax-tpu - image: python:3.12 - securityContext: - privileged: false - env: - - name: JAX_PLATFORMS - value: "tpu,cpu" - - name: TPU_VMODULE - value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" - - name: XLA_IR_DEBUG - value: "1" - - name: XLA_HLO_DEBUG - value: "1" - command: - - bash - - -c - - | - set -ex - - git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git - cd accelerator-microbenchmarks - git checkout tpu7x-auto - pip install -r requirements.txt - - GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} - resources: - requests: - google.com/tpu: 4 - limits: - google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml deleted file mode 100644 index 0819edad..00000000 --- a/Ironwood/guides/automation/tpu7x-4x4x4-hbm.yaml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: headless-svc-${JOB_NAME} -spec: - clusterIP: None - selector: - job-name: ${JOB_NAME} ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: ${JOB_NAME} - labels: - kueue.x-k8s.io/queue-name: user-queue-4x4x4 -spec: - completionMode: Indexed - suspend: true - parallelism: 16 - completions: 16 - backoffLimit: 0 - template: - spec: - subdomain: headless-svc-${JOB_NAME} - restartPolicy: Never - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu7x - cloud.google.com/gke-tpu-topology: 4x4x4 - containers: - - name: jax-tpu - image: python:3.12 - securityContext: - privileged: false - env: - - name: JAX_PLATFORMS - value: "tpu,cpu" - - name: TPU_VMODULE - value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" - - name: XLA_IR_DEBUG - value: "1" - - name: XLA_HLO_DEBUG - value: "1" - command: - - bash - - -c - - | - set -ex - - git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git - cd accelerator-microbenchmarks - git checkout tpu7x-auto - pip install -r requirements.txt - - GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} - resources: - requests: - google.com/tpu: 4 - limits: - google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-host_device.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-host_device.yaml deleted file mode 100644 index da7a4e7c..00000000 --- a/Ironwood/guides/automation/tpu7x-4x4x4-host_device.yaml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: headless-svc-${JOB_NAME} -spec: - clusterIP: None - selector: - job-name: ${JOB_NAME} ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: ${JOB_NAME} - labels: - kueue.x-k8s.io/queue-name: user-queue-4x4x4 -spec: - completionMode: Indexed - suspend: true - parallelism: 16 - completions: 16 - backoffLimit: 0 - template: - spec: - subdomain: headless-svc-${JOB_NAME} - restartPolicy: Never - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu7x - cloud.google.com/gke-tpu-topology: 4x4x4 - containers: - - name: jax-tpu - image: python:3.12 - securityContext: - privileged: false - env: - - name: JAX_PLATFORMS - value: "tpu,cpu" - - name: TPU_VMODULE - value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" - - name: XLA_IR_DEBUG - value: "1" - - name: XLA_HLO_DEBUG - value: "1" - command: - - bash - - -c - - | - set -ex - - git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git - cd accelerator-microbenchmarks - git checkout tpu7x-auto - pip install -r requirements.txt - - GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} - resources: - requests: - google.com/tpu: 4 - limits: - google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml deleted file mode 100644 index 65f93467..00000000 --- a/Ironwood/guides/automation/tpu7x-4x4x8-gemm.yaml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: headless-svc-${JOB_NAME} -spec: - clusterIP: None - selector: - job-name: ${JOB_NAME} ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: ${JOB_NAME} - labels: - kueue.x-k8s.io/queue-name: user-queue-4x4x8 -spec: - completionMode: Indexed - suspend: true - parallelism: 32 - completions: 32 - backoffLimit: 0 - template: - spec: - subdomain: headless-svc-${JOB_NAME} - restartPolicy: Never - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu7x - cloud.google.com/gke-tpu-topology: 4x4x8 - containers: - - name: jax-tpu - image: python:3.12 - securityContext: - privileged: false - env: - - name: JAX_PLATFORMS - value: "tpu,cpu" - - name: TPU_VMODULE - value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" - - name: XLA_IR_DEBUG - value: "1" - - name: XLA_HLO_DEBUG - value: "1" - command: - - bash - - -c - - | - set -ex - - git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git - cd accelerator-microbenchmarks - git checkout tpu7x-auto - pip install -r requirements.txt - - GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} - resources: - requests: - google.com/tpu: 4 - limits: - google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml deleted file mode 100644 index e6445b54..00000000 --- a/Ironwood/guides/automation/tpu7x-4x4x8-hbm.yaml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: headless-svc-${JOB_NAME} -spec: - clusterIP: None - selector: - job-name: ${JOB_NAME} ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: ${JOB_NAME} - labels: - kueue.x-k8s.io/queue-name: user-queue-4x4x8 -spec: - completionMode: Indexed - suspend: true - parallelism: 32 - completions: 32 - backoffLimit: 0 - template: - spec: - subdomain: headless-svc-${JOB_NAME} - restartPolicy: Never - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu7x - cloud.google.com/gke-tpu-topology: 4x4x8 - containers: - - name: jax-tpu - image: python:3.12 - securityContext: - privileged: false - env: - - name: JAX_PLATFORMS - value: "tpu,cpu" - - name: TPU_VMODULE - value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" - - name: XLA_IR_DEBUG - value: "1" - - name: XLA_HLO_DEBUG - value: "1" - command: - - bash - - -c - - | - set -ex - - git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git - cd accelerator-microbenchmarks - git checkout tpu7x-auto - pip install -r requirements.txt - - GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} - resources: - requests: - google.com/tpu: 4 - limits: - google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-host_device.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-host_device.yaml deleted file mode 100644 index 09d2a5ab..00000000 --- a/Ironwood/guides/automation/tpu7x-4x4x8-host_device.yaml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: headless-svc-${JOB_NAME} -spec: - clusterIP: None - selector: - job-name: ${JOB_NAME} ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: ${JOB_NAME} - labels: - kueue.x-k8s.io/queue-name: user-queue-4x4x8 -spec: - completionMode: Indexed - suspend: true - parallelism: 32 - completions: 32 - backoffLimit: 0 - template: - spec: - subdomain: headless-svc-${JOB_NAME} - restartPolicy: Never - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu7x - cloud.google.com/gke-tpu-topology: 4x4x8 - containers: - - name: jax-tpu - image: python:3.12 - securityContext: - privileged: false - env: - - name: JAX_PLATFORMS - value: "tpu,cpu" - - name: TPU_VMODULE - value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" - - name: XLA_IR_DEBUG - value: "1" - - name: XLA_HLO_DEBUG - value: "1" - command: - - bash - - -c - - | - set -ex - - git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git - cd accelerator-microbenchmarks - git checkout tpu7x-auto - pip install -r requirements.txt - - GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} - resources: - requests: - google.com/tpu: 4 - limits: - google.com/tpu: 4 \ No newline at end of file From 9ea7110fb14e7dd2aa52b68a30b97eb420c9b029 Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Thu, 29 Jan 2026 19:02:00 +0000 Subject: [PATCH 16/88] [Automation] Update aggregator --- Ironwood/guides/automation/aggregator.py | 55 ++++++++++++---------- Ironwood/guides/automation/aggregator.yaml | 4 +- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py index 471a4ca8..655debe6 100644 --- a/Ironwood/guides/automation/aggregator.py +++ b/Ironwood/guides/automation/aggregator.py @@ -6,35 +6,38 @@ columns_mapping = { "collectives": [ - "topology", "input_num_elements", "transferred_data (GB)", "dtype_bytes", "hlo_input_shape", "hlo_output_shape", "test_name", - 'step_time_ms_p50', 'step_time_ms_p90', 'step_time_ms_p95', 'step_time_ms_p99', 'step_time_ms_avg', 'step_time_ms_max', 'step_time_ms_num_runs', 'step_time_ms_min', - 'achieved_bw (GB/s)_p50', 'achieved_bw (GB/s)_p90', 'achieved_bw (GB/s)_p95', 'achieved_bw (GB/s)_p99', 'achieved_bw (GB/s)_avg', 'achieved_bw (GB/s)_max', 'achieved_bw (GB/s)_num_runs', 'achieved_bw (GB/s)_min', + "topology", "op_type", "input_num_elements", "transferred_data (GB)", "dtype_bytes", "hlo_input_shape", "hlo_output_shape", + "step_time_ms_p50", "step_time_ms_p90", "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg", "step_time_ms_max", "step_time_ms_num_runs", "step_time_ms_min", + "achieved_bw (GB/s)_p50", "achieved_bw (GB/s)_p90", "achieved_bw (GB/s)_p95", "achieved_bw (GB/s)_p99", "achieved_bw (GB/s)_avg", "achieved_bw (GB/s)_max", "achieved_bw (GB/s)_num_runs", "achieved_bw (GB/s)_min", ], "hbm": [ - 'num_elements', 'dtype', 'tensor_size_gbytes', 'test_name', - 'time_ms_p50', 'time_ms_p90', 'time_ms_p95', 'time_ms_p99', 'time_ms_avg', 'time_ms_max', 'time_ms_num_runs', 'time_ms_min', - 'bw_gbyte_sec_p50', 'bw_gbyte_sec_p90', 'bw_gbyte_sec_p95', 'bw_gbyte_sec_p99', 'bw_gbyte_sec_avg', 'bw_gbyte_sec_max', 'bw_gbyte_sec_num_runs', 'bw_gbyte_sec_min', + "num_elements", "dtype", "tensor_size_gbytes", + "time_ms_p50", "time_ms_p90", "time_ms_p95", "time_ms_p99", "time_ms_avg", "time_ms_max", "time_ms_num_runs", "time_ms_min", + "bw_gbyte_sec_p50", "bw_gbyte_sec_p90", "bw_gbyte_sec_p95", "bw_gbyte_sec_p99", "bw_gbyte_sec_avg", "bw_gbyte_sec_max", "bw_gbyte_sec_num_runs", "bw_gbyte_sec_min", ], "host_device": [ - 'data_size_mib', 'H2D_bw (GiB/s)_p50', 'H2D_bw (GiB/s)_p90', - 'H2D_bw (GiB/s)_p95', 'H2D_bw (GiB/s)_p99', 'H2D_bw (GiB/s)_avg', - 'H2D_bw (GiB/s)_max', 'H2D_bw (GiB/s)_num_runs', 'H2D_bw (GiB/s)_min', - 'D2H_bw (GiB/s)_p50', 'D2H_bw (GiB/s)_p90', 'D2H_bw (GiB/s)_p95', - 'D2H_bw (GiB/s)_p99', 'D2H_bw (GiB/s)_avg', 'D2H_bw (GiB/s)_max', - 'D2H_bw (GiB/s)_num_runs', 'D2H_bw (GiB/s)_min'], - "training": ['m', 'n', 'k', 'dtype', 'StepTime(median,ms)', - 'Throughput(median,TFLOP/s/device)', 'TotalThroughput(median,TFLOP/s)', - 'MFU', 'total_flops', 'test_name', 'step_time_ms_p50', 'step_time_ms_p90', - 'step_time_ms_p95', 'step_time_ms_p99', 'step_time_ms_avg', - 'step_time_ms_max', 'step_time_ms_num_runs', 'step_time_ms_min', - 'tflops_per_sec_pre_device_p50', 'tflops_per_sec_pre_device_p90', - 'tflops_per_sec_pre_device_p95', 'tflops_per_sec_pre_device_p99', - 'tflops_per_sec_pre_device_avg', 'tflops_per_sec_pre_device_max', - 'tflops_per_sec_pre_device_num_runs', 'tflops_per_sec_pre_device_min', - 'tflops_per_sec_p50', 'tflops_per_sec_p90', 'tflops_per_sec_p95', - 'tflops_per_sec_p99', 'tflops_per_sec_avg', 'tflops_per_sec_max', - 'tflops_per_sec_num_runs', 'tflops_per_sec_min', 'MFU_p50', 'MFU_p90', - 'MFU_p95', 'MFU_p99', 'MFU_avg', 'MFU_max', 'MFU_num_runs', 'MFU_min'] + "data_size_mib", "H2D_bw (GiB/s)_p50", "H2D_bw (GiB/s)_p90", + "H2D_bw (GiB/s)_p95", "H2D_bw (GiB/s)_p99", "H2D_bw (GiB/s)_avg", + "H2D_bw (GiB/s)_max", "H2D_bw (GiB/s)_num_runs", "H2D_bw (GiB/s)_min", + "D2H_bw (GiB/s)_p50", "D2H_bw (GiB/s)_p90", "D2H_bw (GiB/s)_p95", + "D2H_bw (GiB/s)_p99", "D2H_bw (GiB/s)_avg", "D2H_bw (GiB/s)_max", + "D2H_bw (GiB/s)_num_runs", "D2H_bw (GiB/s)_min" + ], + "training": [ + "m", "n", "k", "dtype", "StepTime(median,ms)", + "Throughput(median,TFLOP/s/device)", "TotalThroughput(median,TFLOP/s)", + "MFU", "total_flops", "step_time_ms_p50", "step_time_ms_p90", + "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg", + "step_time_ms_max", "step_time_ms_num_runs", "step_time_ms_min", + "tflops_per_sec_pre_device_p50", "tflops_per_sec_pre_device_p90", + "tflops_per_sec_pre_device_p95", "tflops_per_sec_pre_device_p99", + "tflops_per_sec_pre_device_avg", "tflops_per_sec_pre_device_max", + "tflops_per_sec_pre_device_num_runs", "tflops_per_sec_pre_device_min", + "tflops_per_sec_p50", "tflops_per_sec_p90", "tflops_per_sec_p95", + "tflops_per_sec_p99", "tflops_per_sec_avg", "tflops_per_sec_max", + "tflops_per_sec_num_runs", "tflops_per_sec_min", "MFU_p50", "MFU_p90", + "MFU_p95", "MFU_p99", "MFU_avg", "MFU_max", "MFU_num_runs", "MFU_min" + ], } def download_from_gcs(bucket_path: str, local_dir: str): @@ -107,7 +110,7 @@ def aggregate_results(bucket_path: str, local_dir: str): directories[category] = sorted(glob.glob(f"{local_dir}/*/{category}/*", recursive=True)) results[category] = aggregate_function[category](directories[category], columns_mapping[category]) if results[category] is not None: - results[category].to_csv(f"{bucket_path}/aggregated_results/{category}.csv", index=False) + results[category].to_csv(f"{bucket_path}/aggregated_results/{category}.tsv", index=False, sep='\t') if __name__ == "__main__": parser = argparse.ArgumentParser(description="Download from GCS and aggregate results locally.") diff --git a/Ironwood/guides/automation/aggregator.yaml b/Ironwood/guides/automation/aggregator.yaml index 12a0832f..5a2c89a3 100644 --- a/Ironwood/guides/automation/aggregator.yaml +++ b/Ironwood/guides/automation/aggregator.yaml @@ -1,7 +1,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: wait + name: aggregator spec: template: spec: @@ -16,7 +16,7 @@ spec: git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git cd accelerator-microbenchmarks git checkout tpu7x-auto - pip install -r requirements.txt + pip install gcsfs pandas GCS_BUCKET_DIR=${GCS_BUCKET_ROOT_DIR} python Ironwood/guides/automation/aggregator.py --bucket_path=${GCS_BUCKET_DIR} From bb5fc2f455f2b1edf7ba291727b3c0352ce4d0cf Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Fri, 30 Jan 2026 06:46:58 +0000 Subject: [PATCH 17/88] Add dtype to H2D/D2H --- Ironwood/guides/automation/automation_launch.sh | 10 ++++++++-- Ironwood/src/benchmark_host_device.py | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh index 262e06f2..386370de 100644 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -10,8 +10,14 @@ MAX_RETRIES=3 TIMEOUT_SECOND=3600 yaml_names=( - "tpu7x-2x2x1-hbm.yaml" "tpu7x-2x2x1-host_device.yaml" "tpu7x-2x2x1-gemm.yaml" "tpu7x-2x2x1-collectives.yaml" - "tpu7x-2x2x2-collectives.yaml" "tpu7x-2x2x4-collectives.yaml" "tpu7x-2x4x4-collectives.yaml" "tpu7x-4x4x4-collectives.yaml" + "tpu7x-2x2x1-hbm.yaml" + "tpu7x-2x2x1-host_device.yaml" + "tpu7x-2x2x1-gemm.yaml" + "tpu7x-2x2x1-collectives.yaml" + "tpu7x-2x2x2-collectives.yaml" + "tpu7x-2x2x4-collectives.yaml" + "tpu7x-2x4x4-collectives.yaml" + "tpu7x-4x4x4-collectives.yaml" ) ###################################################################### diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index f745eb48..16352e2a 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -107,6 +107,7 @@ def benchmark_host_device_calculate_metrics( "data_size_mib", } metadata = {k: v for k, v in params if k in metadata_keys} + metadata["dtype"] = "float32" metrics = {} From 5315132d526743e61a021530dad8c4edc203fbf3 Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Fri, 30 Jan 2026 07:34:01 +0000 Subject: [PATCH 18/88] [Automation] Automatically delete aggregator after completion --- Ironwood/guides/automation/automation_launch.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh index 386370de..1f544499 100644 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -157,6 +157,8 @@ echo "Jobs completed. Aggregating results..." echo "" envsubst '${GCS_BUCKET_ROOT_DIR}' < ${SCRIPT_DIR}/aggregator.yaml | kubectl apply -f - +wait_for_job_completion "aggregator" ${TIMEOUT_SECOND} +envsubst '${GCS_BUCKET_ROOT_DIR}' < ${SCRIPT_DIR}/aggregator.yaml | kubectl delete -f - # Print the failed jobs at the end for better visibility. From 4bdf77fb4345e8198057dda30d0b5470e7dd6df8 Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Fri, 30 Jan 2026 07:34:45 +0000 Subject: [PATCH 19/88] Update README with kueue and reformat --- Ironwood/guides/automation/README.md | 109 +++++++++++++++++++++------ 1 file changed, 88 insertions(+), 21 deletions(-) diff --git a/Ironwood/guides/automation/README.md b/Ironwood/guides/automation/README.md index 05e2e2d1..3c463c67 100644 --- a/Ironwood/guides/automation/README.md +++ b/Ironwood/guides/automation/README.md @@ -1,40 +1,107 @@ -# Ironwood Automation Tool +# Ironwood Benchmark Automation -This directory contains the automation scripts for running TPU microbenchmarks. The tool simplifies the process of launching multiple benchmark jobs, waiting for their completion, and aggregating the results into a unified format. +This directory contains the automation framework for running TPU microbenchmarks (HBM, Host-Device, Collectives, etc.) on GKE clusters. The tool simplifies the workflow of launching multiple benchmark jobs via [Kueue](https://kueue.sigs.k8s.io/), monitoring their status, handling retries, and aggregating the final results into a unified format. + +## Overview + +The automation workflow consists of three main stages: +1. **Launch**: Submits Kubernetes Jobs for various benchmark configurations (e.g., different topologies like 2x2x1, 2x2x2) using Kueue for queue management. +2. **Monitor & Retry**: Watches the jobs until completion. If any job fails, it automatically retries them (up to 3 times by default). +3. **Aggregate**: Once all jobs succeed, an aggregator job is launched to collect all intermediate results from GCS and consolidate them into summary TSV files. ## Prerequisites Before running the automation script, ensure the following requirements are met: -1. **Node Pool Topology**: The script expects specific TPU node pools to be available in your cluster. - * The `check_node_pool_setup.sh` script validates this. -2. **GCS Bucket**: You must have a Google Cloud Storage (GCS) bucket for the intermediate and final results. - * This can be setup by `gcloud storage buckets create gs://my-unique-bucket-name --location=us-central1` -3. **Kubectl**: Ensure `kubectl` is configured and connected to your GKE cluster. +### 1. Environment Setup +* **GKE Cluster**: You must have a GKE cluster with TPU node pools configured. +* **Kubectl**: Ensure `kubectl` is installed and authenticated to your cluster. +* **GCS Bucket**: A Google Cloud Storage bucket is required to store intermediate and final aggregated results. + ```bash + gcloud storage buckets create gs://my-unique-bucket-name --location=us-central1 + ``` + +### 2. Install Kueue +The automation relies on Kueue for job queuing. Check if it's already installed: + +```bash +kubectl get namespace kueue-system +``` + +If you see `Error from server (NotFound)`, install it with: + +```bash +kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.16.0/manifests.yaml +``` + +### 3. Verify Node Pool Topology +The script expects specific TPU node pools (e.g., `tpu7x-2x2x1`, `tpu7x-2x2x2`) to be available. The `check_node_pool_setup.sh` utility will automatically validate this before launching jobs. + +## Directory Structure + +* `automation_launch.sh`: The main entry point script. Manages the full lifecycle of the benchmark run. +* `check_node_pool_setup.sh`: Validation script to ensure required node pools exist in the cluster. +* `aggregator.py`: Python script that downloads results from GCS and produces summary tables. +* `aggregator.yaml`: Kubernetes Job definition for running the aggregator. +* `job-queue.yaml`: Kueue resource definitions (ClusterQueue, LocalQueue). +* `*.yaml`: Benchmark job configurations (e.g., `tpu7x-2x2x1-hbm.yaml`). + +## Configuration + +You can configure the behavior using the following environment variable: -## User Journey +| Variable | Description | Required | Default | +| :--- | :--- | :--- | :--- | +| `GCS_BUCKET_ROOT_DIR` | The root GCS path where results will be stored. Must start with `gs://`. | **Yes** | `gs://amylin-microbenchmark` (Change this!) | -1. **Clone & Checkout Branch**. +## Usage Guide + +1. **Clone the Repository**: ```bash git clone https://github.com/google/accelerator-microbenchmarks.git cd accelerator-microbenchmarks + # Switch to the correct branch if necessary git checkout tpu7x-auto ``` -2. **Setup Environment**: Ensure your node pools are set up and you have prepared a GCS bucket. - -3. **Run Automation Script**: - The main script is `automation_launch.sh`. You need to set the `GCS_BUCKET_ROOT_DIR` environment variable before running it. +2. **Set the GCS Bucket**: + Export the path to your GCS bucket. This is where all results will be saved. + ```bash + export GCS_BUCKET_ROOT_DIR="gs://your-unique-bucket-name/benchmark_runs/$(date +%Y%m%d_%H%M%S)" + ``` +3. **Run the Automation Script**: + Execute the launch script from the root of the repository. ```bash - # Replace with your actual bucket path (must start with gs://) - export GCS_BUCKET_ROOT_DIR="gs://your-bucket-name/automation_results" - - # Run the launch script bash Ironwood/guides/automation/automation_launch.sh ``` -4. **Retrieve Results**: - After the script completes, the final aggregated TSV files will be available in your GCS bucket. The script generates a timestamped directory for each run. - * Check the script output for the exact path: `The intermediate result will be written to gs://...` - * Look for the `final` directory under that path (e.g., `gs://your-bucket/automation_results//final`). + **What happens next?** + * The script validates your node pools. + * It applies the Kueue job queues. + * It submits the benchmark jobs defined in the script (e.g., HBM tests). + * It waits for jobs to finish, retrying any failures up to 3 times. + * Finally, it launches the `aggregator` job. + +## Output + +After the automation completes, check your GCS bucket (`GCS_BUCKET_ROOT_DIR`). You will find: + +* **`aggregated_results/`**: Contains the final summary CSV/TSV files (e.g., `hbm.tsv`, `collectives.tsv`). +* **`/`**: Directories for each individual job containing intermediate results. + +## Troubleshooting + +### Job Failures +If jobs fail even after retries: +1. Check the script output to see which specific jobs failed. +2. Inspect the logs of a failed job using `kubectl logs job/`. +3. Manually retry a specific job if needed using the command printed by the script at the end of the run. + +### Missing Results +If the `aggregated_results` folder is empty: +1. Check the logs of the aggregator job: + ```bash + kubectl logs job/aggregator + ``` +2. Ensure the `GCS_BUCKET_ROOT_DIR` was accessible by the pods (check Workload Identity or service account permissions if running in a restricted project). From 2de55f4dc44e35830dfa04a7bf7a29e247574331 Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Fri, 30 Jan 2026 07:45:52 +0000 Subject: [PATCH 20/88] Add dtype to aggregator H2D method --- Ironwood/guides/automation/aggregator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py index 655debe6..c4e3cc20 100644 --- a/Ironwood/guides/automation/aggregator.py +++ b/Ironwood/guides/automation/aggregator.py @@ -16,7 +16,7 @@ "bw_gbyte_sec_p50", "bw_gbyte_sec_p90", "bw_gbyte_sec_p95", "bw_gbyte_sec_p99", "bw_gbyte_sec_avg", "bw_gbyte_sec_max", "bw_gbyte_sec_num_runs", "bw_gbyte_sec_min", ], "host_device": [ - "data_size_mib", "H2D_bw (GiB/s)_p50", "H2D_bw (GiB/s)_p90", + "data_size_mib", "dtype", "H2D_bw (GiB/s)_p50", "H2D_bw (GiB/s)_p90", "H2D_bw (GiB/s)_p95", "H2D_bw (GiB/s)_p99", "H2D_bw (GiB/s)_avg", "H2D_bw (GiB/s)_max", "H2D_bw (GiB/s)_num_runs", "H2D_bw (GiB/s)_min", "D2H_bw (GiB/s)_p50", "D2H_bw (GiB/s)_p90", "D2H_bw (GiB/s)_p95", From ed9f6ef017896f6aa14fc759d18fc15386263900 Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Fri, 30 Jan 2026 07:51:59 +0000 Subject: [PATCH 21/88] Remove unnecessary columns when aggregating and fix a typo of per_device --- Ironwood/guides/automation/aggregator.py | 53 ++++++++++-------------- Ironwood/src/benchmark_utils.py | 2 +- 2 files changed, 23 insertions(+), 32 deletions(-) diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py index c4e3cc20..d0d5f8a3 100644 --- a/Ironwood/guides/automation/aggregator.py +++ b/Ironwood/guides/automation/aggregator.py @@ -6,37 +6,28 @@ columns_mapping = { "collectives": [ - "topology", "op_type", "input_num_elements", "transferred_data (GB)", "dtype_bytes", "hlo_input_shape", "hlo_output_shape", - "step_time_ms_p50", "step_time_ms_p90", "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg", "step_time_ms_max", "step_time_ms_num_runs", "step_time_ms_min", - "achieved_bw (GB/s)_p50", "achieved_bw (GB/s)_p90", "achieved_bw (GB/s)_p95", "achieved_bw (GB/s)_p99", "achieved_bw (GB/s)_avg", "achieved_bw (GB/s)_max", "achieved_bw (GB/s)_num_runs", "achieved_bw (GB/s)_min", + "topology", "op_type", "input_num_elements", "transferred_data (GB)", "dtype_bytes", "step_time_ms_num_runs", + "achieved_bw (GB/s)_p50", "achieved_bw (GB/s)_p90", "achieved_bw (GB/s)_p95", "achieved_bw (GB/s)_p99", "achieved_bw (GB/s)_avg", "achieved_bw (GB/s)_min", "achieved_bw (GB/s)_max", + "step_time_ms_p50", "step_time_ms_p90", "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg", "step_time_ms_min", "step_time_ms_max", ], "hbm": [ - "num_elements", "dtype", "tensor_size_gbytes", - "time_ms_p50", "time_ms_p90", "time_ms_p95", "time_ms_p99", "time_ms_avg", "time_ms_max", "time_ms_num_runs", "time_ms_min", - "bw_gbyte_sec_p50", "bw_gbyte_sec_p90", "bw_gbyte_sec_p95", "bw_gbyte_sec_p99", "bw_gbyte_sec_avg", "bw_gbyte_sec_max", "bw_gbyte_sec_num_runs", "bw_gbyte_sec_min", + "dtype", "tensor_size_gbytes", "time_ms_num_runs", + "bw_gbyte_sec_p50", "bw_gbyte_sec_p90", "bw_gbyte_sec_p95", "bw_gbyte_sec_p99", "bw_gbyte_sec_avg", "bw_gbyte_sec_min", "bw_gbyte_sec_max", + "time_ms_p50", "time_ms_p90", "time_ms_p95", "time_ms_p99", "time_ms_avg", "time_ms_min", "time_ms_max", ], "host_device": [ - "data_size_mib", "dtype", "H2D_bw (GiB/s)_p50", "H2D_bw (GiB/s)_p90", - "H2D_bw (GiB/s)_p95", "H2D_bw (GiB/s)_p99", "H2D_bw (GiB/s)_avg", - "H2D_bw (GiB/s)_max", "H2D_bw (GiB/s)_num_runs", "H2D_bw (GiB/s)_min", - "D2H_bw (GiB/s)_p50", "D2H_bw (GiB/s)_p90", "D2H_bw (GiB/s)_p95", - "D2H_bw (GiB/s)_p99", "D2H_bw (GiB/s)_avg", "D2H_bw (GiB/s)_max", - "D2H_bw (GiB/s)_num_runs", "D2H_bw (GiB/s)_min" + "data_size_mib", "H2D_bw (GiB/s)_num_runs", + "H2D_bw (GiB/s)_p50", "H2D_bw (GiB/s)_p90", "H2D_bw (GiB/s)_p95", "H2D_bw (GiB/s)_p99", + "H2D_bw (GiB/s)_avg", "H2D_bw (GiB/s)_min", "H2D_bw (GiB/s)_max", + "D2H_bw (GiB/s)_p50", "D2H_bw (GiB/s)_p90", "D2H_bw (GiB/s)_p95", "D2H_bw (GiB/s)_p99", + "D2H_bw (GiB/s)_avg", "D2H_bw (GiB/s)_min", "D2H_bw (GiB/s)_max", ], "training": [ - "m", "n", "k", "dtype", "StepTime(median,ms)", - "Throughput(median,TFLOP/s/device)", "TotalThroughput(median,TFLOP/s)", - "MFU", "total_flops", "step_time_ms_p50", "step_time_ms_p90", - "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg", - "step_time_ms_max", "step_time_ms_num_runs", "step_time_ms_min", - "tflops_per_sec_pre_device_p50", "tflops_per_sec_pre_device_p90", - "tflops_per_sec_pre_device_p95", "tflops_per_sec_pre_device_p99", - "tflops_per_sec_pre_device_avg", "tflops_per_sec_pre_device_max", - "tflops_per_sec_pre_device_num_runs", "tflops_per_sec_pre_device_min", - "tflops_per_sec_p50", "tflops_per_sec_p90", "tflops_per_sec_p95", - "tflops_per_sec_p99", "tflops_per_sec_avg", "tflops_per_sec_max", - "tflops_per_sec_num_runs", "tflops_per_sec_min", "MFU_p50", "MFU_p90", - "MFU_p95", "MFU_p99", "MFU_avg", "MFU_max", "MFU_num_runs", "MFU_min" + "m", "n", "k", "dtype", "step_time_ms_num_runs", + "tflops_per_sec_per_device_p50", "tflops_per_sec_per_device_p90", + "tflops_per_sec_per_device_p95", "tflops_per_sec_per_device_p99", + "tflops_per_sec_per_device_avg", "tflops_per_sec_per_device_min", + "tflops_per_sec_per_device_max", ], } @@ -59,7 +50,7 @@ def aggregate_collectives(directories: list[str], picked_columns: list[str]) -> file = glob.glob(f"{directory}/*.tsv")[0] df = pd.read_csv(file, sep='\t') df["topology"] = [file.split('/')[-4].split('-')[1] for _ in range(df.shape[0])] - aggregated_df = pd.concat([aggregated_df, df[picked_columns]], ignore_index=True) + aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True) return aggregated_df def aggregate_hbm(directories: list[str], picked_columns: list[str]) -> pd.DataFrame: @@ -69,7 +60,7 @@ def aggregate_hbm(directories: list[str], picked_columns: list[str]) -> pd.DataF for directory in directories: file = glob.glob(f"{directory}/*.tsv")[0] df = pd.read_csv(file, sep='\t') - aggregated_df = pd.concat([aggregated_df, df[picked_columns]], ignore_index=True) + aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"time_ms_num_runs": "num_runs"})], ignore_index=True) return aggregated_df def aggregate_host_device(directories: list[str], picked_columns: list[str]) -> pd.DataFrame: @@ -79,10 +70,10 @@ def aggregate_host_device(directories: list[str], picked_columns: list[str]) -> for directory in directories: file = glob.glob(f"{directory}/*.tsv")[0] df = pd.read_csv(file, sep='\t') - aggregated_df = pd.concat([aggregated_df, df[picked_columns]], ignore_index=True) + aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"H2D_bw (GiB/s)_num_runs": "num_runs"})], ignore_index=True) return aggregated_df -def aggregate_training(directories: list[str], picked_columns: list[str]) -> pd.DataFrame: +def aggregate_gemm(directories: list[str], picked_columns: list[str]) -> pd.DataFrame: if len(directories) == 0: return None aggregated_df = pd.DataFrame() @@ -92,14 +83,14 @@ def aggregate_training(directories: list[str], picked_columns: list[str]) -> pd. df = pd.read_csv(file, sep='\t') if "topology" in picked_columns: df["topology"] = [file.split('/')[-4].split('-')[1] for _ in range(df.shape[0])] - aggregated_df = pd.concat([aggregated_df, df[picked_columns]], ignore_index=True) + aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True) return aggregated_df aggregate_function = { "collectives": aggregate_collectives, "hbm": aggregate_hbm, "host_device": aggregate_host_device, - "training": aggregate_training + "training": aggregate_gemm, } def aggregate_results(bucket_path: str, local_dir: str): diff --git a/Ironwood/src/benchmark_utils.py b/Ironwood/src/benchmark_utils.py index e28f39e4..fa1fb81c 100644 --- a/Ironwood/src/benchmark_utils.py +++ b/Ironwood/src/benchmark_utils.py @@ -1134,7 +1134,7 @@ def unified_flops_metrics( metrics_list=time_ms_list, metrics_name="step_time_ms" ) tflops_per_sec_statistics = MetricsStatistics( - metrics_list=tflops_per_sec_list, metrics_name="tflops_per_sec_pre_device" + metrics_list=tflops_per_sec_list, metrics_name="tflops_per_sec_per_device" ) tflops_per_sec_all_devices_statistics = MetricsStatistics( metrics_list=tflops_per_sec_all_devices, metrics_name="tflops_per_sec" From aad5d9db02dd3bb016e64bddcdabd6693c0444e2 Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Fri, 30 Jan 2026 08:01:11 +0000 Subject: [PATCH 22/88] Create config folder and modify kubenetes yaml for gemm test --- .../configs/gemm/gemm_multiple_run_more.yaml | 75 +++++++++++++++++++ .../guides/automation/tpu7x-2x2x1-gemm.yaml | 2 +- 2 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 Ironwood/configs/gemm/gemm_multiple_run_more.yaml diff --git a/Ironwood/configs/gemm/gemm_multiple_run_more.yaml b/Ironwood/configs/gemm/gemm_multiple_run_more.yaml new file mode 100644 index 00000000..ea89f98b --- /dev/null +++ b/Ironwood/configs/gemm/gemm_multiple_run_more.yaml @@ -0,0 +1,75 @@ +benchmarks: +- benchmark_name: "gemm_multiple_run" + trace_dir: "../microbenchmarks/gemm_multiple_run_bf16" + csv_path: "../microbenchmarks/gemm_multiple_run_bf16" + xlml_metrics_dir: "../microbenchmarks/gemm_multiple_run_bf16" + xla_dump_dir: "../microbenchmarks/gemm_multiple_run_bf16/hlo_graphs" + benchmark_sweep_params: + - {m: 128, k: 128, n: 128, num_runs: 100, dtype: 'bfloat16'} + - {m: 256, k: 256, n: 256, num_runs: 100, dtype: 'bfloat16'} + - {m: 512, k: 512, n: 512, num_runs: 100, dtype: 'bfloat16'} + - {m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'bfloat16'} + - {m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'bfloat16'} + - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'bfloat16'} + - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'bfloat16'} + - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'bfloat16'} + +- benchmark_name: "gemm_multiple_run" + trace_dir: "../microbenchmarks/gemm_multiple_run_f32" + csv_path: "../microbenchmarks/gemm_multiple_run_f32" + xlml_metrics_dir: "../microbenchmarks/gemm_multiple_run_f32" + xla_dump_dir: "../microbenchmarks/gemm_multiple_run_f32/hlo_graphs" + benchmark_sweep_params: + - {m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float32'} + - {m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float32'} + - {m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float32'} + - {m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float32'} + - {m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float32'} + - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float32'} + - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float32'} + - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float32'} + +- benchmark_name: "gemm_multiple_run" + trace_dir: "../microbenchmarks/gemm_multiple_run_fp16" + csv_path: "../microbenchmarks/gemm_multiple_run_fp16" + xlml_metrics_dir: "../microbenchmarks/gemm_multiple_run_fp16" + xla_dump_dir: "../microbenchmarks/gemm_multiple_run_fp16/hlo_graphs" + benchmark_sweep_params: + - {m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float16'} + - {m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float16'} + - {m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float16'} + - {m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float16'} + - {m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float16'} + - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float16'} + - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float16'} + - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float16'} + +- benchmark_name: "gemm_multiple_run" + trace_dir: "../microbenchmarks/gemm_multiple_run_fp8" + csv_path: "../microbenchmarks/gemm_multiple_run_fp8" + xlml_metrics_dir: "../microbenchmarks/gemm_multiple_run_fp8" + xla_dump_dir: "../microbenchmarks/gemm_multiple_run_fp8/hlo_graphs" + benchmark_sweep_params: + - {m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float8'} + - {m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float8'} + - {m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float8'} + - {m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float8'} + - {m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float8'} + - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float8'} + - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float8'} + - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float8'} + +- benchmark_name: "gemm_multiple_run" + trace_dir: "../microbenchmarks/gemm_multiple_run_fp4" + csv_path: "../microbenchmarks/gemm_multiple_run_fp4" + xlml_metrics_dir: "../microbenchmarks/gemm_multiple_run_fp4" + xla_dump_dir: "../microbenchmarks/gemm_multiple_run_fp4/hlo_graphs" + benchmark_sweep_params: + - {m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float4'} + - {m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float4'} + - {m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float4'} + - {m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float4'} + - {m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float4'} + - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float4'} + - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float4'} + - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float4'} \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml index 1c9fa143..8181941b 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml @@ -52,7 +52,7 @@ spec: pip install -r requirements.txt GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/training/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/gemm/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} resources: requests: google.com/tpu: 4 From cb79abbba193efa7e12de32e792271b21e0dad5e Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Fri, 30 Jan 2026 08:32:10 +0000 Subject: [PATCH 23/88] Update aggregator for gemm test --- Ironwood/guides/automation/aggregator.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py index d0d5f8a3..931d5ab3 100644 --- a/Ironwood/guides/automation/aggregator.py +++ b/Ironwood/guides/automation/aggregator.py @@ -22,7 +22,7 @@ "D2H_bw (GiB/s)_p50", "D2H_bw (GiB/s)_p90", "D2H_bw (GiB/s)_p95", "D2H_bw (GiB/s)_p99", "D2H_bw (GiB/s)_avg", "D2H_bw (GiB/s)_min", "D2H_bw (GiB/s)_max", ], - "training": [ + "gemm": [ "m", "n", "k", "dtype", "step_time_ms_num_runs", "tflops_per_sec_per_device_p50", "tflops_per_sec_per_device_p90", "tflops_per_sec_per_device_p95", "tflops_per_sec_per_device_p99", @@ -81,8 +81,6 @@ def aggregate_gemm(directories: list[str], picked_columns: list[str]) -> pd.Data files = glob.glob(f"{directory}/*.tsv") for file in files: df = pd.read_csv(file, sep='\t') - if "topology" in picked_columns: - df["topology"] = [file.split('/')[-4].split('-')[1] for _ in range(df.shape[0])] aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True) return aggregated_df @@ -90,11 +88,11 @@ def aggregate_gemm(directories: list[str], picked_columns: list[str]) -> pd.Data "collectives": aggregate_collectives, "hbm": aggregate_hbm, "host_device": aggregate_host_device, - "training": aggregate_gemm, + "gemm": aggregate_gemm, } def aggregate_results(bucket_path: str, local_dir: str): - categories = ["collectives", "hbm", "host_device", "training"] + categories = ["collectives", "hbm", "host_device", "gemm"] directories = {} results = {} for category in categories: From b10e6bb275e83519c056faa320cb57325cc5d7a9 Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Fri, 30 Jan 2026 09:11:43 +0000 Subject: [PATCH 24/88] Add dtype string in aggregated TSV file --- Ironwood/guides/automation/aggregator.py | 2 +- Ironwood/src/benchmark_collectives.py | 2 ++ Ironwood/src/benchmark_hbm.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py index 931d5ab3..26f53d87 100644 --- a/Ironwood/guides/automation/aggregator.py +++ b/Ironwood/guides/automation/aggregator.py @@ -6,7 +6,7 @@ columns_mapping = { "collectives": [ - "topology", "op_type", "input_num_elements", "transferred_data (GB)", "dtype_bytes", "step_time_ms_num_runs", + "topology", "op_type", "input_num_elements", "transferred_data (GB)", "dtype", "step_time_ms_num_runs", "achieved_bw (GB/s)_p50", "achieved_bw (GB/s)_p90", "achieved_bw (GB/s)_p95", "achieved_bw (GB/s)_p99", "achieved_bw (GB/s)_avg", "achieved_bw (GB/s)_min", "achieved_bw (GB/s)_max", "step_time_ms_p50", "step_time_ms_p90", "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg", "step_time_ms_min", "step_time_ms_max", ], diff --git a/Ironwood/src/benchmark_collectives.py b/Ironwood/src/benchmark_collectives.py index 69b4d212..e142f59b 100644 --- a/Ironwood/src/benchmark_collectives.py +++ b/Ironwood/src/benchmark_collectives.py @@ -98,6 +98,7 @@ def unified_ici_collectives_metrics( hlo_first_replica_group = [] input_num_elements = matrix_shape[0] * matrix_shape[1] * matrix_shape[2] + dtype_name = dtype.dtype.name dtype_bytes = dtype.dtype.itemsize if xla_output: xla_output_json = json.loads(xla_output) @@ -169,6 +170,7 @@ def unified_ici_collectives_metrics( "input_num_elements": input_num_elements, "matrix_shape": json.dumps(f"({matrix_shape})"), "transferred_data (GB)": transferred_data, + "dtype": dtype_name, "dtype_bytes": dtype_bytes, "hlo_input_shape": json.dumps(hlo_input_shape), "hlo_output_shape": json.dumps(hlo_output_shape), diff --git a/Ironwood/src/benchmark_hbm.py b/Ironwood/src/benchmark_hbm.py index bb279f42..53744b5d 100644 --- a/Ironwood/src/benchmark_hbm.py +++ b/Ironwood/src/benchmark_hbm.py @@ -29,7 +29,7 @@ def get_metrics_helper( for key, value in params if value is not None and key not in exclude_keys } - metadata["dtype"] = metadata["dtype"].dtype.itemsize + metadata["dtype"] = metadata["dtype"].dtype.name return metadata From 6f525bf9eb34ba79df9132e8c70a716b308b6658 Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Fri, 30 Jan 2026 09:23:55 +0000 Subject: [PATCH 25/88] Add multiple precisions for HBM test --- Ironwood/configs/hbm/hbm.yaml | 29 ++++++++++++++++++++---- Ironwood/guides/automation/aggregator.py | 14 +++++++----- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/Ironwood/configs/hbm/hbm.yaml b/Ironwood/configs/hbm/hbm.yaml index d2c76e29..1912cb17 100644 --- a/Ironwood/configs/hbm/hbm.yaml +++ b/Ironwood/configs/hbm/hbm.yaml @@ -2,7 +2,28 @@ benchmarks: - benchmark_name: "single_device_hbm_copy" benchmark_sweep_params: - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "bfloat16", num_runs: 1} - trace_dir: "../microbenchmarks/hbm" - csv_path: "../microbenchmarks/hbm" - xlml_metrics_dir: "../microbenchmarks/hbm" - xla_dump_dir: "../microbenchmarks/hbm/hlo_graphs" \ No newline at end of file + trace_dir: "../microbenchmarks/hbm_bfloat16" + csv_path: "../microbenchmarks/hbm_bfloat16" + xlml_metrics_dir: "../microbenchmarks/hbm_bfloat16" + xla_dump_dir: "../microbenchmarks/hbm_bfloat16/hlo_graphs" +- benchmark_name: "single_device_hbm_copy" + benchmark_sweep_params: + - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float32", num_runs: 1} + trace_dir: "../microbenchmarks/hbm_float32" + csv_path: "../microbenchmarks/hbm_float32" + xlml_metrics_dir: "../microbenchmarks/hbm_float32" + xla_dump_dir: "../microbenchmarks/hbm_float32/hlo_graphs" +- benchmark_name: "single_device_hbm_copy" + benchmark_sweep_params: + - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float8", num_runs: 1} + trace_dir: "../microbenchmarks/hbm_float8" + csv_path: "../microbenchmarks/hbm_float8" + xlml_metrics_dir: "../microbenchmarks/hbm_float8" + xla_dump_dir: "../microbenchmarks/hbm_float8/hlo_graphs" +- benchmark_name: "single_device_hbm_copy" + benchmark_sweep_params: + - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float16", num_runs: 1} + trace_dir: "../microbenchmarks/hbm_float16" + csv_path: "../microbenchmarks/hbm_float16" + xlml_metrics_dir: "../microbenchmarks/hbm_float16" + xla_dump_dir: "../microbenchmarks/hbm_float16/hlo_graphs" \ No newline at end of file diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py index 26f53d87..bc5e5e24 100644 --- a/Ironwood/guides/automation/aggregator.py +++ b/Ironwood/guides/automation/aggregator.py @@ -58,9 +58,10 @@ def aggregate_hbm(directories: list[str], picked_columns: list[str]) -> pd.DataF return None aggregated_df = pd.DataFrame() for directory in directories: - file = glob.glob(f"{directory}/*.tsv")[0] - df = pd.read_csv(file, sep='\t') - aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"time_ms_num_runs": "num_runs"})], ignore_index=True) + files = glob.glob(f"{directory}/*.tsv") + for file in files: + df = pd.read_csv(file, sep='\t') + aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"time_ms_num_runs": "num_runs"})], ignore_index=True) return aggregated_df def aggregate_host_device(directories: list[str], picked_columns: list[str]) -> pd.DataFrame: @@ -68,9 +69,10 @@ def aggregate_host_device(directories: list[str], picked_columns: list[str]) -> return None aggregated_df = pd.DataFrame() for directory in directories: - file = glob.glob(f"{directory}/*.tsv")[0] - df = pd.read_csv(file, sep='\t') - aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"H2D_bw (GiB/s)_num_runs": "num_runs"})], ignore_index=True) + files = glob.glob(f"{directory}/*.tsv") + for file in files: + df = pd.read_csv(file, sep='\t') + aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"H2D_bw (GiB/s)_num_runs": "num_runs"})], ignore_index=True) return aggregated_df def aggregate_gemm(directories: list[str], picked_columns: list[str]) -> pd.DataFrame: From 69661f97afa44e434898c97226bfecab7f7a015a Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Fri, 30 Jan 2026 13:44:00 +0000 Subject: [PATCH 26/88] Print pending process status every minute --- .../guides/automation/automation_launch.sh | 85 +++++++++++++------ 1 file changed, 61 insertions(+), 24 deletions(-) diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh index 1f544499..9b65bbf7 100644 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -83,7 +83,6 @@ wait_for_job_completion() { # Returns a list of failed yaml files in the variable FAILED_JOBS apply_and_wait() { local yaml_files=("$@") - local pids=() local job_names_in_batch=() FAILED_JOBS=() @@ -102,29 +101,67 @@ apply_and_wait() { job_names_in_batch+=("${JOB_NAME}") done - # Wait for completion in background + # Monitor jobs + local start_time=$(date +%s) + local end_time=$((start_time + TIMEOUT_SECOND)) + local last_print_time=0 + + while true; do + local current_time=$(date +%s) + if [[ $current_time -gt $end_time ]]; then + echo "Timeout waiting for batch completion" + break + fi + + # Identify active jobs + local active_jobs=() + for job_name in "${job_names_in_batch[@]}"; do + # Check for Complete + if kubectl get job "${job_name}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null | grep -q "True"; then + continue + fi + + # Check for Failed + if kubectl get job "${job_name}" -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null | grep -q "True"; then + continue + fi + + # If neither, it's pending/running + active_jobs+=("${job_name}") + done + + if [[ ${#active_jobs[@]} -eq 0 ]]; then + break + fi + + # Dashboard View - Print every 60 seconds + if [[ $((current_time - last_print_time)) -ge 60 ]]; then + echo "======================================================================" + date "+%Y-%m-%d %H:%M:%S" + echo "----------------------------------------------------------------------" + kubectl get jobs "${active_jobs[@]}" + echo "======================================================================" + last_print_time=$current_time + fi + + sleep 10 + done + + # Collect results and cleanup + FAILED_JOBS=() for i in "${!yaml_files[@]}"; do local yaml_file="${yaml_files[$i]}" - local filepath="${SCRIPT_DIR}/${yaml_file}" local job_name="${job_names_in_batch[$i]}" - export GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}" - ( - wait_for_job_completion "${job_name}" ${TIMEOUT_SECOND} - wait_status=$? - - export JOB_NAME="${job_name}" - envsubst '${JOB_NAME} ${GCS_PATH}' < "${filepath}" | kubectl delete -f - &> /dev/null - exit $wait_status - ) & - pids+=($!) - done - - # Collect results - for i in "${!pids[@]}"; do - wait "${pids[$i]}" - if [[ $? -ne 0 ]]; then - FAILED_JOBS+=("${yaml_files[$i]}") + local filepath="${SCRIPT_DIR}/${yaml_file}" + + # Check if failed or still running (timeout) + if ! kubectl get job "${job_name}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null | grep -q "True"; then + FAILED_JOBS+=("${yaml_files[$i]}") fi + + export JOB_NAME="${job_name}" + export GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}" + envsubst '${JOB_NAME} ${GCS_PATH}' < "${filepath}" | kubectl delete -f - &> /dev/null done } @@ -133,15 +170,15 @@ current_batch=("${yaml_names[@]}") for (( retry=1; retry<=MAX_RETRIES; retry++ )); do apply_and_wait "${current_batch[@]}" - + if [[ ${#FAILED_JOBS[@]} -eq 0 ]]; then echo "All jobs completed successfully in Round ${retry}!" break fi - + echo "Round ${retry} finished. ${#FAILED_JOBS[@]} jobs failed." current_batch=("${FAILED_JOBS[@]}") - + if [[ ${retry} -lt ${MAX_RETRIES} ]]; then echo "Retrying failed jobs..." echo "========================================" @@ -174,4 +211,4 @@ if [[ ${#FAILED_JOBS[@]} -gt 0 ]]; then done else echo "Success! All jobs finished." -fi \ No newline at end of file +fi From 3e4b59a0d8a5e87bea51f8c934895a4c9fda3ac5 Mon Sep 17 00:00:00 2001 From: Chi Shuen Lee Date: Tue, 27 Jan 2026 15:38:39 +0800 Subject: [PATCH 27/88] Revert the changes that were made for an urgent demo (#90) --- Ironwood/configs/collectives/all_gather_1d.yaml | 3 ++- Ironwood/configs/collectives/all_gather_2d.yaml | 3 ++- Ironwood/configs/collectives/all_gather_3d.yaml | 3 ++- Ironwood/configs/collectives/all_gather_demo.yaml | 10 +++++++--- .../configs/collectives/all_gather_tpu7x_2x2x1.yaml | 3 ++- .../configs/collectives/all_gather_tpu7x_2x2x2.yaml | 3 ++- .../configs/collectives/all_gather_tpu7x_2x2x4.yaml | 3 ++- .../configs/collectives/all_gather_tpu7x_2x4x4.yaml | 3 ++- .../configs/collectives/all_gather_tpu7x_4x4x4.yaml | 5 +++-- .../configs/collectives/all_gather_tpu7x_4x4x8.yaml | 3 ++- Ironwood/configs/collectives/all_reduce_1d.yaml | 3 ++- Ironwood/configs/collectives/all_reduce_2d.yaml | 3 ++- Ironwood/configs/collectives/all_reduce_3d.yaml | 3 ++- .../configs/collectives/all_reduce_tpu7x_2x2x1.yaml | 3 ++- .../configs/collectives/all_reduce_tpu7x_2x2x2.yaml | 3 ++- .../configs/collectives/all_reduce_tpu7x_2x2x4.yaml | 3 ++- .../configs/collectives/all_reduce_tpu7x_2x4x4.yaml | 3 ++- .../configs/collectives/all_reduce_tpu7x_4x4x4.yaml | 3 ++- .../configs/collectives/all_reduce_tpu7x_4x4x8.yaml | 3 ++- Ironwood/configs/collectives/all_to_all_1d.yaml | 2 +- Ironwood/configs/collectives/all_to_all_2d.yaml | 2 +- Ironwood/configs/collectives/all_to_all_3d.yaml | 2 +- .../configs/collectives/all_to_all_tpu7x_2x2x1.yaml | 3 ++- .../configs/collectives/all_to_all_tpu7x_2x2x2.yaml | 3 ++- .../configs/collectives/all_to_all_tpu7x_2x2x4.yaml | 3 ++- .../configs/collectives/all_to_all_tpu7x_2x4x4.yaml | 3 ++- .../configs/collectives/all_to_all_tpu7x_4x4x4.yaml | 3 ++- .../configs/collectives/all_to_all_tpu7x_4x4x8.yaml | 3 ++- Ironwood/configs/collectives/reduce_scatter_1d.yaml | 3 ++- Ironwood/configs/collectives/reduce_scatter_2d.yaml | 3 ++- 30 files changed, 63 insertions(+), 33 deletions(-) diff --git a/Ironwood/configs/collectives/all_gather_1d.yaml b/Ironwood/configs/collectives/all_gather_1d.yaml index 0b1313dc..85d8fc3e 100644 --- a/Ironwood/configs/collectives/all_gather_1d.yaml +++ b/Ironwood/configs/collectives/all_gather_1d.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_gather_1d" csv_path: "../microbenchmarks/all_gather_1d" xlml_metrics_dir: "../microbenchmarks/all_gather_1d" diff --git a/Ironwood/configs/collectives/all_gather_2d.yaml b/Ironwood/configs/collectives/all_gather_2d.yaml index c45f3e70..2d7a0e7a 100644 --- a/Ironwood/configs/collectives/all_gather_2d.yaml +++ b/Ironwood/configs/collectives/all_gather_2d.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups trace_dir: "../microbenchmarks/all_gather_2d" csv_path: "../microbenchmarks/all_gather_2d" xlml_metrics_dir: "../microbenchmarks/all_gather_2d" diff --git a/Ironwood/configs/collectives/all_gather_3d.yaml b/Ironwood/configs/collectives/all_gather_3d.yaml index e159adfd..cc876a08 100644 --- a/Ironwood/configs/collectives/all_gather_3d.yaml +++ b/Ironwood/configs/collectives/all_gather_3d.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} # Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups trace_dir: "../microbenchmarks/all_gather_3d" csv_path: "../microbenchmarks/all_gather_3d" xlml_metrics_dir: "../microbenchmarks/all_gather_3d" diff --git a/Ironwood/configs/collectives/all_gather_demo.yaml b/Ironwood/configs/collectives/all_gather_demo.yaml index a9d776cd..6fb5a757 100644 --- a/Ironwood/configs/collectives/all_gather_demo.yaml +++ b/Ironwood/configs/collectives/all_gather_demo.yaml @@ -1,9 +1,13 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1} - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2} - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3} + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1} # Parallel Replica + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1} # Non-Parallel Replica + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2} # Non Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2} # Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3} # Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3} # Non Parallel Replica Groups + warmup_tries: 10 trace_dir: "../microbenchmarks/all_gather_demo" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml index 0338aef1..9bc586a1 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml index 9253bac5..b5be0c8d 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml index 9f8af67f..09b02979 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml index 724fff00..4f6cf11a 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4" csv_path: "../microbenchmarks/all_gather_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml index 65189cc9..77f3ed13 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml @@ -1,8 +1,9 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 10} + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4" csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4" - xla_dump_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4/hlo_graphs" + xla_dump_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4/hlo_graphs" \ No newline at end of file diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml index 77c4da6f..12743d61 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8" csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/all_reduce_1d.yaml b/Ironwood/configs/collectives/all_reduce_1d.yaml index d12d4221..7b1d3068 100644 --- a/Ironwood/configs/collectives/all_reduce_1d.yaml +++ b/Ironwood/configs/collectives/all_reduce_1d.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1" , op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1" , op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non Parallel Replica trace_dir: "../microbenchmarks/all_reduce_1d" csv_path: "../microbenchmarks/all_reduce_1d" xlml_metrics_dir: "../microbenchmarks/all_reduce_1d" diff --git a/Ironwood/configs/collectives/all_reduce_2d.yaml b/Ironwood/configs/collectives/all_reduce_2d.yaml index 5aa9654e..93e1a7c9 100644 --- a/Ironwood/configs/collectives/all_reduce_2d.yaml +++ b/Ironwood/configs/collectives/all_reduce_2d.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups trace_dir: "../microbenchmarks/all_reduce_2d" csv_path: "../microbenchmarks/all_reduce_2d" xlml_metrics_dir: "../microbenchmarks/all_reduce_2d" diff --git a/Ironwood/configs/collectives/all_reduce_3d.yaml b/Ironwood/configs/collectives/all_reduce_3d.yaml index 4e76b55f..f6a4ad9d 100644 --- a/Ironwood/configs/collectives/all_reduce_3d.yaml +++ b/Ironwood/configs/collectives/all_reduce_3d.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} # Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups trace_dir: "../microbenchmarks/all_reduce_3d" csv_path: "../microbenchmarks/all_reduce_3d" xlml_metrics_dir: "../microbenchmarks/all_reduce_3d" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml index 6d2d506c..f7389925 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/psum_tpu7x_2x2x1" csv_path: "../microbenchmarks/psum_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml index d11981b0..b2cb202c 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/psum_tpu7x_2x2x2" csv_path: "../microbenchmarks/psum_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml index ab243b6f..946fd5ed 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/psum_tpu7x_2x2x4" csv_path: "../microbenchmarks/psum_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml index c731c622..613717cf 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/psum_tpu7x_2x4x4" csv_path: "../microbenchmarks/psum_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml index 53d8dd3d..3f4822c0 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/psum_tpu7x_4x4x4" csv_path: "../microbenchmarks/psum_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml index f87878a4..a14bbfe8 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/psum_tpu7x_4x4x8" csv_path: "../microbenchmarks/psum_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/all_to_all_1d.yaml b/Ironwood/configs/collectives/all_to_all_1d.yaml index 8d222613..3c28194d 100644 --- a/Ironwood/configs/collectives/all_to_all_1d.yaml +++ b/Ironwood/configs/collectives/all_to_all_1d.yaml @@ -1,7 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non Parallel Replica trace_dir: "../microbenchmarks/all_to_all_1d" csv_path: "../microbenchmarks/all_to_all_1d" xlml_metrics_dir: "../microbenchmarks/all_to_all_1d" diff --git a/Ironwood/configs/collectives/all_to_all_2d.yaml b/Ironwood/configs/collectives/all_to_all_2d.yaml index d23115fe..b4a1bc0e 100644 --- a/Ironwood/configs/collectives/all_to_all_2d.yaml +++ b/Ironwood/configs/collectives/all_to_all_2d.yaml @@ -1,7 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica trace_dir: "../microbenchmarks/all_to_all_2d" csv_path: "../microbenchmarks/all_to_all_2d" xlml_metrics_dir: "../microbenchmarks/all_to_all_2d" diff --git a/Ironwood/configs/collectives/all_to_all_3d.yaml b/Ironwood/configs/collectives/all_to_all_3d.yaml index c705754c..3aa0e2a7 100644 --- a/Ironwood/configs/collectives/all_to_all_3d.yaml +++ b/Ironwood/configs/collectives/all_to_all_3d.yaml @@ -1,7 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups trace_dir: "../microbenchmarks/all_to_all_3d" csv_path: "../microbenchmarks/all_to_all_3d" xlml_metrics_dir: "../microbenchmarks/all_to_all_3d" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml index f9786b29..96da2c38 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml index b530a698..388a4468 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml index 86e3dbbc..e0cc48c9 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml index 6d4b79fb..5ae19b6e 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml index 3460ddb6..4cc8f6bb 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml index 93ef7cb7..212cd92d 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8" csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/reduce_scatter_1d.yaml b/Ironwood/configs/collectives/reduce_scatter_1d.yaml index 063d73fc..9c2c0dea 100644 --- a/Ironwood/configs/collectives/reduce_scatter_1d.yaml +++ b/Ironwood/configs/collectives/reduce_scatter_1d.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum_scatter benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/reduce_scatter_1d" csv_path: "../microbenchmarks/reduce_scatter_1d" xlml_metrics_dir: "../microbenchmarks/reduce_scatter_1d" diff --git a/Ironwood/configs/collectives/reduce_scatter_2d.yaml b/Ironwood/configs/collectives/reduce_scatter_2d.yaml index 027ac991..f329b571 100644 --- a/Ironwood/configs/collectives/reduce_scatter_2d.yaml +++ b/Ironwood/configs/collectives/reduce_scatter_2d.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum_scatter benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups trace_dir: "../microbenchmarks/reduce_scatter_2d" csv_path: "../microbenchmarks/reduce_scatter_2d" xlml_metrics_dir: "../microbenchmarks/reduce_scatter_2d" From 0b24e562e9fc3442ff487069828aa7035f4dc3a9 Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Fri, 30 Jan 2026 09:51:14 +0000 Subject: [PATCH 28/88] [Ironwood] Add pipelined H2D mode to H2D benchmark --- Ironwood/configs/host_device/host_device.yaml | 3 +- Ironwood/src/benchmark_host_device.py | 82 +++++++++++++------ 2 files changed, 60 insertions(+), 25 deletions(-) diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml index 0b48800c..8d572ed7 100644 --- a/Ironwood/configs/host_device/host_device.yaml +++ b/Ironwood/configs/host_device/host_device.yaml @@ -3,7 +3,8 @@ benchmarks: num_runs: 20 benchmark_sweep_params: - { - data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768] + data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768], + h2d_type: ["simple", "pipelined"] } csv_path: "../microbenchmarks/host_device" trace_dir: "../microbenchmarks/host_device/trace" diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index 16352e2a..ec50619a 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -5,7 +5,6 @@ from typing import Any, Dict, Tuple, List import jax -from jax import sharding import numpy as np from benchmark_utils import MetricsStatistics @@ -23,6 +22,7 @@ def benchmark_host_device( data_size_mib: int, num_runs: int = 100, trace_dir: str = None, + h2d_type: str = "simple", ) -> Dict[str, Any]: """Benchmarks H2D/D2H transfer using simple device_put/device_get.""" @@ -32,8 +32,18 @@ def benchmark_host_device( column = 128 host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32) + # Used in pipelined flow + num_devices_to_perform_h2d = 1 + tensor_size = 4 * 1024 * 1024 + target_device = jax.devices()[:num_devices_to_perform_h2d] + mesh = jax.sharding.Mesh(np.array(target_device), axis_names=["x"]) + sharding = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec("x")) + pipelined_array = None + if h2d_type == "pipelined": + pipelined_array = np.random.normal(size=(tensor_size,)).astype(np.float32) + print( - f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations", + f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {h2d_type=}", flush=True ) @@ -65,29 +75,52 @@ def benchmark_host_device( with step_context: # H2D - t0 = time.perf_counter() - - # Simple device_put - device_array = jax.device_put(host_data) - device_array.block_until_ready() - - t1 = time.perf_counter() - h2d_perf.append((t1 - t0) * 1000) - - # Verify H2D shape - assert device_array.shape == host_data.shape - - # D2H - t2 = time.perf_counter() - - # Simple device_get - # Note: device_get returns a numpy array (copy) - _ = jax.device_get(device_array) - - t3 = time.perf_counter() - d2h_perf.append((t3 - t2) * 1000) + if h2d_type == "simple": + t0 = time.perf_counter() + # Simple device_put + device_array = jax.device_put(host_data) + device_array.block_until_ready() + t1 = time.perf_counter() + + # Verify H2D shape + assert device_array.shape == host_data.shape + + h2d_perf.append((t1 - t0) * 1000) - device_array.delete() + # D2H + t2 = time.perf_counter() + + # Simple device_get + # Note: device_get returns a numpy array (copy) + _ = jax.device_get(device_array) + + t3 = time.perf_counter() + d2h_perf.append((t3 - t2) * 1000) + + device_array.delete() + elif h2d_type == "pipelined": + tensors_on_device = [] + if data_size_mib * 1024 * 1024 < pipelined_array.nbytes: + print(f"Warning: {data_size_mib=} is smaller than pipeline unit, no data will be transferred.") + t0 = time.perf_counter() + # Assume data_size_mib is total across devices for now + bytes_left = 1024 * 1024 * data_size_mib + while bytes_left >= pipelined_array.nbytes: + with jax.profiler.StepTraceAnnotation("device_put", step_num=1): + x_device = jax.device_put(pipelined_array, sharding) + tensors_on_device.append(x_device) + bytes_left -= pipelined_array.nbytes + + total_bytes_transferred = 0 + for tensor in tensors_on_device: + tensor.block_until_ready() + total_bytes_transferred += tensor.nbytes + tensor.delete() + t1 = time.perf_counter() + + h2d_perf.append((t1 - t0) * 1000) + # Implement D2H at a later time after we establish H2D + d2h_perf.append(0) return { "H2D_Bandwidth_ms": h2d_perf, @@ -98,6 +131,7 @@ def benchmark_host_device_calculate_metrics( data_size_mib: int, H2D_Bandwidth_ms: List[float], D2H_Bandwidth_ms: List[float], + h2d_type: str = "simple", ) -> Tuple[Dict[str, Any], Dict[str, Any]]: """Calculates metrics for Host-Device transfer.""" params = locals().items() From a70b70151923a8f346db42597ee59e1d931473e1 Mon Sep 17 00:00:00 2001 From: Junjie Qian Date: Sat, 31 Jan 2026 22:20:26 -0800 Subject: [PATCH 29/88] add extra datatypes in configs (#94) --- Ironwood/configs/collectives/all_gather_1d.yaml | 4 ++++ Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml | 4 ++++ Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml | 4 ++++ Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml | 4 ++++ Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml | 4 ++++ Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml | 4 ++++ Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml | 4 ++++ Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml | 4 ++++ Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml | 4 ++++ Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml | 4 ++++ Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml | 4 ++++ Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml | 4 ++++ Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml | 4 ++++ Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml | 4 ++++ Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml | 4 ++++ Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml | 4 ++++ Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml | 4 ++++ Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml | 4 ++++ Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml | 4 ++++ Ironwood/configs/hbm/hbm.yaml | 9 ++++++++- 20 files changed, 84 insertions(+), 1 deletion(-) diff --git a/Ironwood/configs/collectives/all_gather_1d.yaml b/Ironwood/configs/collectives/all_gather_1d.yaml index 0b1313dc..8ab9faeb 100644 --- a/Ironwood/configs/collectives/all_gather_1d.yaml +++ b/Ironwood/configs/collectives/all_gather_1d.yaml @@ -2,6 +2,10 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float16", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float8", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float4", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "bfloat16", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_1d" csv_path: "../microbenchmarks/all_gather_1d" xlml_metrics_dir: "../microbenchmarks/all_gather_1d" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml index 0338aef1..d4ef5f2c 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml @@ -2,6 +2,10 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float16", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float8", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float4", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "bfloat16", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml index 9253bac5..0048163e 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml @@ -2,6 +2,10 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float16", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float8", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float4", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "bfloat16", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml index 9f8af67f..3e18b0fa 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml @@ -2,6 +2,10 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float16", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float8", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float4", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "bfloat16", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml index 724fff00..97a6ec2b 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml @@ -2,6 +2,10 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float16", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float8", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float4", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "bfloat16", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4" csv_path: "../microbenchmarks/all_gather_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml index 65189cc9..b51ebbe5 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml @@ -2,6 +2,10 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 10} + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float16", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 10} + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float8", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 10} + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float4", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 10} + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "bfloat16", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 10} trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4" csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml index 77c4da6f..f6f151a8 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml @@ -2,6 +2,10 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float16", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float8", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float4", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "bfloat16", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8" csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml index 6d2d506c..08139afb 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml @@ -2,6 +2,10 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float16", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float8", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float4", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "bfloat16", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/psum_tpu7x_2x2x1" csv_path: "../microbenchmarks/psum_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml index d11981b0..e510a975 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml @@ -2,6 +2,10 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float16", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float8", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float4", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "bfloat16", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/psum_tpu7x_2x2x2" csv_path: "../microbenchmarks/psum_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml index ab243b6f..5f53e826 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml @@ -2,6 +2,10 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float16", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float8", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float4", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "bfloat16", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/psum_tpu7x_2x2x4" csv_path: "../microbenchmarks/psum_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml index c731c622..5ea9c71b 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml @@ -2,6 +2,10 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float16", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float8", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float4", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "bfloat16", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/psum_tpu7x_2x4x4" csv_path: "../microbenchmarks/psum_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml index 53d8dd3d..9ddbe192 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml @@ -2,6 +2,10 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float16", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float8", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float4", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "bfloat16", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/psum_tpu7x_4x4x4" csv_path: "../microbenchmarks/psum_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml index f87878a4..997bb480 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml @@ -2,6 +2,10 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float16", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float8", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float4", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "bfloat16", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/psum_tpu7x_4x4x8" csv_path: "../microbenchmarks/psum_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml index f9786b29..0e753645 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml @@ -2,6 +2,10 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float16", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float8", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float4", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "bfloat16", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml index b530a698..c6410449 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml @@ -2,6 +2,10 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float16", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float8", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float4", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "bfloat16", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml index 86e3dbbc..8dc83348 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml @@ -2,6 +2,10 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float16", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float8", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float4", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "bfloat16", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml index 6d4b79fb..ccd5ab14 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml @@ -2,6 +2,10 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float16", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float8", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float4", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "bfloat16", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml index 3460ddb6..54a82e5d 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml @@ -2,6 +2,10 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float16", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float8", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float4", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "bfloat16", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml index 93ef7cb7..b7120534 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml @@ -2,6 +2,10 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float16", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float8", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float4", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "bfloat16", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8" csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8" diff --git a/Ironwood/configs/hbm/hbm.yaml b/Ironwood/configs/hbm/hbm.yaml index 1912cb17..0e42b2f0 100644 --- a/Ironwood/configs/hbm/hbm.yaml +++ b/Ironwood/configs/hbm/hbm.yaml @@ -26,4 +26,11 @@ benchmarks: trace_dir: "../microbenchmarks/hbm_float16" csv_path: "../microbenchmarks/hbm_float16" xlml_metrics_dir: "../microbenchmarks/hbm_float16" - xla_dump_dir: "../microbenchmarks/hbm_float16/hlo_graphs" \ No newline at end of file + xla_dump_dir: "../microbenchmarks/hbm_float16/hlo_graphs" +- benchmark_name: "single_device_hbm_copy" + benchmark_sweep_params: + - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float4", num_runs: 1} + trace_dir: "../microbenchmarks/hbm_float4" + csv_path: "../microbenchmarks/hbm_float4" + xlml_metrics_dir: "../microbenchmarks/hbm_float4" + xla_dump_dir: "../microbenchmarks/hbm_float4/hlo_graphs" \ No newline at end of file From 94ddadabb9ea054f2b3f49003a60b4e19b476c5e Mon Sep 17 00:00:00 2001 From: Junjie Qian Date: Sun, 1 Feb 2026 18:37:09 -0800 Subject: [PATCH 30/88] add GCS service account name to job yamls (#95) --- Ironwood/guides/automation/automation_launch.sh | 6 ++++-- Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml | 1 + Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml | 1 + Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml | 1 + Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml | 1 + Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml | 1 + Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml | 1 + Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml | 1 + Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml | 1 + Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml | 1 + 10 files changed, 13 insertions(+), 2 deletions(-) diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh index 9b65bbf7..56cee946 100644 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -5,6 +5,7 @@ ###################################################################### TIMESTAMP=$(date +%Y-%m-%d_%H-%M-%S) export GCS_BUCKET_ROOT_DIR="" +export GCS_SA_NAME="gcs-writer" # Service account with write access to GCS_BUCKET_ROOT_DIR MAX_RETRIES=3 TIMEOUT_SECOND=3600 @@ -93,11 +94,12 @@ apply_and_wait() { local filepath="${SCRIPT_DIR}/${yaml_file}" # Derive job name: remove .yaml, lowercase, replace _ with - local job_name=$(basename "${yaml_file}" .yaml | tr '[:upper:]' '[:lower:]' | tr '_' '-') - export JOB_NAME="${job_name}" + random_suffix=$(head /dev/urandom | tr -dc a-z0-9 | head -c 5) + export JOB_NAME="${job_name}-${random_suffix}" export GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}" echo "Launching job: ${filepath} (name: ${JOB_NAME})" - envsubst '${JOB_NAME} ${GCS_PATH}' < "${filepath}" | kubectl apply -f - + envsubst '${JOB_NAME} ${GCS_PATH} ${GCS_SA_NAME}' < "${filepath}" | kubectl apply -f - job_names_in_batch+=("${JOB_NAME}") done diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml index fc878ea0..a99704d0 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x1-collectives.yaml @@ -22,6 +22,7 @@ spec: template: spec: subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} restartPolicy: Never nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu7x diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml index 8181941b..1a119d2e 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x1-gemm.yaml @@ -22,6 +22,7 @@ spec: template: spec: subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} restartPolicy: Never nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu7x diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml index e84e13ea..85efe69a 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x1-hbm.yaml @@ -22,6 +22,7 @@ spec: template: spec: subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} restartPolicy: Never nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu7x diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml index 3f662be5..a6b8febd 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml @@ -22,6 +22,7 @@ spec: template: spec: subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} restartPolicy: Never nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu7x diff --git a/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml index c2efba03..937d572c 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x2-collectives.yaml @@ -22,6 +22,7 @@ spec: template: spec: subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} restartPolicy: Never nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu7x diff --git a/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml index a42b04ae..6502170a 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x4-collectives.yaml @@ -22,6 +22,7 @@ spec: template: spec: subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} restartPolicy: Never nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu7x diff --git a/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml b/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml index de6f7106..2de39037 100644 --- a/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml +++ b/Ironwood/guides/automation/tpu7x-2x4x4-collectives.yaml @@ -22,6 +22,7 @@ spec: template: spec: subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} restartPolicy: Never nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu7x diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml index 828d1352..7afdac9d 100644 --- a/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml +++ b/Ironwood/guides/automation/tpu7x-4x4x4-collectives.yaml @@ -22,6 +22,7 @@ spec: template: spec: subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} restartPolicy: Never nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu7x diff --git a/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml b/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml index f3bf721d..14735274 100644 --- a/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml +++ b/Ironwood/guides/automation/tpu7x-4x4x8-collectives.yaml @@ -22,6 +22,7 @@ spec: template: spec: subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} restartPolicy: Never nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu7x From e7e10f9c5350d16966eaaecdab95dc19099ebb28 Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Mon, 2 Feb 2026 02:40:23 +0000 Subject: [PATCH 31/88] [Automation] GCS Permission check and fix --- Ironwood/guides/automation/README.md | 2 +- .../guides/automation/automation_launch.sh | 14 +++ .../automation/check_gcs_permissions.sh | 116 ++++++++++++++++++ Ironwood/guides/automation/gcs-write.yaml | 41 +++++++ 4 files changed, 172 insertions(+), 1 deletion(-) mode change 100644 => 100755 Ironwood/guides/automation/automation_launch.sh create mode 100755 Ironwood/guides/automation/check_gcs_permissions.sh create mode 100644 Ironwood/guides/automation/gcs-write.yaml diff --git a/Ironwood/guides/automation/README.md b/Ironwood/guides/automation/README.md index 3c463c67..81d63c93 100644 --- a/Ironwood/guides/automation/README.md +++ b/Ironwood/guides/automation/README.md @@ -52,7 +52,7 @@ You can configure the behavior using the following environment variable: | Variable | Description | Required | Default | | :--- | :--- | :--- | :--- | -| `GCS_BUCKET_ROOT_DIR` | The root GCS path where results will be stored. Must start with `gs://`. | **Yes** | `gs://amylin-microbenchmark` (Change this!) | +| `GCS_BUCKET_ROOT_DIR` | The root GCS path where results will be stored. Must start with `gs://`. | **Yes** | `gs://example-microbenchmark` (Change this!) | ## Usage Guide diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh old mode 100644 new mode 100755 index 56cee946..745a3f20 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -6,6 +6,7 @@ TIMESTAMP=$(date +%Y-%m-%d_%H-%M-%S) export GCS_BUCKET_ROOT_DIR="" export GCS_SA_NAME="gcs-writer" # Service account with write access to GCS_BUCKET_ROOT_DIR +export PROJECT_ID=$(gcloud config get-value project 2>/dev/null) MAX_RETRIES=3 TIMEOUT_SECOND=3600 @@ -45,6 +46,19 @@ for topology in "${required_topologies[@]}"; do envsubst '${TOPOLOGY} ${TPUS}' < ${SCRIPT_DIR}/job-queue.yaml | kubectl apply -f - done +###################################################################### +# GCS PERMISSION CHECK +###################################################################### + +# Run the GCS permission check +export SA_NAME="${GCS_SA_NAME}" +export PROJECT_ID="${PROJECT_ID}" +if ! bash "${SCRIPT_DIR}/check_gcs_permissions.sh"; then + echo "GCS Permission Check Failed. Exiting." + exit 1 +fi + + ###################################################################### # LAUNCH JOBS & WAIT FOR COMPLETION ###################################################################### diff --git a/Ironwood/guides/automation/check_gcs_permissions.sh b/Ironwood/guides/automation/check_gcs_permissions.sh new file mode 100755 index 00000000..0cca1968 --- /dev/null +++ b/Ironwood/guides/automation/check_gcs_permissions.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash + +# This script checks if the configured Service Account has write permissions to the specified GCS bucket. +# If permissions are missing, it attempts to fix them by creating the SA and granting roles/storage.admin. +# +# Expected Environment Variables: +# GCS_BUCKET_ROOT_DIR: The GCS path (must start with gs://) +# SA_NAME: The Service Account name (default: gcs-writer) +# PROJECT_ID: The GCP Project ID (optional, will try to detect if not set) + +SCRIPT_DIR="$(dirname "$(realpath "$0")")" +SA_NAME="${SA_NAME:-gcs-writer}" +PROJECT_ID="${PROJECT_ID:-$(gcloud config get-value project 2>/dev/null)}" + +if [[ -z "${GCS_BUCKET_ROOT_DIR}" || "${GCS_BUCKET_ROOT_DIR}" != "gs://"* ]]; then + echo "Error: GCS_BUCKET_ROOT_DIR must be set and start with gs://" + exit 1 +fi + +fix_gcs_permissions() { + # See more context in https://docs.cloud.google.com/kubernetes-engine/docs/how-to/workload-identity#authenticating_to + echo "Attempting to fix GCS permissions..." + + if [[ -z "${PROJECT_ID}" ]]; then + echo "Error: PROJECT_ID is not set and could not be detected." + echo "Please export PROJECT_ID= and rerun." + exit 1 + fi + + local bucket_name=$(echo "${GCS_BUCKET_ROOT_DIR}" | sed 's|^gs://||' | cut -d/ -f1) + local ns_name="default" + + echo "Ensuring ServiceAccount ${SA_NAME} exists in namespace ${ns_name}..." + kubectl create serviceaccount "${SA_NAME}" --namespace "${ns_name}" --dry-run=client -o yaml | kubectl apply -f - + + local project_number=$(gcloud projects describe "${PROJECT_ID}" --format="value(projectNumber)") + + echo "Granting roles/storage.admin to ${SA_NAME} on gs://${bucket_name}..." + gcloud storage buckets add-iam-policy-binding "gs://${bucket_name}" \ + --role=roles/storage.admin \ + --member="principal://iam.googleapis.com/projects/${project_number}/locations/global/workloadIdentityPools/${PROJECT_ID}.svc.id.goog/subject/ns/${ns_name}/sa/${SA_NAME}" + + echo "Permission fix command executed." +} + +check_gcs_permission() { + echo "Checking GCS write permissions..." + export GCS_CHECK_PATH="${GCS_BUCKET_ROOT_DIR}/permission-check-$(date +%s).txt" + export SA_NAME="${SA_NAME}" + + # Check if ServiceAccount exists first to fail fast + if ! kubectl get serviceaccount "${SA_NAME}" &> /dev/null; then + echo "ServiceAccount '${SA_NAME}' not found." + return 1 + fi + + # Launch check pod + # We capture the pod name from the output of kubectl create + local apply_output=$(envsubst '${SA_NAME} ${GCS_CHECK_PATH}' < "${SCRIPT_DIR}/gcs-write.yaml" | kubectl create -f -) + # output example: pod/gcs-writer-test-abcde created + local pod_name=$(echo "${apply_output}" | awk -F'/' '{print $2}' | awk '{print $1}') + + echo "Launched GCS check pod: ${pod_name}" + + # Wait for completion + local check_status="FAILED" + for i in {1..20}; do + sleep 5 + if kubectl get pod "${pod_name}" -o jsonpath='{.status.phase}' 2>/dev/null | grep -q "Succeeded"; then + check_status="SUCCESS" + break + fi + if kubectl get pod "${pod_name}" -o jsonpath='{.status.phase}' 2>/dev/null | grep -q "Failed"; then + check_status="FAILED" + break + fi + done + + # Check logs + if kubectl logs "${pod_name}" 2>/dev/null | grep -q "GCS test complete!"; then + echo "GCS permission check PASSED." + check_status="SUCCESS" + else + echo "GCS permission check FAILED." + check_status="FAILED" + echo "Logs from ${pod_name}:" + kubectl logs "${pod_name}" 2>/dev/null | tail -n 10 + fi + + # Cleanup + kubectl delete pod "${pod_name}" --grace-period=0 --force &> /dev/null + + if [[ "${check_status}" != "SUCCESS" ]]; then + return 1 + fi + return 0 +} + +# Main Logic +echo "======================================================================" +echo "Starting GCS Permission Check (SA: ${SA_NAME}, Bucket: ${GCS_BUCKET_ROOT_DIR})" +echo "======================================================================" + +if ! check_gcs_permission; then + echo "GCS check failed. Attempting to fix..." + fix_gcs_permissions + + echo "Retrying GCS check..." + if ! check_gcs_permission; then + echo "GCS permissions check failed even after attempted fix." + echo "Please verify your Service Account '${SA_NAME}' has proper permissions on ${GCS_BUCKET_ROOT_DIR}" + exit 1 + fi +fi + +echo "GCS Check Verified Successfully." diff --git a/Ironwood/guides/automation/gcs-write.yaml b/Ironwood/guides/automation/gcs-write.yaml new file mode 100644 index 00000000..8d27586c --- /dev/null +++ b/Ironwood/guides/automation/gcs-write.yaml @@ -0,0 +1,41 @@ +apiVersion: v1 +kind: Pod +metadata: + generateName: gcs-writer-test- + namespace: default +spec: + serviceAccountName: ${SA_NAME} + containers: + - name: gcs-test-container + image: google/cloud-sdk:slim + command: + - bash + - -c + - | + set -ex + TIMESTAMP=$(date +%s) + LOCAL_FILE="/tmp/test-file-${TIMESTAMP}.txt" + + # GCS_CHECK_PATH is substituted by envsubst + echo "Using GCS Path: ${GCS_CHECK_PATH}" + + echo "Testing GCS write from pod at $(date)" > "${LOCAL_FILE}" + + echo "--- Configuration ---" + gcloud auth list + gcloud config list + # Try to get service account email, but don't fail if metadata server is slow/unreachable (though it should be reachable) + curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/email" || echo "Could not fetch SA email" + echo + + echo "--- Writing to GCS ---" + gsutil cp "${LOCAL_FILE}" "${GCS_CHECK_PATH}" + + echo "--- Verifying from GCS ---" + gsutil cat "${GCS_CHECK_PATH}" + + echo "--- Cleaning up GCS object ---" + gsutil rm "${GCS_CHECK_PATH}" + + echo "GCS test complete!" + restartPolicy: Never From 30db8d01cfe39506a0507eeba8aa6fef43233ae7 Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Mon, 2 Feb 2026 02:57:14 +0000 Subject: [PATCH 32/88] Inject service account spec to Aggregator ``` Caller does not have storage.objects.list access to the Google Cloud Storage bucket. Permission 'storage.objects.list' denied on resource (or it may not exist). ``` --- Ironwood/guides/automation/aggregator.yaml | 2 ++ Ironwood/guides/automation/automation_launch.sh | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Ironwood/guides/automation/aggregator.yaml b/Ironwood/guides/automation/aggregator.yaml index 5a2c89a3..120e94b7 100644 --- a/Ironwood/guides/automation/aggregator.yaml +++ b/Ironwood/guides/automation/aggregator.yaml @@ -3,8 +3,10 @@ kind: Job metadata: name: aggregator spec: + backoffLimit: 0 template: spec: + serviceAccountName: ${GCS_SA_NAME} containers: - name: main-app image: python:3.12 diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh index 745a3f20..21af18ee 100755 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -209,9 +209,12 @@ echo "" echo "Jobs completed. Aggregating results..." echo "" -envsubst '${GCS_BUCKET_ROOT_DIR}' < ${SCRIPT_DIR}/aggregator.yaml | kubectl apply -f - +# Ensure cleanup of any previous aggregator job to avoid immutable field errors +kubectl delete job aggregator --ignore-not-found=true + +envsubst '${GCS_BUCKET_ROOT_DIR} ${GCS_SA_NAME}' < ${SCRIPT_DIR}/aggregator.yaml | kubectl apply -f - wait_for_job_completion "aggregator" ${TIMEOUT_SECOND} -envsubst '${GCS_BUCKET_ROOT_DIR}' < ${SCRIPT_DIR}/aggregator.yaml | kubectl delete -f - +envsubst '${GCS_BUCKET_ROOT_DIR} ${GCS_SA_NAME}' < ${SCRIPT_DIR}/aggregator.yaml | kubectl delete -f - # Print the failed jobs at the end for better visibility. From 5dd6f85d158a624ae4169868e50c778b633a57c1 Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Wed, 4 Feb 2026 16:19:53 +0800 Subject: [PATCH 33/88] Add bmm microbenchmark. (#97) * [BMM] Add bmm microbenchmark * Update hook in benchmark entry. * Update BMM config * Update timeit logic --- Ironwood/configs/bmm/single_device_bmm.yaml | 75 +++++++++++ Ironwood/src/benchmark_bmm.py | 134 ++++++++++++++++++++ Ironwood/src/run_benchmark.py | 5 + 3 files changed, 214 insertions(+) create mode 100644 Ironwood/configs/bmm/single_device_bmm.yaml create mode 100644 Ironwood/src/benchmark_bmm.py diff --git a/Ironwood/configs/bmm/single_device_bmm.yaml b/Ironwood/configs/bmm/single_device_bmm.yaml new file mode 100644 index 00000000..f0a4156c --- /dev/null +++ b/Ironwood/configs/bmm/single_device_bmm.yaml @@ -0,0 +1,75 @@ +benchmarks: +- benchmark_name: "single_device_bmm" + trace_dir: "../microbenchmarks/single_device_bmm_bf16" + csv_path: "../microbenchmarks/single_device_bmm_bf16" + xlml_metrics_dir: "../microbenchmarks/single_device_bmm_bf16" + xla_dump_dir: "../microbenchmarks/single_device_bmm_bf16/hlo_graphs" + benchmark_sweep_params: + - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'bfloat16'} + - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'bfloat16'} + - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'bfloat16'} + - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'bfloat16'} + - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'bfloat16'} + - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'bfloat16'} + - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'bfloat16'} + - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'bfloat16'} + +- benchmark_name: "single_device_bmm" + trace_dir: "../microbenchmarks/single_device_bmm_f32" + csv_path: "../microbenchmarks/single_device_bmm_f32" + xlml_metrics_dir: "../microbenchmarks/single_device_bmm_f32" + xla_dump_dir: "../microbenchmarks/single_device_bmm_f32/hlo_graphs" + benchmark_sweep_params: + - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float32'} + - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float32'} + - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float32'} + - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float32'} + - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float32'} + - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float32'} + - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float32'} + - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float32'} + +- benchmark_name: "single_device_bmm" + trace_dir: "../microbenchmarks/single_device_bmm_fp16" + csv_path: "../microbenchmarks/single_device_bmm_fp16" + xlml_metrics_dir: "../microbenchmarks/single_device_bmm_fp16" + xla_dump_dir: "../microbenchmarks/single_device_bmm_fp16/hlo_graphs" + benchmark_sweep_params: + - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float16'} + - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float16'} + - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float16'} + - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float16'} + - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float16'} + - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float16'} + - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float16'} + - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float16'} + +- benchmark_name: "single_device_bmm" + trace_dir: "../microbenchmarks/single_device_bmm_fp8" + csv_path: "../microbenchmarks/single_device_bmm_fp8" + xlml_metrics_dir: "../microbenchmarks/single_device_bmm_fp8" + xla_dump_dir: "../microbenchmarks/single_device_bmm_fp8/hlo_graphs" + benchmark_sweep_params: + - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float8'} + - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float8'} + - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float8'} + - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float8'} + - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float8'} + - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float8'} + - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float8'} + - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float8'} + +- benchmark_name: "single_device_bmm" + trace_dir: "../microbenchmarks/single_device_bmm_fp4" + csv_path: "../microbenchmarks/single_device_bmm_fp4" + xlml_metrics_dir: "../microbenchmarks/single_device_bmm_fp4" + xla_dump_dir: "../microbenchmarks/single_device_bmm_fp4/hlo_graphs" + benchmark_sweep_params: + - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float4'} + - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float4'} + - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float4'} + - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float4'} + - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float4'} + - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float4'} + - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float4'} + - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float4'} \ No newline at end of file diff --git a/Ironwood/src/benchmark_bmm.py b/Ironwood/src/benchmark_bmm.py new file mode 100644 index 00000000..40046821 --- /dev/null +++ b/Ironwood/src/benchmark_bmm.py @@ -0,0 +1,134 @@ +""" +Benchmarks bmm in various flavors. +Considered ops: +1. bmm +""" + +import os +from typing import Any, Dict + +# pylint: disable=g-importing-member +from benchmark_utils import ( + iteration_timeit, + multiple_iteration_timeit_from_trace, + ShardingStrategy, + get_lhs_named_shading, + get_rhs_named_shading, + get_output_named_shading, + get_out_sharding, + create_mesh, + handle_based_on_sharding, + unified_flops_metrics, + str_to_dtype, + get_peak_flops_multiplier +) +from common import MARKER +import jax +from jax.experimental.shard_map import shard_map +import jax.numpy as jnp +from jax.sharding import NamedSharding +from jax.sharding import PartitionSpec as P + + +# pylint: disable=g-importing-member + +os.environ["LIBTPU_INIT_ARGS"] = ( + "--xla_tpu_enable_async_collective_fusion=true " + "--xla_tpu_enable_async_collective_fusion_fuse_all_gather=true " + "--xla_tpu_enable_async_collective_fusion_multiple_steps=true " + "--xla_tpu_overlap_compute_collective_tc=true " + "--xla_enable_async_all_gather=true " + "--xla_enable_async_collective_permute=true " + "--xla_tpu_enable_all_experimental_scheduler_features=true " + "--xla_tpu_accumulate_into_mrb=true " + "--xla_tpu_scoped_vmem_limit_kib=65536 " + "--xla_tpu_vmem_scavenging_mode=NONE " + "--xla_tpu_dvfs_p_state=7" +) + +TRACE_BASE_DIR = None +METRICS_JSONL_DIR = None +SHARDING_STRATEGY = ShardingStrategy.NO_SHARDING +SEED = 0 +PEAK_FLOPS_PER_DEVICE = 2307 # TFLOP/s for single core(device) of FP8 + +def single_device_bmm( + b: int, + m: int, + k: int, + n: int, + dtype: jnp.dtype = jax.numpy.float8_e4m3fn, + num_runs: int = 1, + trace_dir: str = None, +) -> Dict[str, Any]: + """Benchmarks the OUT:BF16 = IN0:FP8 x IN1:FP8. Accumulation is FP32.""" + + def f(x, y): + with jax.named_scope(MARKER): + acc = jax.numpy.einsum( + "bij,bjk->bik", x, y, preferred_element_type=jnp.float32 + ) + return acc.astype(jnp.bfloat16) + + jit_sharded_f = jax.jit(f) + + lhs_shape = (b, m, k) + rhs_shape = (b, k, n) + + lhs_dtype = dtype + rhs_dtype = dtype + + key = jax.random.key(SEED) + + def data_generator(): + """Creates new random data on host and puts it on device.""" + nonlocal key # Use and update the outer 'key' + key, key_lhs, key_rhs = jax.random.split(key, 3) + + # Create random data on host + lhs_host = jax.random.normal(key_lhs, lhs_shape).astype(lhs_dtype) + rhs_host = jax.random.normal(key_rhs, rhs_shape).astype(rhs_dtype) + + # Put on device (HBM) + + return (lhs_host, rhs_host) + + # Run the benchmark + + # num_runs = 1 + + dtype_str = dtype.dtype.name + time_ms_list = multiple_iteration_timeit_from_trace( + jit_sharded_f, + data_generator, + matrix_dim=f"{dtype_str}_{b}x{m}x{n}x{k}", + tries=num_runs, + task="single_device_bmm", + trace_dir=trace_dir, + ) + + return {"time_ms_list": time_ms_list} + + +def single_device_bmm_calculate_metrics( + b: int, + m: int, + k: int, + n: int, + dtype: jnp.dtype, + time_ms_list: list[float], +) -> Dict[str, Any]: + # Calculate FLOPs + total_flops = 2 * b * m * k * n # Total floating-point operations + total_flops, total_flops_all_devices = handle_based_on_sharding( + total_flops, SHARDING_STRATEGY + ) + return unified_flops_metrics( + m, + n, + k, + time_ms_list, + total_flops, + total_flops_all_devices, + PEAK_FLOPS_PER_DEVICE, + ) diff --git a/Ironwood/src/run_benchmark.py b/Ironwood/src/run_benchmark.py index 5b3c3b7f..efef1dcc 100644 --- a/Ironwood/src/run_benchmark.py +++ b/Ironwood/src/run_benchmark.py @@ -32,6 +32,10 @@ "send_recv": "benchmark_send_recv.send_recv_benchmark", } +BMM_BENCHMARK_MAP = { + "single_device_bmm": "benchmark_bmm.single_device_bmm", +} + MATMUL_BENCHMARK_MAP = { "naive_matmul": "benchmark_matmul.naive_matmul", "single_host_naive_matmul": "benchmark_matmul.single_host_naive_matmul", @@ -99,6 +103,7 @@ "host_device": "benchmark_host_device.benchmark_host_device", } BENCHMARK_MAP = {} +BENCHMARK_MAP.update(BMM_BENCHMARK_MAP) BENCHMARK_MAP.update(COLLECTIVE_BENCHMARK_MAP) BENCHMARK_MAP.update(MATMUL_BENCHMARK_MAP) BENCHMARK_MAP.update(CONVOLUTION_BENCHMARK_MAP) From ef5ad1c41585ac570fa7ab199bdb840e6aa9c80c Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Wed, 4 Feb 2026 09:46:14 +0000 Subject: [PATCH 34/88] Add --numactl_bind flag to H2D benchmark script --- Ironwood/scripts/run_host_device_benchmark.sh | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Ironwood/scripts/run_host_device_benchmark.sh b/Ironwood/scripts/run_host_device_benchmark.sh index 1275160b..607244c2 100755 --- a/Ironwood/scripts/run_host_device_benchmark.sh +++ b/Ironwood/scripts/run_host_device_benchmark.sh @@ -4,12 +4,14 @@ CONFIG_DIR="Ironwood/configs/host_device" SPECIFIC_CONFIG="" INTERLEAVED=false +NUMACTL_BIND=false # Helper function for usage usage() { echo "Usage: $0 [OPTIONS]" echo "Options:" echo " --config Path to specific config file (optional)" + echo " --numactl_bind Run with numactl --cpunodebind=0 --membind=0" echo " --interleaved Run with numactl --interleave=all" echo " --help Show this help message" exit 1 @@ -20,6 +22,7 @@ while [[ "$#" -gt 0 ]]; do case $1 in --config) SPECIFIC_CONFIG="$2"; shift ;; --interleaved) INTERLEAVED=true ;; + --numactl_bind) NUMACTL_BIND=true ;; --help) usage ;; *) echo "Unknown parameter passed: $1"; usage ;; esac @@ -33,6 +36,7 @@ echo "********************************************************" echo "" echo "Configuration:" echo " Interleaved: $INTERLEAVED" +echo " numactl Bound: $NUMACTL_BIND" echo "" if [ -n "$SPECIFIC_CONFIG" ]; then @@ -44,6 +48,11 @@ else shopt -u nullglob fi +if [[ "$INTERLEAVED" = true && "$NUMACTL_BIND" = true ]]; then + echo "Only one of --interleaved and --numactl_bind is allowed to be set at once." + exit 1 +fi + if [ ${#CONFIGS[@]} -eq 0 ]; then echo "No configuration files found!" exit 1 @@ -61,6 +70,14 @@ for CONFIG_FILE in "${CONFIGS[@]}"; do echo "Warning: numactl not found. Running without interleaving." $CMD fi + elif [ "$NUMACTL_BIND" = true ]; then + if command -v numactl &> /dev/null; then + echo "Running with numactl --cpunodebind=0 --membind=0" + numactl --cpunodebind=0 --membind=0 $CMD + else + echo "Warning: numactl not found. Running without binding." + $CMD + fi else $CMD fi From be12de015f3ba0014f0c5d7e9d928d06a655fcc4 Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Wed, 4 Feb 2026 09:46:14 +0000 Subject: [PATCH 35/88] Add --numactl_bind flag to H2D benchmark script --- Ironwood/scripts/run_host_device_benchmark.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/Ironwood/scripts/run_host_device_benchmark.sh b/Ironwood/scripts/run_host_device_benchmark.sh index 607244c2..77abb52d 100755 --- a/Ironwood/scripts/run_host_device_benchmark.sh +++ b/Ironwood/scripts/run_host_device_benchmark.sh @@ -13,6 +13,7 @@ usage() { echo " --config Path to specific config file (optional)" echo " --numactl_bind Run with numactl --cpunodebind=0 --membind=0" echo " --interleaved Run with numactl --interleave=all" + echo " --numactl_bind Run with numactl --cpunodebind=0 --membind=0" echo " --help Show this help message" exit 1 } From 21c694061a8e48ca5d3e1baa75ab91d8e3ca4572 Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Wed, 4 Feb 2026 09:46:14 +0000 Subject: [PATCH 36/88] Add --numactl_bind flag to H2D benchmark script --- Ironwood/scripts/run_host_device_benchmark.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/Ironwood/scripts/run_host_device_benchmark.sh b/Ironwood/scripts/run_host_device_benchmark.sh index 77abb52d..13345ec6 100755 --- a/Ironwood/scripts/run_host_device_benchmark.sh +++ b/Ironwood/scripts/run_host_device_benchmark.sh @@ -11,7 +11,6 @@ usage() { echo "Usage: $0 [OPTIONS]" echo "Options:" echo " --config Path to specific config file (optional)" - echo " --numactl_bind Run with numactl --cpunodebind=0 --membind=0" echo " --interleaved Run with numactl --interleave=all" echo " --numactl_bind Run with numactl --cpunodebind=0 --membind=0" echo " --help Show this help message" From 7b090f1d96761ddd0e6d32efa3a49de25bed2649 Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Wed, 4 Feb 2026 17:12:47 +0000 Subject: [PATCH 37/88] [Automation] Add BMM into automation script --- Ironwood/guides/automation/aggregator.py | 21 ++++++- .../guides/automation/automation_launch.sh | 1 + .../guides/automation/tpu7x-2x2x1-bmm.yaml | 61 +++++++++++++++++++ Ironwood/src/benchmark_bmm.py | 2 + Ironwood/src/benchmark_utils.py | 1 + 5 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py index bc5e5e24..5382ce30 100644 --- a/Ironwood/guides/automation/aggregator.py +++ b/Ironwood/guides/automation/aggregator.py @@ -29,6 +29,13 @@ "tflops_per_sec_per_device_avg", "tflops_per_sec_per_device_min", "tflops_per_sec_per_device_max", ], + "bmm": [ + "b", "m", "n", "k", "dtype", "step_time_ms_num_runs", + "tflops_per_sec_per_device_p50", "tflops_per_sec_per_device_p90", + "tflops_per_sec_per_device_p95", "tflops_per_sec_per_device_p99", + "tflops_per_sec_per_device_avg", "tflops_per_sec_per_device_min", + "tflops_per_sec_per_device_max", + ], } def download_from_gcs(bucket_path: str, local_dir: str): @@ -86,15 +93,27 @@ def aggregate_gemm(directories: list[str], picked_columns: list[str]) -> pd.Data aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True) return aggregated_df +def aggregate_bmm(directories: list[str], picked_columns: list[str]) -> pd.DataFrame: + if len(directories) == 0: + return None + aggregated_df = pd.DataFrame() + for directory in directories: + files = glob.glob(f"{directory}/*.tsv") + for file in files: + df = pd.read_csv(file, sep='\t') + aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True) + return aggregated_df + aggregate_function = { "collectives": aggregate_collectives, "hbm": aggregate_hbm, "host_device": aggregate_host_device, "gemm": aggregate_gemm, + "bmm": aggregate_bmm, } def aggregate_results(bucket_path: str, local_dir: str): - categories = ["collectives", "hbm", "host_device", "gemm"] + categories = ["collectives", "hbm", "host_device", "gemm", "bmm"] directories = {} results = {} for category in categories: diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh index 21af18ee..0ca76210 100755 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -15,6 +15,7 @@ yaml_names=( "tpu7x-2x2x1-hbm.yaml" "tpu7x-2x2x1-host_device.yaml" "tpu7x-2x2x1-gemm.yaml" + "tpu7x-2x2x1-bmm.yaml" "tpu7x-2x2x1-collectives.yaml" "tpu7x-2x2x2-collectives.yaml" "tpu7x-2x2x4-collectives.yaml" diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml new file mode 100644 index 00000000..1b5b9774 --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml @@ -0,0 +1,61 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-2x2x1 +spec: + completionMode: Indexed + suspend: true + parallelism: 1 + completions: 1 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x1 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/bmm/single_device_bmm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/src/benchmark_bmm.py b/Ironwood/src/benchmark_bmm.py index 40046821..264daf2b 100644 --- a/Ironwood/src/benchmark_bmm.py +++ b/Ironwood/src/benchmark_bmm.py @@ -131,4 +131,6 @@ def single_device_bmm_calculate_metrics( total_flops, total_flops_all_devices, PEAK_FLOPS_PER_DEVICE, + dtype=dtype.dtype.name, + b=b, ) diff --git a/Ironwood/src/benchmark_utils.py b/Ironwood/src/benchmark_utils.py index fa1fb81c..60fc2d14 100644 --- a/Ironwood/src/benchmark_utils.py +++ b/Ironwood/src/benchmark_utils.py @@ -1111,6 +1111,7 @@ def unified_flops_metrics( total_flops_all_devices: int, peak_TFLOPS_per_device: float, dtype: str = None, + b: int = None, ) -> Dict[str, Any]: """Calculates the metrics for the naive matmul benchmark.""" # Build dictionary of all the parameters in the function From a86475d6a50bb644617d6ee9e63427072526fc61 Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Thu, 5 Feb 2026 00:15:58 +0000 Subject: [PATCH 38/88] Add baseline pipelined flow to H2D benchmark --- Ironwood/configs/host_device/host_device.yaml | 3 +- Ironwood/src/benchmark_host_device.py | 126 ++++++++++++++---- 2 files changed, 103 insertions(+), 26 deletions(-) diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml index 0b48800c..ff97df1b 100644 --- a/Ironwood/configs/host_device/host_device.yaml +++ b/Ironwood/configs/host_device/host_device.yaml @@ -3,7 +3,8 @@ benchmarks: num_runs: 20 benchmark_sweep_params: - { - data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768] + h2d_type: ["simple", "pipelined"], + data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768], } csv_path: "../microbenchmarks/host_device" trace_dir: "../microbenchmarks/host_device/trace" diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index 16352e2a..1d72b5eb 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -5,7 +5,7 @@ from typing import Any, Dict, Tuple, List import jax -from jax import sharding +from jax import numpy as jnp import numpy as np from benchmark_utils import MetricsStatistics @@ -23,8 +23,9 @@ def benchmark_host_device( data_size_mib: int, num_runs: int = 100, trace_dir: str = None, + h2d_type: str = "simple", ) -> Dict[str, Any]: - """Benchmarks H2D/D2H transfer using simple device_put/device_get.""" + """Benchmarks H2D/D2H transfer using device_put/device_get.""" num_elements = 1024 * 1024 * data_size_mib // np.dtype(np.float32).itemsize @@ -32,8 +33,13 @@ def benchmark_host_device( column = 128 host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32) + # Used in pipelined flow + # TODO: turn into a param + num_devices_to_perform_h2d = 1 + target_devices = jax.devices()[:num_devices_to_perform_h2d] + print( - f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations", + f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {h2d_type=}", flush=True ) @@ -65,29 +71,98 @@ def benchmark_host_device( with step_context: # H2D - t0 = time.perf_counter() - - # Simple device_put - device_array = jax.device_put(host_data) - device_array.block_until_ready() - - t1 = time.perf_counter() - h2d_perf.append((t1 - t0) * 1000) - - # Verify H2D shape - assert device_array.shape == host_data.shape - - # D2H - t2 = time.perf_counter() - - # Simple device_get - # Note: device_get returns a numpy array (copy) - _ = jax.device_get(device_array) - - t3 = time.perf_counter() - d2h_perf.append((t3 - t2) * 1000) + if h2d_type == "simple": + t0 = time.perf_counter() + # Simple device_put + device_array = jax.device_put(host_data) + device_array.block_until_ready() + t1 = time.perf_counter() + + # Verify H2D shape + assert device_array.shape == host_data.shape + + h2d_perf.append((t1 - t0) * 1000) - device_array.delete() + # D2H + t2 = time.perf_counter() + + # Simple device_get + # Note: device_get returns a numpy array (copy) + _ = jax.device_get(device_array) + + t3 = time.perf_counter() + d2h_perf.append((t3 - t2) * 1000) + + device_array.delete() + elif h2d_type == "pipelined": + target_chunk_size_mib = 16 # Sweet spot from profiling + num_devices = len(target_devices) + + tensors_on_device = [] + + # Calculate chunks per device + data_per_dev = data_size_mib / num_devices + chunks_per_dev = int(data_per_dev / target_chunk_size_mib) + chunks_per_dev = max(1, chunks_per_dev) + + chunks = np.array_split(host_data, chunks_per_dev * num_devices, axis=0) + + t0 = time.perf_counter() + if chunks_per_dev > 1: + # We need to map chunks to the correct device + # This simple example assumes chunks are perfectly divisible and ordered + # In production, use `jax.sharding` mesh logic for complex layouts + + # approach 1: simple for loop + for idx, chunk in enumerate(chunks): + if num_devices > 1: + dev = target_devices[idx % num_devices] + else: + dev = target_devices[0] + tensors_on_device.append(jax.device_put(chunk, dev)) + # Re-assemble array + result = jnp.vstack(tensors_on_device) + # Wait for all chunks to be transferred + result.block_until_ready() + + # approach 2: generator (slightly less overhead) + # def chunk_generator(num_devices, chunks_per_dev): + # for n in range(chunks_per_dev): + # for d in range(num_devices): + # # 1. Get the specific small chunk + # chunk = chunks[d*chunks_per_dev+n] + + # # 2. Trigger an individual DMA transfer for this specific chunk + # # This is where NUMA-local memory access matters + # yield jax.device_put(chunk, target_devices[d]) + + # # Re-assemble array + # result = jnp.vstack(list(chunk_generator(num_devices, chunks_per_dev))) + # # Wait for all chunks to be transferred + # result.block_until_ready() + else: + print(f"Warning: {data_size_mib=} is not larger than {target_chunk_size_mib=}, falling back to standard JAX put.") + # Fallback to standard JAX put for small data + result = jax.device_put(host_data, target_devices[0]) + result.block_until_ready() + + t1 = time.perf_counter() + h2d_perf.append((t1 - t0) * 1000) + + # D2H + t2 = time.perf_counter() + # Simple device_get + # Note: device_get returns a numpy array (copy) + _ = jax.device_get(result) + + t3 = time.perf_counter() + if not np.allclose(result, host_data): + print("pipelined result not equal to host_data") + d2h_perf.append((t3 - t2) * 1000) + + for r in tensors_on_device: + r.delete() + del tensors_on_device return { "H2D_Bandwidth_ms": h2d_perf, @@ -98,6 +173,7 @@ def benchmark_host_device_calculate_metrics( data_size_mib: int, H2D_Bandwidth_ms: List[float], D2H_Bandwidth_ms: List[float], + h2d_type: str = "simple", ) -> Tuple[Dict[str, Any], Dict[str, Any]]: """Calculates metrics for Host-Device transfer.""" params = locals().items() From 0fe95cce4fbd2cf9256dca509754823604aef1a3 Mon Sep 17 00:00:00 2001 From: "Amy (Yu-Hsuan) Lin" Date: Thu, 5 Feb 2026 11:05:03 +0800 Subject: [PATCH 39/88] Correct fp4 tensor size calculation (#99) The new utility will use jnp.finfo and jnp.iinfo to determine the accurate bit width of any dtype, ensuring correct bandwidth metrics for current and future sub-byte types (like int4 or float4). --- Ironwood/src/benchmark_collectives.py | 5 +++-- Ironwood/src/benchmark_hbm.py | 3 ++- Ironwood/src/benchmark_send_recv.py | 7 ++++--- Ironwood/src/benchmark_utils.py | 12 ++++++++++++ 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/Ironwood/src/benchmark_collectives.py b/Ironwood/src/benchmark_collectives.py index e142f59b..8e8399d2 100644 --- a/Ironwood/src/benchmark_collectives.py +++ b/Ironwood/src/benchmark_collectives.py @@ -11,6 +11,7 @@ from benchmark_utils import MetricsStatistics from benchmark_utils import multiple_iteration_timeit_from_trace from benchmark_utils import ShardingStrategy +from benchmark_utils import get_real_dtype_bytes from common import MARKER import jax from jax import core @@ -72,7 +73,7 @@ def get_metrics_helper( for key, value in params if value is not None and key not in exclude_keys } - metadata["dtype"] = metadata["dtype"].dtype.itemsize + metadata["dtype"] = get_real_dtype_bytes(metadata["dtype"].dtype) return metadata @@ -99,7 +100,7 @@ def unified_ici_collectives_metrics( input_num_elements = matrix_shape[0] * matrix_shape[1] * matrix_shape[2] dtype_name = dtype.dtype.name - dtype_bytes = dtype.dtype.itemsize + dtype_bytes = get_real_dtype_bytes(dtype.dtype) if xla_output: xla_output_json = json.loads(xla_output) hlo_input_shape = xla_output_json.get("hlo_input_shape") diff --git a/Ironwood/src/benchmark_hbm.py b/Ironwood/src/benchmark_hbm.py index 53744b5d..5e6574ac 100644 --- a/Ironwood/src/benchmark_hbm.py +++ b/Ironwood/src/benchmark_hbm.py @@ -6,6 +6,7 @@ from benchmark_utils import ( MetricsStatistics, multiple_iteration_timeit_from_trace, + get_real_dtype_bytes, ) from common import MARKER import jax @@ -76,7 +77,7 @@ def single_device_hbm_copy_calculate_metrics( metrics = {} # Calculate throughput. - tensor_size_bytes = num_elements * dtype.dtype.itemsize + tensor_size_bytes = num_elements * get_real_dtype_bytes(dtype.dtype) tensor_size_gbytes = (tensor_size_bytes * 2) / 10**9 time_statistics = MetricsStatistics( diff --git a/Ironwood/src/benchmark_send_recv.py b/Ironwood/src/benchmark_send_recv.py index 90950007..c7dd5db3 100644 --- a/Ironwood/src/benchmark_send_recv.py +++ b/Ironwood/src/benchmark_send_recv.py @@ -8,6 +8,7 @@ import jax.sharding from benchmark_utils import ( get_trace, + get_real_dtype_bytes, ) from common import MARKER import tempfile @@ -68,7 +69,7 @@ def get_metrics_helper( for key, value in params if value is not None and key not in exclude_keys } - metadata['dtype'] = metadata['dtype'].dtype.itemsize + metadata['dtype'] = get_real_dtype_bytes(metadata['dtype'].dtype) return metadata @@ -84,7 +85,7 @@ def send_recv_benchmark( device_count = jax.local_device_count() devices = mesh_utils.create_device_mesh((device_count,)) mesh = jax.sharding.Mesh(devices, 'x') - item_size = jnp.dtype(dtype).itemsize + item_size = get_real_dtype_bytes(jnp.dtype(dtype)) tensor_size_bytes = num_elements * item_size last_dim = tensor_size_bytes // (1 * 8 * item_size) @@ -161,7 +162,7 @@ def send_recv_benchmark_calculate_metrics( metadata = get_metrics_helper(params) metrics = {} - tensor_size_bytes = num_elements * jnp.dtype(dtype).itemsize + tensor_size_bytes = num_elements * get_real_dtype_bytes(jnp.dtype(dtype)) tensor_size_gbytes = tensor_size_bytes / 10**9 metrics['runtime_ms (ms)'] = runtime_ms diff --git a/Ironwood/src/benchmark_utils.py b/Ironwood/src/benchmark_utils.py index 60fc2d14..4091aefb 100644 --- a/Ironwood/src/benchmark_utils.py +++ b/Ironwood/src/benchmark_utils.py @@ -28,6 +28,18 @@ import jax.extend from tensorflow.tsl.profiler.protobuf import xplane_pb2 + +def get_real_dtype_bytes(dtype) -> float: + """Returns the real byte size of a dtype, handling sub-byte types.""" + try: + return jnp.finfo(dtype).bits / 8 + except Exception: + try: + return jnp.iinfo(dtype).bits / 8 + except Exception: + return dtype.itemsize + + # The dictionary to map a JAX (collective) function to its main HLO. TARGET_TASK_NAME_COLLECTIVES_MAP = { "all_to_all_ici_op": r"all-to-all.[0-9]+", From 9a7c4afb29571d682733046ca0b2d51caf546437 Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Thu, 5 Feb 2026 00:15:58 +0000 Subject: [PATCH 40/88] Add baseline pipelined flow to H2D benchmark --- Ironwood/configs/host_device/host_device.yaml | 2 +- Ironwood/src/benchmark_host_device.py | 92 ++++++++++++++----- 2 files changed, 68 insertions(+), 26 deletions(-) diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml index 8d572ed7..ff97df1b 100644 --- a/Ironwood/configs/host_device/host_device.yaml +++ b/Ironwood/configs/host_device/host_device.yaml @@ -3,8 +3,8 @@ benchmarks: num_runs: 20 benchmark_sweep_params: - { + h2d_type: ["simple", "pipelined"], data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768], - h2d_type: ["simple", "pipelined"] } csv_path: "../microbenchmarks/host_device" trace_dir: "../microbenchmarks/host_device/trace" diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index 8a36a2c7..1d72b5eb 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -5,6 +5,7 @@ from typing import Any, Dict, Tuple, List import jax +from jax import numpy as jnp import numpy as np from benchmark_utils import MetricsStatistics @@ -33,14 +34,9 @@ def benchmark_host_device( host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32) # Used in pipelined flow + # TODO: turn into a param num_devices_to_perform_h2d = 1 - tensor_size = 4 * 1024 * 1024 - target_device = jax.devices()[:num_devices_to_perform_h2d] - mesh = jax.sharding.Mesh(np.array(target_device), axis_names=["x"]) - sharding = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec("x")) - pipelined_array = None - if h2d_type == "pipelined": - pipelined_array = np.random.normal(size=(tensor_size,)).astype(np.float32) + target_devices = jax.devices()[:num_devices_to_perform_h2d] print( f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {h2d_type=}", @@ -99,28 +95,74 @@ def benchmark_host_device( device_array.delete() elif h2d_type == "pipelined": + target_chunk_size_mib = 16 # Sweet spot from profiling + num_devices = len(target_devices) + tensors_on_device = [] - if data_size_mib * 1024 * 1024 < pipelined_array.nbytes: - print(f"Warning: {data_size_mib=} is smaller than pipeline unit, no data will be transferred.") - t0 = time.perf_counter() - # Assume data_size_mib is total across devices for now - bytes_left = 1024 * 1024 * data_size_mib - while bytes_left >= pipelined_array.nbytes: - with jax.profiler.StepTraceAnnotation("device_put", step_num=1): - x_device = jax.device_put(pipelined_array, sharding) - tensors_on_device.append(x_device) - bytes_left -= pipelined_array.nbytes - total_bytes_transferred = 0 - for tensor in tensors_on_device: - tensor.block_until_ready() - total_bytes_transferred += tensor.nbytes - tensor.delete() - t1 = time.perf_counter() + # Calculate chunks per device + data_per_dev = data_size_mib / num_devices + chunks_per_dev = int(data_per_dev / target_chunk_size_mib) + chunks_per_dev = max(1, chunks_per_dev) + + chunks = np.array_split(host_data, chunks_per_dev * num_devices, axis=0) + t0 = time.perf_counter() + if chunks_per_dev > 1: + # We need to map chunks to the correct device + # This simple example assumes chunks are perfectly divisible and ordered + # In production, use `jax.sharding` mesh logic for complex layouts + + # approach 1: simple for loop + for idx, chunk in enumerate(chunks): + if num_devices > 1: + dev = target_devices[idx % num_devices] + else: + dev = target_devices[0] + tensors_on_device.append(jax.device_put(chunk, dev)) + # Re-assemble array + result = jnp.vstack(tensors_on_device) + # Wait for all chunks to be transferred + result.block_until_ready() + + # approach 2: generator (slightly less overhead) + # def chunk_generator(num_devices, chunks_per_dev): + # for n in range(chunks_per_dev): + # for d in range(num_devices): + # # 1. Get the specific small chunk + # chunk = chunks[d*chunks_per_dev+n] + + # # 2. Trigger an individual DMA transfer for this specific chunk + # # This is where NUMA-local memory access matters + # yield jax.device_put(chunk, target_devices[d]) + + # # Re-assemble array + # result = jnp.vstack(list(chunk_generator(num_devices, chunks_per_dev))) + # # Wait for all chunks to be transferred + # result.block_until_ready() + else: + print(f"Warning: {data_size_mib=} is not larger than {target_chunk_size_mib=}, falling back to standard JAX put.") + # Fallback to standard JAX put for small data + result = jax.device_put(host_data, target_devices[0]) + result.block_until_ready() + + t1 = time.perf_counter() h2d_perf.append((t1 - t0) * 1000) - # Implement D2H at a later time after we establish H2D - d2h_perf.append(0) + + # D2H + t2 = time.perf_counter() + # Simple device_get + # Note: device_get returns a numpy array (copy) + _ = jax.device_get(result) + + t3 = time.perf_counter() + if not np.allclose(result, host_data): + print("pipelined result not equal to host_data") + d2h_perf.append((t3 - t2) * 1000) + + for r in tensors_on_device: + r.delete() + del tensors_on_device return { "H2D_Bandwidth_ms": h2d_perf, From 90eb07a8b5f7b02eb71cdd76cfae799a6cce9428 Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Thu, 5 Feb 2026 02:52:30 +0000 Subject: [PATCH 41/88] Add --numactl_binding flag to host_device YAMLs --- Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml | 2 +- Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml index a6b8febd..1084fdf9 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml @@ -53,7 +53,7 @@ spec: pip install -r requirements.txt GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} --numactl_binding resources: requests: google.com/tpu: 4 diff --git a/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml b/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml index 8c027c01..83d06065 100644 --- a/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml +++ b/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml @@ -24,7 +24,7 @@ spec: cd accelerator-microbenchmarks pip install -r requirements.txt - bash ./Ironwood/scripts/run_host_device_benchmark.sh --config Ironwood/configs/host_device/host_device.yaml + bash ./Ironwood/scripts/run_host_device_benchmark.sh --config Ironwood/configs/host_device/host_device.yaml --numactl_binding resources: requests: From 040002fdcd635bc5a350d1567af716feb04149f5 Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Thu, 5 Feb 2026 06:07:41 +0000 Subject: [PATCH 42/88] Add h2d_type column to H2D/D2H output --- Ironwood/src/benchmark_host_device.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index 1d72b5eb..d80c2819 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -184,6 +184,7 @@ def benchmark_host_device_calculate_metrics( } metadata = {k: v for k, v in params if k in metadata_keys} metadata["dtype"] = "float32" + metadata["h2d_type"] = h2d_type metrics = {} From 38e8038d3ecb4652fe6d4f2f0baa15f9d58e7159 Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Thu, 5 Feb 2026 06:21:56 +0000 Subject: [PATCH 43/88] Revert "Add baseline pipelined flow to H2D benchmark" This reverts commit a86475d6a50bb644617d6ee9e63427072526fc61. --- Ironwood/configs/host_device/host_device.yaml | 3 +- Ironwood/src/benchmark_host_device.py | 126 ++++-------------- 2 files changed, 26 insertions(+), 103 deletions(-) diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml index ff97df1b..0b48800c 100644 --- a/Ironwood/configs/host_device/host_device.yaml +++ b/Ironwood/configs/host_device/host_device.yaml @@ -3,8 +3,7 @@ benchmarks: num_runs: 20 benchmark_sweep_params: - { - h2d_type: ["simple", "pipelined"], - data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768], + data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768] } csv_path: "../microbenchmarks/host_device" trace_dir: "../microbenchmarks/host_device/trace" diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index d80c2819..0c7eacc5 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -5,7 +5,7 @@ from typing import Any, Dict, Tuple, List import jax -from jax import numpy as jnp +from jax import sharding import numpy as np from benchmark_utils import MetricsStatistics @@ -23,9 +23,8 @@ def benchmark_host_device( data_size_mib: int, num_runs: int = 100, trace_dir: str = None, - h2d_type: str = "simple", ) -> Dict[str, Any]: - """Benchmarks H2D/D2H transfer using device_put/device_get.""" + """Benchmarks H2D/D2H transfer using simple device_put/device_get.""" num_elements = 1024 * 1024 * data_size_mib // np.dtype(np.float32).itemsize @@ -33,13 +32,8 @@ def benchmark_host_device( column = 128 host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32) - # Used in pipelined flow - # TODO: turn into a param - num_devices_to_perform_h2d = 1 - target_devices = jax.devices()[:num_devices_to_perform_h2d] - print( - f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {h2d_type=}", + f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations", flush=True ) @@ -71,98 +65,29 @@ def benchmark_host_device( with step_context: # H2D - if h2d_type == "simple": - t0 = time.perf_counter() - # Simple device_put - device_array = jax.device_put(host_data) - device_array.block_until_ready() - t1 = time.perf_counter() - - # Verify H2D shape - assert device_array.shape == host_data.shape - - h2d_perf.append((t1 - t0) * 1000) + t0 = time.perf_counter() - # D2H - t2 = time.perf_counter() - - # Simple device_get - # Note: device_get returns a numpy array (copy) - _ = jax.device_get(device_array) - - t3 = time.perf_counter() - d2h_perf.append((t3 - t2) * 1000) - - device_array.delete() - elif h2d_type == "pipelined": - target_chunk_size_mib = 16 # Sweet spot from profiling - num_devices = len(target_devices) - - tensors_on_device = [] - - # Calculate chunks per device - data_per_dev = data_size_mib / num_devices - chunks_per_dev = int(data_per_dev / target_chunk_size_mib) - chunks_per_dev = max(1, chunks_per_dev) - - chunks = np.array_split(host_data, chunks_per_dev * num_devices, axis=0) - - t0 = time.perf_counter() - if chunks_per_dev > 1: - # We need to map chunks to the correct device - # This simple example assumes chunks are perfectly divisible and ordered - # In production, use `jax.sharding` mesh logic for complex layouts - - # approach 1: simple for loop - for idx, chunk in enumerate(chunks): - if num_devices > 1: - dev = target_devices[idx % num_devices] - else: - dev = target_devices[0] - tensors_on_device.append(jax.device_put(chunk, dev)) - # Re-assemble array - result = jnp.vstack(tensors_on_device) - # Wait for all chunks to be transferred - result.block_until_ready() - - # approach 2: generator (slightly less overhead) - # def chunk_generator(num_devices, chunks_per_dev): - # for n in range(chunks_per_dev): - # for d in range(num_devices): - # # 1. Get the specific small chunk - # chunk = chunks[d*chunks_per_dev+n] - - # # 2. Trigger an individual DMA transfer for this specific chunk - # # This is where NUMA-local memory access matters - # yield jax.device_put(chunk, target_devices[d]) - - # # Re-assemble array - # result = jnp.vstack(list(chunk_generator(num_devices, chunks_per_dev))) - # # Wait for all chunks to be transferred - # result.block_until_ready() - else: - print(f"Warning: {data_size_mib=} is not larger than {target_chunk_size_mib=}, falling back to standard JAX put.") - # Fallback to standard JAX put for small data - result = jax.device_put(host_data, target_devices[0]) - result.block_until_ready() - - t1 = time.perf_counter() - h2d_perf.append((t1 - t0) * 1000) - - # D2H - t2 = time.perf_counter() - # Simple device_get - # Note: device_get returns a numpy array (copy) - _ = jax.device_get(result) - - t3 = time.perf_counter() - if not np.allclose(result, host_data): - print("pipelined result not equal to host_data") - d2h_perf.append((t3 - t2) * 1000) - - for r in tensors_on_device: - r.delete() - del tensors_on_device + # Simple device_put + device_array = jax.device_put(host_data) + device_array.block_until_ready() + + t1 = time.perf_counter() + h2d_perf.append((t1 - t0) * 1000) + + # Verify H2D shape + assert device_array.shape == host_data.shape + + # D2H + t2 = time.perf_counter() + + # Simple device_get + # Note: device_get returns a numpy array (copy) + _ = jax.device_get(device_array) + + t3 = time.perf_counter() + d2h_perf.append((t3 - t2) * 1000) + + device_array.delete() return { "H2D_Bandwidth_ms": h2d_perf, @@ -173,7 +98,6 @@ def benchmark_host_device_calculate_metrics( data_size_mib: int, H2D_Bandwidth_ms: List[float], D2H_Bandwidth_ms: List[float], - h2d_type: str = "simple", ) -> Tuple[Dict[str, Any], Dict[str, Any]]: """Calculates metrics for Host-Device transfer.""" params = locals().items() From 09b3331da8cf3cc6b166f300379e85baf3f68a44 Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Thu, 5 Feb 2026 06:31:07 +0000 Subject: [PATCH 44/88] Revert "Add --numactl_binding flag to host_device YAMLs" This reverts commit bbb316347a5277bcf2af5a829b0c4b25653f236d. --- Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml | 2 +- Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml index 1084fdf9..a6b8febd 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml @@ -53,7 +53,7 @@ spec: pip install -r requirements.txt GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} --numactl_binding + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} resources: requests: google.com/tpu: 4 diff --git a/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml b/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml index 83d06065..8c027c01 100644 --- a/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml +++ b/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml @@ -24,7 +24,7 @@ spec: cd accelerator-microbenchmarks pip install -r requirements.txt - bash ./Ironwood/scripts/run_host_device_benchmark.sh --config Ironwood/configs/host_device/host_device.yaml --numactl_binding + bash ./Ironwood/scripts/run_host_device_benchmark.sh --config Ironwood/configs/host_device/host_device.yaml resources: requests: From 462d771b657844ed689cd8133e5bc754403650f4 Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Thu, 5 Feb 2026 06:32:58 +0000 Subject: [PATCH 45/88] Revert "Add h2d_type column to H2D/D2H output" This reverts commit bf5c79cf574359c52227938bcef47bc4d2392386. --- Ironwood/src/benchmark_host_device.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index 0c7eacc5..16352e2a 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -108,7 +108,6 @@ def benchmark_host_device_calculate_metrics( } metadata = {k: v for k, v in params if k in metadata_keys} metadata["dtype"] = "float32" - metadata["h2d_type"] = h2d_type metrics = {} From 75c47aad66d8a556a3eef0595b9aa92a5b210907 Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Thu, 5 Feb 2026 09:11:48 +0000 Subject: [PATCH 46/88] Add upload log for aggregated results --- Ironwood/guides/automation/aggregator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py index 5382ce30..069a13a1 100644 --- a/Ironwood/guides/automation/aggregator.py +++ b/Ironwood/guides/automation/aggregator.py @@ -120,6 +120,7 @@ def aggregate_results(bucket_path: str, local_dir: str): directories[category] = sorted(glob.glob(f"{local_dir}/*/{category}/*", recursive=True)) results[category] = aggregate_function[category](directories[category], columns_mapping[category]) if results[category] is not None: + print(f"Writing {category} results to {bucket_path}/aggregated_results/{category}.tsv") results[category].to_csv(f"{bucket_path}/aggregated_results/{category}.tsv", index=False, sep='\t') if __name__ == "__main__": From 6eb8ac60c3626dfe620b20538166c78e2a4cde24 Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Thu, 5 Feb 2026 10:01:05 +0000 Subject: [PATCH 47/88] Update num_runs for collectives and matmul configuration --- .../configs/collectives/all_gather_tpu7x_2x2x1.yaml | 4 ++-- .../configs/collectives/all_gather_tpu7x_2x2x2.yaml | 4 ++-- .../configs/collectives/all_gather_tpu7x_2x2x4.yaml | 4 ++-- .../configs/collectives/all_gather_tpu7x_2x4x4.yaml | 4 ++-- .../configs/collectives/all_gather_tpu7x_4x4x4.yaml | 4 ++-- .../configs/collectives/all_gather_tpu7x_4x4x8.yaml | 4 ++-- .../configs/collectives/all_reduce_tpu7x_2x2x1.yaml | 4 ++-- .../configs/collectives/all_reduce_tpu7x_2x2x2.yaml | 4 ++-- .../configs/collectives/all_reduce_tpu7x_2x2x4.yaml | 4 ++-- .../configs/collectives/all_reduce_tpu7x_2x4x4.yaml | 4 ++-- .../configs/collectives/all_reduce_tpu7x_4x4x4.yaml | 4 ++-- .../configs/collectives/all_reduce_tpu7x_4x4x8.yaml | 4 ++-- .../configs/collectives/all_to_all_tpu7x_2x2x1.yaml | 4 ++-- .../configs/collectives/all_to_all_tpu7x_2x2x2.yaml | 4 ++-- .../configs/collectives/all_to_all_tpu7x_2x2x4.yaml | 4 ++-- .../configs/collectives/all_to_all_tpu7x_2x4x4.yaml | 4 ++-- .../configs/collectives/all_to_all_tpu7x_4x4x4.yaml | 4 ++-- .../configs/collectives/all_to_all_tpu7x_4x4x8.yaml | 4 ++-- Ironwood/configs/hbm/hbm.yaml | 10 +++++----- 19 files changed, 41 insertions(+), 41 deletions(-) diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml index 9bc586a1..b0858716 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml @@ -1,8 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml index b5be0c8d..ab282dec 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml @@ -1,8 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml index 09b02979..d2f65afe 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml @@ -1,8 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml index 4f6cf11a..35414ff3 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml @@ -1,8 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4" csv_path: "../microbenchmarks/all_gather_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml index 77f3ed13..b561942d 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml @@ -1,8 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4" csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml index 12743d61..5838cafd 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml @@ -1,8 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8" csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml index f7389925..b713c549 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml @@ -1,8 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica trace_dir: "../microbenchmarks/psum_tpu7x_2x2x1" csv_path: "../microbenchmarks/psum_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml index b2cb202c..165e0e72 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml @@ -1,8 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica trace_dir: "../microbenchmarks/psum_tpu7x_2x2x2" csv_path: "../microbenchmarks/psum_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml index 946fd5ed..0002ae3a 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml @@ -1,8 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica trace_dir: "../microbenchmarks/psum_tpu7x_2x2x4" csv_path: "../microbenchmarks/psum_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml index 613717cf..e5652a92 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml @@ -1,8 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica trace_dir: "../microbenchmarks/psum_tpu7x_2x4x4" csv_path: "../microbenchmarks/psum_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml index 3f4822c0..137bae19 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml @@ -1,8 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica trace_dir: "../microbenchmarks/psum_tpu7x_4x4x4" csv_path: "../microbenchmarks/psum_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml index a14bbfe8..25758453 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml @@ -1,8 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica trace_dir: "../microbenchmarks/psum_tpu7x_4x4x8" csv_path: "../microbenchmarks/psum_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml index 96da2c38..a30a17cf 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml @@ -1,8 +1,8 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml index 388a4468..01fb9b80 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml @@ -1,8 +1,8 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml index e0cc48c9..43beeb27 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml @@ -1,8 +1,8 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml index 5ae19b6e..614caa6b 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml @@ -1,8 +1,8 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml index 4cc8f6bb..12dd149d 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml @@ -1,8 +1,8 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml index 212cd92d..32c63d74 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml @@ -1,8 +1,8 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8" csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8" diff --git a/Ironwood/configs/hbm/hbm.yaml b/Ironwood/configs/hbm/hbm.yaml index 0e42b2f0..02cbaf59 100644 --- a/Ironwood/configs/hbm/hbm.yaml +++ b/Ironwood/configs/hbm/hbm.yaml @@ -1,35 +1,35 @@ benchmarks: - benchmark_name: "single_device_hbm_copy" benchmark_sweep_params: - - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "bfloat16", num_runs: 1} + - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "bfloat16", num_runs: 20} trace_dir: "../microbenchmarks/hbm_bfloat16" csv_path: "../microbenchmarks/hbm_bfloat16" xlml_metrics_dir: "../microbenchmarks/hbm_bfloat16" xla_dump_dir: "../microbenchmarks/hbm_bfloat16/hlo_graphs" - benchmark_name: "single_device_hbm_copy" benchmark_sweep_params: - - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float32", num_runs: 1} + - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float32", num_runs: 20} trace_dir: "../microbenchmarks/hbm_float32" csv_path: "../microbenchmarks/hbm_float32" xlml_metrics_dir: "../microbenchmarks/hbm_float32" xla_dump_dir: "../microbenchmarks/hbm_float32/hlo_graphs" - benchmark_name: "single_device_hbm_copy" benchmark_sweep_params: - - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float8", num_runs: 1} + - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float8", num_runs: 20} trace_dir: "../microbenchmarks/hbm_float8" csv_path: "../microbenchmarks/hbm_float8" xlml_metrics_dir: "../microbenchmarks/hbm_float8" xla_dump_dir: "../microbenchmarks/hbm_float8/hlo_graphs" - benchmark_name: "single_device_hbm_copy" benchmark_sweep_params: - - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float16", num_runs: 1} + - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float16", num_runs: 20} trace_dir: "../microbenchmarks/hbm_float16" csv_path: "../microbenchmarks/hbm_float16" xlml_metrics_dir: "../microbenchmarks/hbm_float16" xla_dump_dir: "../microbenchmarks/hbm_float16/hlo_graphs" - benchmark_name: "single_device_hbm_copy" benchmark_sweep_params: - - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float4", num_runs: 1} + - {num_elements_range: {start: 1048576, end: 4294967296, multiplier: 2}, dtype: "float4", num_runs: 20} trace_dir: "../microbenchmarks/hbm_float4" csv_path: "../microbenchmarks/hbm_float4" xlml_metrics_dir: "../microbenchmarks/hbm_float4" From b17d35b6553032221cb7aa15d46c5273e97dc0e3 Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Thu, 5 Feb 2026 10:58:02 +0000 Subject: [PATCH 48/88] Set batch size in bmm configuration to be 8 --- Ironwood/configs/bmm/single_device_bmm.yaml | 80 ++++++++++----------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/Ironwood/configs/bmm/single_device_bmm.yaml b/Ironwood/configs/bmm/single_device_bmm.yaml index f0a4156c..f4f946b5 100644 --- a/Ironwood/configs/bmm/single_device_bmm.yaml +++ b/Ironwood/configs/bmm/single_device_bmm.yaml @@ -5,14 +5,14 @@ benchmarks: xlml_metrics_dir: "../microbenchmarks/single_device_bmm_bf16" xla_dump_dir: "../microbenchmarks/single_device_bmm_bf16/hlo_graphs" benchmark_sweep_params: - - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'bfloat16'} - - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'bfloat16'} - - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'bfloat16'} - - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'bfloat16'} - - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'bfloat16'} - - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'bfloat16'} - - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'bfloat16'} - - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'bfloat16'} + - {b: 8, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'bfloat16'} + - {b: 8, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'bfloat16'} + - {b: 8, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'bfloat16'} + - {b: 8, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'bfloat16'} + - {b: 8, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'bfloat16'} + - {b: 8, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'bfloat16'} + - {b: 8, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'bfloat16'} + - {b: 8, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'bfloat16'} - benchmark_name: "single_device_bmm" trace_dir: "../microbenchmarks/single_device_bmm_f32" @@ -20,14 +20,14 @@ benchmarks: xlml_metrics_dir: "../microbenchmarks/single_device_bmm_f32" xla_dump_dir: "../microbenchmarks/single_device_bmm_f32/hlo_graphs" benchmark_sweep_params: - - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float32'} - - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float32'} - - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float32'} - - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float32'} - - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float32'} - - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float32'} - - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float32'} - - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float32'} + - {b: 8, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float32'} + - {b: 8, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float32'} + - {b: 8, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float32'} + - {b: 8, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float32'} + - {b: 8, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float32'} + - {b: 8, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float32'} + - {b: 8, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float32'} + - {b: 8, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float32'} - benchmark_name: "single_device_bmm" trace_dir: "../microbenchmarks/single_device_bmm_fp16" @@ -35,14 +35,14 @@ benchmarks: xlml_metrics_dir: "../microbenchmarks/single_device_bmm_fp16" xla_dump_dir: "../microbenchmarks/single_device_bmm_fp16/hlo_graphs" benchmark_sweep_params: - - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float16'} - - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float16'} - - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float16'} - - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float16'} - - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float16'} - - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float16'} - - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float16'} - - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float16'} + - {b: 8, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float16'} + - {b: 8, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float16'} + - {b: 8, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float16'} + - {b: 8, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float16'} + - {b: 8, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float16'} + - {b: 8, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float16'} + - {b: 8, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float16'} + - {b: 8, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float16'} - benchmark_name: "single_device_bmm" trace_dir: "../microbenchmarks/single_device_bmm_fp8" @@ -50,14 +50,14 @@ benchmarks: xlml_metrics_dir: "../microbenchmarks/single_device_bmm_fp8" xla_dump_dir: "../microbenchmarks/single_device_bmm_fp8/hlo_graphs" benchmark_sweep_params: - - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float8'} - - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float8'} - - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float8'} - - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float8'} - - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float8'} - - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float8'} - - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float8'} - - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float8'} + - {b: 8, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float8'} + - {b: 8, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float8'} + - {b: 8, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float8'} + - {b: 8, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float8'} + - {b: 8, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float8'} + - {b: 8, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float8'} + - {b: 8, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float8'} + - {b: 8, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float8'} - benchmark_name: "single_device_bmm" trace_dir: "../microbenchmarks/single_device_bmm_fp4" @@ -65,11 +65,11 @@ benchmarks: xlml_metrics_dir: "../microbenchmarks/single_device_bmm_fp4" xla_dump_dir: "../microbenchmarks/single_device_bmm_fp4/hlo_graphs" benchmark_sweep_params: - - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float4'} - - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float4'} - - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float4'} - - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float4'} - - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float4'} - - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float4'} - - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float4'} - - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float4'} \ No newline at end of file + - {b: 8, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float4'} + - {b: 8, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float4'} + - {b: 8, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float4'} + - {b: 8, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float4'} + - {b: 8, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float4'} + - {b: 8, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float4'} + - {b: 8, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float4'} + - {b: 8, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float4'} \ No newline at end of file From de75930b2960e42a1f7a19a7eb89d07e68d75b9a Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Fri, 6 Feb 2026 03:31:55 +0000 Subject: [PATCH 49/88] Implement gemm_all_reduce benchmark (Single Chip) --- .../gemm_all_reduce/gemm_all_reduce.yaml | 60 +++++ Ironwood/guides/automation/aggregator.py | 10 +- .../guides/automation/automation_launch.sh | 1 + .../tpu7x-2x2x1-gemm_all_reduce.yaml | 61 +++++ Ironwood/src/benchmark_gemm.py | 2 +- Ironwood/src/benchmark_gemm_all_reduce.py | 227 ++++++++++++++++++ Ironwood/src/benchmark_utils.py | 42 +++- Ironwood/src/run_benchmark.py | 2 + 8 files changed, 393 insertions(+), 12 deletions(-) create mode 100644 Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml create mode 100644 Ironwood/guides/automation/tpu7x-2x2x1-gemm_all_reduce.yaml create mode 100644 Ironwood/src/benchmark_gemm_all_reduce.py diff --git a/Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml b/Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml new file mode 100644 index 00000000..93466840 --- /dev/null +++ b/Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml @@ -0,0 +1,60 @@ +benchmarks: +- benchmark_name: "gemm_all_reduce" + trace_dir: "../microbenchmarks/gemm_all_reduce_bf16" + csv_path: "../microbenchmarks/gemm_all_reduce_bf16" + xlml_metrics_dir: "../microbenchmarks/gemm_all_reduce_bf16" + xla_dump_dir: "../microbenchmarks/gemm_all_reduce_bf16/hlo_graphs" + benchmark_sweep_params: + - {m: 2048, k: 2048, n: 2048, num_runs: 20, dtype: 'bfloat16'} + - {m: 4096, k: 4096, n: 4096, num_runs: 20, dtype: 'bfloat16'} + - {m: 8192, k: 8192, n: 8192, num_runs: 20, dtype: 'bfloat16'} + - {m: 16384, k: 16384, n: 16384, num_runs: 20, dtype: 'bfloat16'} + - {m: 32768, k: 32768, n: 32768, num_runs: 20, dtype: 'bfloat16'} + +- benchmark_name: "gemm_all_reduce" + trace_dir: "../microbenchmarks/gemm_all_reduce_f32" + csv_path: "../microbenchmarks/gemm_all_reduce_f32" + xlml_metrics_dir: "../microbenchmarks/gemm_all_reduce_f32" + xla_dump_dir: "../microbenchmarks/gemm_all_reduce_f32/hlo_graphs" + benchmark_sweep_params: + - {m: 2048, k: 2048, n: 2048, num_runs: 20, dtype: 'float32'} + - {m: 4096, k: 4096, n: 4096, num_runs: 20, dtype: 'float32'} + - {m: 8192, k: 8192, n: 8192, num_runs: 20, dtype: 'float32'} + - {m: 16384, k: 16384, n: 16384, num_runs: 20, dtype: 'float32'} + - {m: 32768, k: 32768, n: 32768, num_runs: 20, dtype: 'float32'} + +- benchmark_name: "gemm_all_reduce" + trace_dir: "../microbenchmarks/gemm_all_reduce_fp16" + csv_path: "../microbenchmarks/gemm_all_reduce_fp16" + xlml_metrics_dir: "../microbenchmarks/gemm_all_reduce_fp16" + xla_dump_dir: "../microbenchmarks/gemm_all_reduce_fp16/hlo_graphs" + benchmark_sweep_params: + - {m: 2048, k: 2048, n: 2048, num_runs: 20, dtype: 'float16'} + - {m: 4096, k: 4096, n: 4096, num_runs: 20, dtype: 'float16'} + - {m: 8192, k: 8192, n: 8192, num_runs: 20, dtype: 'float16'} + - {m: 16384, k: 16384, n: 16384, num_runs: 20, dtype: 'float16'} + - {m: 32768, k: 32768, n: 32768, num_runs: 20, dtype: 'float16'} + +- benchmark_name: "gemm_all_reduce" + trace_dir: "../microbenchmarks/gemm_all_reduce_fp8" + csv_path: "../microbenchmarks/gemm_all_reduce_fp8" + xlml_metrics_dir: "../microbenchmarks/gemm_all_reduce_fp8" + xla_dump_dir: "../microbenchmarks/gemm_all_reduce_fp8/hlo_graphs" + benchmark_sweep_params: + - {m: 2048, k: 2048, n: 2048, num_runs: 20, dtype: 'float8'} + - {m: 4096, k: 4096, n: 4096, num_runs: 20, dtype: 'float8'} + - {m: 8192, k: 8192, n: 8192, num_runs: 20, dtype: 'float8'} + - {m: 16384, k: 16384, n: 16384, num_runs: 20, dtype: 'float8'} + - {m: 32768, k: 32768, n: 32768, num_runs: 20, dtype: 'float8'} + +- benchmark_name: "gemm_all_reduce" + trace_dir: "../microbenchmarks/gemm_all_reduce_fp4" + csv_path: "../microbenchmarks/gemm_all_reduce_fp4" + xlml_metrics_dir: "../microbenchmarks/gemm_all_reduce_fp4" + xla_dump_dir: "../microbenchmarks/gemm_all_reduce_fp4/hlo_graphs" + benchmark_sweep_params: + - {m: 2048, k: 2048, n: 2048, num_runs: 20, dtype: 'float4'} + - {m: 4096, k: 4096, n: 4096, num_runs: 20, dtype: 'float4'} + - {m: 8192, k: 8192, n: 8192, num_runs: 20, dtype: 'float4'} + - {m: 16384, k: 16384, n: 16384, num_runs: 20, dtype: 'float4'} + - {m: 32768, k: 32768, n: 32768, num_runs: 20, dtype: 'float4'} diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py index 069a13a1..bd291cfc 100644 --- a/Ironwood/guides/automation/aggregator.py +++ b/Ironwood/guides/automation/aggregator.py @@ -36,6 +36,13 @@ "tflops_per_sec_per_device_avg", "tflops_per_sec_per_device_min", "tflops_per_sec_per_device_max", ], + "gemm_all_reduce": [ + "m", "n", "k", "dtype", "step_time_ms_num_runs", + "tflops_per_sec_per_device_p50", "tflops_per_sec_per_device_p90", + "tflops_per_sec_per_device_p95", "tflops_per_sec_per_device_p99", + "tflops_per_sec_per_device_avg", "tflops_per_sec_per_device_min", + "tflops_per_sec_per_device_max", + ], } def download_from_gcs(bucket_path: str, local_dir: str): @@ -110,10 +117,11 @@ def aggregate_bmm(directories: list[str], picked_columns: list[str]) -> pd.DataF "host_device": aggregate_host_device, "gemm": aggregate_gemm, "bmm": aggregate_bmm, + "gemm_all_reduce": aggregate_gemm, } def aggregate_results(bucket_path: str, local_dir: str): - categories = ["collectives", "hbm", "host_device", "gemm", "bmm"] + categories = ["collectives", "hbm", "host_device", "gemm", "bmm", "gemm_all_reduce"] directories = {} results = {} for category in categories: diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh index 0ca76210..0e49fb12 100755 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -14,6 +14,7 @@ TIMEOUT_SECOND=3600 yaml_names=( "tpu7x-2x2x1-hbm.yaml" "tpu7x-2x2x1-host_device.yaml" + "tpu7x-2x2x1-gemm_all_reduce.yaml" "tpu7x-2x2x1-gemm.yaml" "tpu7x-2x2x1-bmm.yaml" "tpu7x-2x2x1-collectives.yaml" diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-gemm_all_reduce.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-gemm_all_reduce.yaml new file mode 100644 index 00000000..186c63e2 --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-2x2x1-gemm_all_reduce.yaml @@ -0,0 +1,61 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-2x2x1 +spec: + completionMode: Indexed + suspend: true + parallelism: 1 + completions: 1 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x1 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 diff --git a/Ironwood/src/benchmark_gemm.py b/Ironwood/src/benchmark_gemm.py index b802ddc0..0b19637d 100644 --- a/Ironwood/src/benchmark_gemm.py +++ b/Ironwood/src/benchmark_gemm.py @@ -146,7 +146,7 @@ def gemm_multiple_run_calculate_metrics( total_flops, total_flops_all_devices = handle_based_on_sharding( total_flops, SHARDING_STRATEGY ) - peak_flops = PEAK_FLOPS_PER_DEVICE if dtype==jax.numpy.float8_e4m3fn else PEAK_FLOPS_PER_DEVICE/2 + peak_flops_multiplier = get_peak_flops_multiplier(dtype.dtype.name) return unified_flops_metrics( m, n, diff --git a/Ironwood/src/benchmark_gemm_all_reduce.py b/Ironwood/src/benchmark_gemm_all_reduce.py new file mode 100644 index 00000000..851849ca --- /dev/null +++ b/Ironwood/src/benchmark_gemm_all_reduce.py @@ -0,0 +1,227 @@ +"""Benchmarks gemm + all_reduce for DP gradient sync simulation.""" + +import os +from typing import Any, Dict, Optional, Callable + +# pylint: disable=g-importing-member +from benchmark_utils import ( + iteration_timeit, + multiple_iteration_timeit_from_trace, + ShardingStrategy, + get_lhs_named_shading, + get_rhs_named_shading, + get_out_sharding, + create_mesh, + handle_based_on_sharding, + unified_flops_metrics, + MetricsStatistics, + get_metrics_helper, + str_to_dtype, + get_peak_flops_multiplier, + unified_bytes_metrics, +) +from common import MARKER +import jax +from jax.experimental.shard_map import shard_map +from jax.sharding import PartitionSpec as P +import jax.numpy as jnp + + +# pylint: disable=g-importing-member + + +# Matmul shapes: A(M,K) x B(K,N) = C(M,N) +# Then AllReduce(C) +SHARDING_STRATEGY = ShardingStrategy.NO_SHARDING +SEED = 0 +PEAK_FLOPS_PER_DEVICE = 2307 # TFLOP/s for single core(device) of FP8 + +_INITIALIZED = False + +def setup_tpu_env(): + global _INITIALIZED + if _INITIALIZED: + return + + print("Setting LIBTPU_INIT_ARGS...", flush=True) + os.environ["LIBTPU_INIT_ARGS"] = ( + "--xla_tpu_enable_async_collective_fusion=true " + "--xla_tpu_enable_async_collective_fusion_fuse_all_reduce=true " + "--xla_tpu_enable_async_collective_fusion_multiple_steps=true " + "--xla_tpu_overlap_compute_collective_tc=true " + "--xla_enable_async_all_reduce=true " + "--xla_enable_async_collective_permute=true " + "--xla_tpu_enable_all_experimental_scheduler_features=true " + "--xla_tpu_should_accumulate_into_mrb=true " + "--xla_tpu_scoped_vmem_limit_kib=65536 " + "--xla_tpu_vmem_scavenging_mode=NONE " + "--xla_tpu_dvfs_p_state=7 " + + "--xla_tpu_impure_enable_packed_bf16_math_ops=true " + "--xla_tpu_enable_pincer_short_fusion_emitter=true " + "--xla_tpu_enable_sparse_core_hierarchical_all_reduce=true " + "--xla_tpu_use_single_sparse_core_for_all_reduce_offload=true " # Test effect on SC + + "--xla_jf_debug_level=1 " + "--xla_sc_disable_megacore_partitioning=true " + "--xla_tpu_disable_sparse_core_collective_offload_remover=true " + "--xla_tpu_enable_all_reduce_scatter_fusion=false " + "--xla_tpu_enable_sparse_core_collective_offload_all_reduce=true " + "--xla_tpu_pad_operations_input_tiles=true " + "--xla_tpu_sparse_core_all_reduce_offload_min_size_in_bytes=0 " + "--xla_tpu_use_tc_device_shape_on_sc=true " + ) + + print("Step 1: Calling jax.distributed.initialize(initialization_timeout=300)...", flush=True) + jax.distributed.initialize(initialization_timeout=300) + print("Step 1: jax.distributed.initialize() completed.", flush=True) + _INITIALIZED = True + + +def _run_gemm_base( + m: int, + k: int, + n: int, + dtype: jnp.dtype, + num_runs: int, + trace_dir: str, + sharding_strategy: ShardingStrategy, + task_name_suffix: str, +) -> Dict[str, Any]: + """Shared base function for running GEMM benchmarks.""" + setup_tpu_env() + dtype_str = dtype.dtype.name + task_name = f"{task_name_suffix}_{dtype_str}" + print(f"Running {task_name} benchmark with m={m}, k={k}, n={n}, dtype={dtype_str}, runs={num_runs}", flush=True) + + def f(x, y): + with jax.named_scope(MARKER): + # Matmul + acc = jax.numpy.einsum( + "ij,jk->ik", x, y, preferred_element_type=jnp.float32 + ) + c = acc.astype(dtype) + + # AllReduce (psum) + out = jax.lax.psum(c, axis_name="device") + return out + + print("Step 2: Creating Mesh and Shardings...", flush=True) + mesh = create_mesh(sharding_strategy) + lhs_sharding = get_lhs_named_shading(mesh, sharding_strategy) + rhs_sharding = get_rhs_named_shading(mesh, sharding_strategy) + out_sharding = get_out_sharding(sharding_strategy) + + jit_sharded_f = jax.jit( + shard_map( + f, + mesh, + in_specs=( + lhs_sharding.spec, + rhs_sharding.spec, + ), + out_specs=out_sharding, + check_rep=False, + ) + ) + + lhs_shape = (m, k) + rhs_shape = (k, n) + lhs_dtype = dtype + rhs_dtype = dtype + key = jax.random.key(SEED) + + def data_generator(): + """Creates new random data on host and puts it on device.""" + nonlocal key + key, key_lhs, key_rhs = jax.random.split(key, 3) + + # Create random data on host + lhs_host = jax.random.normal(key_lhs, lhs_shape).astype(lhs_dtype) + rhs_host = jax.random.normal(key_rhs, rhs_shape).astype(rhs_dtype) + + # Put on device (HBM) + lhs_device = jax.device_put(lhs_host, lhs_sharding) + rhs_device = jax.device_put(rhs_host, rhs_sharding) + + return (lhs_device, rhs_device) + + print("Step 3: Starting Execution Loop (includes JIT)...", flush=True) + time_ms_list = multiple_iteration_timeit_from_trace( + jit_sharded_f, + data_generator, + matrix_dim=f"{dtype_str}_{m}x{n}x{k}", + tries=num_runs, + task=task_name, + trace_dir=trace_dir, + multi_op=True, + ) + print("Step 4: Execution Loop Completed.", flush=True) + + return { + "time_ms_list": time_ms_list, + } + + +def gemm_all_reduce( + m: int, + k: int, + n: int, + dtype: jnp.dtype = jnp.bfloat16, + num_runs: int = 1, + trace_dir: str = None, +) -> Dict[str, Any]: + """Benchmarks the Matmul(A, B) + AllReduce(C).""" + return _run_gemm_base( + m, k, n, dtype, num_runs, trace_dir, + sharding_strategy=ShardingStrategy.NO_SHARDING, + task_name_suffix="gemm_all_reduce" + ) + + + + + +def _calculate_metrics_base( + m: int, + k: int, + n: int, + dtype: jnp.dtype, + time_ms_list: list[float], + sharding_strategy: ShardingStrategy, +) -> tuple[Dict[str, Any], Dict[str, Any]]: + """Shared metrics calculation for GEMM benchmarks.""" + total_flops = 2 * m * k * n + total_flops_per_device, total_flops_all_devices = handle_based_on_sharding( + total_flops, sharding_strategy + ) + + dtype_str = dtype.dtype.name + peak_flops_multiplier = get_peak_flops_multiplier(dtype_str) + peak_flops = PEAK_FLOPS_PER_DEVICE * peak_flops_multiplier + + return unified_flops_metrics( + m, n, k, time_ms_list, total_flops_per_device, total_flops_all_devices, peak_flops, dtype=dtype_str, + ) + + +def gemm_all_reduce_calculate_metrics( + m: int, + k: int, + n: int, + dtype: jnp.dtype, + time_ms_list: list[float], +) -> Dict[str, Any]: + # Calculate Bandwidth for Collective (AllReduce) + # Effective bandwidth for AllReduce is 2 * (N-1)/N * Size. + # We use Size * 2 as a proxy for total bytes moved (assuming large N). + + metadata, metrics = _calculate_metrics_base( + m, k, n, dtype, time_ms_list, ShardingStrategy.NO_SHARDING + ) + + metadata["type"] = "gemm_all_reduce" + return metadata, metrics + + + diff --git a/Ironwood/src/benchmark_utils.py b/Ironwood/src/benchmark_utils.py index 4091aefb..0a45678d 100644 --- a/Ironwood/src/benchmark_utils.py +++ b/Ironwood/src/benchmark_utils.py @@ -62,6 +62,7 @@ class ShardingStrategy(Enum): SHARDING_ON_SINGLE_CHIP_WITH_N = auto() + def multiple_iteration_timeit_from_trace_throttling( compute_func: Callable, data_generator: Callable, @@ -150,6 +151,7 @@ def multiple_iteration_timeit_from_trace( tries: int = 17, task: str = None, trace_dir: str = None, + multi_op: bool = False, ) -> list[float]: """ Time a function with jax.profiler and get the run time from the trace. @@ -189,10 +191,11 @@ def multiple_iteration_timeit_from_trace( if trace_full_dir != tmp_trace_dir: # Upload the traces to desired location upload_to_storage(trace_dir=trace_full_dir, local_file=tmp_trace_dir) - return multiple_iteration_get_metrics_from_trace(trace, task) + return multiple_iteration_get_metrics_from_trace(trace, task, tries, multi_op) -def multiple_iteration_get_metrics_from_trace(trace: dict[str, Any], task: str = None) -> list[float]: +def multiple_iteration_get_metrics_from_trace( + trace: dict[str, Any], task: str = None, tries = 17, multi_op: bool = False) -> list[float]: marker_done_events = [] for event in trace["traceEvents"]: args = event.get("args", {}) @@ -203,7 +206,7 @@ def multiple_iteration_get_metrics_from_trace(trace: dict[str, Any], task: str = marker_call_done_events = [ e for e in marker_done_events if e.get("name", "").endswith("call-done") ] - if marker_call_done_events: + if not multi_op and marker_call_done_events: marker_done_events = marker_call_done_events unique_pids = set([e["pid"] for e in marker_done_events]) print(f"Unique PIDs: {unique_pids}") @@ -232,9 +235,19 @@ def multiple_iteration_get_metrics_from_trace(trace: dict[str, Any], task: str = min_pid = min([e["pid"] for e in marker_done_events]) events_from_min_pid = [e for e in marker_done_events if e["pid"] == min_pid] - durations_ms = [ - float(e["args"]["device_duration_ps"]) / 1e9 for e in events_from_min_pid - ] + + if multi_op and len(events_from_min_pid) > tries: + if len(events_from_min_pid) % tries != 0: + raise ValueError(f"Number of events {len(events_from_min_pid)} is not a multiple of tries {tries}.") + events_from_min_pid.sort(key=lambda t: t["ts"]) + durations_ms = [] + num_ops = len(events_from_min_pid) // tries + for i in range(0, len(events_from_min_pid), num_ops): + durations_ms.append(sum([float(e["args"]["device_duration_ps"]) / 1e9 for e in events_from_min_pid[i:i+num_ops]])) + else: + durations_ms = [ + float(e["args"]["device_duration_ps"]) / 1e9 for e in events_from_min_pid + ] print(f"Collected {len(durations_ms)} events from trace for pid {min_pid}.") print(durations_ms) @@ -984,6 +997,8 @@ def get_lhs_named_shading(mesh, strategy: ShardingStrategy): return NamedSharding(mesh, P(None, None)) case ShardingStrategy.SHARDING_ON_SINGLE_CHIP_WITH_N: return NamedSharding(mesh, P(None, None)) + case ShardingStrategy.SHARDING_ON_ALL_DEVICES_WITH_K: + return NamedSharding(mesh, P(None, "device")) def get_rhs_named_shading(mesh, strategy: ShardingStrategy): @@ -1000,6 +1015,7 @@ def get_rhs_named_shading(mesh, strategy: ShardingStrategy): return NamedSharding(mesh, P(None, "device")) + def get_out_sharding(strategy: ShardingStrategy): match strategy: case ShardingStrategy.NO_SHARDING: @@ -1014,6 +1030,7 @@ def get_out_sharding(strategy: ShardingStrategy): return P(None, "device") + def get_rowwise_named_shading(mesh, strategy: ShardingStrategy): match strategy: case ShardingStrategy.NO_SHARDING: @@ -1056,6 +1073,7 @@ def handle_per_device_based_on_sharding(value, strategy: ShardingStrategy): return value // 2 + def handle_all_devices_based_on_sharding(value: int, strategy: ShardingStrategy): match strategy: case ShardingStrategy.NO_SHARDING: @@ -1070,6 +1088,7 @@ def handle_all_devices_based_on_sharding(value: int, strategy: ShardingStrategy) return value * jax.device_count() // 2 + def handle_based_on_sharding(value: int, strategy: ShardingStrategy): total_value = value value = handle_per_device_based_on_sharding(value, strategy) @@ -1273,16 +1292,19 @@ def get_peak_flops_multiplier(in_dtype_str: str) -> float: (PEAK_FLOPS_PER_DEVICE) based on the input data type. """ in_dtype_lower = in_dtype_str.lower() - if in_dtype_lower == "fp8": + if in_dtype_lower in ("fp8", "float8_e4m3fn"): # FP8 is 2x faster than BF16 # The baseline PEAK_FLOPS_PER_DEVICE is 1153.5 * 2 = 2307, which is FP8 peak. # So the multiplier should be 1.0 return 1.0 - elif in_dtype_lower == "bf16" or in_dtype_lower == "fp16": + elif in_dtype_lower in ("bf16", "bfloat16", "fp16", "float16"): # BF16/FP16 is 2x slower than FP8 peak return 0.5 - elif in_dtype_lower == "fp32": + elif in_dtype_lower in ("fp32", "float32"): # FP32 is 4x slower than FP8 peak return 0.25 + elif in_dtype_lower in ("fp4", "float4_e2m1fn"): + # FP4/INT4 is treated the same as FP8 + return 1.0 else: - raise RuntimeError(f"{in_dtype_lower} is not supported for setting peak_flops_multiplier.") + raise RuntimeError(f"{in_dtype_lower} is not supported for setting peak_flops_multiplier.") \ No newline at end of file diff --git a/Ironwood/src/run_benchmark.py b/Ironwood/src/run_benchmark.py index efef1dcc..9c3f41bb 100644 --- a/Ironwood/src/run_benchmark.py +++ b/Ironwood/src/run_benchmark.py @@ -63,6 +63,8 @@ "gemm_simple": "benchmark_gemm.gemm_simple", "gemm_simple_with_dtype": "benchmark_gemm.gemm_simple_with_dtype", "gemm_multiple_run": "benchmark_gemm.gemm_multiple_run", + "gemm_all_reduce": "benchmark_gemm_all_reduce.gemm_all_reduce", + "gemm_throttling": "benchmark_gemm_throttling.gemm_throttling", "gemm": "benchmark_gemm.gemm", "gemm_accum": "benchmark_gemm.gemm_accum", From 9a0b8aecd2fb0cea6d4757b4b621e27d22ccc5b7 Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Fri, 6 Feb 2026 16:06:49 +0800 Subject: [PATCH 50/88] Add multi-host BMM into automation (#105) * Add multi-host BMM into automation --- Ironwood/configs/bmm/multi_host_bmm.yaml | 75 +++++++++++++++ .../guides/automation/tpu7x-2x2x1-bmm.yaml | 1 + Ironwood/src/benchmark_bmm.py | 93 +++++++++++++++++++ Ironwood/src/run_benchmark.py | 1 + 4 files changed, 170 insertions(+) create mode 100644 Ironwood/configs/bmm/multi_host_bmm.yaml diff --git a/Ironwood/configs/bmm/multi_host_bmm.yaml b/Ironwood/configs/bmm/multi_host_bmm.yaml new file mode 100644 index 00000000..eb332383 --- /dev/null +++ b/Ironwood/configs/bmm/multi_host_bmm.yaml @@ -0,0 +1,75 @@ +benchmarks: +- benchmark_name: "multi_host_bmm" + trace_dir: "../microbenchmarks/multi_host_bmm_bf16" + csv_path: "../microbenchmarks/multi_host_bmm_bf16" + xlml_metrics_dir: "../microbenchmarks/multi_host_bmm_bf16" + xla_dump_dir: "../microbenchmarks/multi_host_bmm_bf16/hlo_graphs" + benchmark_sweep_params: + - {b: 8, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'bfloat16'} + - {b: 8, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'bfloat16'} + - {b: 8, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'bfloat16'} + - {b: 8, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'bfloat16'} + - {b: 8, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'bfloat16'} + - {b: 8, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'bfloat16'} + - {b: 8, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'bfloat16'} + - {b: 8, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'bfloat16'} + +- benchmark_name: "multi_host_bmm" + trace_dir: "../microbenchmarks/multi_host_bmm_f32" + csv_path: "../microbenchmarks/multi_host_bmm_f32" + xlml_metrics_dir: "../microbenchmarks/multi_host_bmm_f32" + xla_dump_dir: "../microbenchmarks/multi_host_bmm_f32/hlo_graphs" + benchmark_sweep_params: + - {b: 8, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float32'} + - {b: 8, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float32'} + - {b: 8, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float32'} + - {b: 8, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float32'} + - {b: 8, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float32'} + - {b: 8, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float32'} + - {b: 8, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float32'} + - {b: 8, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float32'} + +- benchmark_name: "multi_host_bmm" + trace_dir: "../microbenchmarks/multi_host_bmm_fp16" + csv_path: "../microbenchmarks/multi_host_bmm_fp16" + xlml_metrics_dir: "../microbenchmarks/multi_host_bmm_fp16" + xla_dump_dir: "../microbenchmarks/multi_host_bmm_fp16/hlo_graphs" + benchmark_sweep_params: + - {b: 8, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float16'} + - {b: 8, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float16'} + - {b: 8, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float16'} + - {b: 8, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float16'} + - {b: 8, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float16'} + - {b: 8, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float16'} + - {b: 8, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float16'} + - {b: 8, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float16'} + +- benchmark_name: "multi_host_bmm" + trace_dir: "../microbenchmarks/multi_host_bmm_fp8" + csv_path: "../microbenchmarks/multi_host_bmm_fp8" + xlml_metrics_dir: "../microbenchmarks/multi_host_bmm_fp8" + xla_dump_dir: "../microbenchmarks/multi_host_bmm_fp8/hlo_graphs" + benchmark_sweep_params: + - {b: 8, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float8'} + - {b: 8, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float8'} + - {b: 8, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float8'} + - {b: 8, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float8'} + - {b: 8, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float8'} + - {b: 8, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float8'} + - {b: 8, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float8'} + - {b: 8, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float8'} + +- benchmark_name: "multi_host_bmm" + trace_dir: "../microbenchmarks/multi_host_bmm_fp4" + csv_path: "../microbenchmarks/multi_host_bmm_fp4" + xlml_metrics_dir: "../microbenchmarks/multi_host_bmm_fp4" + xla_dump_dir: "../microbenchmarks/multi_host_bmm_fp4/hlo_graphs" + benchmark_sweep_params: + - {b: 8, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float4'} + - {b: 8, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float4'} + - {b: 8, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float4'} + - {b: 8, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float4'} + - {b: 8, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float4'} + - {b: 8, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float4'} + - {b: 8, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float4'} + - {b: 8, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float4'} \ No newline at end of file diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml index 1b5b9774..6257acfc 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml @@ -54,6 +54,7 @@ spec: GCS_BUCKET_DIR=${GCS_PATH} python Ironwood/src/run_benchmark.py --config="Ironwood/configs/bmm/single_device_bmm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/bmm/multi_host_bmm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} resources: requests: google.com/tpu: 4 diff --git a/Ironwood/src/benchmark_bmm.py b/Ironwood/src/benchmark_bmm.py index 264daf2b..f988008e 100644 --- a/Ironwood/src/benchmark_bmm.py +++ b/Ironwood/src/benchmark_bmm.py @@ -134,3 +134,96 @@ def single_device_bmm_calculate_metrics( dtype=dtype.dtype.name, b=b, ) + + +def multi_host_bmm( + b: int, + m: int, + k: int, + n: int, + dtype: jnp.dtype = jax.numpy.float8_e4m3fn, + num_runs: int = 1, + trace_dir: str = None, + sharding_strategy: ShardingStrategy = ShardingStrategy.NO_SHARDING, +) -> Dict[str, Any]: + """Benchmarks multi-host bmm.""" + mesh = create_mesh(sharding_strategy) + + lhs_sharding = get_lhs_named_shading(mesh, sharding_strategy) + rhs_sharding = get_rhs_named_shading(mesh, sharding_strategy) + output_sharding = get_output_named_shading(mesh, sharding_strategy) + + def f(x, y): + with jax.named_scope(MARKER): + acc = jax.numpy.einsum( + "bij,bjk->bik", x, y, preferred_element_type=jnp.float32 + ) + return acc.astype(jnp.bfloat16) + + jit_sharded_f = jax.jit( + f, + in_shardings=(lhs_sharding, rhs_sharding), + out_shardings=output_sharding, + ) + + lhs_shape = (b, m, k) + rhs_shape = (b, k, n) + + lhs_dtype = dtype + rhs_dtype = dtype + + key = jax.random.key(SEED) + + def data_generator(): + """Creates new random data on host and puts it on device.""" + nonlocal key # Use and update the outer 'key' + key, key_lhs, key_rhs = jax.random.split(key, 3) + + # Create random data on host + lhs_host = jax.random.normal(key_lhs, lhs_shape).astype(lhs_dtype) + rhs_host = jax.random.normal(key_rhs, rhs_shape).astype(rhs_dtype) + + # Put on device (HBM) with sharding + lhs = jax.device_put(lhs_host, lhs_sharding) + rhs = jax.device_put(rhs_host, rhs_sharding) + + return (lhs, rhs) + + dtype_str = dtype.dtype.name + time_ms_list = multiple_iteration_timeit_from_trace( + jit_sharded_f, + data_generator, + matrix_dim=f"{dtype_str}_{b}x{m}x{n}x{k}", + tries=num_runs, + task="multi_host_bmm", + trace_dir=trace_dir, + ) + + return {"time_ms_list": time_ms_list} + + +def multi_host_bmm_calculate_metrics( + b: int, + m: int, + k: int, + n: int, + dtype: jnp.dtype, + time_ms_list: list[float], + sharding_strategy: ShardingStrategy = ShardingStrategy.NO_SHARDING +) -> Dict[str, Any]: + # Calculate FLOPs + total_flops = 2 * b * m * k * n # Total floating-point operations + total_flops, total_flops_all_devices = handle_based_on_sharding( + total_flops, sharding_strategy + ) + return unified_flops_metrics( + m, + n, + k, + time_ms_list, + total_flops, + total_flops_all_devices, + PEAK_FLOPS_PER_DEVICE, + dtype=dtype.dtype.name, + b=b, + ) diff --git a/Ironwood/src/run_benchmark.py b/Ironwood/src/run_benchmark.py index 9c3f41bb..31338d21 100644 --- a/Ironwood/src/run_benchmark.py +++ b/Ironwood/src/run_benchmark.py @@ -34,6 +34,7 @@ BMM_BENCHMARK_MAP = { "single_device_bmm": "benchmark_bmm.single_device_bmm", + "multi_host_bmm": "benchmark_bmm.multi_host_bmm" } MATMUL_BENCHMARK_MAP = { From 2d45945df4bef612334c0c69f99739fc995bbe57 Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Fri, 6 Feb 2026 02:25:11 +0000 Subject: [PATCH 51/88] Update pipelined flow with optimized approach --- Ironwood/src/benchmark_host_device.py | 106 ++++++++++++++++++++------ 1 file changed, 84 insertions(+), 22 deletions(-) diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index 16352e2a..ba1be121 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -21,6 +21,7 @@ def benchmark_host_device( data_size_mib: int, + h2d_type: str = "simple", num_runs: int = 100, trace_dir: str = None, ) -> Dict[str, Any]: @@ -65,29 +66,89 @@ def benchmark_host_device( with step_context: # H2D - t0 = time.perf_counter() + if h2d_type == "simple": + t0 = time.perf_counter() + # Simple device_put + device_array = jax.device_put(host_data) + device_array.block_until_ready() + t1 = time.perf_counter() + + # Verify H2D shape + assert device_array.shape == host_data.shape + h2d_perf.append((t1 - t0) * 1000) - # Simple device_put - device_array = jax.device_put(host_data) - device_array.block_until_ready() - - t1 = time.perf_counter() - h2d_perf.append((t1 - t0) * 1000) - - # Verify H2D shape - assert device_array.shape == host_data.shape - - # D2H - t2 = time.perf_counter() - - # Simple device_get - # Note: device_get returns a numpy array (copy) - _ = jax.device_get(device_array) - - t3 = time.perf_counter() - d2h_perf.append((t3 - t2) * 1000) - - device_array.delete() + # D2H + t2 = time.perf_counter() + # Simple device_get + # Note: device_get returns a numpy array (copy) + _ = jax.device_get(device_array) + t3 = time.perf_counter() + d2h_perf.append((t3 - t2) * 1000) + + device_array.delete() + elif h2d_type == "pipelined": + target_chunk_size_mib = 16 # Sweet spot from profiling + num_devices = len(target_devices) + + tensors_on_device = [] + + # Calculate chunks per device + data_per_dev = data_size_mib / num_devices + chunks_per_dev = int(data_per_dev / target_chunk_size_mib) + chunks_per_dev = max(1, chunks_per_dev) + + chunks = np.array_split(host_data, chunks_per_dev * num_devices, axis=0) + if chunks_per_dev > 1: + t0 = time.perf_counter() + # We need to map chunks to the correct device + # This simple example assumes chunks are perfectly divisible and ordered + # In production, use `jax.sharding` mesh logic for complex layouts + for idx, chunk in enumerate(chunks): + if num_devices > 1: + dev = target_devices[idx % num_devices] + else: + dev = target_devices[0] + tensors_on_device.append(jax.device_put(chunk, dev)) + for device_tensor in tensors_on_device: + device_tensor.block_until_ready() + t1 = time.perf_counter() + h2d_perf.append((t1 - t0) * 1000) + del chunks + + # D2H + tensor_stack = jnp.vstack(tensors_on_device) + + t2 = time.perf_counter() + _ = jax.device_get(tensor_stack) + t3 = time.perf_counter() + + d2h_perf.append((t3 - t2) * 1000) + tensor_stack.delete() + for device_tensor in tensors_on_device: + device_tensor.delete() + del tensors_on_device + else: + t0 = time.perf_counter() + + print(f"Warning: {data_size_mib=} is not larger than {target_chunk_size_mib=}, falling back to standard JAX put.") + # Fallback to standard JAX put for small data + result = jax.device_put(host_data, target_devices[0]) + result.block_until_ready() + + t1 = time.perf_counter() + h2d_perf.append((t1 - t0) * 1000) + + # D2H + t2 = time.perf_counter() + # Simple device_get + # Note: device_get returns a numpy array (copy) + _ = jax.device_get(result) + + t3 = time.perf_counter() + d2h_perf.append((t3 - t2) * 1000) + result.delete() + + jax.clear_caches() return { "H2D_Bandwidth_ms": h2d_perf, @@ -108,6 +169,7 @@ def benchmark_host_device_calculate_metrics( } metadata = {k: v for k, v in params if k in metadata_keys} metadata["dtype"] = "float32" + metadata["h2d_type"] = h2d_type metrics = {} From 3d82d5b11940840720ffbf3208cac708604f34e5 Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Fri, 6 Feb 2026 08:09:39 +0000 Subject: [PATCH 52/88] Add missing h2d_type to H2D metrics --- Ironwood/guides/automation/aggregator.py | 2 +- Ironwood/src/benchmark_host_device.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py index bd291cfc..97235665 100644 --- a/Ironwood/guides/automation/aggregator.py +++ b/Ironwood/guides/automation/aggregator.py @@ -16,7 +16,7 @@ "time_ms_p50", "time_ms_p90", "time_ms_p95", "time_ms_p99", "time_ms_avg", "time_ms_min", "time_ms_max", ], "host_device": [ - "data_size_mib", "H2D_bw (GiB/s)_num_runs", + "data_size_mib", "h2d_type", "H2D_bw (GiB/s)_num_runs", "H2D_bw (GiB/s)_p50", "H2D_bw (GiB/s)_p90", "H2D_bw (GiB/s)_p95", "H2D_bw (GiB/s)_p99", "H2D_bw (GiB/s)_avg", "H2D_bw (GiB/s)_min", "H2D_bw (GiB/s)_max", "D2H_bw (GiB/s)_p50", "D2H_bw (GiB/s)_p90", "D2H_bw (GiB/s)_p95", "D2H_bw (GiB/s)_p99", diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index ba1be121..4e4e0a63 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -157,6 +157,7 @@ def benchmark_host_device( def benchmark_host_device_calculate_metrics( data_size_mib: int, + h2d_type: str, H2D_Bandwidth_ms: List[float], D2H_Bandwidth_ms: List[float], ) -> Tuple[Dict[str, Any], Dict[str, Any]]: From adc084d6462540a16c96531ca23113a36ed5e88c Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Fri, 6 Feb 2026 09:02:42 +0000 Subject: [PATCH 53/88] Revert unintended commit This reverts commit 3e4b59a0d8a5e87bea51f8c934895a4c9fda3ac5. --- Ironwood/configs/collectives/all_gather_1d.yaml | 3 +-- Ironwood/configs/collectives/all_gather_2d.yaml | 3 +-- Ironwood/configs/collectives/all_gather_3d.yaml | 3 +-- Ironwood/configs/collectives/all_gather_demo.yaml | 10 +++------- .../configs/collectives/all_gather_tpu7x_2x2x1.yaml | 3 +-- .../configs/collectives/all_gather_tpu7x_2x2x2.yaml | 3 +-- .../configs/collectives/all_gather_tpu7x_2x2x4.yaml | 3 +-- .../configs/collectives/all_gather_tpu7x_2x4x4.yaml | 3 +-- .../configs/collectives/all_gather_tpu7x_4x4x4.yaml | 5 ++--- .../configs/collectives/all_gather_tpu7x_4x4x8.yaml | 3 +-- Ironwood/configs/collectives/all_reduce_1d.yaml | 3 +-- Ironwood/configs/collectives/all_reduce_2d.yaml | 3 +-- Ironwood/configs/collectives/all_reduce_3d.yaml | 3 +-- .../configs/collectives/all_reduce_tpu7x_2x2x1.yaml | 3 +-- .../configs/collectives/all_reduce_tpu7x_2x2x2.yaml | 3 +-- .../configs/collectives/all_reduce_tpu7x_2x2x4.yaml | 3 +-- .../configs/collectives/all_reduce_tpu7x_2x4x4.yaml | 3 +-- .../configs/collectives/all_reduce_tpu7x_4x4x4.yaml | 3 +-- .../configs/collectives/all_reduce_tpu7x_4x4x8.yaml | 3 +-- Ironwood/configs/collectives/all_to_all_1d.yaml | 2 +- Ironwood/configs/collectives/all_to_all_2d.yaml | 2 +- Ironwood/configs/collectives/all_to_all_3d.yaml | 2 +- .../configs/collectives/all_to_all_tpu7x_2x2x1.yaml | 3 +-- .../configs/collectives/all_to_all_tpu7x_2x2x2.yaml | 3 +-- .../configs/collectives/all_to_all_tpu7x_2x2x4.yaml | 3 +-- .../configs/collectives/all_to_all_tpu7x_2x4x4.yaml | 3 +-- .../configs/collectives/all_to_all_tpu7x_4x4x4.yaml | 3 +-- .../configs/collectives/all_to_all_tpu7x_4x4x8.yaml | 3 +-- Ironwood/configs/collectives/reduce_scatter_1d.yaml | 3 +-- Ironwood/configs/collectives/reduce_scatter_2d.yaml | 3 +-- 30 files changed, 33 insertions(+), 63 deletions(-) diff --git a/Ironwood/configs/collectives/all_gather_1d.yaml b/Ironwood/configs/collectives/all_gather_1d.yaml index 85d8fc3e..0b1313dc 100644 --- a/Ironwood/configs/collectives/all_gather_1d.yaml +++ b/Ironwood/configs/collectives/all_gather_1d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_1d" csv_path: "../microbenchmarks/all_gather_1d" xlml_metrics_dir: "../microbenchmarks/all_gather_1d" diff --git a/Ironwood/configs/collectives/all_gather_2d.yaml b/Ironwood/configs/collectives/all_gather_2d.yaml index 2d7a0e7a..c45f3e70 100644 --- a/Ironwood/configs/collectives/all_gather_2d.yaml +++ b/Ironwood/configs/collectives/all_gather_2d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_2d" csv_path: "../microbenchmarks/all_gather_2d" xlml_metrics_dir: "../microbenchmarks/all_gather_2d" diff --git a/Ironwood/configs/collectives/all_gather_3d.yaml b/Ironwood/configs/collectives/all_gather_3d.yaml index cc876a08..e159adfd 100644 --- a/Ironwood/configs/collectives/all_gather_3d.yaml +++ b/Ironwood/configs/collectives/all_gather_3d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_3d" csv_path: "../microbenchmarks/all_gather_3d" xlml_metrics_dir: "../microbenchmarks/all_gather_3d" diff --git a/Ironwood/configs/collectives/all_gather_demo.yaml b/Ironwood/configs/collectives/all_gather_demo.yaml index 6fb5a757..a9d776cd 100644 --- a/Ironwood/configs/collectives/all_gather_demo.yaml +++ b/Ironwood/configs/collectives/all_gather_demo.yaml @@ -1,13 +1,9 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1} # Parallel Replica - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1} # Non-Parallel Replica - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2} # Non Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3} # Non Parallel Replica Groups - + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1} + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2} + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3} warmup_tries: 10 trace_dir: "../microbenchmarks/all_gather_demo" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml index b0858716..5b11ac8d 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml index ab282dec..3747b754 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml index d2f65afe..9c25eb6b 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml index 35414ff3..0ad03f56 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4" csv_path: "../microbenchmarks/all_gather_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml index b561942d..ceb7bb52 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml @@ -1,9 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4" csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4" - xla_dump_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4/hlo_graphs" \ No newline at end of file + xla_dump_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4/hlo_graphs" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml index 5838cafd..0218d6b0 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8" csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/all_reduce_1d.yaml b/Ironwood/configs/collectives/all_reduce_1d.yaml index 7b1d3068..d12d4221 100644 --- a/Ironwood/configs/collectives/all_reduce_1d.yaml +++ b/Ironwood/configs/collectives/all_reduce_1d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1" , op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1" , op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_reduce_1d" csv_path: "../microbenchmarks/all_reduce_1d" xlml_metrics_dir: "../microbenchmarks/all_reduce_1d" diff --git a/Ironwood/configs/collectives/all_reduce_2d.yaml b/Ironwood/configs/collectives/all_reduce_2d.yaml index 93e1a7c9..5aa9654e 100644 --- a/Ironwood/configs/collectives/all_reduce_2d.yaml +++ b/Ironwood/configs/collectives/all_reduce_2d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} trace_dir: "../microbenchmarks/all_reduce_2d" csv_path: "../microbenchmarks/all_reduce_2d" xlml_metrics_dir: "../microbenchmarks/all_reduce_2d" diff --git a/Ironwood/configs/collectives/all_reduce_3d.yaml b/Ironwood/configs/collectives/all_reduce_3d.yaml index f6a4ad9d..4e76b55f 100644 --- a/Ironwood/configs/collectives/all_reduce_3d.yaml +++ b/Ironwood/configs/collectives/all_reduce_3d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} trace_dir: "../microbenchmarks/all_reduce_3d" csv_path: "../microbenchmarks/all_reduce_3d" xlml_metrics_dir: "../microbenchmarks/all_reduce_3d" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml index b713c549..dbeb0407 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/psum_tpu7x_2x2x1" csv_path: "../microbenchmarks/psum_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml index 165e0e72..cca20bc2 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/psum_tpu7x_2x2x2" csv_path: "../microbenchmarks/psum_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml index 0002ae3a..1cb29b11 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/psum_tpu7x_2x2x4" csv_path: "../microbenchmarks/psum_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml index e5652a92..8366350a 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/psum_tpu7x_2x4x4" csv_path: "../microbenchmarks/psum_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml index 137bae19..93deef9f 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/psum_tpu7x_4x4x4" csv_path: "../microbenchmarks/psum_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml index 25758453..7b629828 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/psum_tpu7x_4x4x8" csv_path: "../microbenchmarks/psum_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/all_to_all_1d.yaml b/Ironwood/configs/collectives/all_to_all_1d.yaml index 3c28194d..8d222613 100644 --- a/Ironwood/configs/collectives/all_to_all_1d.yaml +++ b/Ironwood/configs/collectives/all_to_all_1d.yaml @@ -1,7 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_1d" csv_path: "../microbenchmarks/all_to_all_1d" xlml_metrics_dir: "../microbenchmarks/all_to_all_1d" diff --git a/Ironwood/configs/collectives/all_to_all_2d.yaml b/Ironwood/configs/collectives/all_to_all_2d.yaml index b4a1bc0e..d23115fe 100644 --- a/Ironwood/configs/collectives/all_to_all_2d.yaml +++ b/Ironwood/configs/collectives/all_to_all_2d.yaml @@ -1,7 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_2d" csv_path: "../microbenchmarks/all_to_all_2d" xlml_metrics_dir: "../microbenchmarks/all_to_all_2d" diff --git a/Ironwood/configs/collectives/all_to_all_3d.yaml b/Ironwood/configs/collectives/all_to_all_3d.yaml index 3aa0e2a7..c705754c 100644 --- a/Ironwood/configs/collectives/all_to_all_3d.yaml +++ b/Ironwood/configs/collectives/all_to_all_3d.yaml @@ -1,7 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_3d" csv_path: "../microbenchmarks/all_to_all_3d" xlml_metrics_dir: "../microbenchmarks/all_to_all_3d" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml index a30a17cf..42dcf9e1 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml index 01fb9b80..5b1bbb82 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml index 43beeb27..f6004ce2 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml index 614caa6b..a0e16a92 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml index 12dd149d..5b627a9a 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml index 32c63d74..65742f12 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} # Parallel Replica - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 20} # Non-Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8" csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/reduce_scatter_1d.yaml b/Ironwood/configs/collectives/reduce_scatter_1d.yaml index 9c2c0dea..063d73fc 100644 --- a/Ironwood/configs/collectives/reduce_scatter_1d.yaml +++ b/Ironwood/configs/collectives/reduce_scatter_1d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum_scatter benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/reduce_scatter_1d" csv_path: "../microbenchmarks/reduce_scatter_1d" xlml_metrics_dir: "../microbenchmarks/reduce_scatter_1d" diff --git a/Ironwood/configs/collectives/reduce_scatter_2d.yaml b/Ironwood/configs/collectives/reduce_scatter_2d.yaml index f329b571..027ac991 100644 --- a/Ironwood/configs/collectives/reduce_scatter_2d.yaml +++ b/Ironwood/configs/collectives/reduce_scatter_2d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum_scatter benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} trace_dir: "../microbenchmarks/reduce_scatter_2d" csv_path: "../microbenchmarks/reduce_scatter_2d" xlml_metrics_dir: "../microbenchmarks/reduce_scatter_2d" From 16c614714c12076d1fef29045bd81cccd622fa23 Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Fri, 6 Feb 2026 09:03:51 +0000 Subject: [PATCH 54/88] Remove 32768 data_size_mib from H2D YAML --- Ironwood/configs/host_device/host_device.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml index 0b48800c..b703ea96 100644 --- a/Ironwood/configs/host_device/host_device.yaml +++ b/Ironwood/configs/host_device/host_device.yaml @@ -3,7 +3,7 @@ benchmarks: num_runs: 20 benchmark_sweep_params: - { - data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768] + data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384] } csv_path: "../microbenchmarks/host_device" trace_dir: "../microbenchmarks/host_device/trace" From 8c3be375dc6f090dd9492dcb206b333c2d454d3b Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Mon, 9 Feb 2026 17:32:43 +0800 Subject: [PATCH 55/88] Fix inadvertent removal of target_devices (#108) --- Ironwood/src/benchmark_host_device.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index 4e4e0a63..67eb980b 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -5,7 +5,7 @@ from typing import Any, Dict, Tuple, List import jax -from jax import sharding +from jax import numpy as jnp import numpy as np from benchmark_utils import MetricsStatistics @@ -25,7 +25,7 @@ def benchmark_host_device( num_runs: int = 100, trace_dir: str = None, ) -> Dict[str, Any]: - """Benchmarks H2D/D2H transfer using simple device_put/device_get.""" + """Benchmarks H2D/D2H transfer using device_put/device_get.""" num_elements = 1024 * 1024 * data_size_mib // np.dtype(np.float32).itemsize @@ -33,8 +33,13 @@ def benchmark_host_device( column = 128 host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32) + # Used in pipelined flow + # TODO: turn into a param + num_devices_to_perform_h2d = 1 + target_devices = jax.devices()[:num_devices_to_perform_h2d] + print( - f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations", + f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {h2d_type=}", flush=True ) From b6bd6ae9dce456f1575daab674ff4bab10e46029 Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Tue, 10 Feb 2026 07:05:21 +0000 Subject: [PATCH 56/88] Add log to show best hyperparameters after tuning --- Ironwood/configs/attention/attention.yaml | 12 ++++++++++++ Ironwood/src/benchmark_attention.py | 10 ++++++++++ 2 files changed, 22 insertions(+) create mode 100644 Ironwood/configs/attention/attention.yaml diff --git a/Ironwood/configs/attention/attention.yaml b/Ironwood/configs/attention/attention.yaml new file mode 100644 index 00000000..92c19dd8 --- /dev/null +++ b/Ironwood/configs/attention/attention.yaml @@ -0,0 +1,12 @@ +benchmarks: +- benchmark_name: "tokamax_splash_attention" + benchmark_sweep_params: + - {batch_size: 1, q_seq_len: 4096, kv_seq_len: 4096, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true} + - {batch_size: 1, q_seq_len: 8192, kv_seq_len: 8192, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true} + - {batch_size: 1, q_seq_len: 16384, kv_seq_len: 16384, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true} + - {batch_size: 1, q_seq_len: 32768, kv_seq_len: 32768, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true} + - {batch_size: 1, q_seq_len: 65536, kv_seq_len: 65536, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true} + - {batch_size: 1, q_seq_len: 131072, kv_seq_len: 131072, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true} + trace_dir: "../microbenchmarks/attention" + csv_path: "../microbenchmarks/attention" + xlml_metrics_dir: "../microbenchmarks/attention" diff --git a/Ironwood/src/benchmark_attention.py b/Ironwood/src/benchmark_attention.py index 95ae5248..525c7757 100644 --- a/Ironwood/src/benchmark_attention.py +++ b/Ironwood/src/benchmark_attention.py @@ -252,6 +252,16 @@ def attention_fn( output = tuned_splash(q, k, v) jax.block_until_ready(output) + + print("-" * 50) + print( + f"batch_size={batch_size}, q_seq_len={q_seq_len}, kv_seq_len={kv_seq_len}, " + f"q_heads={q_heads}, kv_heads={kv_heads}, qk_head_dim={qk_head_dim}, " + f"v_head_dim={v_head_dim}, mode={mode}, causal={causal}" + ) + print(f"tuned_splash.optimal_hyperparams={tuned_splash.optimal_hyperparams}") + print("-" * 50) + # Run benchmark time_ms_list = timeit_from_trace( tuned_splash, From 5d73eaeb9b2708113f2671c06851e3c9447c5226 Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Tue, 10 Feb 2026 09:33:30 +0000 Subject: [PATCH 57/88] Add attention into aggregator --- Ironwood/guides/automation/aggregator.py | 19 ++++++ .../automation/tpu7x-2x2x1-attention.yaml | 61 +++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 Ironwood/guides/automation/tpu7x-2x2x1-attention.yaml diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py index 97235665..ed889d79 100644 --- a/Ironwood/guides/automation/aggregator.py +++ b/Ironwood/guides/automation/aggregator.py @@ -43,6 +43,13 @@ "tflops_per_sec_per_device_avg", "tflops_per_sec_per_device_min", "tflops_per_sec_per_device_max", ], + "attention": [ + "batch_size", "q_seq_len", "kv_seq_len", "q_heads", "kv_heads", "qk_head_dim", "v_head_dim", "mode", "causal", "step_time_ms_num_runs", "num_samples", + "time_ms_p50", "time_ms_p90", + "time_ms_p95", "time_ms_p99", + "time_ms_avg", "time_ms_min", + "time_ms_max", + ], } def download_from_gcs(bucket_path: str, local_dir: str): @@ -111,6 +118,17 @@ def aggregate_bmm(directories: list[str], picked_columns: list[str]) -> pd.DataF aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True) return aggregated_df +def aggregate_attention(directories: list[str], picked_columns: list[str]) -> pd.DataFrame: + if len(directories) == 0: + return None + aggregated_df = pd.DataFrame() + for directory in directories: + files = glob.glob(f"{directory}/*.tsv") + for file in files: + df = pd.read_csv(file, sep='\t') + aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True) + return aggregated_df + aggregate_function = { "collectives": aggregate_collectives, "hbm": aggregate_hbm, @@ -118,6 +136,7 @@ def aggregate_bmm(directories: list[str], picked_columns: list[str]) -> pd.DataF "gemm": aggregate_gemm, "bmm": aggregate_bmm, "gemm_all_reduce": aggregate_gemm, + "attention": aggregate_attention, } def aggregate_results(bucket_path: str, local_dir: str): diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-attention.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-attention.yaml new file mode 100644 index 00000000..d3aff3ed --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-2x2x1-attention.yaml @@ -0,0 +1,61 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-2x2x1 +spec: + completionMode: Indexed + suspend: true + parallelism: 1 + completions: 1 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x1 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/attention/attention.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file From d9d22465bc60504870c12da7c2d0b761102c4457 Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Fri, 6 Feb 2026 03:31:55 +0000 Subject: [PATCH 58/88] Gemm+All Reduce for 4x4x4 and fix minor bugs --- .../gemm_all_reduce/gemm_all_reduce.yaml | 10 +-- Ironwood/configs/host_device/host_device.yaml | 7 ++- Ironwood/guides/automation/aggregator.py | 16 ++++- .../tpu7x-4x4x4-gemm_all_reduce.yaml | 61 +++++++++++++++++++ Ironwood/src/benchmark_gemm.py | 1 + Ironwood/src/benchmark_gemm_all_reduce.py | 54 ++++++++-------- Ironwood/src/benchmark_host_device.py | 5 +- Ironwood/src/run_benchmark.py | 30 ++++++++- 8 files changed, 145 insertions(+), 39 deletions(-) create mode 100644 Ironwood/guides/automation/tpu7x-4x4x4-gemm_all_reduce.yaml diff --git a/Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml b/Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml index 93466840..cde1101f 100644 --- a/Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml +++ b/Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml @@ -5,11 +5,11 @@ benchmarks: xlml_metrics_dir: "../microbenchmarks/gemm_all_reduce_bf16" xla_dump_dir: "../microbenchmarks/gemm_all_reduce_bf16/hlo_graphs" benchmark_sweep_params: + - {m: 1024, k: 1024, n: 1024, num_runs: 20, dtype: 'bfloat16'} - {m: 2048, k: 2048, n: 2048, num_runs: 20, dtype: 'bfloat16'} - {m: 4096, k: 4096, n: 4096, num_runs: 20, dtype: 'bfloat16'} - {m: 8192, k: 8192, n: 8192, num_runs: 20, dtype: 'bfloat16'} - {m: 16384, k: 16384, n: 16384, num_runs: 20, dtype: 'bfloat16'} - - {m: 32768, k: 32768, n: 32768, num_runs: 20, dtype: 'bfloat16'} - benchmark_name: "gemm_all_reduce" trace_dir: "../microbenchmarks/gemm_all_reduce_f32" @@ -17,11 +17,11 @@ benchmarks: xlml_metrics_dir: "../microbenchmarks/gemm_all_reduce_f32" xla_dump_dir: "../microbenchmarks/gemm_all_reduce_f32/hlo_graphs" benchmark_sweep_params: + - {m: 1024, k: 1024, n: 1024, num_runs: 20, dtype: 'float32'} - {m: 2048, k: 2048, n: 2048, num_runs: 20, dtype: 'float32'} - {m: 4096, k: 4096, n: 4096, num_runs: 20, dtype: 'float32'} - {m: 8192, k: 8192, n: 8192, num_runs: 20, dtype: 'float32'} - {m: 16384, k: 16384, n: 16384, num_runs: 20, dtype: 'float32'} - - {m: 32768, k: 32768, n: 32768, num_runs: 20, dtype: 'float32'} - benchmark_name: "gemm_all_reduce" trace_dir: "../microbenchmarks/gemm_all_reduce_fp16" @@ -29,11 +29,11 @@ benchmarks: xlml_metrics_dir: "../microbenchmarks/gemm_all_reduce_fp16" xla_dump_dir: "../microbenchmarks/gemm_all_reduce_fp16/hlo_graphs" benchmark_sweep_params: + - {m: 1024, k: 1024, n: 1024, num_runs: 20, dtype: 'float16'} - {m: 2048, k: 2048, n: 2048, num_runs: 20, dtype: 'float16'} - {m: 4096, k: 4096, n: 4096, num_runs: 20, dtype: 'float16'} - {m: 8192, k: 8192, n: 8192, num_runs: 20, dtype: 'float16'} - {m: 16384, k: 16384, n: 16384, num_runs: 20, dtype: 'float16'} - - {m: 32768, k: 32768, n: 32768, num_runs: 20, dtype: 'float16'} - benchmark_name: "gemm_all_reduce" trace_dir: "../microbenchmarks/gemm_all_reduce_fp8" @@ -41,11 +41,11 @@ benchmarks: xlml_metrics_dir: "../microbenchmarks/gemm_all_reduce_fp8" xla_dump_dir: "../microbenchmarks/gemm_all_reduce_fp8/hlo_graphs" benchmark_sweep_params: + - {m: 1024, k: 1024, n: 1024, num_runs: 20, dtype: 'float8'} - {m: 2048, k: 2048, n: 2048, num_runs: 20, dtype: 'float8'} - {m: 4096, k: 4096, n: 4096, num_runs: 20, dtype: 'float8'} - {m: 8192, k: 8192, n: 8192, num_runs: 20, dtype: 'float8'} - {m: 16384, k: 16384, n: 16384, num_runs: 20, dtype: 'float8'} - - {m: 32768, k: 32768, n: 32768, num_runs: 20, dtype: 'float8'} - benchmark_name: "gemm_all_reduce" trace_dir: "../microbenchmarks/gemm_all_reduce_fp4" @@ -53,8 +53,8 @@ benchmarks: xlml_metrics_dir: "../microbenchmarks/gemm_all_reduce_fp4" xla_dump_dir: "../microbenchmarks/gemm_all_reduce_fp4/hlo_graphs" benchmark_sweep_params: + - {m: 1024, k: 1024, n: 1024, num_runs: 20, dtype: 'float4'} - {m: 2048, k: 2048, n: 2048, num_runs: 20, dtype: 'float4'} - {m: 4096, k: 4096, n: 4096, num_runs: 20, dtype: 'float4'} - {m: 8192, k: 8192, n: 8192, num_runs: 20, dtype: 'float4'} - {m: 16384, k: 16384, n: 16384, num_runs: 20, dtype: 'float4'} - - {m: 32768, k: 32768, n: 32768, num_runs: 20, dtype: 'float4'} diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml index b703ea96..c63075a0 100644 --- a/Ironwood/configs/host_device/host_device.yaml +++ b/Ironwood/configs/host_device/host_device.yaml @@ -2,8 +2,9 @@ benchmarks: - benchmark_name: host_device num_runs: 20 benchmark_sweep_params: - - { - data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384] - } + - h2d_type: "simple" + data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384] + - h2d_type: "pipelined" + data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384] csv_path: "../microbenchmarks/host_device" trace_dir: "../microbenchmarks/host_device/trace" diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py index ed889d79..9e4e9648 100644 --- a/Ironwood/guides/automation/aggregator.py +++ b/Ironwood/guides/automation/aggregator.py @@ -37,7 +37,7 @@ "tflops_per_sec_per_device_max", ], "gemm_all_reduce": [ - "m", "n", "k", "dtype", "step_time_ms_num_runs", + "topology", "m", "n", "k", "dtype", "step_time_ms_num_runs", "tflops_per_sec_per_device_p50", "tflops_per_sec_per_device_p90", "tflops_per_sec_per_device_p95", "tflops_per_sec_per_device_p99", "tflops_per_sec_per_device_avg", "tflops_per_sec_per_device_min", @@ -107,6 +107,18 @@ def aggregate_gemm(directories: list[str], picked_columns: list[str]) -> pd.Data aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True) return aggregated_df +def aggregate_gemm_all_reduce(directories: list[str], picked_columns: list[str]) -> pd.DataFrame: + if len(directories) == 0: + return None + aggregated_df = pd.DataFrame() + for directory in directories: + files = glob.glob(f"{directory}/*.tsv") + for file in files: + df = pd.read_csv(file, sep='\t') + df["topology"] = [file.split('/')[-4].split('-')[1] for _ in range(df.shape[0])] + aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True) + return aggregated_df + def aggregate_bmm(directories: list[str], picked_columns: list[str]) -> pd.DataFrame: if len(directories) == 0: return None @@ -135,8 +147,8 @@ def aggregate_attention(directories: list[str], picked_columns: list[str]) -> pd "host_device": aggregate_host_device, "gemm": aggregate_gemm, "bmm": aggregate_bmm, - "gemm_all_reduce": aggregate_gemm, "attention": aggregate_attention, + "gemm_all_reduce": aggregate_gemm_all_reduce, } def aggregate_results(bucket_path: str, local_dir: str): diff --git a/Ironwood/guides/automation/tpu7x-4x4x4-gemm_all_reduce.yaml b/Ironwood/guides/automation/tpu7x-4x4x4-gemm_all_reduce.yaml new file mode 100644 index 00000000..05c98427 --- /dev/null +++ b/Ironwood/guides/automation/tpu7x-4x4x4-gemm_all_reduce.yaml @@ -0,0 +1,61 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-4x4x4 +spec: + completionMode: Indexed + suspend: true + parallelism: 16 + completions: 16 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 4x4x4 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/src/benchmark_gemm.py b/Ironwood/src/benchmark_gemm.py index 0b19637d..422b1bc4 100644 --- a/Ironwood/src/benchmark_gemm.py +++ b/Ironwood/src/benchmark_gemm.py @@ -147,6 +147,7 @@ def gemm_multiple_run_calculate_metrics( total_flops, SHARDING_STRATEGY ) peak_flops_multiplier = get_peak_flops_multiplier(dtype.dtype.name) + peak_flops = PEAK_FLOPS_PER_DEVICE * peak_flops_multiplier return unified_flops_metrics( m, n, diff --git a/Ironwood/src/benchmark_gemm_all_reduce.py b/Ironwood/src/benchmark_gemm_all_reduce.py index 851849ca..55744593 100644 --- a/Ironwood/src/benchmark_gemm_all_reduce.py +++ b/Ironwood/src/benchmark_gemm_all_reduce.py @@ -1,6 +1,7 @@ """Benchmarks gemm + all_reduce for DP gradient sync simulation.""" import os +import time from typing import Any, Dict, Optional, Callable # pylint: disable=g-importing-member @@ -57,11 +58,6 @@ def setup_tpu_env(): "--xla_tpu_vmem_scavenging_mode=NONE " "--xla_tpu_dvfs_p_state=7 " - "--xla_tpu_impure_enable_packed_bf16_math_ops=true " - "--xla_tpu_enable_pincer_short_fusion_emitter=true " - "--xla_tpu_enable_sparse_core_hierarchical_all_reduce=true " - "--xla_tpu_use_single_sparse_core_for_all_reduce_offload=true " # Test effect on SC - "--xla_jf_debug_level=1 " "--xla_sc_disable_megacore_partitioning=true " "--xla_tpu_disable_sparse_core_collective_offload_remover=true " @@ -72,9 +68,7 @@ def setup_tpu_env(): "--xla_tpu_use_tc_device_shape_on_sc=true " ) - print("Step 1: Calling jax.distributed.initialize(initialization_timeout=300)...", flush=True) jax.distributed.initialize(initialization_timeout=300) - print("Step 1: jax.distributed.initialize() completed.", flush=True) _INITIALIZED = True @@ -106,7 +100,6 @@ def f(x, y): out = jax.lax.psum(c, axis_name="device") return out - print("Step 2: Creating Mesh and Shardings...", flush=True) mesh = create_mesh(sharding_strategy) lhs_sharding = get_lhs_named_shading(mesh, sharding_strategy) rhs_sharding = get_rhs_named_shading(mesh, sharding_strategy) @@ -131,22 +124,35 @@ def f(x, y): rhs_dtype = dtype key = jax.random.key(SEED) - def data_generator(): - """Creates new random data on host and puts it on device.""" - nonlocal key - key, key_lhs, key_rhs = jax.random.split(key, 3) - - # Create random data on host - lhs_host = jax.random.normal(key_lhs, lhs_shape).astype(lhs_dtype) - rhs_host = jax.random.normal(key_rhs, rhs_shape).astype(rhs_dtype) - - # Put on device (HBM) - lhs_device = jax.device_put(lhs_host, lhs_sharding) - rhs_device = jax.device_put(rhs_host, rhs_sharding) + # Create random data on host and put on device ONCE (Double Buffered) + key, key_lhs_1, key_lhs_2, key_rhs_1, key_rhs_2 = jax.random.split(key, 5) + + lhs_host_1 = jax.random.normal(key_lhs_1, lhs_shape).astype(lhs_dtype) + lhs_host_2 = jax.random.normal(key_lhs_2, lhs_shape).astype(lhs_dtype) + rhs_host_1 = jax.random.normal(key_rhs_1, rhs_shape).astype(rhs_dtype) + rhs_host_2 = jax.random.normal(key_rhs_2, rhs_shape).astype(rhs_dtype) + + lhs_device_1 = jax.device_put(lhs_host_1, lhs_sharding) + lhs_device_2 = jax.device_put(lhs_host_2, lhs_sharding) + rhs_device_1 = jax.device_put(rhs_host_1, rhs_sharding) + rhs_device_2 = jax.device_put(rhs_host_2, rhs_sharding) + + jax.block_until_ready(lhs_device_1) + jax.block_until_ready(lhs_device_2) + jax.block_until_ready(rhs_device_1) + jax.block_until_ready(rhs_device_2) - return (lhs_device, rhs_device) + step = 0 + def data_generator(): + """Returns pre-allocated device data, toggling between two sets of buffers to avoid caching.""" + nonlocal step + use_set_1 = (step % 2) == 0 + step += 1 + return ( + lhs_device_1 if use_set_1 else lhs_device_2, + rhs_device_1 if use_set_1 else rhs_device_2 + ) - print("Step 3: Starting Execution Loop (includes JIT)...", flush=True) time_ms_list = multiple_iteration_timeit_from_trace( jit_sharded_f, data_generator, @@ -156,7 +162,6 @@ def data_generator(): trace_dir=trace_dir, multi_op=True, ) - print("Step 4: Execution Loop Completed.", flush=True) return { "time_ms_list": time_ms_list, @@ -179,9 +184,6 @@ def gemm_all_reduce( ) - - - def _calculate_metrics_base( m: int, k: int, diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index 67eb980b..a1f2467c 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -21,7 +21,7 @@ def benchmark_host_device( data_size_mib: int, - h2d_type: str = "simple", + h2d_type: str, num_runs: int = 100, trace_dir: str = None, ) -> Dict[str, Any]: @@ -129,6 +129,7 @@ def benchmark_host_device( d2h_perf.append((t3 - t2) * 1000) tensor_stack.delete() + for device_tensor in tensors_on_device: device_tensor.delete() del tensors_on_device @@ -165,6 +166,7 @@ def benchmark_host_device_calculate_metrics( h2d_type: str, H2D_Bandwidth_ms: List[float], D2H_Bandwidth_ms: List[float], + d2h_pipelined_method: str = "device_put", ) -> Tuple[Dict[str, Any], Dict[str, Any]]: """Calculates metrics for Host-Device transfer.""" params = locals().items() @@ -176,6 +178,7 @@ def benchmark_host_device_calculate_metrics( metadata = {k: v for k, v in params if k in metadata_keys} metadata["dtype"] = "float32" metadata["h2d_type"] = h2d_type + metadata["d2h_pipelined_method"] = d2h_pipelined_method metrics = {} diff --git a/Ironwood/src/run_benchmark.py b/Ironwood/src/run_benchmark.py index 31338d21..2d10db5d 100644 --- a/Ironwood/src/run_benchmark.py +++ b/Ironwood/src/run_benchmark.py @@ -331,6 +331,17 @@ def write_metrics_to_gcs( calculate_metrics_results: List[Dict[str, Any]], ): """Writes metrics to GCS bucket defined by gcs_bucket_csv_dir.""" + # Only write metrics from one host. + try: + if jax.process_index() != 0: + return + except Exception: + # If jax is not initialized or we are in a single process setup where process_index might fail + # (though likely it returns 0), we can default to writing. + # But safest is to assume if this function is called, we want to write, + # unless explicitly on a non-zero rank. + pass + if not gcs_bucket_csv_dir: return @@ -342,9 +353,24 @@ def write_metrics_to_gcs( config_category = os.path.basename(config_dir) config_stem = os.path.splitext(os.path.basename(config_path))[0] - gcs_path = os.path.join(gcs_bucket_csv_dir, config_category, config_stem) - write_to_csv(f"{gcs_path}/{test_name}.tsv", calculate_metrics_results) + + # Group results by dtype + results_by_dtype = {} + for result in calculate_metrics_results: + dtype = result.get("metadata", {}).get("dtype", "unknown_dtype") + if dtype not in results_by_dtype: + results_by_dtype[dtype] = [] + results_by_dtype[dtype].append(result) + + # Write separate files for each dtype + for dtype, results in results_by_dtype.items(): + # Append dtype to test_name if it's not already part of it (it usually isn't in a clean way) + # But to be safe and clear, we can just append it. + # test_name is like "t_gemm_all_reduce_XYZ..." + # We want "t_gemm_all_reduce_XYZ..._float16.tsv" + dtype_test_name = f"{test_name}_{dtype}" + write_to_csv(f"{gcs_path}/{dtype_test_name}.tsv", results) def run_single_benchmark(benchmark_config: Dict[str, Any], output_path: str, gcs_bucket_csv_dir: str = None, config_path: str = None): From 88229ec46146a38b6c8a4df503c230746a02373f Mon Sep 17 00:00:00 2001 From: "Amy (Yu-Hsuan) Lin" Date: Tue, 10 Feb 2026 18:24:46 +0800 Subject: [PATCH 59/88] Add 4x4 gemm_all_reduce.yaml to automation launch script --- Ironwood/guides/automation/automation_launch.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh index 0e49fb12..1d1fb498 100755 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -22,6 +22,7 @@ yaml_names=( "tpu7x-2x2x4-collectives.yaml" "tpu7x-2x4x4-collectives.yaml" "tpu7x-4x4x4-collectives.yaml" + "tpu7x-4x4x4-gemm_all_reduce.yaml" ) ###################################################################### From a585ecba660599d4e7d612721282845e33589ed1 Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Wed, 11 Feb 2026 09:22:08 +0000 Subject: [PATCH 60/88] Add step time to matmul series --- Ironwood/guides/automation/aggregator.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py index 9e4e9648..b221c7e8 100644 --- a/Ironwood/guides/automation/aggregator.py +++ b/Ironwood/guides/automation/aggregator.py @@ -28,6 +28,7 @@ "tflops_per_sec_per_device_p95", "tflops_per_sec_per_device_p99", "tflops_per_sec_per_device_avg", "tflops_per_sec_per_device_min", "tflops_per_sec_per_device_max", + "step_time_ms_p50", "step_time_ms_p90", "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg", "step_time_ms_min", "step_time_ms_max", ], "bmm": [ "b", "m", "n", "k", "dtype", "step_time_ms_num_runs", @@ -35,6 +36,7 @@ "tflops_per_sec_per_device_p95", "tflops_per_sec_per_device_p99", "tflops_per_sec_per_device_avg", "tflops_per_sec_per_device_min", "tflops_per_sec_per_device_max", + "step_time_ms_p50", "step_time_ms_p90", "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg", "step_time_ms_min", "step_time_ms_max", ], "gemm_all_reduce": [ "topology", "m", "n", "k", "dtype", "step_time_ms_num_runs", @@ -42,6 +44,7 @@ "tflops_per_sec_per_device_p95", "tflops_per_sec_per_device_p99", "tflops_per_sec_per_device_avg", "tflops_per_sec_per_device_min", "tflops_per_sec_per_device_max", + "step_time_ms_p50", "step_time_ms_p90", "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg", "step_time_ms_min", "step_time_ms_max", ], "attention": [ "batch_size", "q_seq_len", "kv_seq_len", "q_heads", "kv_heads", "qk_head_dim", "v_head_dim", "mode", "causal", "step_time_ms_num_runs", "num_samples", From adb47d1a9498164709ee838e81838a9f3941d032 Mon Sep 17 00:00:00 2001 From: yuyanpeng-google Date: Thu, 12 Feb 2026 16:43:47 +0800 Subject: [PATCH 61/88] update benchmark_attention not sweep at the runtime (#111) We add a tuned table instead of. run the sweeping at each microbenchmark. For the config not tuned yet, we just use default block sizes and output with flag has_optimized=false. --- Ironwood/configs/attention/attention.yaml | 7 +- Ironwood/src/benchmark_attention.py | 236 +++++++++++++++------- 2 files changed, 161 insertions(+), 82 deletions(-) diff --git a/Ironwood/configs/attention/attention.yaml b/Ironwood/configs/attention/attention.yaml index 92c19dd8..bc365cd9 100644 --- a/Ironwood/configs/attention/attention.yaml +++ b/Ironwood/configs/attention/attention.yaml @@ -1,12 +1,7 @@ benchmarks: - benchmark_name: "tokamax_splash_attention" benchmark_sweep_params: - - {batch_size: 1, q_seq_len: 4096, kv_seq_len: 4096, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true} - - {batch_size: 1, q_seq_len: 8192, kv_seq_len: 8192, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true} - - {batch_size: 1, q_seq_len: 16384, kv_seq_len: 16384, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true} - - {batch_size: 1, q_seq_len: 32768, kv_seq_len: 32768, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true} - - {batch_size: 1, q_seq_len: 65536, kv_seq_len: 65536, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true} - - {batch_size: 1, q_seq_len: 131072, kv_seq_len: 131072, q_heads: 128, kv_heads: 8, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_samples: 3500, tune_pallas_only: true} + - {batch_size: 1, q_seq_len: 4096, kv_seq_len: 4096, q_heads: 128, kv_heads: 128, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false]} trace_dir: "../microbenchmarks/attention" csv_path: "../microbenchmarks/attention" xlml_metrics_dir: "../microbenchmarks/attention" diff --git a/Ironwood/src/benchmark_attention.py b/Ironwood/src/benchmark_attention.py index 525c7757..64192266 100644 --- a/Ironwood/src/benchmark_attention.py +++ b/Ironwood/src/benchmark_attention.py @@ -6,27 +6,96 @@ # pylint: disable=g-importing-member,g-bad-import-order from functools import partial -from typing import Any, Callable, Dict, Tuple +from typing import Any, Callable, Dict, Literal, Optional, Tuple import dataclasses from benchmark_utils import timeit_from_trace, MetricsStatistics import jax -import logging +import jax.numpy as jnp + from tokamax._src.ops.experimental.tpu.splash_attention import ( splash_attention_kernel as splash, ) from tokamax._src.ops.experimental.tpu.splash_attention import ( splash_attention_mask as mask_lib, ) -import tune_jax -tune_jax.tune_logger.setLevel(logging.ERROR) # pylint: disable=g-importing-member,g-bad-import-order os.environ["LIBTPU_INIT_ARGS"] = ( - "--xla_tpu_dvfs_p_state=7" + "--xla_tpu_dvfs_p_state=7 --xla_tpu_scoped_vmem_limit_kib=65536" +) + +SplashAttentionLookupKey = tuple[ + int, # batch_size + int, # q_seq_len + int, # kv_seq_len + int, # q_heads + int, # kv_heads + int, # qk_head_dim + int, # v_head_dim + bool, # causal +] + +SplashAttentionLookupValue = tuple[ + int, # block_q + int, # block_kv + int, # block_kv_compute + int, # block_q_dkv + int, # block_kv_dkv + int, # block_kv_dkv_compute + splash.QKVLayout, # q_layout + splash.QKVLayout, # k_layout + splash.QKVLayout, # v_layout + bool, # use_experimental_scheduler +] + +# Merge the tuned block size of optimal fwd and bwd +# The optimal layout and use_experimental_scheduler may be different between fwd and bwd +# Use the layout and use_experimental_scheduler optimized for fwd +SPLASH_ATTENTION_HYPERPARAMS_LOOKUP_TABLE: Dict[ + SplashAttentionLookupKey, SplashAttentionLookupValue +] = { + (1, 4096, 4096, 128, 128, 256, 256, True): ( + 2048, + 2048, + 256, + 2048, + 2048, + 512, + splash.QKVLayout.HEAD_DIM_MINOR, + splash.QKVLayout.SEQ_MINOR, + splash.QKVLayout.HEAD_DIM_MINOR, + True, + ), + (1, 4096, 4096, 128, 128, 256, 256, False): ( + 4096, + 4096, + 512, + 4096, + 2048, + 512, + splash.QKVLayout.HEAD_DIM_MINOR, + splash.QKVLayout.SEQ_MINOR, + splash.QKVLayout.HEAD_DIM_MINOR, + True, + ), +} + +DEFAULT_SPLASH_ATTENTION_HYPERPARAMS: SplashAttentionLookupValue = ( + 2048, + 2048, + 256, + 2048, + 2048, + 256, + splash.QKVLayout.HEAD_DIM_MINOR, + splash.QKVLayout.SEQ_MINOR, + splash.QKVLayout.HEAD_DIM_MINOR, + True, ) + def generate_qkv_separate_dims( batch_size: int, q_seq_len: int, @@ -40,9 +109,9 @@ def generate_qkv_separate_dims( """Generates QKV with potentially different shapes for Q, K, and V.""" key = jax.random.PRNGKey(seed) key_q, key_k, key_v = jax.random.split(key, 3) - q = jax.random.normal(key_q, (batch_size, q_heads, q_seq_len, qk_head_dim)) - k = jax.random.normal(key_k, (batch_size, kv_heads, kv_seq_len, qk_head_dim)) - v = jax.random.normal(key_v, (batch_size, kv_heads, kv_seq_len, v_head_dim)) + q = jax.random.normal(key_q, (batch_size, q_heads, q_seq_len, qk_head_dim), dtype=jnp.bfloat16) + k = jax.random.normal(key_k, (batch_size, kv_heads, kv_seq_len, qk_head_dim), dtype=jnp.bfloat16) + v = jax.random.normal(key_v, (batch_size, kv_heads, kv_seq_len, v_head_dim), dtype=jnp.bfloat16) return q, k, v @@ -106,28 +175,14 @@ def tokamax_splash_attention_benchmark( kv_heads: int, qk_head_dim: int, v_head_dim: int, - mode: str = "fwd", # One of ('fwd', 'bwd', 'combined') + mode: Literal["fwd", "bwd"] = "fwd", causal: bool = True, - num_samples: int = 256, - tune_pallas_only: bool = True, num_runs: int = 10, - trace_dir: str = None, + trace_dir: Optional[str] = None, ) -> Dict[str, Any]: """Benchmarks the Tokamax Splash attention kernel.""" event_filter_regex = _pallas_call_hlo_pattern(mode, q_heads != kv_heads) - - hyperparams_override = {} - if mode == "bwd": - # Don't tune fwd only hyperparams - hyperparams_override = dict( - block_q=min(512, q_seq_len), - block_kv=min(1024, kv_seq_len), - block_kv_compute=min(512, kv_seq_len), - ) - elif mode == "combined": - mode = "bwd" - - # Generate QKV. + # Generate QKV in shape [batch, head_num, seq_len, head_dim]. q, k, v = generate_qkv_separate_dims( batch_size, q_seq_len, @@ -138,12 +193,68 @@ def tokamax_splash_attention_benchmark( v_head_dim, ) + key = ( + batch_size, + q_seq_len, + kv_seq_len, + q_heads, + kv_heads, + qk_head_dim, + v_head_dim, + causal, + ) + hyperparams: Optional[SplashAttentionLookupValue] = ( + SPLASH_ATTENTION_HYPERPARAMS_LOOKUP_TABLE.get(key, None) + ) + has_optimized = True + if hyperparams is None: + print(f"{key=} is not tuned") + has_optimized = False + hyperparams = DEFAULT_SPLASH_ATTENTION_HYPERPARAMS + + ( + block_q, + block_kv, + block_kv_compute, + block_q_dkv, + block_kv_dkv, + block_kv_dkv_compute, + q_layout, + k_layout, + v_layout, + use_experimental_scheduler, + ) = hyperparams + + # Pad q, kv to prevent the block size are not valid + if not has_optimized: + def _ceiling_div(a: int, b: int) -> int: + return (a + b - 1) // b + + + def _align_to(x: int, a: int) -> int: + return _ceiling_div(x, a) * a + + def _pad_token(t: jax.Array, size) -> jax.Array: + # tensor is [batch_size, num_head, token, head_dim] + result = jnp.pad(t, ((0, 0), (0, 0), (0, size), (0, 0)), constant_values=0) + return result + + q_len = q.shape[-2] + k_len = k.shape[-2] + + # Pad q, k, v sequence, align to block sizes + q = _pad_token(q, _align_to(q_len, block_q) - q_len) + k = _pad_token(k, _align_to(k_len, block_kv) - k_len) + v = _pad_token(v, _align_to(k_len, block_kv) - k_len) + + padded_q_len = q.shape[-2] + padded_kv_len = k.shape[-2] # Attention mask - mask = mask_lib.FullMask(_shape=(q_seq_len, kv_seq_len)) + mask = mask_lib.FullMask(_shape=(padded_q_len, padded_kv_len)) if causal: # Pick offset for causal masks for a "representative" slice of the causal - offset = v.shape[-2] - q.shape[-2] - mask = mask_lib.CausalMask(shape=(q_seq_len, kv_seq_len), offset=offset) + offset = padded_kv_len - padded_q_len + mask = mask_lib.CausalMask(shape=(padded_q_len, padded_kv_len), offset=offset) def attention_fn( q: jax.Array, @@ -155,8 +266,6 @@ def attention_fn( block_q_dkv: int | None, block_kv_dkv: int | None, block_kv_dkv_compute: int | None, - block_q_dq: int | None, - block_kv_dq: int | None, q_layout: splash.QKVLayout, k_layout: splash.QKVLayout, v_layout: splash.QKVLayout, @@ -165,6 +274,7 @@ def attention_fn( mqa: bool, use_experimental_scheduler: bool, ): + # dq kernel is not used config = splash.SplashConfig( block_q=block_q, block_kv=block_kv, @@ -172,8 +282,8 @@ def attention_fn( block_q_dkv=block_q_dkv, block_kv_dkv=block_kv_dkv, block_kv_dkv_compute=block_kv_dkv_compute, - block_q_dq=block_q_dq, - block_kv_dq=block_kv_dq, + block_q_dq=None, + block_kv_dq=None, q_layout=q_layout, k_layout=k_layout, v_layout=v_layout, @@ -190,38 +300,6 @@ def attention_fn( mqa=q_heads != kv_heads, # Determine if it's Multi-Query Attention ) - # Define the search space for tokamax splash attention hyperparameters. - tiles = [256, 512, 1024, 2048, 4096, 8192] - layouts = [splash.QKVLayout.HEAD_DIM_MINOR, splash.QKVLayout.SEQ_MINOR] - hyperparams = { - "block_q": tiles, - "block_kv": tiles, - "block_kv_compute": tiles, - "block_q_dkv": [None], - "block_kv_dkv": [None], - "block_kv_dkv_compute": [None], - "block_q_dq": [None], - "block_kv_dq": [None], - "q_layout": layouts, - "k_layout": layouts, - "v_layout": layouts, - "use_experimental_scheduler": [True, False], - } - - if mode == "bwd": - # If mode is backward, enable tuning for dKV-related block sizes. - # These parameters are only used during the backward pass. - hyperparams["block_q_dkv"] = tiles - hyperparams["block_kv_dkv"] = tiles - hyperparams["block_kv_dkv_compute"] = tiles - hyperparams["block_q_dq"] = tiles - hyperparams["block_kv_dq"] = tiles - - # Incorporate any potentially previously tuned hyperparameters - hyperparams = dict(hyperparams, **hyperparams_override) - - # Prepare the attention function for tuning. - tune_jax.CONFIG.allow_fallback_timing = False splash_fn = jax.jit( attention_fn, static_argnames=( @@ -231,8 +309,6 @@ def attention_fn( "block_q_dkv", "block_kv_dkv", "block_kv_dkv_compute", - "block_q_dq", - "block_kv_dq", "q_layout", "k_layout", "v_layout", @@ -240,26 +316,31 @@ def attention_fn( ), ) - # Tune the hyperparameters with tune_jax - tuned_splash = tune_jax.tune( + tuned_splash = partial( splash_fn, - hyperparams=hyperparams, - event_filter_regex=event_filter_regex if tune_pallas_only else None, - sample_num=num_samples, + block_q=block_q, + block_kv=block_kv, + block_kv_compute=block_kv_compute, + block_q_dkv=block_q_dkv, + block_kv_dkv=block_kv_dkv, + block_kv_dkv_compute=block_kv_dkv_compute, + q_layout=q_layout, + k_layout=k_layout, + v_layout=v_layout, + use_experimental_scheduler=use_experimental_scheduler, ) # Run once output = tuned_splash(q, k, v) jax.block_until_ready(output) - print("-" * 50) print( f"batch_size={batch_size}, q_seq_len={q_seq_len}, kv_seq_len={kv_seq_len}, " f"q_heads={q_heads}, kv_heads={kv_heads}, qk_head_dim={qk_head_dim}, " f"v_head_dim={v_head_dim}, mode={mode}, causal={causal}" ) - print(f"tuned_splash.optimal_hyperparams={tuned_splash.optimal_hyperparams}") + print(f"{hyperparams=}") print("-" * 50) # Run benchmark @@ -275,7 +356,11 @@ def attention_fn( f"{event_filter_regex}_no_residuals.1", ] ) - return {"time_ms_list": time_ms_list, "output": output} + return { + "time_ms_list": time_ms_list, + "output": output, + "has_optimized": has_optimized, + } def tokamax_splash_attention_benchmark_calculate_metrics( @@ -289,9 +374,8 @@ def tokamax_splash_attention_benchmark_calculate_metrics( v_head_dim: int, mode: str, causal: bool, - num_samples: int, - tune_pallas_only: bool, time_ms_list: list[float], + has_optimized: bool, # pylint: disable=unused-argument ) -> Dict[str, Any]: """Gathers metrics for the tokamax splash attention benchmark.""" From 9bf9e38d13515b638b3489484271c36379ea62ab Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Thu, 12 Feb 2026 09:59:58 +0000 Subject: [PATCH 62/88] Add attention into automation --- Ironwood/guides/automation/aggregator.py | 4 ++-- Ironwood/guides/automation/automation_launch.sh | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py index b221c7e8..54df2146 100644 --- a/Ironwood/guides/automation/aggregator.py +++ b/Ironwood/guides/automation/aggregator.py @@ -47,7 +47,7 @@ "step_time_ms_p50", "step_time_ms_p90", "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg", "step_time_ms_min", "step_time_ms_max", ], "attention": [ - "batch_size", "q_seq_len", "kv_seq_len", "q_heads", "kv_heads", "qk_head_dim", "v_head_dim", "mode", "causal", "step_time_ms_num_runs", "num_samples", + "batch_size", "q_seq_len", "kv_seq_len", "q_heads", "kv_heads", "qk_head_dim", "v_head_dim", "mode", "causal", "has_optimized", "step_time_ms_num_runs", "time_ms_p50", "time_ms_p90", "time_ms_p95", "time_ms_p99", "time_ms_avg", "time_ms_min", @@ -155,7 +155,7 @@ def aggregate_attention(directories: list[str], picked_columns: list[str]) -> pd } def aggregate_results(bucket_path: str, local_dir: str): - categories = ["collectives", "hbm", "host_device", "gemm", "bmm", "gemm_all_reduce"] + categories = ["collectives", "hbm", "host_device", "gemm", "bmm", "gemm_all_reduce", "attention"] directories = {} results = {} for category in categories: diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh index 1d1fb498..133db969 100755 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -17,6 +17,7 @@ yaml_names=( "tpu7x-2x2x1-gemm_all_reduce.yaml" "tpu7x-2x2x1-gemm.yaml" "tpu7x-2x2x1-bmm.yaml" + "tpu7x-2x2x1-attention.yaml" "tpu7x-2x2x1-collectives.yaml" "tpu7x-2x2x2-collectives.yaml" "tpu7x-2x2x4-collectives.yaml" From 1d36fa8933b7bff032dcde0b2505f8610085ec0c Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Thu, 12 Feb 2026 11:24:38 +0000 Subject: [PATCH 63/88] Update attention aggregate logic --- Ironwood/guides/automation/aggregator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py index 54df2146..c519635b 100644 --- a/Ironwood/guides/automation/aggregator.py +++ b/Ironwood/guides/automation/aggregator.py @@ -47,7 +47,7 @@ "step_time_ms_p50", "step_time_ms_p90", "step_time_ms_p95", "step_time_ms_p99", "step_time_ms_avg", "step_time_ms_min", "step_time_ms_max", ], "attention": [ - "batch_size", "q_seq_len", "kv_seq_len", "q_heads", "kv_heads", "qk_head_dim", "v_head_dim", "mode", "causal", "has_optimized", "step_time_ms_num_runs", + "batch_size", "q_seq_len", "kv_seq_len", "q_heads", "kv_heads", "qk_head_dim", "v_head_dim", "mode", "causal", "has_optimized", "time_ms_num_runs", "time_ms_p50", "time_ms_p90", "time_ms_p95", "time_ms_p99", "time_ms_avg", "time_ms_min", From 5d958cc5e4878c483d47d8458af264b44b9bcd7f Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Thu, 12 Feb 2026 11:27:50 +0000 Subject: [PATCH 64/88] Set automation timeout to 2 hours --- Ironwood/guides/automation/automation_launch.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh index 133db969..cb89e725 100755 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -9,7 +9,7 @@ export GCS_SA_NAME="gcs-writer" # Service account with write access to GCS_BUCK export PROJECT_ID=$(gcloud config get-value project 2>/dev/null) MAX_RETRIES=3 -TIMEOUT_SECOND=3600 +TIMEOUT_SECOND=7200 yaml_names=( "tpu7x-2x2x1-hbm.yaml" From 1ab8008f3de1e52bc4213e0825f161a8815e8ef5 Mon Sep 17 00:00:00 2001 From: Hong-Yi Lin Date: Thu, 12 Feb 2026 11:39:10 +0000 Subject: [PATCH 65/88] Set attention num_runs to 20 --- Ironwood/configs/attention/attention.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Ironwood/configs/attention/attention.yaml b/Ironwood/configs/attention/attention.yaml index bc365cd9..4ee7e100 100644 --- a/Ironwood/configs/attention/attention.yaml +++ b/Ironwood/configs/attention/attention.yaml @@ -1,7 +1,7 @@ benchmarks: - benchmark_name: "tokamax_splash_attention" benchmark_sweep_params: - - {batch_size: 1, q_seq_len: 4096, kv_seq_len: 4096, q_heads: 128, kv_heads: 128, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false]} + - {batch_size: 1, q_seq_len: 4096, kv_seq_len: 4096, q_heads: 128, kv_heads: 128, qk_head_dim: 256, v_head_dim: 256, mode: ["fwd", "bwd"], causal: [true, false], num_runs: 20} trace_dir: "../microbenchmarks/attention" csv_path: "../microbenchmarks/attention" xlml_metrics_dir: "../microbenchmarks/attention" From 9b4e8ded7479fbbd7e94a66ce134aaa5df3219a4 Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Wed, 11 Feb 2026 10:48:09 +0000 Subject: [PATCH 66/88] Try pinned memory --- Ironwood/configs/host_device/host_device.yaml | 6 +- Ironwood/guides/automation/aggregator.py | 2 +- Ironwood/src/benchmark_host_device.py | 276 +++++++++++------- 3 files changed, 181 insertions(+), 103 deletions(-) diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml index c63075a0..e652f87f 100644 --- a/Ironwood/configs/host_device/host_device.yaml +++ b/Ironwood/configs/host_device/host_device.yaml @@ -2,9 +2,11 @@ benchmarks: - benchmark_name: host_device num_runs: 20 benchmark_sweep_params: - - h2d_type: "simple" + - transfer_type: "simple" data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384] - - h2d_type: "pipelined" + - transfer_type: "pipelined" + data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384] + - transfer_type: "pinned_memory" data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384] csv_path: "../microbenchmarks/host_device" trace_dir: "../microbenchmarks/host_device/trace" diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py index c519635b..5ff4ce99 100644 --- a/Ironwood/guides/automation/aggregator.py +++ b/Ironwood/guides/automation/aggregator.py @@ -16,7 +16,7 @@ "time_ms_p50", "time_ms_p90", "time_ms_p95", "time_ms_p99", "time_ms_avg", "time_ms_min", "time_ms_max", ], "host_device": [ - "data_size_mib", "h2d_type", "H2D_bw (GiB/s)_num_runs", + "data_size_mib", "transfer_type", "H2D_bw (GiB/s)_num_runs", "H2D_bw (GiB/s)_p50", "H2D_bw (GiB/s)_p90", "H2D_bw (GiB/s)_p95", "H2D_bw (GiB/s)_p99", "H2D_bw (GiB/s)_avg", "H2D_bw (GiB/s)_min", "H2D_bw (GiB/s)_max", "D2H_bw (GiB/s)_p50", "D2H_bw (GiB/s)_p90", "D2H_bw (GiB/s)_p95", "D2H_bw (GiB/s)_p99", diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index a1f2467c..3c1a5621 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -8,6 +8,8 @@ from jax import numpy as jnp import numpy as np from benchmark_utils import MetricsStatistics +from jax.sharding import Mesh, NamedSharding, PartitionSpec +from jax.experimental import mesh_utils libtpu_init_args = [ @@ -19,9 +21,162 @@ os.environ["TPU_PREMAPPED_BUFFER_TRANSFER_THRESHOLD_BYTES"] = "68719476736" +import abc + +class TransferStrategy(abc.ABC): + """Abstract base class for transfer strategies.""" + + def __init__(self, trace_dir: str = None): + self.trace_dir = trace_dir + self.h2d_perf = [] + self.d2h_perf = [] + + @abc.abstractmethod + def setup(self, data_size_mib: int, host_data: np.ndarray, devices: List[jax.Device]): + """Perform one-time setup before the benchmark loop.""" + pass + + @abc.abstractmethod + def run_h2d(self, host_data: np.ndarray, i: int) -> Any: + """Run H2D transfer for iteration i.""" + pass + + @abc.abstractmethod + def run_d2h(self, device_data: Any, i: int): + """Run D2H transfer for iteration i.""" + pass + + @abc.abstractmethod + def teardown(self): + """Clean up resources after the benchmark loop.""" + pass + + +class SimpleTransfer(TransferStrategy): + """Simple device_put/device_get strategy.""" + + def setup(self, data_size_mib: int, host_data: np.ndarray): + pass + + def run_h2d(self, host_data: np.ndarray, i: int) -> Any: + t0 = time.perf_counter() + device_array = jax.device_put(host_data) + device_array.block_until_ready() + t1 = time.perf_counter() + + # Verify H2D shape + assert device_array.shape == host_data.shape + self.h2d_perf.append((t1 - t0) * 1000) + return device_array + + def run_d2h(self, device_data: Any, i: int): + t2 = time.perf_counter() + _ = jax.device_get(device_data) + t3 = time.perf_counter() + self.d2h_perf.append((t3 - t2) * 1000) + device_data.delete() + + def teardown(self): + pass + + +class PipelinedTransfer(TransferStrategy): + """Pipelined transfer using chunking.""" + + def setup(self, data_size_mib: int, host_data: np.ndarray): + self.target_chunk_size_mib = 16 + num_devices_to_perform_h2d = 2 + self.target_devices = jax.devices()[:num_devices_to_perform_h2d] + self.num_devices = len(self.target_devices) + + data_per_dev = data_size_mib / self.num_devices + chunks_per_dev = int(data_per_dev / self.target_chunk_size_mib) + self.chunks_per_dev = max(1, chunks_per_dev) + if self.chunks_per_dev == 1: + print(f"Warning: Data size is not larger than target chunk size, falling back to standard JAX put.") + + def run_h2d(self, host_data: np.ndarray, i: int) -> Any: + if self.chunks_per_dev > 1: + chunks = np.array_split(host_data, self.chunks_per_dev * self.num_devices, axis=0) + tensors_on_device = [] + + t0 = time.perf_counter() + for idx, chunk in enumerate(chunks): + dev = self.target_devices[idx % self.num_devices] + tensors_on_device.append(jax.device_put(chunk, dev)) + + for device_tensor in tensors_on_device: + device_tensor.block_until_ready() + t1 = time.perf_counter() + + self.h2d_perf.append((t1 - t0) * 1000) + del chunks + return tensors_on_device + else: + t0 = time.perf_counter() + result = jax.device_put(host_data, self.target_devices[0]) + result.block_until_ready() + t1 = time.perf_counter() + self.h2d_perf.append((t1 - t0) * 1000) + return result + + def run_d2h(self, device_data: Any, i: int): + t2 = time.perf_counter() + if isinstance(device_data, list): + _ = jax.device_get(device_data) + for device_tensor in device_data: + device_tensor.delete() + else: + _ = jax.device_get(device_data) + device_data.delete() + t3 = time.perf_counter() + self.d2h_perf.append((t3 - t2) * 1000) + + def teardown(self): + pass + + +class PinnedMemoryTransfer(TransferStrategy): + """Pinned memory host-to-device with parallelized device-to-host transfer.""" + + def setup(self, data_size_mib: int, host_data: np.ndarray): + num_devices_to_perform_h2d = 2 + target_devices = jax.devices()[:num_devices_to_perform_h2d] + + mesh = Mesh(target_devices, ('x',)) + partition_spec = PartitionSpec('x') + host_sharding = NamedSharding(mesh, partition_spec, memory_kind='pinned_host') + self.pinned_device_sharding = NamedSharding(mesh, partition_spec) + + print(" Allocating Pinned Host Data...", flush=True) + self.pinned_host_input = jax.device_put(host_data, host_sharding) + self.pinned_host_input.block_until_ready() + + def run_h2d(self, host_data: np.ndarray, i: int) -> Any: + t_transfer_start = time.perf_counter() + device_array = jax.device_put(self.pinned_host_input, self.pinned_device_sharding) + device_array.block_until_ready() + t_transfer_end = time.perf_counter() + + self.h2d_perf.append((t_transfer_end - t_transfer_start) * 1000) + return device_array + + def run_d2h(self, device_data: Any, i: int): + t2 = time.perf_counter() + # Fetch addressable shards to enable pipelined D2H + _ = jax.device_get([s.data for s in device_data.addressable_shards]) + t3 = time.perf_counter() + self.d2h_perf.append((t3 - t2) * 1000) + device_data.delete() + + def teardown(self): + if hasattr(self, 'pinned_host_input'): + self.pinned_host_input.delete() + + def benchmark_host_device( data_size_mib: int, - h2d_type: str, + transfer_type: str, num_runs: int = 100, trace_dir: str = None, ) -> Dict[str, Any]: @@ -32,19 +187,23 @@ def benchmark_host_device( # Allocate Host Source Buffer column = 128 host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32) - - # Used in pipelined flow - # TODO: turn into a param - num_devices_to_perform_h2d = 1 - target_devices = jax.devices()[:num_devices_to_perform_h2d] print( - f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {h2d_type=}", + f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {transfer_type=}", flush=True ) - # Performance Lists - h2d_perf, d2h_perf = [], [] + strategies = { + "simple": SimpleTransfer, + "pipelined": PipelinedTransfer, + "pinned_memory": PinnedMemoryTransfer, + } + + if transfer_type not in strategies: + raise ValueError(f"Unknown transfer_type: {transfer_type}. Available: {list(strategies.keys())}") + + strategy = strategies[transfer_type](trace_dir) + strategy.setup(data_size_mib, host_data) # Profiling Context import contextlib @@ -70,103 +229,21 @@ def benchmark_host_device( step_context = contextlib.nullcontext() with step_context: - # H2D - if h2d_type == "simple": - t0 = time.perf_counter() - # Simple device_put - device_array = jax.device_put(host_data) - device_array.block_until_ready() - t1 = time.perf_counter() - - # Verify H2D shape - assert device_array.shape == host_data.shape - h2d_perf.append((t1 - t0) * 1000) - - # D2H - t2 = time.perf_counter() - # Simple device_get - # Note: device_get returns a numpy array (copy) - _ = jax.device_get(device_array) - t3 = time.perf_counter() - d2h_perf.append((t3 - t2) * 1000) - - device_array.delete() - elif h2d_type == "pipelined": - target_chunk_size_mib = 16 # Sweet spot from profiling - num_devices = len(target_devices) - - tensors_on_device = [] - - # Calculate chunks per device - data_per_dev = data_size_mib / num_devices - chunks_per_dev = int(data_per_dev / target_chunk_size_mib) - chunks_per_dev = max(1, chunks_per_dev) - - chunks = np.array_split(host_data, chunks_per_dev * num_devices, axis=0) - if chunks_per_dev > 1: - t0 = time.perf_counter() - # We need to map chunks to the correct device - # This simple example assumes chunks are perfectly divisible and ordered - # In production, use `jax.sharding` mesh logic for complex layouts - for idx, chunk in enumerate(chunks): - if num_devices > 1: - dev = target_devices[idx % num_devices] - else: - dev = target_devices[0] - tensors_on_device.append(jax.device_put(chunk, dev)) - for device_tensor in tensors_on_device: - device_tensor.block_until_ready() - t1 = time.perf_counter() - h2d_perf.append((t1 - t0) * 1000) - del chunks - - # D2H - tensor_stack = jnp.vstack(tensors_on_device) - - t2 = time.perf_counter() - _ = jax.device_get(tensor_stack) - t3 = time.perf_counter() - - d2h_perf.append((t3 - t2) * 1000) - tensor_stack.delete() - - for device_tensor in tensors_on_device: - device_tensor.delete() - del tensors_on_device - else: - t0 = time.perf_counter() - - print(f"Warning: {data_size_mib=} is not larger than {target_chunk_size_mib=}, falling back to standard JAX put.") - # Fallback to standard JAX put for small data - result = jax.device_put(host_data, target_devices[0]) - result.block_until_ready() - - t1 = time.perf_counter() - h2d_perf.append((t1 - t0) * 1000) - - # D2H - t2 = time.perf_counter() - # Simple device_get - # Note: device_get returns a numpy array (copy) - _ = jax.device_get(result) - - t3 = time.perf_counter() - d2h_perf.append((t3 - t2) * 1000) - result.delete() - - jax.clear_caches() + device_data = strategy.run_h2d(host_data, i) + strategy.run_d2h(device_data, i) + + strategy.teardown() return { - "H2D_Bandwidth_ms": h2d_perf, - "D2H_Bandwidth_ms": d2h_perf, + "H2D_Bandwidth_ms": strategy.h2d_perf, + "D2H_Bandwidth_ms": strategy.d2h_perf, } def benchmark_host_device_calculate_metrics( data_size_mib: int, - h2d_type: str, + transfer_type: str, H2D_Bandwidth_ms: List[float], D2H_Bandwidth_ms: List[float], - d2h_pipelined_method: str = "device_put", ) -> Tuple[Dict[str, Any], Dict[str, Any]]: """Calculates metrics for Host-Device transfer.""" params = locals().items() @@ -177,8 +254,7 @@ def benchmark_host_device_calculate_metrics( } metadata = {k: v for k, v in params if k in metadata_keys} metadata["dtype"] = "float32" - metadata["h2d_type"] = h2d_type - metadata["d2h_pipelined_method"] = d2h_pipelined_method + metadata["transfer_type"] = transfer_type metrics = {} From e0a9abcbe6670888dd44c0d6a054099e2dcab557 Mon Sep 17 00:00:00 2001 From: yuyanpeng-google Date: Fri, 13 Feb 2026 17:05:25 +0800 Subject: [PATCH 67/88] fix numeric error cause by padding and improve default block size (#112) Use segment id to filter out the padding KV if needed. Since the segment id would affect the latency, we should add them to reflect the padding situation. --- Ironwood/src/benchmark_attention.py | 63 ++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 14 deletions(-) diff --git a/Ironwood/src/benchmark_attention.py b/Ironwood/src/benchmark_attention.py index 64192266..87139200 100644 --- a/Ironwood/src/benchmark_attention.py +++ b/Ironwood/src/benchmark_attention.py @@ -151,19 +151,19 @@ def _get_tokamax_benchmark_fn( kernel = splash.make_splash_mqa_single_device(mask, config=config) @jax.jit - def f(q, k, v): + def f(q, k, v, segment_ids): q = q.reshape(q.shape[:-3] + (k.shape[-3], -1) + q.shape[-2:]) - kernel_ = jax.vmap(kernel, in_axes=(0, 0, 0)) # batch vmap - kernel_ = jax.vmap(kernel_, in_axes=(0, 0, 0)) # mqa vmap - return kernel_(q, k, v) + kernel_ = jax.vmap(kernel, in_axes=(0, 0, 0, None)) # batch vmap + kernel_ = jax.vmap(kernel_, in_axes=(0, 0, 0, None)) # mqa vmap + return kernel_(q, k, v, segment_ids) else: kernel = splash.make_splash_mha_single_device(mask, config=config) - f = jax.jit(jax.vmap(kernel, in_axes=(0, 0, 0))) + f = jax.jit(jax.vmap(kernel, in_axes=(0, 0, 0, None))) if mode == "fwd": return f if mode == "bwd": - return jax.grad(lambda q, k, v: f(q, k, v).mean(), argnums=(0, 1, 2)) + return jax.grad(lambda q, k, v, segment_ids: f(q, k, v, segment_ids).mean(), argnums=(0, 1, 2)) raise ValueError(f"Invalid mode: {mode}") @@ -211,7 +211,7 @@ def tokamax_splash_attention_benchmark( print(f"{key=} is not tuned") has_optimized = False hyperparams = DEFAULT_SPLASH_ATTENTION_HYPERPARAMS - + ( block_q, block_kv, @@ -225,30 +225,62 @@ def tokamax_splash_attention_benchmark( use_experimental_scheduler, ) = hyperparams + segment_ids = None + # Pad q, kv to prevent the block size are not valid if not has_optimized: def _ceiling_div(a: int, b: int) -> int: return (a + b - 1) // b - def _align_to(x: int, a: int) -> int: return _ceiling_div(x, a) * a + q_len = q.shape[-2] + k_len = k.shape[-2] + + # handle the block size, seq_len need to be multiple of block size + # bkv need to be multiple of bkv_compute + block_q = min(q_len, block_q) + # Align to 128 per kernel request + block_q = _align_to(block_q, 128) + block_kv = min(k_len, block_kv) + block_kv = _align_to(block_kv, 128) + block_kv_compute = min(block_kv, 256) + # Align block_kv to block_kv_compute + block_kv = _align_to(block_kv, block_kv_compute) + block_q_dkv = min(q_len, block_q_dkv) + # Align to 128 per kernel request + block_q_dkv = _align_to(block_q_dkv, 128) + block_kv_dkv = min(k_len, block_kv_dkv) + block_kv_dkv = _align_to(block_kv_dkv, 128) + block_kv_dkv_compute = min(block_kv_dkv, 256) + # Align block_kv to block_kv_compute + block_kv_dkv = _align_to(block_kv_dkv, block_kv_dkv_compute) + def _pad_token(t: jax.Array, size) -> jax.Array: # tensor is [batch_size, num_head, token, head_dim] result = jnp.pad(t, ((0, 0), (0, 0), (0, size), (0, 0)), constant_values=0) return result - q_len = q.shape[-2] - k_len = k.shape[-2] - # Pad q, k, v sequence, align to block sizes q = _pad_token(q, _align_to(q_len, block_q) - q_len) k = _pad_token(k, _align_to(k_len, block_kv) - k_len) v = _pad_token(v, _align_to(k_len, block_kv) - k_len) + # Handle the k padding to avoid numeric error + if k.shape[-2] > k_len: + # padded q doesn't matter since it can directly strip out from result + segment_ids = splash.SegmentIds( + q=jnp.ones((q.shape[-2],), dtype=jnp.int32), + kv=jnp.pad( + jnp.ones((k_len,), dtype=jnp.int32), + ((0, k.shape[-2] - k_len),), + constant_values=0, + ), + ) padded_q_len = q.shape[-2] padded_kv_len = k.shape[-2] + print(f"{padded_q_len=}, {padded_kv_len=}") # Attention mask mask = mask_lib.FullMask(_shape=(padded_q_len, padded_kv_len)) if causal: @@ -260,6 +292,7 @@ def attention_fn( q: jax.Array, k: jax.Array, v: jax.Array, + segment_ids: Optional[splash.SegmentIds], block_q: int, block_kv: int, block_kv_compute: int, @@ -291,7 +324,7 @@ def attention_fn( ) f = _get_tokamax_benchmark_fn(mask, config, mode, mqa=mqa) - return f(q, k, v) + return f(q, k, v, segment_ids) attention_fn = partial( attention_fn, @@ -331,7 +364,7 @@ def attention_fn( ) # Run once - output = tuned_splash(q, k, v) + output = tuned_splash(q, k, v, segment_ids) jax.block_until_ready(output) print("-" * 50) @@ -343,17 +376,19 @@ def attention_fn( print(f"{hyperparams=}") print("-" * 50) + is_event_filter_segmented = "" if segment_ids is None else "segmented_" # Run benchmark time_ms_list = timeit_from_trace( tuned_splash, q, k, v, + segment_ids, tries=num_runs, task="tokamax_splash_attentionatt", trace_dir=trace_dir, event_name_str_list=[ - f"{event_filter_regex}_no_residuals.1", + f"{event_filter_regex}_{is_event_filter_segmented}no_residuals.1", ] ) return { From e7c1649aa14e6abd7949d897c115e0f2c962b64c Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Fri, 13 Feb 2026 08:42:11 +0000 Subject: [PATCH 68/88] Fix retry command --- Ironwood/guides/automation/automation_launch.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Ironwood/guides/automation/automation_launch.sh b/Ironwood/guides/automation/automation_launch.sh index cb89e725..03f781ca 100755 --- a/Ironwood/guides/automation/automation_launch.sh +++ b/Ironwood/guides/automation/automation_launch.sh @@ -230,7 +230,7 @@ if [[ ${#FAILED_JOBS[@]} -gt 0 ]]; then for yaml_file in "${FAILED_JOBS[@]}"; do job_name=$(basename "${yaml_file}" .yaml | tr '[:upper:]' '[:lower:]' | tr '_' '-') GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}" - echo "JOB_NAME=\"${job_name}\" GCS_PATH=\"${GCS_PATH}\" envsubst '\${JOB_NAME} \${GCS_PATH}' < \"${SCRIPT_DIR}/${yaml_file}\" | kubectl apply -f -" + echo "JOB_NAME=\"${job_name}\" GCS_PATH=\"${GCS_PATH}\" GCS_SA_NAME=\"${GCS_SA_NAME}\" envsubst '\${JOB_NAME} \${GCS_PATH} \${GCS_SA_NAME}' < \"${SCRIPT_DIR}/${yaml_file}\" | kubectl apply -f -" done else echo "Success! All jobs finished." From c2bec50175a5f5d51bcea84a053a45e5af502793 Mon Sep 17 00:00:00 2001 From: Chi Shuen Lee Date: Fri, 13 Feb 2026 17:57:43 +0000 Subject: [PATCH 69/88] Remove BMM multi-host runs from the 2x2x1 yaml file to avoid confusion. --- Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml index 6257acfc..1b5b9774 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x1-bmm.yaml @@ -54,7 +54,6 @@ spec: GCS_BUCKET_DIR=${GCS_PATH} python Ironwood/src/run_benchmark.py --config="Ironwood/configs/bmm/single_device_bmm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/bmm/multi_host_bmm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} resources: requests: google.com/tpu: 4 From f4f89ee12224aabc762df8334b3829ab094fe857 Mon Sep 17 00:00:00 2001 From: Pulasthi Supun Date: Thu, 19 Feb 2026 13:02:07 -0800 Subject: [PATCH 70/88] Adding CCC based autoscaler files (#109) * Adding CCC based autoscaler files Signed-off-by: pulasthi * adding Readme file --------- Signed-off-by: pulasthi --- .../guides/automation/autoscaling/README.md | 111 ++++++++ .../autoscaling/automation_launch.sh | 267 ++++++++++++++++++ .../autoscaling/check_ccc_resources.sh | 82 ++++++ .../autoscaling/create_ccc_templates.sh | 98 +++++++ .../automation/autoscaling/job-queue-CCC.yaml | 40 +++ .../autoscaling/tpu-ccc-template.yaml | 19 ++ .../autoscaling/tpu7x-2x2x1-bmm.yaml | 62 ++++ .../autoscaling/tpu7x-2x2x1-collectives.yaml | 64 +++++ .../autoscaling/tpu7x-2x2x1-gemm.yaml | 62 ++++ .../tpu7x-2x2x1-gemm_all_reduce.yaml | 62 ++++ .../autoscaling/tpu7x-2x2x1-hbm.yaml | 62 ++++ .../autoscaling/tpu7x-2x2x1-host_device.yaml | 62 ++++ .../autoscaling/tpu7x-2x2x2-collectives.yaml | 64 +++++ .../autoscaling/tpu7x-2x2x4-collectives.yaml | 64 +++++ .../autoscaling/tpu7x-2x4x4-collectives.yaml | 64 +++++ .../autoscaling/tpu7x-4x4x4-collectives.yaml | 64 +++++ .../autoscaling/tpu7x-4x4x8-collectives.yaml | 64 +++++ 17 files changed, 1311 insertions(+) create mode 100644 Ironwood/guides/automation/autoscaling/README.md create mode 100755 Ironwood/guides/automation/autoscaling/automation_launch.sh create mode 100644 Ironwood/guides/automation/autoscaling/check_ccc_resources.sh create mode 100755 Ironwood/guides/automation/autoscaling/create_ccc_templates.sh create mode 100644 Ironwood/guides/automation/autoscaling/job-queue-CCC.yaml create mode 100644 Ironwood/guides/automation/autoscaling/tpu-ccc-template.yaml create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-bmm.yaml create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-collectives.yaml create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-gemm.yaml create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-gemm_all_reduce.yaml create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-hbm.yaml create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-host_device.yaml create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-2x2x2-collectives.yaml create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-2x2x4-collectives.yaml create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-2x4x4-collectives.yaml create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-4x4x4-collectives.yaml create mode 100644 Ironwood/guides/automation/autoscaling/tpu7x-4x4x8-collectives.yaml diff --git a/Ironwood/guides/automation/autoscaling/README.md b/Ironwood/guides/automation/autoscaling/README.md new file mode 100644 index 00000000..e8bbde50 --- /dev/null +++ b/Ironwood/guides/automation/autoscaling/README.md @@ -0,0 +1,111 @@ +# Ironwood Benchmark Automation With CCC for nodepool creation + +This directory contains the automation framework for running TPU microbenchmarks (HBM, Host-Device, Collectives, etc.) on GKE clusters with autoscaling enabled through CCC. The tool simplifies the workflow of launching multiple benchmark jobs via [Kueue](https://kueue.sigs.k8s.io/), monitoring their status, handling retries, and aggregating the final results into a unified format. + +The autoscaling version of the script uses CustomComputeClass (CCC) to manage the creation and deletion of the required nodepools automatically based on the workloads. + +## Overview + +The automation workflow consists of three main stages: +1. **Launch**: Submits Kubernetes Jobs for various benchmark configurations (e.g., different topologies like 2x2x1, 2x2x2) using Kueue for queue management. +2. **Monitor & Retry**: Watches the jobs until completion. If any job fails, it automatically retries them (up to 3 times by default). +3. **Aggregate**: Once all jobs succeed, an aggregator job is launched to collect all intermediate results from GCS and consolidate them into summary TSV files. + +## Prerequisites + +Before running the automation script, ensure the following requirements are met: + +### 1. Environment Setup +* **GKE Cluster**: You must have a GKE cluster. +* **Kubectl**: Ensure `kubectl` is installed and authenticated to your cluster. +* **GCS Bucket**: A Google Cloud Storage bucket is required to store intermediate and final aggregated results. + ```bash + gcloud storage buckets create gs://my-unique-bucket-name --location=us-central1 + ``` + +### 2. Install Kueue +The automation relies on Kueue for job queuing. Check if it's already installed: + +```bash +kubectl get namespace kueue-system +``` + +If you see `Error from server (NotFound)`, install it with: + +```bash +kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.16.0/manifests.yaml +``` + +### 3. Verify requirments for CCC +In order for CCC to work the correct set of CCC templates need to be created. If you have not already done so, allowing pre-flight checks to run +when the script prompts for it will install all the required CCC templates (templates for different TPU topologies 2x2x1, 2x2x2, etc) + +## Directory Structure + +* `automation_launch.sh`: The main entry point script. Manages the full lifecycle of the benchmark run. +* `check_ccc_resources.sh`: Validation script that makes sure all CCC related resources are created. +* `create_ccc_templates.sh`: Create the required CCC related resources. +* `../aggregator.py`: Python script that downloads results from GCS and produces summary tables. +* `../aggregator.yaml`: Kubernetes Job definition for running the aggregator. +* `job-queue-CCC.yaml`: Kueue resource definitions (ClusterQueue, LocalQueue). +* `*.yaml`: Benchmark job configurations (e.g., `tpu7x-2x2x1-hbm.yaml`). + +## Configuration + +You can configure the behavior using the following environment variable: + +| Variable | Description | Required | Default | +| :--- | :--- | :--- | :--- | +| `GCS_BUCKET_ROOT_DIR` | The root GCS path where results will be stored. Must start with `gs://`. | **Yes** | `gs://example-microbenchmark` (Change this!) | + +## Usage Guide + +1. **Clone the Repository**: + ```bash + git clone https://github.com/google/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + # Switch to the correct branch if necessary + git checkout tpu7x-auto + ``` + +2. **Set the GCS Bucket**: + Export the path to your GCS bucket. This is where all results will be saved. + ```bash + export GCS_BUCKET_ROOT_DIR="gs://your-unique-bucket-name/benchmark_runs/$(date +%Y%m%d_%H%M%S)" + ``` + +3. **Run the Automation Script**: + Execute the launch script from the root of the repository. + ```bash + bash Ironwood/guides/automation/automation_launch.sh + ``` + + **What happens next?** + * If pre-flight checks are enabled, will check and CCC resources (and create if needed) and check GCS permissions + * It applies the Kueue job queue. + * It submits the benchmark jobs defined in the script (e.g., HBM tests). + * It waits for jobs to finish, retrying any failures up to 3 times. + * Finally, it launches the `aggregator` job. + +## Output + +After the automation completes, check your GCS bucket (`GCS_BUCKET_ROOT_DIR`). You will find: + +* **`aggregated_results/`**: Contains the final summary CSV/TSV files (e.g., `hbm.tsv`, `collectives.tsv`). +* **`/`**: Directories for each individual job containing intermediate results. + +## Troubleshooting + +### Job Failures +If jobs fail even after retries: +1. Check the script output to see which specific jobs failed. +2. Inspect the logs of a failed job using `kubectl logs job/`. +3. Manually retry a specific job if needed using the command printed by the script at the end of the run. + +### Missing Results +If the `aggregated_results` folder is empty: +1. Check the logs of the aggregator job: + ```bash + kubectl logs job/aggregator + ``` +2. Ensure the `GCS_BUCKET_ROOT_DIR` was accessible by the pods (check Workload Identity or service account permissions if running in a restricted project). diff --git a/Ironwood/guides/automation/autoscaling/automation_launch.sh b/Ironwood/guides/automation/autoscaling/automation_launch.sh new file mode 100755 index 00000000..2823a4ef --- /dev/null +++ b/Ironwood/guides/automation/autoscaling/automation_launch.sh @@ -0,0 +1,267 @@ +#!/usr/bin/env bash + +###################################################################### +# automation_launch.sh: Run a series of TPU microbenchmark jobs +###################################################################### +# This script automates the process of launching multiple TPU microbenchmark +# jobs defined in various YAML files. It handles: +# - Pre-flight checks for necessary CCC resources and GCS permissions. +# - Applying job YAMLs to a Kubernetes cluster. +# - Waiting for jobs to complete, with a timeout. +# - Retrying failed jobs up to a configurable number of times. +# - Aggregating results using a separate aggregator job. +# - Reporting on any jobs that ultimately failed. +# +# User-configurable variables are at the top of the script. +###################################################################### + +###################################################################### +# USER INPUT +###################################################################### +TIMESTAMP=$(date +%Y-%m-%d_%H-%M-%S) +export GCS_BUCKET_ROOT_DIR="gs://pulasthi-ccc-testb1/test5" +export GCS_SA_NAME="gcs-writer" # Service account with write access to GCS_BUCKET_ROOT_DIR +export PROJECT_ID=$(gcloud config get-value project 2>/dev/null) +MAX_RETRIES=3 +TIMEOUT_SECOND=3600 + +yaml_names=( + "tpu7x-2x2x1-hbm.yaml" + "tpu7x-2x4x4-collectives.yaml" + "tpu7x-2x2x1-gemm_all_reduce.yaml" +) + +################################################################################ +# COLOR OUTPUT +################################################################################ + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +function print_success() { + echo -e "${GREEN}✅ $1${NC}" +} + +function print_error() { + echo -e "${RED}❌ $1${NC}" +} + +function print_info() { + echo -e "${BLUE}ℹ️ $1${NC}" +} + +function print_warning() { + echo -e "${YELLOW}⚠️ $1${NC}" +} + +###################################################################### +# VALIDATION & SETUP +###################################################################### + +if [[ -z "${GCS_BUCKET_ROOT_DIR}" || "${GCS_BUCKET_ROOT_DIR}" != "gs://"* ]]; then + print_error "GCS_BUCKET_ROOT_DIR must be set and start with gs://" + exit 1 +fi + +print_info "The intermediate result will be written to ${GCS_BUCKET_ROOT_DIR}" + +read -p "Run pre-flight checks (CCC resource validation & GCS permissions)? (y/n): " run_checks + +if [[ "$run_checks" == "y" ]]; then + print_info "Running CCC resource validation..." + required_topologies=($(printf "%s\n" "${yaml_names[@]}" | grep -oE '[0-9]+x[0-9]+x[0-9]+' | sort -u)) + SCRIPT_DIR="$(dirname "$(realpath "$0")")" + if ! bash "${SCRIPT_DIR}/check_ccc_resources.sh"; then + print_error "Some required CCC resources are missing. Please run create_ccc_templates.sh first. Make sure to fill the requierd variables." + exit 1 + fi + + print_info "Running GCS permission check..." + export SA_NAME="${GCS_SA_NAME}" + export PROJECT_ID="${PROJECT_ID}" + if ! bash "${SCRIPT_DIR}/../check_gcs_permissions.sh"; then + print_error "GCS Permission Check Failed. Exiting." + exit 1 + fi +else + print_warning "Skipping pre-flight checks." +fi + +SCRIPT_DIR="$(dirname "$(realpath "$0")")" +kubectl apply -f ${SCRIPT_DIR}/job-queue-CCC.yaml + +###################################################################### +# LAUNCH JOBS & WAIT FOR COMPLETION +###################################################################### + + +# Function to wait for a job to complete or fail +wait_for_job_completion() { + local job_name="$1" + local timeout="$2" + local start_time=$(date +%s) + local end_time=$((start_time + timeout)) + + while true; do + current_time=$(date +%s) + if [[ $current_time -gt $end_time ]]; then + print_error "Timeout waiting for job ${job_name}" + return 2 + fi + + # Check for Complete condition + if kubectl get job "${job_name}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null | grep -q "True"; then + print_success "Job ${job_name} completed successfully!" + return 0 + fi + + # Check for Failed condition + if kubectl get job "${job_name}" -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null | grep -q "True"; then + print_error "Job ${job_name} FAILED!" + return 1 + fi + + sleep 5 + done +} + +# Function to apply jobs and wait for them to complete +# Returns a list of failed yaml files in the variable FAILED_JOBS +apply_and_wait() { + local yaml_files=("$@") + local job_names_in_batch=() + FAILED_JOBS=() + + print_info "Processing batch of ${#yaml_files[@]} jobs..." + + # Launch all jobs + for yaml_file in "${yaml_files[@]}"; do + local filepath="${SCRIPT_DIR}/${yaml_file}" + # Derive job name: remove .yaml, lowercase, replace _ with - + local job_name=$(basename "${yaml_file}" .yaml | tr '[:upper:]' '[:lower:]' | tr '_' '-') + random_suffix=$(head /dev/urandom | tr -dc a-z0-9 | head -c 5) + export JOB_NAME="${job_name}-${random_suffix}" + export GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}" + + print_info "Launching job: ${filepath} (name: ${JOB_NAME})" + envsubst '${JOB_NAME} ${GCS_PATH} ${GCS_SA_NAME}' < "${filepath}" | kubectl apply -f - + job_names_in_batch+=("${JOB_NAME}") + done + + # Monitor jobs + local start_time=$(date +%s) + local end_time=$((start_time + TIMEOUT_SECOND)) + local last_print_time=0 + + while true; do + local current_time=$(date +%s) + if [[ $current_time -gt $end_time ]]; then + print_error "Timeout waiting for batch completion" + break + fi + + # Identify active jobs + local active_jobs=() + for job_name in "${job_names_in_batch[@]}"; do + # Check for Complete + if kubectl get job "${job_name}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null | grep -q "True"; then + continue + fi + + # Check for Failed + if kubectl get job "${job_name}" -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null | grep -q "True"; then + continue + fi + + # If neither, it's pending/running + active_jobs+=("${job_name}") + done + + if [[ ${#active_jobs[@]} -eq 0 ]]; then + break + fi + + # Dashboard View - Print every 60 seconds + if [[ $((current_time - last_print_time)) -ge 60 ]]; then + print_info "======================================================================" + date "+%Y-%m-%d %H:%M:%S" + print_info "----------------------------------------------------------------------" + kubectl get jobs "${active_jobs[@]}" + print_info "======================================================================" + last_print_time=$current_time + fi + + sleep 10 + done + + # Collect results and cleanup + FAILED_JOBS=() + for i in "${!yaml_files[@]}"; do + local yaml_file="${yaml_files[$i]}" + local job_name="${job_names_in_batch[$i]}" + local filepath="${SCRIPT_DIR}/${yaml_file}" + + # Check if failed or still running (timeout) + if ! kubectl get job "${job_name}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null | grep -q "True"; then + FAILED_JOBS+=("${yaml_files[$i]}") + fi + + export JOB_NAME="${job_name}" + export GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}" + envsubst '${JOB_NAME} ${GCS_PATH}' < "${filepath}" | kubectl delete -f - &> /dev/null + done +} + +# Retry loop +current_batch=("${yaml_names[@]}") + +for (( retry=1; retry<=MAX_RETRIES; retry++ )); do + apply_and_wait "${current_batch[@]}" + + if [[ ${#FAILED_JOBS[@]} -eq 0 ]]; then + print_success "All jobs completed successfully in Round ${retry}!" + break + fi + + print_error "Round ${retry} finished. ${#FAILED_JOBS[@]} jobs failed." + current_batch=("${FAILED_JOBS[@]}") + + if [[ ${retry} -lt ${MAX_RETRIES} ]]; then + print_info "Retrying failed jobs..." + print_info "========================================" + print_info "$((retry + 1)) / ${MAX_RETRIES} max retries" + print_info "========================================" + else + print_error "Max retries reached." + fi +done + +echo "" +print_info "Jobs completed. Aggregating results..." +echo "" + +# Ensure cleanup of any previous aggregator job to avoid immutable field errors +kubectl delete job aggregator --ignore-not-found=true + +envsubst '${GCS_BUCKET_ROOT_DIR} ${GCS_SA_NAME}' < ${SCRIPT_DIR}/../aggregator.yaml | kubectl apply -f - +wait_for_job_completion "aggregator" ${TIMEOUT_SECOND} +envsubst '${GCS_BUCKET_ROOT_DIR} ${GCS_SA_NAME}' < ${SCRIPT_DIR}/../aggregator.yaml | kubectl delete -f - + +# Print the failed jobs at the end for better visibility. + +if [[ ${#FAILED_JOBS[@]} -gt 0 ]]; then + print_error "The following jobs finally failed after ${MAX_RETRIES} rounds:" + printf '%s\n' "${FAILED_JOBS[@]}" + + echo -e "\nTo retry manually, run:" + for yaml_file in "${FAILED_JOBS[@]}"; do + job_name=$(basename "${yaml_file}" .yaml | tr '[:upper:]' '[:lower:]' | tr '_' '-') + GCS_PATH="${GCS_BUCKET_ROOT_DIR}/${job_name}" + echo "JOB_NAME=\"${job_name}\" GCS_PATH=\"${GCS_PATH}\" envsubst '\${JOB_NAME} \${GCS_PATH}' < \"${SCRIPT_DIR}/${yaml_file}\" | kubectl apply -f -" + done +else + print_success "Success! All jobs finished." +fi diff --git a/Ironwood/guides/automation/autoscaling/check_ccc_resources.sh b/Ironwood/guides/automation/autoscaling/check_ccc_resources.sh new file mode 100644 index 00000000..cb085d66 --- /dev/null +++ b/Ironwood/guides/automation/autoscaling/check_ccc_resources.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +###################################################################### +# check_ccc_resources.sh: Validate existence of CCC resources +###################################################################### +# This script checks if the required Google Cloud Compute resource policies +# and Kubernetes Custom Compute Class (CCC) manifests exist for a given +# list of TPU topologies. +# +# It iterates through the provided TOPOLOGIES array: +# - For multi-host topologies, it verifies the presence of the +# expected workload policy using gcloud. +# - It checks for the existence of the Custom Compute Class resource +# in the Kubernetes cluster using kubectl. +# +# The script exits with status 1 if any required resource is missing, +# and status 0 if all resources are found. +###################################################################### + +export TOPOLOGIES=(2x2x1 2x2x2 2x2x4 2x4x4 4x4x4 4x4x8) +PROJECT_ID="${PROJECT_ID:-$(gcloud config get-value project 2>/dev/null)}" +export REGION=$(kubectl get nodes -o jsonpath='{.items[0].metadata.labels.topology\.kubernetes\.io/region}') +CLUSTER_NAME=$(kubectl config current-context | cut -d '_' -f 4) +export RESOURCE_NAME=${CLUSTER_NAME%-gke} + +################################################################################ +# COLOR OUTPUT +################################################################################ + +RED='\033[0;31m' +GREEN='\033[0;32m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +function print_success() { + echo -e "${GREEN}✅ $1${NC}" +} + +function print_error() { + echo -e "${RED}❌ $1${NC}" +} + +function print_info() { + echo -e "${BLUE}ℹ️ $1${NC}" +} + +print_info "Checking CCC resources for all topologies" +missing_resources=false + +for TOPOLOGY in "${TOPOLOGIES[@]}" +do + print_info "Checking resources for topology: ${TOPOLOGY}" + # Check workload policy for multi-host topologies + if [[ "${TOPOLOGY}" != "2x2x1" ]]; then + WORKLOAD_POLICY_NAME="${RESOURCE_NAME}-workload-policy${TOPOLOGY}" + if gcloud compute resource-policies describe ${WORKLOAD_POLICY_NAME} --project=${PROJECT_ID} --region=${REGION} &> /dev/null; then + print_success "Workload policy ${WORKLOAD_POLICY_NAME} exists." + else + print_error "Workload policy ${WORKLOAD_POLICY_NAME} is MISSING." + missing_resources=true + fi + else + print_info "Skipping workload policy check for single-host topology ${TOPOLOGY}." + fi + + # Check Custom Compute Class + CCC_NAME="tpuv7-${TOPOLOGY}-class" + if kubectl get computeclass ${CCC_NAME} &> /dev/null; then + print_success "Custom Compute Class ${CCC_NAME} exists." + else + print_error "Custom Compute Class ${CCC_NAME} is MISSING." + missing_resources=true + fi +done + +if [[ "${missing_resources}" == "true" ]]; then + print_error "One or more required resources are missing. Please create them." + exit 1 +else + print_success "All required CCC resources exist." + exit 0 +fi diff --git a/Ironwood/guides/automation/autoscaling/create_ccc_templates.sh b/Ironwood/guides/automation/autoscaling/create_ccc_templates.sh new file mode 100755 index 00000000..5630885d --- /dev/null +++ b/Ironwood/guides/automation/autoscaling/create_ccc_templates.sh @@ -0,0 +1,98 @@ +#!/bin/bash + +###################################################################### +# create_ccc_templates.sh: Create Custom Compute Class templates +###################################################################### +# This script creates the necessary Google Cloud Compute resource policies +# and Kubernetes Custom Compute Class (CCC) manifests for various TPU +# topologies. +# +# It iterates through a predefined list of TOPOLOGIES: +# - For multi-host topologies, it creates a HIGH_THROUGHPUT +# workload policy if it doesn't already exist. +# - It then uses envsubst to populate a template YAML +# (tpu-ccc-template.yaml) with the correct TPU_TOPOLOGY, +# RESERVATION_NAME, PROJECT_ID, and POLICY_NAME. +# - The resulting manifest is applied to the Kubernetes cluster using +# kubectl apply. +# +# Required environment variables: +# - RESERVATION_NAME: The name of the GCE reservation to use. +# - PROJECT_ID: The Google Cloud Project ID. +# - REGION: The Google Cloud Region. +# - RESOURCE_NAME: A base name used for naming resources. +###################################################################### + +export RESERVATION_NAME="" + + +export TOPOLOGIES=(2x2x1 2x2x2 2x2x4 2x4x4 4x4x4 4x4x8) +SCRIPT_DIR="$(dirname "$(realpath "$0")")" +PROJECT_ID="${PROJECT_ID:-$(gcloud config get-value project 2>/dev/null)}" +export REGION=$(kubectl get nodes -o jsonpath='{.items[0].metadata.labels.topology\.kubernetes\.io/region}') +CLUSTER_NAME=$(kubectl config current-context | cut -d '_' -f 4) +export RESOURCE_NAME=${CLUSTER_NAME%-gke} # assumes cluster was created with setup script which creates cluster with ${RESOURCE_NAME}-gke as name +################################################################################ +# COLOR OUTPUT +################################################################################ + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +function print_header() { + echo -e "\n${BLUE}========================================${NC}" + echo -e "${BLUE}$1${NC}" + echo -e "${BLUE}========================================${NC}\n" +} + +function print_success() { + echo -e "${GREEN}✅ $1${NC}" +} + +function print_error() { + echo -e "${RED}❌ $1${NC}" +} + +function print_warning() { + echo -e "${YELLOW}⚠️ $1${NC}" +} + +function print_info() { + echo -e "${BLUE}ℹ️ $1${NC}" +} + +print_info "Creating CCC templates for all topoligies" +# Create workload policy +for TOPOLOGY in "${TOPOLOGIES[@]}" +do + export TPU_TOPOLOGY=${TOPOLOGY} + if [[ "${TOPOLOGY}" == "2x2x1" ]]; then + print_warning "Skipping workload policy creation for ${TOPOLOGY} as it is not needed for single host topologies." + export POLICY_NAME="" # No policy for single host + else + WORKLOAD_POLICY_NAME="${RESOURCE_NAME}-workload-policy${TOPOLOGY}" + if gcloud compute resource-policies describe ${WORKLOAD_POLICY_NAME} --project=${PROJECT_ID} --region=${REGION} &> /dev/null; then + print_info "Workload policy ${WORKLOAD_POLICY_NAME} already exists." + else + print_info "Creating workload policy ${WORKLOAD_POLICY_NAME}..." + gcloud compute resource-policies create workload-policy ${WORKLOAD_POLICY_NAME} \ + --type HIGH_THROUGHPUT \ + --accelerator-topology ${TOPOLOGY} \ + --project ${PROJECT_ID} \ + --region ${REGION} + print_success "Workload policy ${WORKLOAD_POLICY_NAME} created." + fi + export POLICY_NAME=${WORKLOAD_POLICY_NAME} + fi + + echo "${TPU_TOPOLOGY} ${RESERVATION_NAME} ${PROJECT_ID} ${POLICY_NAME}" + if [[ "${TOPOLOGY}" == "2x2x1" ]]; then + envsubst '${TPU_TOPOLOGY} ${RESERVATION_NAME} ${PROJECT_ID}' < ${SCRIPT_DIR}/tpu-ccc-template.yaml | sed '/placement:/,/policyName:/d' | kubectl apply -f - + else + envsubst '${TPU_TOPOLOGY} ${RESERVATION_NAME} ${PROJECT_ID} ${POLICY_NAME}' < ${SCRIPT_DIR}/tpu-ccc-template.yaml | kubectl apply -f - + fi + print_success "Applied TPU Compute Class for ${TOPOLOGY}" +done diff --git a/Ironwood/guides/automation/autoscaling/job-queue-CCC.yaml b/Ironwood/guides/automation/autoscaling/job-queue-CCC.yaml new file mode 100644 index 00000000..c1c155ce --- /dev/null +++ b/Ironwood/guides/automation/autoscaling/job-queue-CCC.yaml @@ -0,0 +1,40 @@ +apiVersion: kueue.x-k8s.io/v1beta2 +kind: ResourceFlavor +metadata: + name: "flavor-tpu7x" +spec: + nodeLabels: + cloud.google.com/gke-tpu-accelerator: tpu7x +--- +apiVersion: kueue.x-k8s.io/v1beta2 +kind: ClusterQueue +metadata: + name: cluster-queue-tpu7x +spec: + flavorFungibility: + whenCanBorrow: MayStopSearch + whenCanPreempt: TryNextFlavor + namespaceSelector: {} + preemption: + borrowWithinCohort: + policy: Never + reclaimWithinCohort: Never + withinClusterQueue: LowerPriority + queueingStrategy: BestEffortFIFO + resourceGroups: + - coveredResources: + - google.com/tpu + flavors: + - name: flavor-tpu7x + resources: + - name: google.com/tpu + nominalQuota: 128 + stopPolicy: None +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + namespace: default + name: "user-queue-tpu7x" +spec: + clusterQueue: "cluster-queue-tpu7x" \ No newline at end of file diff --git a/Ironwood/guides/automation/autoscaling/tpu-ccc-template.yaml b/Ironwood/guides/automation/autoscaling/tpu-ccc-template.yaml new file mode 100644 index 00000000..100175f4 --- /dev/null +++ b/Ironwood/guides/automation/autoscaling/tpu-ccc-template.yaml @@ -0,0 +1,19 @@ +apiVersion: cloud.google.com/v1 +kind: ComputeClass +metadata: + name: tpuv7-${TPU_TOPOLOGY}-class +spec: + priorities: + - tpu: + type: tpu7x + topology: ${TPU_TOPOLOGY} + count: 4 + reservations: + specific: + - name: ${RESERVATION_NAME} + project: ${PROJECT_ID} + affinity: Specific + placement: + policyName: ${POLICY_NAME} + nodePoolAutoCreation: + enabled: true \ No newline at end of file diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-bmm.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-bmm.yaml new file mode 100644 index 00000000..1a4c777e --- /dev/null +++ b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-bmm.yaml @@ -0,0 +1,62 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-tpu7x +spec: + completionMode: Indexed + suspend: true + parallelism: 1 + completions: 1 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/compute-class: tpuv7-2x2x1-class + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x1 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/bmm/single_device_bmm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-collectives.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-collectives.yaml new file mode 100644 index 00000000..eb152986 --- /dev/null +++ b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-collectives.yaml @@ -0,0 +1,64 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-tpu7x +spec: + completionMode: Indexed + suspend: true + parallelism: 1 + completions: 1 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/compute-class: tpuv7-2x2x1-class + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x1 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-gemm.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-gemm.yaml new file mode 100644 index 00000000..822a2246 --- /dev/null +++ b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-gemm.yaml @@ -0,0 +1,62 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-tpu7x +spec: + completionMode: Indexed + suspend: true + parallelism: 1 + completions: 1 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/compute-class: tpuv7-2x2x1-class + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x1 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/gemm/gemm_multiple_run_more.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-gemm_all_reduce.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-gemm_all_reduce.yaml new file mode 100644 index 00000000..11b1fce7 --- /dev/null +++ b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-gemm_all_reduce.yaml @@ -0,0 +1,62 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-tpu7x +spec: + completionMode: Indexed + suspend: true + parallelism: 1 + completions: 1 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/compute-class: tpuv7-2x2x1-class + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x1 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/gemm_all_reduce/gemm_all_reduce.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-hbm.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-hbm.yaml new file mode 100644 index 00000000..f589cc98 --- /dev/null +++ b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-hbm.yaml @@ -0,0 +1,62 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-tpu7x +spec: + completionMode: Indexed + suspend: true + parallelism: 1 + completions: 1 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/compute-class: tpuv7-2x2x1-class + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x1 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/hbm/hbm.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-host_device.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-host_device.yaml new file mode 100644 index 00000000..bc9c0819 --- /dev/null +++ b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x1-host_device.yaml @@ -0,0 +1,62 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-tpu7x +spec: + completionMode: Indexed + suspend: true + parallelism: 1 + completions: 1 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/compute-class: tpuv7-2x2x1-class + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x1 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-2x2x2-collectives.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x2-collectives.yaml new file mode 100644 index 00000000..7915ff28 --- /dev/null +++ b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x2-collectives.yaml @@ -0,0 +1,64 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-tpu7x +spec: + completionMode: Indexed + suspend: true + parallelism: 2 + completions: 2 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/compute-class: tpuv7-2x2x2-class + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x2 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-2x2x4-collectives.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x4-collectives.yaml new file mode 100644 index 00000000..60282962 --- /dev/null +++ b/Ironwood/guides/automation/autoscaling/tpu7x-2x2x4-collectives.yaml @@ -0,0 +1,64 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-tpu7x +spec: + completionMode: Indexed + suspend: true + parallelism: 4 + completions: 4 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/compute-class: tpuv7-2x2x4-class + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x2x4 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-2x4x4-collectives.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-2x4x4-collectives.yaml new file mode 100644 index 00000000..343bbf01 --- /dev/null +++ b/Ironwood/guides/automation/autoscaling/tpu7x-2x4x4-collectives.yaml @@ -0,0 +1,64 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-tpu7x +spec: + completionMode: Indexed + suspend: true + parallelism: 8 + completions: 8 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/compute-class: tpuv7-2x4x4-class + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 2x4x4 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-4x4x4-collectives.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-4x4x4-collectives.yaml new file mode 100644 index 00000000..23f0fb3a --- /dev/null +++ b/Ironwood/guides/automation/autoscaling/tpu7x-4x4x4-collectives.yaml @@ -0,0 +1,64 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-tpu7x +spec: + completionMode: Indexed + suspend: true + parallelism: 16 + completions: 16 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/compute-class: tpuv7-4x4x4-class + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 4x4x4 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file diff --git a/Ironwood/guides/automation/autoscaling/tpu7x-4x4x8-collectives.yaml b/Ironwood/guides/automation/autoscaling/tpu7x-4x4x8-collectives.yaml new file mode 100644 index 00000000..25655ca5 --- /dev/null +++ b/Ironwood/guides/automation/autoscaling/tpu7x-4x4x8-collectives.yaml @@ -0,0 +1,64 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc-${JOB_NAME} +spec: + clusterIP: None + selector: + job-name: ${JOB_NAME} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ${JOB_NAME} + labels: + kueue.x-k8s.io/queue-name: user-queue-tpu7x +spec: + completionMode: Indexed + suspend: true + parallelism: 32 + completions: 32 + backoffLimit: 0 + template: + spec: + subdomain: headless-svc-${JOB_NAME} + serviceAccountName: ${GCS_SA_NAME} + restartPolicy: Never + nodeSelector: + cloud.google.com/compute-class: tpuv7-4x4x8-class + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: 4x4x8 + containers: + - name: jax-tpu + image: python:3.12 + securityContext: + privileged: false + env: + - name: JAX_PLATFORMS + value: "tpu,cpu" + - name: TPU_VMODULE + value: "singleton_tpu_system_manager=10,tpu_version_flag=10,device_util=10,device_scanner=10,mesh_builder=10,master=10" + - name: XLA_IR_DEBUG + value: "1" + - name: XLA_HLO_DEBUG + value: "1" + command: + - bash + - -c + - | + set -ex + + git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git + cd accelerator-microbenchmarks + git checkout tpu7x-auto + pip install -r requirements.txt + + GCS_BUCKET_DIR=${GCS_PATH} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + resources: + requests: + google.com/tpu: 4 + limits: + google.com/tpu: 4 \ No newline at end of file From 1629d32d7fef47c18521f8b6cfe5c2c3b4946fe4 Mon Sep 17 00:00:00 2001 From: Pulasthi Supun Date: Fri, 20 Feb 2026 15:58:28 -0800 Subject: [PATCH 71/88] adding all benchmarks to automation script (#114) --- .../automation/autoscaling/automation_launch.sh | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Ironwood/guides/automation/autoscaling/automation_launch.sh b/Ironwood/guides/automation/autoscaling/automation_launch.sh index 2823a4ef..bcd041a1 100755 --- a/Ironwood/guides/automation/autoscaling/automation_launch.sh +++ b/Ironwood/guides/automation/autoscaling/automation_launch.sh @@ -19,16 +19,24 @@ # USER INPUT ###################################################################### TIMESTAMP=$(date +%Y-%m-%d_%H-%M-%S) -export GCS_BUCKET_ROOT_DIR="gs://pulasthi-ccc-testb1/test5" +export GCS_BUCKET_ROOT_DIR="gs:///" export GCS_SA_NAME="gcs-writer" # Service account with write access to GCS_BUCKET_ROOT_DIR export PROJECT_ID=$(gcloud config get-value project 2>/dev/null) MAX_RETRIES=3 TIMEOUT_SECOND=3600 yaml_names=( + "tpu7x-2x2x1-bmm.yaml" + "tpu7x-2x2x1-collectives.yaml" + "tpu7x-2x2x1-gemm.yaml" + "tpu7x-2x2x1-gemm_all_reduce.yaml" "tpu7x-2x2x1-hbm.yaml" + "tpu7x-2x2x1-host_device.yaml" + "tpu7x-2x2x2-collectives.yaml" + "tpu7x-2x2x4-collectives.yaml" "tpu7x-2x4x4-collectives.yaml" - "tpu7x-2x2x1-gemm_all_reduce.yaml" + "tpu7x-4x4x4-collectives.yaml" + "tpu7x-4x4x8-collectives.yaml" ) ################################################################################ From 5885a2889b0635827b3e007151d7761f6515feab Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Wed, 25 Feb 2026 08:41:32 +0000 Subject: [PATCH 72/88] Add missing 8192 gemm --- Ironwood/configs/gemm/gemm_multiple_run_more.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Ironwood/configs/gemm/gemm_multiple_run_more.yaml b/Ironwood/configs/gemm/gemm_multiple_run_more.yaml index ea89f98b..022a8285 100644 --- a/Ironwood/configs/gemm/gemm_multiple_run_more.yaml +++ b/Ironwood/configs/gemm/gemm_multiple_run_more.yaml @@ -11,6 +11,7 @@ benchmarks: - {m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'bfloat16'} - {m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'bfloat16'} - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'bfloat16'} + - {m: 8192, k: 8192, n: 8192, num_runs: 100, dtype: 'bfloat16'} - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'bfloat16'} - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'bfloat16'} @@ -26,6 +27,7 @@ benchmarks: - {m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float32'} - {m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float32'} - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float32'} + - {m: 8192, k: 8192, n: 8192, num_runs: 100, dtype: 'float32'} - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float32'} - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float32'} @@ -41,6 +43,7 @@ benchmarks: - {m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float16'} - {m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float16'} - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float16'} + - {m: 8192, k: 8192, n: 8192, num_runs: 100, dtype: 'float16'} - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float16'} - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float16'} @@ -56,6 +59,7 @@ benchmarks: - {m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float8'} - {m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float8'} - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float8'} + - {m: 8192, k: 8192, n: 8192, num_runs: 100, dtype: 'float8'} - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float8'} - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float8'} @@ -71,5 +75,6 @@ benchmarks: - {m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float4'} - {m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float4'} - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float4'} + - {m: 8192, k: 8192, n: 8192, num_runs: 100, dtype: 'float4'} - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float4'} - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float4'} \ No newline at end of file From 4a284030faa42e89b7b7d458c850f2eb845af50c Mon Sep 17 00:00:00 2001 From: "Amy (Yu-Hsuan) Lin" Date: Tue, 3 Mar 2026 16:11:20 +0800 Subject: [PATCH 73/88] Remove peak flops for fp32, which is unspecified in spec (#117) --- Ironwood/src/benchmark_bmm.py | 4 ++- Ironwood/src/benchmark_gemm.py | 4 +-- Ironwood/src/benchmark_gemm_all_reduce.py | 2 +- Ironwood/src/benchmark_utils.py | 32 ++++++++++++++--------- 4 files changed, 26 insertions(+), 16 deletions(-) diff --git a/Ironwood/src/benchmark_bmm.py b/Ironwood/src/benchmark_bmm.py index f988008e..486c1996 100644 --- a/Ironwood/src/benchmark_bmm.py +++ b/Ironwood/src/benchmark_bmm.py @@ -216,6 +216,8 @@ def multi_host_bmm_calculate_metrics( total_flops, total_flops_all_devices = handle_based_on_sharding( total_flops, sharding_strategy ) + peak_flops_multiplier = get_peak_flops_multiplier(dtype.dtype.name) + peak_flops = PEAK_FLOPS_PER_DEVICE * peak_flops_multiplier if peak_flops_multiplier is not None else None return unified_flops_metrics( m, n, @@ -223,7 +225,7 @@ def multi_host_bmm_calculate_metrics( time_ms_list, total_flops, total_flops_all_devices, - PEAK_FLOPS_PER_DEVICE, + peak_flops, dtype=dtype.dtype.name, b=b, ) diff --git a/Ironwood/src/benchmark_gemm.py b/Ironwood/src/benchmark_gemm.py index 422b1bc4..4d4b8ccc 100644 --- a/Ironwood/src/benchmark_gemm.py +++ b/Ironwood/src/benchmark_gemm.py @@ -147,7 +147,7 @@ def gemm_multiple_run_calculate_metrics( total_flops, SHARDING_STRATEGY ) peak_flops_multiplier = get_peak_flops_multiplier(dtype.dtype.name) - peak_flops = PEAK_FLOPS_PER_DEVICE * peak_flops_multiplier + peak_flops = PEAK_FLOPS_PER_DEVICE * peak_flops_multiplier if peak_flops_multiplier is not None else None return unified_flops_metrics( m, n, @@ -332,7 +332,7 @@ def gemm_simple_with_dtype_calculate_metrics( metadata, metrics = unified_flops_metrics( m, n, k, time_ms_list, total_flops, total_flops_all_devices, - PEAK_FLOPS_PER_DEVICE * peak_flops_multiplier) + PEAK_FLOPS_PER_DEVICE * peak_flops_multiplier if peak_flops_multiplier is not None else None) # Add dtype info to metadata for logging metadata["in_dtype"] = in_dtype_str diff --git a/Ironwood/src/benchmark_gemm_all_reduce.py b/Ironwood/src/benchmark_gemm_all_reduce.py index 55744593..11d9dc7b 100644 --- a/Ironwood/src/benchmark_gemm_all_reduce.py +++ b/Ironwood/src/benchmark_gemm_all_reduce.py @@ -200,7 +200,7 @@ def _calculate_metrics_base( dtype_str = dtype.dtype.name peak_flops_multiplier = get_peak_flops_multiplier(dtype_str) - peak_flops = PEAK_FLOPS_PER_DEVICE * peak_flops_multiplier + peak_flops = PEAK_FLOPS_PER_DEVICE * peak_flops_multiplier if peak_flops_multiplier is not None else None return unified_flops_metrics( m, n, k, time_ms_list, total_flops_per_device, total_flops_all_devices, peak_flops, dtype=dtype_str, diff --git a/Ironwood/src/benchmark_utils.py b/Ironwood/src/benchmark_utils.py index 0a45678d..aa111cba 100644 --- a/Ironwood/src/benchmark_utils.py +++ b/Ironwood/src/benchmark_utils.py @@ -1140,7 +1140,7 @@ def unified_flops_metrics( time_ms_list: list[float], total_flops: int, total_flops_all_devices: int, - peak_TFLOPS_per_device: float, + peak_TFLOPS_per_device: float | None = None, dtype: str = None, b: int = None, ) -> Dict[str, Any]: @@ -1158,10 +1158,19 @@ def unified_flops_metrics( total_flops_all_devices / average_time_s / 10**12 for average_time_s in average_time_s_list ] - mfu = [ - tflops_per_sec / peak_TFLOPS_per_device - for tflops_per_sec in tflops_per_sec_list - ] + if peak_TFLOPS_per_device is not None: + mfu = [ + tflops_per_sec / peak_TFLOPS_per_device + for tflops_per_sec in tflops_per_sec_list + ] + mfu_statistics = MetricsStatistics(metrics_list=mfu, metrics_name="MFU") + mfu_val = f"{mfu_statistics.statistics['p50']:.2%}" + mfu_raw = mfu_statistics.statistics["p50"] + else: + mfu_statistics = None + mfu_val = "N/A" + mfu_raw = "N/A" + average_time_ms_statistics = MetricsStatistics( metrics_list=time_ms_list, metrics_name="step_time_ms" ) @@ -1171,14 +1180,13 @@ def unified_flops_metrics( tflops_per_sec_all_devices_statistics = MetricsStatistics( metrics_list=tflops_per_sec_all_devices, metrics_name="tflops_per_sec" ) - mfu_statistics = MetricsStatistics(metrics_list=mfu, metrics_name="MFU") dtype_prefix = f"[{dtype}] " if dtype is not None else "" print( f"{dtype_prefix}" f"Total floating-point ops: {total_flops}, Step Time (median): {average_time_ms_statistics.statistics['p50']:.2f}, " f"Throughput (median): {tflops_per_sec_statistics.statistics['p50']:.2f} TFLOP / second / device, " f"TotalThroughput (median): {tflops_per_sec_all_devices_statistics.statistics['p50']:.2f} TFLOP / second, " - f"MFU: {mfu_statistics.statistics['p50']:.2%}" + f"MFU: {mfu_val}" ) # print() # time_ms_list = @@ -1193,7 +1201,7 @@ def unified_flops_metrics( "TotalThroughput(median,TFLOP/s)": tflops_per_sec_all_devices_statistics.statistics[ "p50" ], - "MFU": mfu_statistics.statistics["p50"], + "MFU": mfu_raw, "total_flops": total_flops, # "all_time_ms_list": f"{json.dumps(time_ms_list)}", } @@ -1201,7 +1209,8 @@ def unified_flops_metrics( metrics.update(average_time_ms_statistics.serialize_statistics()) metrics.update(tflops_per_sec_statistics.serialize_statistics()) metrics.update(tflops_per_sec_all_devices_statistics.serialize_statistics()) - metrics.update(mfu_statistics.serialize_statistics()) + if mfu_statistics is not None: + metrics.update(mfu_statistics.serialize_statistics()) metrics = {key: value for key, value in metrics.items() if value is not None} return metadata, metrics @@ -1286,7 +1295,7 @@ def str_to_dtype(dtype_str: str) -> jnp.dtype: else: raise ValueError(f"Unsupported dtype string: {dtype_str}") -def get_peak_flops_multiplier(in_dtype_str: str) -> float: +def get_peak_flops_multiplier(in_dtype_str: str) -> float | None: """ Returns the peak FLOPS multiplier relative to the baseline (PEAK_FLOPS_PER_DEVICE) based on the input data type. @@ -1301,8 +1310,7 @@ def get_peak_flops_multiplier(in_dtype_str: str) -> float: # BF16/FP16 is 2x slower than FP8 peak return 0.5 elif in_dtype_lower in ("fp32", "float32"): - # FP32 is 4x slower than FP8 peak - return 0.25 + return None elif in_dtype_lower in ("fp4", "float4_e2m1fn"): # FP4/INT4 is treated the same as FP8 return 1.0 From a495fd631578cc3653b1b02c62dc2a352884b7a1 Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Fri, 6 Mar 2026 07:31:16 +0000 Subject: [PATCH 74/88] Increase sweeping range for all reduce --- Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml index 93deef9f..8d5ccd90 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml @@ -1,7 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/psum_tpu7x_4x4x4" csv_path: "../microbenchmarks/psum_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x4" From c378bdbafb2e1e62c98189f9ceb0f64e2bcfe2c3 Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Fri, 6 Mar 2026 10:04:02 +0000 Subject: [PATCH 75/88] Extend configs for gemm and collectives --- .../configs/collectives/all_to_all_tpu7x_4x4x4.yaml | 2 +- Ironwood/configs/gemm/gemm_multiple_run_more.yaml | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml index 5b627a9a..bf30ef9d 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml @@ -1,7 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4" diff --git a/Ironwood/configs/gemm/gemm_multiple_run_more.yaml b/Ironwood/configs/gemm/gemm_multiple_run_more.yaml index 022a8285..7b68aa5b 100644 --- a/Ironwood/configs/gemm/gemm_multiple_run_more.yaml +++ b/Ironwood/configs/gemm/gemm_multiple_run_more.yaml @@ -14,6 +14,8 @@ benchmarks: - {m: 8192, k: 8192, n: 8192, num_runs: 100, dtype: 'bfloat16'} - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'bfloat16'} - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'bfloat16'} + - {m: 32768, k: 4096, n: 4096, num_runs: 100, dtype: 'bfloat16'} + - {m: 32768, k: 8192, n: 8192, num_runs: 100, dtype: 'bfloat16'} - benchmark_name: "gemm_multiple_run" trace_dir: "../microbenchmarks/gemm_multiple_run_f32" @@ -30,6 +32,8 @@ benchmarks: - {m: 8192, k: 8192, n: 8192, num_runs: 100, dtype: 'float32'} - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float32'} - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float32'} + - {m: 32768, k: 4096, n: 4096, num_runs: 100, dtype: 'float32'} + - {m: 32768, k: 8192, n: 8192, num_runs: 100, dtype: 'float32'} - benchmark_name: "gemm_multiple_run" trace_dir: "../microbenchmarks/gemm_multiple_run_fp16" @@ -46,6 +50,8 @@ benchmarks: - {m: 8192, k: 8192, n: 8192, num_runs: 100, dtype: 'float16'} - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float16'} - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float16'} + - {m: 32768, k: 4096, n: 4096, num_runs: 100, dtype: 'float16'} + - {m: 32768, k: 8192, n: 8192, num_runs: 100, dtype: 'float16'} - benchmark_name: "gemm_multiple_run" trace_dir: "../microbenchmarks/gemm_multiple_run_fp8" @@ -62,6 +68,8 @@ benchmarks: - {m: 8192, k: 8192, n: 8192, num_runs: 100, dtype: 'float8'} - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float8'} - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float8'} + - {m: 32768, k: 4096, n: 4096, num_runs: 100, dtype: 'float8'} + - {m: 32768, k: 8192, n: 8192, num_runs: 100, dtype: 'float8'} - benchmark_name: "gemm_multiple_run" trace_dir: "../microbenchmarks/gemm_multiple_run_fp4" @@ -77,4 +85,6 @@ benchmarks: - {m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float4'} - {m: 8192, k: 8192, n: 8192, num_runs: 100, dtype: 'float4'} - {m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float4'} - - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float4'} \ No newline at end of file + - {m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float4'} + - {m: 32768, k: 4096, n: 4096, num_runs: 100, dtype: 'float4'} + - {m: 32768, k: 8192, n: 8192, num_runs: 100, dtype: 'float4'} \ No newline at end of file From dc795d9482258661de0b7aa869d8f7dfc89cc05f Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Fri, 6 Mar 2026 11:38:32 +0000 Subject: [PATCH 76/88] Extend configs for gemm and collectives --- Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml | 2 ++ Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml | 2 ++ Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml | 2 ++ 3 files changed, 6 insertions(+) diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml index ceb7bb52..a4dba8ff 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml @@ -2,6 +2,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "bfloat16", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float8", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4" csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml index 8d5ccd90..d88ffa33 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml @@ -2,6 +2,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/psum_tpu7x_4x4x4" csv_path: "../microbenchmarks/psum_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml index bf30ef9d..23e5e4fa 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml @@ -2,6 +2,8 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4" From 55fa0eaacf1fc945d48bc52f6bb021e465c6bcc8 Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Fri, 6 Mar 2026 12:15:40 +0000 Subject: [PATCH 77/88] Fix collectives aggregator for multi dtypes --- Ironwood/guides/automation/aggregator.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Ironwood/guides/automation/aggregator.py b/Ironwood/guides/automation/aggregator.py index 5ff4ce99..8e5f6751 100644 --- a/Ironwood/guides/automation/aggregator.py +++ b/Ironwood/guides/automation/aggregator.py @@ -71,10 +71,11 @@ def aggregate_collectives(directories: list[str], picked_columns: list[str]) -> return None aggregated_df = pd.DataFrame() for directory in directories: - file = glob.glob(f"{directory}/*.tsv")[0] - df = pd.read_csv(file, sep='\t') - df["topology"] = [file.split('/')[-4].split('-')[1] for _ in range(df.shape[0])] - aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True) + files = glob.glob(f"{directory}/*.tsv") + for file in files: + df = pd.read_csv(file, sep='\t') + df["topology"] = [file.split('/')[-4].split('-')[1] for _ in range(df.shape[0])] + aggregated_df = pd.concat([aggregated_df, df[picked_columns].rename(columns={"step_time_ms_num_runs": "num_runs"})], ignore_index=True) return aggregated_df def aggregate_hbm(directories: list[str], picked_columns: list[str]) -> pd.DataFrame: From cb56a4309ec6bfd9794bc4e869d5d75f2a83f25a Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Fri, 6 Mar 2026 13:18:38 +0000 Subject: [PATCH 78/88] Address too much event issue --- Ironwood/src/benchmark_utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Ironwood/src/benchmark_utils.py b/Ironwood/src/benchmark_utils.py index aa111cba..78718aad 100644 --- a/Ironwood/src/benchmark_utils.py +++ b/Ironwood/src/benchmark_utils.py @@ -171,7 +171,13 @@ def multiple_iteration_timeit_from_trace( if trace_dir and not is_local_directory_path(trace_dir): tmp_trace_dir = f"{LOCAL_TRACE_DIR}/{trace_name}" # data_args = data_generator() - with jax.profiler.trace(tmp_trace_dir): + options = jax.profiler.ProfileOptions() + options.advanced_configuration = { + "tpu_trace_mode" : "TRACE_ONLY_XLA", + "tpu_num_sparse_cores_to_trace": 0, + "tpu_num_sparse_core_tiles_to_trace": 0, + } + with jax.profiler.trace(tmp_trace_dir, profiler_options=options): for i in range(tries): if i % 10 == 0: print(f"[{task}] Running iteration {i} of {tries} with {matrix_dim}...") From dd61804c62b876b76173436f4347c9a38f2f75cc Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Mon, 9 Mar 2026 08:05:47 +0000 Subject: [PATCH 79/88] Use larger transfering size Test out larger matrix Test out larger matrix --- Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml | 4 +++- Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml | 4 +++- Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml | 4 +++- Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml | 4 +++- Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml | 4 +++- Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml | 4 +++- Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml | 4 +++- Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml | 4 +++- Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml | 4 +++- Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml | 4 +++- Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml | 4 +++- Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml | 4 +++- Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml | 4 +++- Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml | 4 +++- Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml | 4 +++- 15 files changed, 45 insertions(+), 15 deletions(-) diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml index 5b11ac8d..7f7e4b52 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml @@ -1,7 +1,9 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml index 3747b754..481d312e 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml @@ -1,7 +1,9 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml index 9c25eb6b..59046a73 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml @@ -1,7 +1,9 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml index 0ad03f56..0b917e88 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml @@ -1,7 +1,9 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4" csv_path: "../microbenchmarks/all_gather_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml index 0218d6b0..7688fc33 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml @@ -1,7 +1,9 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8" csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml index dbeb0407..030bfeea 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml @@ -1,7 +1,9 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/psum_tpu7x_2x2x1" csv_path: "../microbenchmarks/psum_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml index cca20bc2..6cc52c7c 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml @@ -1,7 +1,9 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/psum_tpu7x_2x2x2" csv_path: "../microbenchmarks/psum_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml index 1cb29b11..c7dfdb51 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml @@ -1,7 +1,9 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/psum_tpu7x_2x2x4" csv_path: "../microbenchmarks/psum_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml index 8366350a..73d437d2 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml @@ -1,7 +1,9 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/psum_tpu7x_2x4x4" csv_path: "../microbenchmarks/psum_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml index 7b629828..78f4d0a2 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml @@ -1,7 +1,9 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/psum_tpu7x_4x4x8" csv_path: "../microbenchmarks/psum_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml index 42dcf9e1..2a5f453b 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml @@ -1,7 +1,9 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml index 5b1bbb82..f3de48ff 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml @@ -1,7 +1,9 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml index f6004ce2..586397d7 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml @@ -1,7 +1,9 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml index a0e16a92..f8b990ab 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml @@ -1,7 +1,9 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml index 65742f12..5cdd637d 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml @@ -1,7 +1,9 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "bfloat16", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} + - {matrix_dim_range: {start: 64, end: 2097152, multiplier: 2}, dtype: "float8", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 20} trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8" csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8" From f924a7ee2902dff49300c95e0fc446c4b6bade08 Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Mon, 16 Mar 2026 13:27:01 +0000 Subject: [PATCH 80/88] Optimize H2D/D2H transfer pipelines and add comprehensive benchmark configs --- .../comprehensive_8dev_experiments.yaml | 9 ++++ .../comprehensive_experiments.yaml | 9 ++++ Ironwood/src/benchmark_host_device.py | 50 ++++++++++++------- Ironwood/src/run_benchmark.py | 2 +- 4 files changed, 52 insertions(+), 18 deletions(-) create mode 100644 Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml create mode 100644 Ironwood/configs/host_device/comprehensive_experiments.yaml diff --git a/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml b/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml new file mode 100644 index 00000000..ea3a2320 --- /dev/null +++ b/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml @@ -0,0 +1,9 @@ +benchmarks: +- benchmark_name: host_device + num_runs: 20 + benchmark_sweep_params: + - transfer_type_list: ["pinned_memory", "simple", "pipelined"] + num_devices_list: [8] + data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384] + input_type_list: ["numpy", "jax"] + csv_path: "/tmp/microbenchmarks/host_device_numactl" diff --git a/Ironwood/configs/host_device/comprehensive_experiments.yaml b/Ironwood/configs/host_device/comprehensive_experiments.yaml new file mode 100644 index 00000000..842ea1f4 --- /dev/null +++ b/Ironwood/configs/host_device/comprehensive_experiments.yaml @@ -0,0 +1,9 @@ +benchmarks: +- benchmark_name: host_device + num_runs: 20 + benchmark_sweep_params: + - transfer_type_list: ["pinned_memory", "simple", "pipelined"] + num_devices_list: [1, 2, 8] + data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384] + input_type_list: ["numpy", "jax"] + csv_path: "/tmp/microbenchmarks/final_experiments" diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index 3c1a5621..d1c3481f 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -32,7 +32,7 @@ def __init__(self, trace_dir: str = None): self.d2h_perf = [] @abc.abstractmethod - def setup(self, data_size_mib: int, host_data: np.ndarray, devices: List[jax.Device]): + def setup(self, data_size_mib: int, host_data: np.ndarray, num_devices: int): """Perform one-time setup before the benchmark loop.""" pass @@ -55,12 +55,15 @@ def teardown(self): class SimpleTransfer(TransferStrategy): """Simple device_put/device_get strategy.""" - def setup(self, data_size_mib: int, host_data: np.ndarray): - pass + def setup(self, data_size_mib: int, host_data: np.ndarray, num_devices: int): + target_devices = jax.devices()[:num_devices] + self.mesh = Mesh(target_devices, ('x',)) + self.partition_spec = PartitionSpec('x') + self.sharding = NamedSharding(self.mesh, self.partition_spec) def run_h2d(self, host_data: np.ndarray, i: int) -> Any: t0 = time.perf_counter() - device_array = jax.device_put(host_data) + device_array = jax.device_put(host_data, self.sharding) device_array.block_until_ready() t1 = time.perf_counter() @@ -71,7 +74,8 @@ def run_h2d(self, host_data: np.ndarray, i: int) -> Any: def run_d2h(self, device_data: Any, i: int): t2 = time.perf_counter() - _ = jax.device_get(device_data) + # Retrieving addressable shards natively supports parallelism + _ = jax.device_get([s.data for s in device_data.addressable_shards]) t3 = time.perf_counter() self.d2h_perf.append((t3 - t2) * 1000) device_data.delete() @@ -83,9 +87,9 @@ def teardown(self): class PipelinedTransfer(TransferStrategy): """Pipelined transfer using chunking.""" - def setup(self, data_size_mib: int, host_data: np.ndarray): + def setup(self, data_size_mib: int, host_data: np.ndarray, num_devices: int): self.target_chunk_size_mib = 16 - num_devices_to_perform_h2d = 2 + num_devices_to_perform_h2d = num_devices self.target_devices = jax.devices()[:num_devices_to_perform_h2d] self.num_devices = len(self.target_devices) @@ -139,8 +143,8 @@ def teardown(self): class PinnedMemoryTransfer(TransferStrategy): """Pinned memory host-to-device with parallelized device-to-host transfer.""" - def setup(self, data_size_mib: int, host_data: np.ndarray): - num_devices_to_perform_h2d = 2 + def setup(self, data_size_mib: int, host_data: np.ndarray, num_devices: int): + num_devices_to_perform_h2d = num_devices target_devices = jax.devices()[:num_devices_to_perform_h2d] mesh = Mesh(target_devices, ('x',)) @@ -177,6 +181,8 @@ def teardown(self): def benchmark_host_device( data_size_mib: int, transfer_type: str, + num_devices: int, + input_type: str, num_runs: int = 100, trace_dir: str = None, ) -> Dict[str, Any]: @@ -186,7 +192,14 @@ def benchmark_host_device( # Allocate Host Source Buffer column = 128 - host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32) + np_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32) + + if input_type == "numpy": + host_data = np_data + elif input_type == "jax": + host_data = jax.device_put(np_data, jax.devices("cpu")[0]) + else: + raise ValueError(f"Unknown input_type: {input_type}") print( f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {transfer_type=}", @@ -203,7 +216,7 @@ def benchmark_host_device( raise ValueError(f"Unknown transfer_type: {transfer_type}. Available: {list(strategies.keys())}") strategy = strategies[transfer_type](trace_dir) - strategy.setup(data_size_mib, host_data) + strategy.setup(data_size_mib, host_data, num_devices) # Profiling Context import contextlib @@ -215,11 +228,10 @@ def benchmark_host_device( with profiler_context: # Warmup for _ in range(2): - device_array = jax.device_put(host_data) - device_array.block_until_ready() - host_out = np.array(device_array) - device_array.delete() - del host_out + device_array = strategy.run_h2d(host_data, -1) + strategy.run_d2h(device_array, -1) + strategy.h2d_perf.clear() + strategy.d2h_perf.clear() for i in range(num_runs): # Step Context @@ -242,6 +254,8 @@ def benchmark_host_device( def benchmark_host_device_calculate_metrics( data_size_mib: int, transfer_type: str, + num_devices: int, + input_type: str, H2D_Bandwidth_ms: List[float], D2H_Bandwidth_ms: List[float], ) -> Tuple[Dict[str, Any], Dict[str, Any]]: @@ -251,10 +265,12 @@ def benchmark_host_device_calculate_metrics( # Filter out list params from metadata to avoid explosion metadata_keys = { "data_size_mib", + "transfer_type", + "num_devices", + "input_type", } metadata = {k: v for k, v in params if k in metadata_keys} metadata["dtype"] = "float32" - metadata["transfer_type"] = transfer_type metrics = {} diff --git a/Ironwood/src/run_benchmark.py b/Ironwood/src/run_benchmark.py index 2d10db5d..aef590e8 100644 --- a/Ironwood/src/run_benchmark.py +++ b/Ironwood/src/run_benchmark.py @@ -410,7 +410,7 @@ def run_single_benchmark(benchmark_config: Dict[str, Any], output_path: str, gcs for id, benchmark_param in enumerate(benchmark_params): original_benchmark_param = copy.deepcopy(benchmark_param) benchmark_param = preprocess_benchmark_param( - benchmark_param, trace_dir=os.path.join(trace_dir, f"benchmark_{id}") + benchmark_param, trace_dir=os.path.join(trace_dir, f"benchmark_{id}") if trace_dir else None ) print(f"Running benchmark: {benchmark_name} with params: {benchmark_param}") test_start_time = ( From 7439b2a3cf0df6d2a7d8c472bfc18127efd48b03 Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Mon, 16 Mar 2026 13:33:47 +0000 Subject: [PATCH 81/88] Add benchmark guide and run script --- .../guides/host_device_benchmark/README.md | 39 +++++++++++++++++++ .../run_comprehensive.sh | 15 +++++++ 2 files changed, 54 insertions(+) create mode 100644 Ironwood/guides/host_device_benchmark/README.md create mode 100755 Ironwood/guides/host_device_benchmark/run_comprehensive.sh diff --git a/Ironwood/guides/host_device_benchmark/README.md b/Ironwood/guides/host_device_benchmark/README.md new file mode 100644 index 00000000..cbac7b11 --- /dev/null +++ b/Ironwood/guides/host_device_benchmark/README.md @@ -0,0 +1,39 @@ +# Host-Device Benchmark Guide + +This directory contains instructions and a script to run comprehensive Host-to-Device (H2D) and Device-to-Host (D2H) microbenchmarks on Cloud TPUs. + +## Overview + +The benchmarks measure the transfer bandwidth for various configurations: +- **Transfer Strategies**: `simple`, `pipelined`, `pinned_memory` +- **Data Sizes**: Ranging from 1 MiB to 16,384 MiB. +- **Input Types**: Replicating inputs using `numpy` arrays or pre-allocated `jax` arrays. +- **Device Counts**: Scaling across 1, 2, and 8 TPU devices. +- **NUMA Settings**: Testing the impact of `--interleave=all` with `numactl`. + +## How to Run + +A convenience script `run_comprehensive.sh` is provided. It executes two suites of tests sequentially from the `Ironwood/` directory: + +1. **Comprehensive Suite**: Sweeps through all transfer strategies, data sizes, input types, and device configurations (1, 2, and 8 devices) without any specific NUMA configuration. +2. **8-Device NUMA Suite**: Executes an 8-device specific sweep while enforcing `numactl --interleave=all` at the process level to balance memory allocations across NUMA nodes, heavily improving the pipelined D2H transfer bottleneck on multi-chip architectures. + +### Execution + +Simply execute the script on your TPU VM: + +```bash +bash run_comprehensive.sh +``` + +## Configuration Files + +The executed configurations are located at: +- `Ironwood/configs/host_device/comprehensive_experiments.yaml` +- `Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml` + +Refer to these files to adjust the tested parameters or trace directories. + +## Analyzing Results + +Resulting logs and TSV files will be exported to the directory specified within the configurations (default typically output to the console and/or a timestamped TSV). You can use simple Pandas scripts to analyze and extract the max P50 bandwidths. diff --git a/Ironwood/guides/host_device_benchmark/run_comprehensive.sh b/Ironwood/guides/host_device_benchmark/run_comprehensive.sh new file mode 100755 index 00000000..a63af1b7 --- /dev/null +++ b/Ironwood/guides/host_device_benchmark/run_comprehensive.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +# Change to the root of the Ironwood directory assuming this script is run from anywhere +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +IRONWOOD_DIR="$(dirname $(dirname "$DIR"))" +cd "$IRONWOOD_DIR" + +echo "Running comprehensive benchmarks across 1, 2, and 8 devices..." +python3 src/run_benchmark.py --config configs/host_device/comprehensive_experiments.yaml + +echo "Running 8-device benchmark with numactl interleaving..." +numactl --interleave=all python3 src/run_benchmark.py --config configs/host_device/comprehensive_8dev_experiments.yaml + +echo "Benchmarks completed successfully." From 38ec530fa47714f2af1da7c11c0ef31588b5c6dc Mon Sep 17 00:00:00 2001 From: "Yu-Hsuan (Amy) Lin" Date: Wed, 13 May 2026 12:28:09 +0000 Subject: [PATCH 82/88] Allow sweeping dtype in host_device benchmarks --- .../host_device/comprehensive_8dev_experiments.yaml | 1 + .../configs/host_device/comprehensive_experiments.yaml | 1 + Ironwood/src/benchmark_host_device.py | 9 ++++++--- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml b/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml index ea3a2320..e15bcc4f 100644 --- a/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml +++ b/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml @@ -6,4 +6,5 @@ benchmarks: num_devices_list: [8] data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384] input_type_list: ["numpy", "jax"] + dtype_list: ["float32"] csv_path: "/tmp/microbenchmarks/host_device_numactl" diff --git a/Ironwood/configs/host_device/comprehensive_experiments.yaml b/Ironwood/configs/host_device/comprehensive_experiments.yaml index 842ea1f4..7974bc61 100644 --- a/Ironwood/configs/host_device/comprehensive_experiments.yaml +++ b/Ironwood/configs/host_device/comprehensive_experiments.yaml @@ -6,4 +6,5 @@ benchmarks: num_devices_list: [1, 2, 8] data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384] input_type_list: ["numpy", "jax"] + dtype_list: ["float32"] csv_path: "/tmp/microbenchmarks/final_experiments" diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index d1c3481f..8b6966b6 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -183,16 +183,18 @@ def benchmark_host_device( transfer_type: str, num_devices: int, input_type: str, + dtype: jnp.dtype = jnp.float32, num_runs: int = 100, trace_dir: str = None, ) -> Dict[str, Any]: """Benchmarks H2D/D2H transfer using device_put/device_get.""" - num_elements = 1024 * 1024 * data_size_mib // np.dtype(np.float32).itemsize + normalized_dtype = jnp.dtype(dtype) + num_elements = 1024 * 1024 * data_size_mib // normalized_dtype.itemsize # Allocate Host Source Buffer column = 128 - np_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32) + np_data = np.random.normal(size=(num_elements // column, column)).astype(normalized_dtype) if input_type == "numpy": host_data = np_data @@ -258,6 +260,7 @@ def benchmark_host_device_calculate_metrics( input_type: str, H2D_Bandwidth_ms: List[float], D2H_Bandwidth_ms: List[float], + dtype: jnp.dtype = jnp.float32, ) -> Tuple[Dict[str, Any], Dict[str, Any]]: """Calculates metrics for Host-Device transfer.""" params = locals().items() @@ -270,7 +273,7 @@ def benchmark_host_device_calculate_metrics( "input_type", } metadata = {k: v for k, v in params if k in metadata_keys} - metadata["dtype"] = "float32" + metadata["dtype"] = jnp.dtype(dtype).name metrics = {} From aa1e67c37f21a2f1796a242db311a660c0bbbc72 Mon Sep 17 00:00:00 2001 From: Daniel Wu Date: Wed, 20 May 2026 06:23:25 +0000 Subject: [PATCH 83/88] Added sample variance as a metric for h2dd2h and increased the num_runs from 20 to 100. --- .../host_device/comprehensive_experiments.yaml | 2 +- Ironwood/src/benchmark_host_device.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/Ironwood/configs/host_device/comprehensive_experiments.yaml b/Ironwood/configs/host_device/comprehensive_experiments.yaml index 7974bc61..70329dac 100644 --- a/Ironwood/configs/host_device/comprehensive_experiments.yaml +++ b/Ironwood/configs/host_device/comprehensive_experiments.yaml @@ -1,6 +1,6 @@ benchmarks: - benchmark_name: host_device - num_runs: 20 + num_runs: 100 benchmark_sweep_params: - transfer_type_list: ["pinned_memory", "simple", "pipelined"] num_devices_list: [1, 2, 8] diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index 8b6966b6..6aa88451 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -291,6 +291,20 @@ def add_metric(name, ms_list): ) metrics.update(stats_bw.serialize_statistics()) + if len(bw_list) > 1: + bw_array = np.array(bw_list) + sample_variance = np.var(bw_array, ddof=1) + + metrics[f"{name}_bw (GiB/s)_sample_variance"] = sample_variance + + print( + f" {name}_bw (GiB/s) Sample Variance: {sample_variance:.4e}", + flush=True + ) + elif len(bw_list) == 1: + print(f" {name}_bw (GiB/s): Only one sample, variance cannot be calculated.", flush=True) + metrics[f"{name}_bw_variance_GiBs"] = 0.0 + add_metric("H2D", H2D_Bandwidth_ms) add_metric("D2H", D2H_Bandwidth_ms) From ac83feed5e9464f6cdc8add29ff6d1e5e23c0309 Mon Sep 17 00:00:00 2001 From: Daniel Wu Date: Wed, 20 May 2026 08:38:30 +0000 Subject: [PATCH 84/88] Triggering CLA recheck From f302e987bbe3d3f8b173b9e14b51ea5753bd0293 Mon Sep 17 00:00:00 2001 From: Daniel Wu Date: Wed, 20 May 2026 08:46:00 +0000 Subject: [PATCH 85/88] Triggering CLA recheck 2 From 9e88b7be5eb1837c87164c0fbfdf62a8103917ef Mon Sep 17 00:00:00 2001 From: Daniel Wu Date: Thu, 21 May 2026 06:54:33 +0000 Subject: [PATCH 86/88] shorten sample_variance as variance --- Ironwood/src/benchmark_host_device.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index 6aa88451..5e1e4fbb 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -295,7 +295,7 @@ def add_metric(name, ms_list): bw_array = np.array(bw_list) sample_variance = np.var(bw_array, ddof=1) - metrics[f"{name}_bw (GiB/s)_sample_variance"] = sample_variance + metrics[f"{name}_bw (GiB/s)_variance"] = sample_variance print( f" {name}_bw (GiB/s) Sample Variance: {sample_variance:.4e}", From c8eca6f6857ad550ceb153b72deff46b0750b863 Mon Sep 17 00:00:00 2001 From: Daniel Wu Date: Fri, 22 May 2026 00:29:19 +0000 Subject: [PATCH 87/88] check if the variance is nan and set the value to zero --- Ironwood/src/benchmark_host_device.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index 5e1e4fbb..83a62aa8 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -291,19 +291,17 @@ def add_metric(name, ms_list): ) metrics.update(stats_bw.serialize_statistics()) - if len(bw_list) > 1: - bw_array = np.array(bw_list) - sample_variance = np.var(bw_array, ddof=1) - - metrics[f"{name}_bw (GiB/s)_variance"] = sample_variance - - print( - f" {name}_bw (GiB/s) Sample Variance: {sample_variance:.4e}", - flush=True - ) - elif len(bw_list) == 1: - print(f" {name}_bw (GiB/s): Only one sample, variance cannot be calculated.", flush=True) - metrics[f"{name}_bw_variance_GiBs"] = 0.0 + bw_array = np.array(bw_list) + sample_variance = np.var(bw_array, ddof=1) + if np.isnan(sample_variance): + sample_variance = 0.0 + + metrics[f"{name}_bw (GiB/s)_variance"] = sample_variance + + print( + f" {name}_bw (GiB/s) Sample Variance: {sample_variance:.4e}", + flush=True + ) add_metric("H2D", H2D_Bandwidth_ms) add_metric("D2H", D2H_Bandwidth_ms) From 07fc9b3aef4ae6ad868a31f2aa1424f5a7c5a53f Mon Sep 17 00:00:00 2001 From: Daniel Wu Date: Mon, 25 May 2026 00:57:49 +0000 Subject: [PATCH 88/88] Updated comprehensive_8dev_experiments.yaml from 20 to 100 num runs --- .../configs/host_device/comprehensive_8dev_experiments.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml b/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml index e15bcc4f..d3403b10 100644 --- a/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml +++ b/Ironwood/configs/host_device/comprehensive_8dev_experiments.yaml @@ -1,6 +1,6 @@ benchmarks: - benchmark_name: host_device - num_runs: 20 + num_runs: 100 benchmark_sweep_params: - transfer_type_list: ["pinned_memory", "simple", "pipelined"] num_devices_list: [8]