DeepSeek-V3.2 XLML: logit matching and decoding tests

snehalv2002 · snehalv2002 · commit 931dd563b8e6 · 2026-04-10T17:45:20.000Z
diff --git a/tests/end_to_end/tpu/deepseek/v3.2-671b/2_test_deepseek.sh b/tests/end_to_end/tpu/deepseek/v3.2-671b/2_test_deepseek.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+# This file is documentation for how to get started with DeepSeek v3.2.
+
+# This file runs Step 2 on v5p-128 on a daily basis.
+# 1. Convert the HuggingFace checkpoint (bf16) to MaxText-compatible checkpoint (bf16):
+#    Scanned format is better for training; unscanned format is better for decoding.
+# 2. Run logit check, pre-training, fine-tuning, and decoding.
+
+set -ex
+
+export MODEL_NAME='deepseek3.2-671b'
+export TOKENIZER_PATH='deepseek-ai/DeepSeek-V3.2'
+
+# Installing torch for checkpoint conversion and forward_pass_logit_checker.py
+python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu
+
+# e.g., $HOME/maxtext/src/maxtext
+export MAXTEXT_PKG_DIR="${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext}"
+
+if [ -z "${BASE_OUTPUT_PATH}" ]; then
+  # Non-Googlers please remember to point `BASE_OUTPUT_PATH` to GCS buckets that you own, this script uses internal buckets for testing.
+  # this bucket will store all the files generated by MaxText during a run
+  export BASE_OUTPUT_PATH=gs://runner-maxtext-logs/$(date +%Y-%m-%d-%H-%M)
+  echo "BASE_OUTPUT_PATH is not set"
+fi
+BASE_OUTPUT_PATH=${BASE_OUTPUT_PATH%/}
+echo using BASE_OUTPUT_PATH = ${BASE_OUTPUT_PATH}
+
+# Step 2:
+# We define the checkpoint paths. This way it is easier to use these paths in the `train.py` and `decode.py` commands
+# export SCANNED_CKPT_PATH=${BASE_OUTPUT_PATH}/scanned/0/items
+# export UNSCANNED_CKPT_PATH=${BASE_OUTPUT_PATH}/unscanned/0/items
+# Use a hard-coded golden checkpoint, rather than checkpoints generated by Step 1 as it is not in daily test.
+SCANNED_CKPT_PATH=gs://maxtext-deepseek/deepseek3.2/2026-02-20/scanned/0/items
+UNSCANNED_CKPT_PATH=gs://maxtext-deepseek/deepseek3.2/2026-02-20/unscanned/0/items
+# Non-Googlers please remember to point `DATASET_PATH` to the GCS bucket where you have your training data
+export DATASET_PATH=gs://maxtext-dataset
+
+# Test whether the forward pass logits match the golden logits
+# default golden_logits_path=/deps/tests/assets/golden_logits/golden_data_{MODEL_NAME}.jsonl, copied from gs://maxtext-test-assets/golden_data_${MODEL_NAME}.jsonl
+GOLDEN_LOGITS_DISK_LOCATION="/deps/tests/assets/golden_logits/golden_data_${MODEL_NAME}.jsonl"
+if [ ! -f "${GOLDEN_LOGITS_DISK_LOCATION}" ]; then
+  GOLDEN_LOGITS_PATH="gs://maxtext-test-assets/golden_data_${MODEL_NAME}.jsonl"
+  GOLDEN_LOGITS_DISK_LOCATION=/tmp/golden_data.jsonl
+  gcloud storage cp ${GOLDEN_LOGITS_PATH} ${GOLDEN_LOGITS_DISK_LOCATION}
+fi
+
+# override deepseek3.2-671b.yml with indexer_topk=2
+# OVERRIDE="indexer_topk=2"
+# OVERRIDE = ""
+python3 -m tests.utils.forward_pass_logit_checker ${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}/base.yml base_output_directory=${BASE_OUTPUT_PATH} run_name=forward_logits_check load_parameters_path=${SCANNED_CKPT_PATH} scan_layers=true attention=dot_product per_device_batch_size=1 model_name=${MODEL_NAME} max_prefill_predict_length=4 max_target_length=4 async_checkpointing=false sparse_matmul=false ici_fsdp_parallelism=1 ici_expert_parallelism=-1 checkpoint_storage_concurrent_gb=1024 weight_dtype=float32 dtype=float32 activations_in_float32=true matmul_precision=highest float32_logits=true float32_qk_product=true indexer_sparse_training=true indexer_topk=2 --golden_logits_path=${GOLDEN_LOGITS_DISK_LOCATION} --max_kl_div=0.3
+
+# Run decoding - tokamax_gmm implementation
+# Note decode requires the access token for huggingface tokenizer even if the model is not gated
+python3 -m maxtext.inference.decode ${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}/base.yml base_output_directory=${BASE_OUTPUT_PATH} run_name=decode model_name=${MODEL_NAME} tokenizer_type=huggingface tokenizer_path=${TOKENIZER_PATH} hf_access_token=${HF_TOKEN} load_parameters_path=${UNSCANNED_CKPT_PATH} scan_layers=False attention=dot_product sparse_matmul=True use_tokamax_gmm=True dtype=bfloat16 weight_dtype=bfloat16 per_device_batch_size=1 max_prefill_predict_length=2080 max_target_length=2560 ici_fsdp_parallelism=1 ici_tensor_parallelism=1 ici_expert_parallelism=-1 checkpoint_storage_concurrent_gb=1024 mla_naive_kvcache=false indexer_sparse_training=true prompt="An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and outputs are all vectors. The output is "