Skip to content

Commit 931dd56

Browse files
committed
DeepSeek-V3.2 XLML: logit matching and decoding tests
1 parent d00c55e commit 931dd56

1 file changed

Lines changed: 56 additions & 0 deletions

File tree

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#!/bin/bash
2+
3+
# This file is documentation for how to get started with DeepSeek v3.2.
4+
5+
# This file runs Step 2 on v5p-128 on a daily basis.
6+
# 1. Convert the HuggingFace checkpoint (bf16) to MaxText-compatible checkpoint (bf16):
7+
# Scanned format is better for training; unscanned format is better for decoding.
8+
# 2. Run logit check, pre-training, fine-tuning, and decoding.
9+
10+
set -ex
11+
12+
export MODEL_NAME='deepseek3.2-671b'
13+
export TOKENIZER_PATH='deepseek-ai/DeepSeek-V3.2'
14+
15+
# Installing torch for checkpoint conversion and forward_pass_logit_checker.py
16+
python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu
17+
18+
# e.g., $HOME/maxtext/src/maxtext
19+
export MAXTEXT_PKG_DIR="${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext}"
20+
21+
if [ -z "${BASE_OUTPUT_PATH}" ]; then
22+
# Non-Googlers please remember to point `BASE_OUTPUT_PATH` to GCS buckets that you own, this script uses internal buckets for testing.
23+
# this bucket will store all the files generated by MaxText during a run
24+
export BASE_OUTPUT_PATH=gs://runner-maxtext-logs/$(date +%Y-%m-%d-%H-%M)
25+
echo "BASE_OUTPUT_PATH is not set"
26+
fi
27+
BASE_OUTPUT_PATH=${BASE_OUTPUT_PATH%/}
28+
echo using BASE_OUTPUT_PATH = ${BASE_OUTPUT_PATH}
29+
30+
# Step 2:
31+
# We define the checkpoint paths. This way it is easier to use these paths in the `train.py` and `decode.py` commands
32+
# export SCANNED_CKPT_PATH=${BASE_OUTPUT_PATH}/scanned/0/items
33+
# export UNSCANNED_CKPT_PATH=${BASE_OUTPUT_PATH}/unscanned/0/items
34+
# Use a hard-coded golden checkpoint, rather than checkpoints generated by Step 1 as it is not in daily test.
35+
SCANNED_CKPT_PATH=gs://maxtext-deepseek/deepseek3.2/2026-02-20/scanned/0/items
36+
UNSCANNED_CKPT_PATH=gs://maxtext-deepseek/deepseek3.2/2026-02-20/unscanned/0/items
37+
# Non-Googlers please remember to point `DATASET_PATH` to the GCS bucket where you have your training data
38+
export DATASET_PATH=gs://maxtext-dataset
39+
40+
# Test whether the forward pass logits match the golden logits
41+
# default golden_logits_path=/deps/tests/assets/golden_logits/golden_data_{MODEL_NAME}.jsonl, copied from gs://maxtext-test-assets/golden_data_${MODEL_NAME}.jsonl
42+
GOLDEN_LOGITS_DISK_LOCATION="/deps/tests/assets/golden_logits/golden_data_${MODEL_NAME}.jsonl"
43+
if [ ! -f "${GOLDEN_LOGITS_DISK_LOCATION}" ]; then
44+
GOLDEN_LOGITS_PATH="gs://maxtext-test-assets/golden_data_${MODEL_NAME}.jsonl"
45+
GOLDEN_LOGITS_DISK_LOCATION=/tmp/golden_data.jsonl
46+
gcloud storage cp ${GOLDEN_LOGITS_PATH} ${GOLDEN_LOGITS_DISK_LOCATION}
47+
fi
48+
49+
# override deepseek3.2-671b.yml with indexer_topk=2
50+
# OVERRIDE="indexer_topk=2"
51+
# OVERRIDE = ""
52+
python3 -m tests.utils.forward_pass_logit_checker ${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}/base.yml base_output_directory=${BASE_OUTPUT_PATH} run_name=forward_logits_check load_parameters_path=${SCANNED_CKPT_PATH} scan_layers=true attention=dot_product per_device_batch_size=1 model_name=${MODEL_NAME} max_prefill_predict_length=4 max_target_length=4 async_checkpointing=false sparse_matmul=false ici_fsdp_parallelism=1 ici_expert_parallelism=-1 checkpoint_storage_concurrent_gb=1024 weight_dtype=float32 dtype=float32 activations_in_float32=true matmul_precision=highest float32_logits=true float32_qk_product=true indexer_sparse_training=true indexer_topk=2 --golden_logits_path=${GOLDEN_LOGITS_DISK_LOCATION} --max_kl_div=0.3
53+
54+
# Run decoding - tokamax_gmm implementation
55+
# Note decode requires the access token for huggingface tokenizer even if the model is not gated
56+
python3 -m maxtext.inference.decode ${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}/base.yml base_output_directory=${BASE_OUTPUT_PATH} run_name=decode model_name=${MODEL_NAME} tokenizer_type=huggingface tokenizer_path=${TOKENIZER_PATH} hf_access_token=${HF_TOKEN} load_parameters_path=${UNSCANNED_CKPT_PATH} scan_layers=False attention=dot_product sparse_matmul=True use_tokamax_gmm=True dtype=bfloat16 weight_dtype=bfloat16 per_device_batch_size=1 max_prefill_predict_length=2080 max_target_length=2560 ici_fsdp_parallelism=1 ici_tensor_parallelism=1 ici_expert_parallelism=-1 checkpoint_storage_concurrent_gb=1024 mla_naive_kvcache=false indexer_sparse_training=true prompt="An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and outputs are all vectors. The output is "

0 commit comments

Comments
 (0)