|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# This file is documentation for how to get started with DeepSeek v3.2. |
| 4 | + |
| 5 | +# This file runs Step 2 on v5p-128 on a daily basis. |
| 6 | +# 1. Convert the HuggingFace checkpoint (bf16) to MaxText-compatible checkpoint (bf16): |
| 7 | +# Scanned format is better for training; unscanned format is better for decoding. |
| 8 | +# 2. Run logit check, pre-training, fine-tuning, and decoding. |
| 9 | + |
| 10 | +set -ex |
| 11 | + |
| 12 | +export MODEL_NAME='deepseek3.2-671b' |
| 13 | +export TOKENIZER_PATH='deepseek-ai/DeepSeek-V3.2' |
| 14 | + |
| 15 | +# Installing torch for checkpoint conversion and forward_pass_logit_checker.py |
| 16 | +python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu |
| 17 | + |
| 18 | +# e.g., $HOME/maxtext/src/maxtext |
| 19 | +export MAXTEXT_PKG_DIR="${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext}" |
| 20 | + |
| 21 | +if [ -z "${BASE_OUTPUT_PATH}" ]; then |
| 22 | + # Non-Googlers please remember to point `BASE_OUTPUT_PATH` to GCS buckets that you own, this script uses internal buckets for testing. |
| 23 | + # this bucket will store all the files generated by MaxText during a run |
| 24 | + export BASE_OUTPUT_PATH=gs://runner-maxtext-logs/$(date +%Y-%m-%d-%H-%M) |
| 25 | + echo "BASE_OUTPUT_PATH is not set" |
| 26 | +fi |
| 27 | +BASE_OUTPUT_PATH=${BASE_OUTPUT_PATH%/} |
| 28 | +echo using BASE_OUTPUT_PATH = ${BASE_OUTPUT_PATH} |
| 29 | + |
| 30 | +# Step 2: |
| 31 | +# We define the checkpoint paths. This way it is easier to use these paths in the `train.py` and `decode.py` commands |
| 32 | +# export SCANNED_CKPT_PATH=${BASE_OUTPUT_PATH}/scanned/0/items |
| 33 | +# export UNSCANNED_CKPT_PATH=${BASE_OUTPUT_PATH}/unscanned/0/items |
| 34 | +# Use a hard-coded golden checkpoint, rather than checkpoints generated by Step 1 as it is not in daily test. |
| 35 | +SCANNED_CKPT_PATH=gs://maxtext-deepseek/deepseek3.2/2026-02-20/scanned/0/items |
| 36 | +UNSCANNED_CKPT_PATH=gs://maxtext-deepseek/deepseek3.2/2026-02-20/unscanned/0/items |
| 37 | +# Non-Googlers please remember to point `DATASET_PATH` to the GCS bucket where you have your training data |
| 38 | +export DATASET_PATH=gs://maxtext-dataset |
| 39 | + |
| 40 | +# Test whether the forward pass logits match the golden logits |
| 41 | +# default golden_logits_path=/deps/tests/assets/golden_logits/golden_data_{MODEL_NAME}.jsonl, copied from gs://maxtext-test-assets/golden_data_${MODEL_NAME}.jsonl |
| 42 | +GOLDEN_LOGITS_DISK_LOCATION="/deps/tests/assets/golden_logits/golden_data_${MODEL_NAME}.jsonl" |
| 43 | +if [ ! -f "${GOLDEN_LOGITS_DISK_LOCATION}" ]; then |
| 44 | + GOLDEN_LOGITS_PATH="gs://maxtext-test-assets/golden_data_${MODEL_NAME}.jsonl" |
| 45 | + GOLDEN_LOGITS_DISK_LOCATION=/tmp/golden_data.jsonl |
| 46 | + gcloud storage cp ${GOLDEN_LOGITS_PATH} ${GOLDEN_LOGITS_DISK_LOCATION} |
| 47 | +fi |
| 48 | + |
| 49 | +# override deepseek3.2-671b.yml with indexer_topk=2 |
| 50 | +# OVERRIDE="indexer_topk=2" |
| 51 | +# OVERRIDE = "" |
| 52 | +python3 -m tests.utils.forward_pass_logit_checker ${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}/base.yml base_output_directory=${BASE_OUTPUT_PATH} run_name=forward_logits_check load_parameters_path=${SCANNED_CKPT_PATH} scan_layers=true attention=dot_product per_device_batch_size=1 model_name=${MODEL_NAME} max_prefill_predict_length=4 max_target_length=4 async_checkpointing=false sparse_matmul=false ici_fsdp_parallelism=1 ici_expert_parallelism=-1 checkpoint_storage_concurrent_gb=1024 weight_dtype=float32 dtype=float32 activations_in_float32=true matmul_precision=highest float32_logits=true float32_qk_product=true indexer_sparse_training=true indexer_topk=2 --golden_logits_path=${GOLDEN_LOGITS_DISK_LOCATION} --max_kl_div=0.3 |
| 53 | + |
| 54 | +# Run decoding - tokamax_gmm implementation |
| 55 | +# Note decode requires the access token for huggingface tokenizer even if the model is not gated |
| 56 | +python3 -m maxtext.inference.decode ${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}/base.yml base_output_directory=${BASE_OUTPUT_PATH} run_name=decode model_name=${MODEL_NAME} tokenizer_type=huggingface tokenizer_path=${TOKENIZER_PATH} hf_access_token=${HF_TOKEN} load_parameters_path=${UNSCANNED_CKPT_PATH} scan_layers=False attention=dot_product sparse_matmul=True use_tokamax_gmm=True dtype=bfloat16 weight_dtype=bfloat16 per_device_batch_size=1 max_prefill_predict_length=2080 max_target_length=2560 ici_fsdp_parallelism=1 ici_tensor_parallelism=1 ici_expert_parallelism=-1 checkpoint_storage_concurrent_gb=1024 mla_naive_kvcache=false indexer_sparse_training=true prompt="An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and outputs are all vectors. The output is " |
0 commit comments